Python BeautifulStoneSoup.find Exemples, bs4.BeautifulStoneSoup.find Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : wikipedia.py Projet : Mause/ninjabot

	def trigger_w(self, msg):
		"Usage: w <search term>. Prints a short description of the corresponding wikipedia article."
		if len(msg.args) == 0:
			self.bot.notice(msg.nick, "Please specify a search term")
			return

		params = {
			'action': 'opensearch',
			'format': 'xml',
			'limit': '2',
			'search': ' '.join(msg.args)
		}
		url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language)

		response = BeautifulStoneSoup(requests.post(url, data=params).text)

		# Damn BS4 is case sensitive, hence all the 'regex's.
		if response.find(re.compile('text', re.I)):
			index = 0
			if "may refer to:" in response.find(re.compile('description', re.I)).string:
				index = 1

			info = response.find_all(re.compile('description', re.I))[index].string.strip()
			url = response.find_all(re.compile('url', re.I))[index].string

			short_url = self.shorten(url)

			message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url)
			self.bot.privmsg(msg.channel, message)
		else:
			self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))

Exemple #2

0

Afficher le fichier

	def trigger_w(self, msg):
		"Usage: w <search term>. Prints a short description of the corresponding wikipedia article."
		if len(msg.args) == 0:
			self.bot.notice(msg.nick, "Please specify a search term")
			return

		params = {
			'action': 'opensearch',
			'format': 'xml',
			'limit': '2',
			'search': ' '.join(msg.args)
		}
		url = 'http://{:s}.wikipedia.org/w/api.php'.format(self.language)

		response = BeautifulStoneSoup(requests.post(url, data=params).text)

		# Damn BS4 is case sensitive, hence all the regex.
		if response.find(re.compile('text', re.I)):
			index = 0
			if "may refer to:" in response.find(re.compile('description', re.I)).string:
				index = 1

			info = response.find_all(re.compile('description', re.I))[index].string.strip()
			url = response.find_all(re.compile('url', re.I))[index].string

			short_url = self.shorten(url)

			message = u"\002Wikipedia ::\002 {} \002::\002 {}".format(info, short_url)
			self.bot.privmsg(msg.channel, message)
		else:
			self.bot.privmsg(msg.channel, "{}: no articles were found.".format(' '.join(msg.args)))

Exemple #3

0

Afficher le fichier

Fichier : read_jatsxml.py Projet : danduma/minerva

    def read(self, xml, identifier):
        """
            Load a JATS/NLM (PubMed) XML into a SciDoc.

            :param xml: full xml string
            :type xml: basestring
            :param identifier: an identifier for this document, e.g. file name
                        If an actual full path, the path will be removed from it
                        when stored
            :type identifier: basestring
            :returns: :class:`SciDoc <SciDoc>` object
            :rtype: SciDoc
        """
        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        BeautifulStoneSoup.NESTABLE_TAGS["sec"] = []
        #xml=fixNumberCitationsXML(xml)
        soup = BeautifulStoneSoup(xml)

        # Create a new SciDoc to store the paper
        newDocument = SciDoc()
        metadata = newDocument["metadata"]
        metadata["filename"] = os.path.basename(identifier)
        metadata["original_citation_style"] = detectCitationStyle(xml)

        body = soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument, "error",
                            "NO <BODY> IN THIS PAPER! file: " + identifier)
            newDocument["metadata"]["guid"] = cp.Corpus.generateGUID()
            return newDocument

        # Load metadata, either from corpus or from file
        self.loadJATSMetadataFromPaper(newDocument, soup)
        metadata["guid"] = cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        back = soup.find("back")
        if back:
            ref_list = back.find("ref-list")
            # other things in <back> like appendices: ignore them for now
            if ref_list:
                for ref in ref_list.findAll("ref"):
                    self.loadJATSReference(ref, newDocument)

        newDocument.updateReferences()

        # Load Abstract
        self.loadJATSAbstract(soup, newDocument)

        for sec in body.findChildren("sec", recursive=False):
            self.loadJATSSection(sec, newDocument, "root")

        newDocument.updateAuthorsAffiliations()
        return newDocument

Exemple #4

0

Afficher le fichier

Fichier : athar_corpus.py Projet : danduma/minerva

    def read(self, xml, filename):
        """
            Load a document from the Athar corpus

            Args:
                xml: full xml string
        """
        ##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        ##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        soup = BeautifulStoneSoup(xml)

        paper_data_node = soup.find("div", {"class": "dstPaperData"})
        paper_data = {
            "id": paper_data_node.text,
            "title": "",
            "authors": "",
        }
        title = paper_data_node.find("div", {"class": "dstPaperTitle"})
        if title:
            paper_data["title"] = title.text

        authors = paper_data_node.find("div", {"class": "dstPaperAuthors"})
        if authors:
            author_chunks = title.text.split(";")
            for author in author_chunks:
                chunks = author.split(",")
                author_dict = {"given": chunks[1], "family": chunks[0]}
            paper_data["authors"] = author_dict


##        print(paper_data)

        all_contexts = []
        all_docs = []
        document_nodes = soup.findAll("table", {"class": "srcPaper"})
        for index, document_node in enumerate(document_nodes):
            try:
                doc, contexts = self.loadDocumentNode(document_node,
                                                      paper_data, index)
                all_docs.append(doc)
                all_contexts.extend(contexts)
            except ValueError:
                print("Error:", sys.exc_info()[1])
                break
        return all_docs, all_contexts

Exemple #5

0

Afficher le fichier

    def get_info(self, account):
        request = urllib.request.Request(self.info_url)
        response = self.opener.open(request)
        content = response.read().decode(self.character).encode("utf-8")

        file = open('new/' + account + '.html', 'wb')
        file.write(content)
        file.close()

        detail_html = BeautifulStoneSoup(content)
        img_url = detail_html.find(id="Student11_Image1")
        link = img_url.get('src')
        link = link[2:]
        pto_url = 'http://szjy.swun.edu.cn/Sys/SystemForm' + link
        pto_url = pto_url.replace('照片', '%D5%D5%C6%AC')
        urllib.request.install_opener(opener=self.opener)
        img_name = 'photos/' + account + '.jpg'
        urllib.request.urlretrieve(pto_url, img_name)
        self.cookie = self.cookie.clear()

Exemple #6

0

Afficher le fichier

Fichier : NavigatorParcer.py Projet : chmelevme/somework

    def parse_data(self, url):
        '''Собирает данные в словарь'''
        request = self.session.get(url, headers=self.headers)
        if request.status_code == 200:
            soup = BeautifulStoneSoup(request.content)
            if not (bool(soup.find('div', {"class": 'error404__text'}))
                    or bool(soup.find('div', {"class": 'nothing-search'}))
                    or bool(soup.find('div', {"id": 'productList'}))):

                try:
                    name_of_product = soup.find('h1').next_element
                except Exception:
                    raise Format_Exeption('name', url)

                try:
                    price_for_all = soup.find(
                        'span', {
                            "class": "item__price item__price--normal-left"
                        }).next_element.replace(" ", "").replace("\n", "")
                except Exception:
                    price_for_all = "Нет в наличии"
                try:
                    price_for_registered = soup.find(
                        'span', {
                            "class": "item__price item__price--red-bold"
                        }).next_element.replace(" ", "").replace("\n", "")
                except Exception:
                    price_for_registered = "Нет в наличии"

                try:
                    reference = soup.findAll(
                        'div', {"class": "item__card-info-articul"})
                    reference = reference[1].next_element
                    reference = str(reference).split()[2].replace("-", '')
                except Exception:
                    reference = "Нет номера"
                final = {
                    "name_of_product": name_of_product,
                    "price_for_all": price_for_all,
                    "price_for_registered": price_for_registered,
                    "reference": reference,
                    "url": url
                }
                return final
            else:
                print("Не тот формат, вот ссылка {0}".format(url))
                raise Format_Exeption
        else:
            raise Connection_Exception

Exemple #7

0

Afficher le fichier

Fichier : MarsParcer.py Projet : chmelevme/somework

    def parse_data(self, url):
        '''Собирает данные в словарь'''
        request = self.session.get(url, headers=self.headers)
        if request.status_code == 200:
            soap = BeautifulStoneSoup(request.content)
            if not (bool(soap.find('table', {"class": 'map-columns'})) or bool(
                    soap.find('div', {"class": 'col-md-12 catalog-items'}))):
                try:
                    name_of_product = soap.find('h1', {
                        'class': 'title'
                    }).next_element
                except Exception:
                    raise Format_Exeption('name', url)

                try:
                    price_for_all = soap.find('div', {
                        "class": "price"
                    }).next_element.replace(" ", "").replace("\n", "")[:-1]
                except Exception:
                    price_for_all = "Нет в наличии"
                try:
                    price_for_rozn = soap.find('div', {
                        "class": "rozn-price"
                    }).next_element.replace(" ", "").replace("\n", "")[:-1]
                    price_for_rozn = ''.join(
                        filter(str.isdigit, price_for_rozn))
                except Exception:
                    price_for_rozn = "Нет в наличии"
                try:
                    reference = soap.find('div', {
                        'class': 'article'
                    }).next_element.replace("-", '')[9:]
                except Exception:
                    reference = "Нет номера"

                final = {
                    "name_of_product": name_of_product,
                    "price_for_all": price_for_all,
                    "price_for_registered": price_for_rozn,
                    "reference": reference,
                    "url": url
                }
                return final
            else:
                print("Не тот формат, вот ссылка {0}".format(url))
                raise Format_Exeption
        else:
            raise Connection_Exception

Exemple #8

0

Afficher le fichier

def processCitationXML(intext):
    """
        Extract the authors, date of an in-text citation <ref> from XML dom
    """
    if isinstance(intext, six.string_types):
        xml = BeautifulStoneSoup(intext)
    else:
        xml = intext

    if not xml:
        return None, None
    authors = []
    for a in xml.findAll("refauthor"):
        authors.append(a.text)
    date = xml.find("date")
    if date:
        date = cleanxml(date.__repr__())
    else:
        date = ""

    if authors == [] or date == "":
        return None, None
    else:
        return authors, date

Exemple #9

0

Afficher le fichier

def loadSciXML(filename):
    """
        Load a Cambridge-style SciXML

    """
    def extractSentenceText(s, newSent_id, doc):
        """
            Returns a printable representation of the sentence where all references are now placeholders with numbers
        """
        global ref_rep_count
        ref_rep_count = 0

        newSent = doc.element_by_id[newSent_id]

        def repFunc(match):
            """
            """
            global ref_rep_count
            ref_rep_count += 1

            res = " <CIT ID=" + str(
                doc.citation_by_id[newSent["citations"][ref_rep_count -
                                                        1]]["id"]) + " />"
            return res

        text = s.renderContents()
        text = re.sub(r"<ref.*?</ref>", repFunc, text, 0,
                      re.IGNORECASE | re.DOTALL)
        text = re.sub(r"</?refauthor>", "", text, 0, re.IGNORECASE | re.DOTALL)
        return text

    def loadStructureProcessPara(p, newDocument, parent):
        newPar_id = newDocument.addParagraph(parent)

        for s in p.findChildren("s"):
            newSent_id = newDocument.addSentence(newPar_id, "")
            newSent = newDocument.element_by_id[newSent_id]
            loadAttributesIfPresent(s, ["ia", "az", "refid"], newSent)
            refs = s.findAll("ref")
            num = len(newDocument["citations"]) + 1
            ##            for cit in citations:
            ##                r["citation_id"]=num
            ##                num+=1
            loaded_refs = [
                loadCitation(r, newSent_id, newDocument, parent) for r in refs
            ]

            newSent["citations"] = [aref["id"] for aref in loaded_refs]
            newSent["text"] = extractSentenceText(s, newSent_id, newDocument)
            newDocument.countMultiCitations(
                newSent
            )  # deal with many citations within characters of each other: make them know they are a cluster TODO cluster them

        return newPar_id

    def loadStructureProcessDiv(div, newDocument):
        header = div.find("header")
        if not header:
            header_id = 0
            header_text = ""
        else:
            header_id = header["id"] or 0
            header_text = re.sub(r"</?header.*?>", "", header.__repr__())

        newSection_id = newDocument.addSection("root", header_text, header_id)

        for p in div.findAll("p"):
            newPar_id = loadStructureProcessPara(p, newDocument, newSection_id)

    def loadMetadataIfExists(branch, key, doc):
        meta = branch.find(key)
        if meta:
            doc["metadata"][key] = meta.text

    def loadAttributesIfPresent(branch, attributes, sent):
        """
            For each element in attributes, if present in branch, it is added to sent
        """
        for a in attributes:
            if a in branch:
                sent[a] = branch[a]

    def loadMetadata(newDocument, paper, fileno):
        """
            Does all the painful stuff of trying to recover metadata from inside a badly converted
            SciXML file
        """
        title = paper.findChildren("title")
        newDocument["metadata"]["title"] = title[0].text if len(
            title) > 0 else "NO TITLE"

        if fileno == "":
            fileno = paper.find("fileno").text

        newDocument["metadata"]["fileno"] = fileno

        authors = []
        meta = soup.find("metadata")
        if not meta:
            debugAddMessage(newDocument, "error",
                            "NO METADATA IN DOCUMENT! file:" + filename)
            return newDocument

        for a in meta.findChildren("author"):
            authors.append(processPlainTextAuthor(a.text))

        if authors == []:
            authorlist = soup.find("authorlist")

        if authorlist:
            for author in authorlist.findChildren("refauthor"):
                authors.append(author.text)

            if authors == []:
                authors = extractAuthorsFromAuthorlist(authorlist)

        appeared = meta.find("appeared")
        if appeared:
            loadMetadataIfExists(appeared, "conference", newDocument)
            loadMetadataIfExists(appeared, "year", newDocument)

        newDocument["metadata"]["authors"] = authors
        newDocument["metadata"]["year"] = meta.find("year").text

    def sanitizeString(s, maxlen=200):
        s = s.replace("\t", " ")
        s = s[:maxlen]
        return s

    def makeSureValuesAreReadable(newDocument):
        newDocument["metadata"]["title"] = sanitizeString(
            newDocument["metadata"]["title"])
        newAuthors = []
        for author in newDocument["metadata"]["authors"]:
            newAuthors.append(sanitizeString(author, 70))
        newDocument["metadata"]["authors"] = newAuthors

        newSurnames = []
        for surname in newDocument["metadata"]["surnames"]:
            newSurnames.append(sanitizeString(surname, 25))
        newDocument["metadata"]["surnames"] = newSurnames

        newDocument["metadata"]["year"] = sanitizeString(
            newDocument["metadata"]["year"])
        if "conference" in newDocument["metadata"]:
            newDocument["metadata"]["conference"] = sanitizeString(
                newDocument["metadata"]["conference"])

    def matchCitationsWithReferences(newDocument):
        """
            Match each citation with its reference
        """
        allcitations = []
        for s in newDocument.allsentences:
            for citation_id in s["citations"]:
                cit = newDocument.citation_by_id[citation_id]

                if cit["ref_id"] != 0:  # the citation already has a matching reference id in the original document, use it
                    match = findMatchingReferenceByOriginalId(
                        cit["ref_id"], newDocument)
                    if not match:
                        ##                        print cit
                        match = newDocument.matchReferenceById(cit["ref_id"])
                else:
                    # attempt to guess which reference the citation should point to
                    match = matchCitationWithReference(cit["original_text"],
                                                       newDocument)

                if match:
                    # whatever the previous case, make sure citation points to the ID of its reference
                    cit["ref_id"] = match["id"]
                    match["citations"].append(
                        cit["id"]
                    )  # add the citation ID to the reference's list of citations
                    cit.pop("authors", "")
                    cit.pop("date", "")
                    cit.pop("original_text", "")
                else:
                    debugAddMessage(
                        newDocument, "notes",
                        "NO MATCH for CITATION in REFERENCES: " +
                        cleanxml(cit["original_text"]) + ", ")
                    pass

    # main loadSciXML
    text = loadFileText(filename)
    soup = BeautifulStoneSoup(text)

    fileno = soup.find("docno")
    fileno = fileno.text if fileno else ""

    # Create a new SciDoc to store the paper
    newDocument = scidoc.SciDoc()
    newDocument["metadata"]["filename"] = os.path.basename(filename)
    newDocument["metadata"]["filepath"] = filename

    paper = soup.find("paper")
    if not paper:
        debugAddMessage(newDocument, "error",
                        "NO <PAPER> IN THIS PAPER! file: " + filename)
        return newDocument

    # Load metadata, either from corpus or from file
    key = cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
    if key in cp.Corpus.metadata_index:
        metadata = cp.Corpus.metadata_index[key]
    else:
        metadata = None

    if metadata:
        newDocument["metadata"]["conference"] = ""
        for field in metadata:
            newDocument["metadata"][field] = metadata[field]
    else:
        loadMetadata(newDocument, paper, fileno)
        debugAddMessage(newDocument, "error",
                        "PAPER NOT IN METADATA FILE! file: " + filename)

    newDocument["metadata"]["guid"] = cp.Corpus.generateGUID(
        newDocument["metadata"])

    # Clean up potential weird text in XML metadata
    makeSureValuesAreReadable(newDocument)

    # Load all references from the XML
    for ref in soup.findAll("reference"):
        processReferenceXML(ref, newDocument)

    # Load Abstract
    abstract = soup.find("abstract")
    if not abstract:
        debugAddMessage(newDocument, "error",
                        "CANNOT LOAD ABSTRACT! file: " + filename + "\n")
        # TODO: LOAD first paragraph as abstract
    else:
        newSection_id = newDocument.addSection("root", "Abstract")
        newPar_id = newDocument.addParagraph(newSection_id)

        for s in abstract.findChildren("a-s"):
            newSent_id = newDocument.addSentence(newPar_id, s.text)
            loadAttributesIfPresent(s, ["ia", "az", "refid"],
                                    newDocument.element_by_id[newSent_id])

        newDocument.abstract = newDocument.element_by_id[newSection_id]

    for div in soup.findAll("div"):
        loadStructureProcessDiv(div, newDocument)

    # try to match each citation with its reference
    matchCitationsWithReferences(newDocument)

    # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
    # No functiona por: unicode
    ##    for ref in newDocument["references"]:
    ##        k=ref.get("AZ",["NO AZ"])
    ##        print k, most_common(k)

    return newDocument

Exemple #10

0

Afficher le fichier

Fichier : parse.py Projet : jschwinger23/ldoce2anki

                  {% if sense.examples|length > 0 %}
                    <div>
                        <ul style="" list-style-type: disc; "">
                    {% for example in sense.examples %}
                            <li style="" text-align: left; ""><font color="" #999999 "">{{ example }}</font></li>
                    {% endfor %}
                        </ul>
                    </div>
                  {% else %}
                    <div style="" text-align: left; "">&nbsp;</div>
                  {% endif %}
                {% endfor %}
                {% endfor %}
                </div>
            </div>
        '''

    def __call__(self, word: dict) -> str:
        html = Template(self._tmpl).render(word=word)
        return html.replace('\n', '')


if __name__ == '__main__':
    converter = AnkiHtmlConverter()
    for filename in glob.glob('htmls/*.html'):
        soup = BeautifulStoneSoup(open(filename))
        if not soup.find(class_='ldoceEntry Entry'):
            continue
        word = os.path.basename(filename).rsplit('.', 1)[0]
        print(word, f'"{converter(parse(word, soup))}"', sep='\t')

Exemple #11

0

Afficher le fichier

Fichier : azscixml.py Projet : danduma/minerva

def loadAZSciXML(filename):
    """
        Load a Cambridge-style SciXML

    """

    # main loadSciXML
    text = loadFileText(filename)
    soup = BeautifulStoneSoup(text)

    fileno = soup.find("docno")
    fileno = fileno.text if fileno else ""

    # Create a new SciDoc to store the paper
    newDocument = SciDoc()
    newDocument["metadata"]["filename"] = os.path.basename(filename)
    newDocument["metadata"]["filepath"] = filename

    paper = soup.find("paper")
    if not paper:
        debugAddMessage(newDocument, "error",
                        "NO <PAPER> IN THIS PAPER! file: " + filename)
        return newDocument

    # Load metadata, either from corpus or from file
##    key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
##    if cp.Corpus.metadata_index.has_key(key):
##        metadata=cp.Corpus.metadata_index[key]
##    else:
    metadata = None

    if metadata:
        newDocument["metadata"]["conference"] = ""
        for field in metadata:
            newDocument["metadata"][field] = metadata[field]
    else:
        loadMetadata(newDocument, paper, fileno, soup)


##        debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename)

    newDocument["metadata"]["guid"] = cp.Corpus.generateGUID(
        newDocument["metadata"])

    # Clean up potential weird text in XML metadata
    ##    makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff

    # Load all references (at the end of the document) from the XML
    for ref in soup.findAll("reference"):
        processReferenceXML(ref, newDocument)

    # Load Abstract
    abstract = soup.find("abstract")
    if not abstract:
        debugAddMessage(newDocument, "error",
                        "CANNOT LOAD ABSTRACT! file: " + filename + "\n")
        # TODO: LOAD first paragraph as abstract
    else:
        newSection_id = newDocument.addSection("root", "Abstract")
        newPar_id = newDocument.addParagraph(newSection_id)

        for s in abstract.findChildren("a-s"):
            addNewSentenceAndProcessRefs(
                s, newDocument, newPar_id,
                newSection_id)  # deals with all of the adding of a sentence

        newDocument.abstract = newDocument.element_by_id[newSection_id]

    for div in soup.findAll("div"):
        loadStructureProcessDiv(div, newDocument)

    # try to match each citation with its reference
    matchCitationsWithReferences(newDocument)

    # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
    # No functiona por: unicode
    ##    for ref in newDocument["references"]:
    ##        k=ref.get("AZ",["NO AZ"])
    ##        print k, most_common(k)

    return newDocument

Exemple #12

0

Afficher le fichier

Fichier : read_scixml.py Projet : danduma/minerva

    def read(self, filename, identifier):
        """
        """
        # main loadSciXML
        text = loadFileText(filename)
        soup = BeautifulStoneSoup(text)

        fileno = soup.find("docno")
        fileno = fileno.text if fileno else ""

        # Create a new SciDoc to store the paper
        newDocument = SciDoc()
        newDocument["metadata"]["filename"] = os.path.basename(filename)
        newDocument["metadata"]["filepath"] = filename

        paper = soup.find("paper")
        if not paper:
            debugAddMessage(newDocument, "error",
                            "NO <PAPER> IN THIS PAPER! file: " + filename)
            return newDocument

        # Load metadata, either from corpus or from file
        key = cp.Corpus.getFileUID(newDocument["metadata"]["filename"])
        if key in cp.Corpus.metadata_index:
            metadata = cp.Corpus.metadata_index[key]
        else:
            metadata = None

        if metadata:
            newDocument["metadata"]["conference"] = ""
            for field in metadata:
                newDocument["metadata"][field] = metadata[field]
        else:
            self.loadMetadata(newDocument, paper, fileno)
            debugAddMessage(newDocument, "error",
                            "PAPER NOT IN METADATA FILE! file: " + filename)

        newDocument["metadata"]["guid"] = cp.Corpus.generateGUID(
            newDocument["metadata"])

        # Clean up potential weird text in XML metadata
        self.makeSureValuesAreReadable(newDocument)

        # Load all references from the XML
        for ref in soup.findAll("reference"):
            self.processReferenceXML(ref, newDocument)

        # Load Abstract
        abstract = soup.find("abstract")
        if not abstract:
            debugAddMessage(newDocument, "error",
                            "CANNOT LOAD ABSTRACT! file: " + filename + "\n")
            # TODO: LOAD first paragraph as abstract
        else:
            newSection_id = newDocument.addSection("root", "Abstract")
            newPar_id = newDocument.addParagraph(newSection_id)

            for s in abstract.findChildren("a-s"):
                newSent_id = newDocument.addSentence(newPar_id, s.text)
                self.loadAttributesIfPresent(
                    s, ["ia", "az", "refid"],
                    newDocument.element_by_id[newSent_id])

            newDocument.abstract = newDocument.element_by_id[newSection_id]

        for div in soup.findAll("div"):
            self.loadStructureProcessDiv(div, newDocument)

            # try to match each citation with its reference
            self.matchCitationsWithReferences(newDocument)

        # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match
        # No functiona por: unicode
        ##    for ref in newDocument["references"]:
        ##        k=ref.get("AZ",["NO AZ"])
        ##        print k, most_common(k)

        return newDocument

Exemple #13

0

Afficher le fichier

def loadAZannot(filename):
	"""
		Load an AZ-annotated document from the Teufel corpus into a "scidoc" JSON file
	"""

	def loadStructureProcessPara(p, glob):
		glob["p"]+=1
		newPar={"type":"p", "id":glob["p"]}
		newPar["sentences"]=[]

		for s in p.findChildren("s"):
			newSent={"type":"s","text":s.text,"ia":s.get("ia",""),"az":s.get("az",""),"id":glob["s"],"refs":[]}
			newSent["refs"]=[{"text":r.text, "link":0} for r in s.findAll("ref")]
			glob["s"]+=1
			newPar["sentences"].append(newSent)

		return newPar

	def loadStructureProcessDiv(div, doc, glob):
		header=div.find("header")

		newSection={"header":header, "paragraphs":[], "id":glob["sect"]}
		glob["sect"]+=1
		for p in div.findAll("p"):
			newPar=loadStructureProcessPara(p,glob)
			newSection["paragraphs"].append(newPar)

		doc["sections"].append(newSection)

	glob={"sect":0,"p":0,"s":0}


	f=codecs.open(filename,"rb","utf-8", errors="ignore")
	lines=f.readlines()
	text="".join(lines)
	soup=BeautifulStoneSoup(text)

	paper=soup.find("paper")
	title=paper.find("title").text

	newDocument={"title":title}
	newDocument["sections"]=[]
	newDocument["references"]=[]
	newDocument["metadata"]={"fileno":paper.find("fileno").text}

	authors=[]
	meta=soup.find("metadata")
	for a in meta.findChildren("author"):
		authors.append(processPlainTextAuthor(a.text))

	newDocument["authors"]=authors
	newDocument["year"]=meta.find("year").text

	for ref in soup.findAll("reference"):
		processReference(ref, newDocument)

	newSection={"header":"Abstract", "paragraphs":[], "id":glob["sect"]}
	glob["sect"]+=1
	newSection["paragraphs"].append({"type":"p", "sentences":[], "id":glob["p"]})
	glob["p"]+=1

	abstract=soup.find("abstract")
	for s in abstract.findChildren("a-s"):
		newSent={"type":"s","text":s.text,"ia":s["ia"],"az":s["az"],"id":glob["s"], "refs":[]}
		newSection["paragraphs"][-1]["sentences"].append(newSent)
		glob["s"]+=1

	newDocument["sections"].append(newSection)

	for div in soup.findAll("div"):
		loadStructureProcessDiv(div, newDocument, glob)

	sentences=getListOfSentenceObjects(newDocument)
	for s in sentences:
		for ref in s["refs"]:
			match=matchInTextReference(ref["text"],newDocument)
			if match:
##				print ref["text"]," -> ", match["authors"], match["year"]
##				print s.get("az","NO AZ")
##				print s.get("ia","NO IA")
				azs.append(s.get("az","NO AZ"))
				ias.append(s.get("ia","NO IA"))
				match["AZ"]=match.get("AZ",[])
				match["AZ"].append(s.get("az","OTH"))
				match["IA"]=match.get("AZ",[])
				match["IA"].append(s.get("az",""))
			else:
				print("NO MATCH for CITATION in REFERENCES:", ref["text"])
				pass

## "in press", "forthcoming", "submitted", "to appear"
# No functiona por: unicode
##	for ref in newDocument["references"]:
##		k=ref.get("AZ",["NO AZ"])
##		print k, most_common(k)

	return newDocument