def processSentenceText(self, s, doc):
        """
            This overriden function POS-tags every token in the sentence to spoonfeed it to the AZPrime classifier
        """
        if s.get("pos_tagged","") != "":
            return s["pos_tagged"]

        text=re.sub(r"<cit\sid=\"?(.+?)\"?\s*?/>",r"_CIT_\1_",s["text"], flags=re.DOTALL|re.IGNORECASE)
        text=escapeText(cleanxml(text))
        tokens=word_tokenize(text)
        tags=pos_tag(tokens)
        items=[]
        for t in tags:
            token=t[0]
            pos=t[1]
            if token == "<":
                token="&lt;"
            elif token == ">":
                token="&gt;"
            elif token == "&":
                token="&amp;"

            if token.startswith("_CIT_"):
                items.append("<REF>%s</REF>" % token)
            else:
                if pos in ["''","'"]:
                    items.append("<W C=\"%s\">%s</W>" % (pos,token))
                else:
                    items.append("<W C='%s'>%s</W>" % (pos,token))

        xml_string=" ".join(items)
        if self.save_pos_tags:
            s["pos_tagged"]=xml_string
        return xml_string
Exemple #2
0
    def loadJATSReference(self, ref, doc):
        """
            Load a reference from the bibliography section.

            :param ref: xml node for reference element
            :param doc: :class `SciDoc <SciDoc>` instance we're loading this for
            :returns: dict with the new loaded reference
            :rtype: dict
        """

        xmltext=ref.__repr__()
        authorlist=[]
        surnames=[]
        original_id=ref["id"]

        citation_type_key="publication-type"

        element=ref.find("element-citation")
        if not element:
            element=ref.find("mixed-citation")
            if not element:
                element=ref.find("citation")
                if element:
                    citation_type_key="citation-type"

        author_group=ref.find("person-group",{"person-group-type":"author"})
        if not author_group:
            collab=ref.find("collab")
            if collab:
                authorlist.append(guessNamesOfPlainTextAuthor(collab.text))
            else:
                author_group=ref

        if author_group:
            authors=author_group.findAll("name")
        else:
            authors=None
            collab=ref.find("collab")
            if collab:
                authorlist.append({"family":collab.text, "given":""})
                surnames.append(collab.text)

        if authors:
            for a in authors:
                astring=a.__repr__()
                surname=a.find("surname")
                if surname:
                    surnames.append(surname.text)
                given_names=a.find("given-names")
                if given_names and surname:
                    authorlist.append({"given": given_names.text, "family": surname.text})
                else:
                    astring=cleanxml(astring)
                    authorlist.append(guessNamesOfPlainTextAuthor(astring))
        else:
            srnms=ref.findAll("surname")
            for s in srnms:
                surnames.append(s.text)

        newref=doc.addReference()
##        newref["xml"]=xmltext
    ##    newref["text"]=cleanxml(xmltext)
        newref["authors"]=authorlist
        newref["surnames"]=surnames
        newref["external_links"]=[]
        newref["title"]="<NO TITLE>"
        if not element:
            newref["xml"]=xmltext
            return newref

        article_title=ref.find("article-title")
        source=element.find("source")
        if source:
            newref["publication-name"]=source.text
        else:
            newref["publication-name"]=""

        try:
            newref["publication-type"]=element[citation_type_key]
        except:
            newref["publication-type"]="unknown"

        if newref["publication-type"]=="book":
            if source:
                newref["title"]=source.text
            else:
                if article_title:
                    newref["title"]=article_title.text
        elif newref["publication-type"]=="journal":
            if article_title:
                newref["title"]=article_title.text
        elif newref["publication-type"]=="other":
            if article_title:
                newref["title"]=article_title.text
            elif source:
                newref["title"]=source.text
            self.extractInfoFromPatentText(ref.__repr__(), newref)

        self.loadJATSmetadataIfExists(element,["volume","issue","fpage","lpage","year"],newref)
        id=element.find("pub-id",{"pub-id-type":"doi"})
        if id:
            newref["doi"]=id.text
        id=element.find("pub-id",{"pub-id-type":"pmid"})
        if id:
            newref["pmid"]=id.text
        id=element.find("pub-id",{"pub-id-type":"pmc"})
        if id:
            newref["pmcid"]=id.text

        if newref["title"] == "":
            newref["title"]="<NO TITLE FOUND>"


##        comment=element.find("comment")
##        if comment:
##            extlink=comment.find("ext-link",{"ext-link-type":"uri"})
        extlinks=element.findAll("ext-link",{"ext-link-type":"uri"})
        for extlink in extlinks:
            newref["external_links"].append(extlink["xlink:href"])

        if original_id and self.USE_ORIGINAL_REF_ID:
            newref["id"]=original_id

        return newref