コード例 #1
0
ファイル: read_jatsxml.py プロジェクト: danieldmm/minerva
    def loadJATSReference(self, ref, doc):
        """
            Load a reference from the bibliography section.

            :param ref: xml node for reference element
            :param doc: :class `SciDoc <SciDoc>` instance we're loading this for
            :returns: dict with the new loaded reference
            :rtype: dict
        """

        xmltext=ref.__repr__()
        authorlist=[]
        surnames=[]
        original_id=ref["id"]

        citation_type_key="publication-type"

        element=ref.find("element-citation")
        if not element:
            element=ref.find("mixed-citation")
            if not element:
                element=ref.find("citation")
                if element:
                    citation_type_key="citation-type"

        author_group=ref.find("person-group",{"person-group-type":"author"})
        if not author_group:
            collab=ref.find("collab")
            if collab:
                authorlist.append(guessNamesOfPlainTextAuthor(collab.text))
            else:
                author_group=ref

        if author_group:
            authors=author_group.findAll("name")
        else:
            authors=None
            collab=ref.find("collab")
            if collab:
                authorlist.append({"family":collab.text, "given":""})
                surnames.append(collab.text)

        if authors:
            for a in authors:
                astring=a.__repr__()
                surname=a.find("surname")
                if surname:
                    surnames.append(surname.text)
                given_names=a.find("given-names")
                if given_names and surname:
                    authorlist.append({"given": given_names.text, "family": surname.text})
                else:
                    astring=cleanxml(astring)
                    authorlist.append(guessNamesOfPlainTextAuthor(astring))
        else:
            srnms=ref.findAll("surname")
            for s in srnms:
                surnames.append(s.text)

        newref=doc.addReference()
##        newref["xml"]=xmltext
    ##    newref["text"]=cleanxml(xmltext)
        newref["authors"]=authorlist
        newref["surnames"]=surnames
        newref["external_links"]=[]
        newref["title"]="<NO TITLE>"
        if not element:
            newref["xml"]=xmltext
            return newref

        article_title=ref.find("article-title")
        source=element.find("source")
        if source:
            newref["publication-name"]=source.text
        else:
            newref["publication-name"]=""

        try:
            newref["publication-type"]=element[citation_type_key]
        except:
            newref["publication-type"]="unknown"

        if newref["publication-type"]=="book":
            if source:
                newref["title"]=source.text
            else:
                if article_title:
                    newref["title"]=article_title.text
        elif newref["publication-type"]=="journal":
            if article_title:
                newref["title"]=article_title.text
        elif newref["publication-type"]=="other":
            if article_title:
                newref["title"]=article_title.text
            elif source:
                newref["title"]=source.text
            self.extractInfoFromPatentText(ref.__repr__(), newref)

        self.loadJATSmetadataIfExists(element,["volume","issue","fpage","lpage","year"],newref)
        id=element.find("pub-id",{"pub-id-type":"doi"})
        if id:
            newref["doi"]=id.text
        id=element.find("pub-id",{"pub-id-type":"pmid"})
        if id:
            newref["pmid"]=id.text
        id=element.find("pub-id",{"pub-id-type":"pmc"})
        if id:
            newref["pmcid"]=id.text

        if newref["title"] == "":
            newref["title"]="<NO TITLE FOUND>"


##        comment=element.find("comment")
##        if comment:
##            extlink=comment.find("ext-link",{"ext-link-type":"uri"})
        extlinks=element.findAll("ext-link",{"ext-link-type":"uri"})
        for extlink in extlinks:
            newref["external_links"].append(extlink["xlink:href"])

        if original_id and self.USE_ORIGINAL_REF_ID:
            newref["id"]=original_id

        return newref
コード例 #2
0
ファイル: read_parscit.py プロジェクト: danieldmm/minerva
    def loadParsCitReference(self, reference):
        """
            Given an XML <citation> node, loads all the relevant values from it.

            Args:
                reference: XML node
            Returns:
                metadata of reference
        """
        metadata={
            "title":"title", # key: the key in the final dict. Value: the XML tag to look for
            "year":"date",
            "volume":"volume",
            "pages":"pages",
            "journal":"journal",
            "publisher":"publisher",
            "location":"location",
            "raw_string":"rawstring",
            "institution":"institution",
        }

        for key in metadata:
            # substitute each value string by its value in the XML, if found
            node=reference.find(metadata[key])
            if node:
                text=node.text.strip(".") # just to be annoying. get the actual text of the node
            else:
                text=""
            metadata[key]=text

        # often the title will end up as anything else: location, journal
        if len(metadata["title"]) < 2:
            if len(metadata.get("journal","")) > 2:
                metadata["title"]=metadata["journal"]
                metadata["journal"]=""
            elif len(metadata.get("location","")) > 2:
                metadata["title"]=metadata["location"]
                metadata["location"]=""

        if metadata["title"].startswith("Building and Using"):
            pass

        # All parsers get the "In Proceedings of..." wrong and put it in the title.
        # this is a manual hack fix
        rx_fix_title_in=re.compile(r"([,.] ?In[\:]? (\w.*))")
        match=rx_fix_title_in.search(metadata["title"])
        if match:
            metadata["journal"]=match.group(2) + metadata.get("journal","")
            metadata["title"]=rx_fix_title_in.sub("", metadata["title"]).strip(" ,")

        rx_fix_title_thesis=re.compile(r", ((?:(?:Doctoral|MSc)? ?[Tt]hesis|(?:\w+|([A-Z]\. ?)+)[Dd]issertation).*)",flags=re.IGNORECASE)
        match=rx_fix_title_thesis.search(metadata["title"])
        if match:
            metadata["journal"]=match.group(1) + metadata.get("journal","")
            metadata["title"]=rx_fix_title_thesis.sub("", metadata["title"]).strip(" ,")

        # remove hanging ". In" at the end of the title
        metadata["title"]=re.sub(r"[.,;] ?In ?$","",metadata["title"])

        for atype in ["journal", "booktitle"]:
            node=reference.find(atype)
            if node:
                metadata["publication"]=node.text.strip(".,: ")
                #TODO: expand this to inproceedings, etc.
                metadata["type"] = atype

        metadata["authors"]=[]
        author_nodes=reference.findAll("author")
        for author_string in [author.text for author in author_nodes]:
            metadata["authors"].append(guessNamesOfPlainTextAuthor(author_string))

        metadata["surnames"]=[author["family"] for author in metadata["authors"]]
        return metadata