Exemple #1
0
    def read(self, xml, identifier):
        """
            Load a JATS/NLM (PubMed) XML into a SciDoc.

            :param xml: full xml string
            :type xml: basestring
            :param identifier: an identifier for this document, e.g. file name
                        If an actual full path, the path will be removed from it
                        when stored
            :type identifier: basestring
            :returns: :class:`SciDoc <SciDoc>` object
            :rtype: SciDoc
        """
        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]
        #xml=fixNumberCitationsXML(xml)
        soup=BeautifulStoneSoup(xml)

        # Create a new SciDoc to store the paper
        newDocument=SciDoc()
        metadata=newDocument["metadata"]
        metadata["filename"]=os.path.basename(identifier)
        metadata["original_citation_style"]=detectCitationStyle(xml)

        body=soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier)
            newDocument["metadata"]["guid"]=cp.Corpus.generateGUID()
            return newDocument

        # Load metadata, either from corpus or from file
        self.loadJATSMetadataFromPaper(newDocument, soup)
        metadata["guid"]=cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        back=soup.find("back")
        if back:
            ref_list=back.find("ref-list")
            # other things in <back> like appendices: ignore them for now
            if ref_list:
                for ref in ref_list.findAll("ref"):
                    self.loadJATSReference(ref, newDocument)

        newDocument.updateReferences()

        # Load Abstract
        self.loadJATSAbstract(soup,newDocument)

        for sec in body.findChildren("sec", recursive=False):
            self.loadJATSSection(sec, newDocument, "root")

        newDocument.updateAuthorsAffiliations()
        return newDocument
Exemple #2
0
    def read(self, xml, identifier):
        """
            Load a PaperXML into a SciDoc.

            Args:
                xml: full xml string
                identifier: an identifier for this document, e.g. file name
                        Important: supply an actual path so that we can check
                        for the meatadata in bibtexml
                        If an actual full path, the path will be removed from it
                        when stored
            Returns:
                SciDoc instance
        """
##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        xml=self.cleanUpPaperXML(xml)
        soup=BeautifulStoneSoup(xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)

        # Create a new SciDoc to store the paper
        newDocument=SciDoc()
        metadata=newDocument["metadata"]
        metadata["filename"]=os.path.basename(identifier)
##        if not citation_style:
##            raise ValueError("Cannot determine citation style")
            # default citation style if not otherwise detected
##            citation_style="APA"
        body=soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier)
##            newDocument["metadata"]["guid"]=cp.Corpus.getFileUID(metadata["filename"])
            return newDocument


        # Load metadata, either from corpus or from file
        self.loadPaperMetadata(newDocument, soup, identifier)
        if metadata["surnames"] == []:
            debugAddMessage(newDocument,"error","NO SURNAMES OF AUTHORS file: "+identifier)
            return newDocument

        if metadata["title"] == []:
            debugAddMessage(newDocument,"error","NO TITLE file: "+identifier)
            return newDocument

        metadata["guid"]=cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        references=body.find("references")
        if references:
            self.loadPaperReferences(references, newDocument)

        newDocument.updateReferences()
##        print (newDocument.references)
##        print("\n\n")
        sections=body.findChildren("section", recursive=False)

        detect_style_text="".join([sec.renderContents() for sec in sections[:3]])
##        citation_style=detectCitationStyle(detect_style_text, default="APA")
        # turns out I don't have a good detection algorithm for AFI
        citation_style="APA"
        metadata["original_citation_style"]=citation_style

        # Load Abstract
        self.loadPaperAbstract(soup,newDocument)

        for sec in sections:
            self.loadPaperSection(sec, newDocument, "root")

        newDocument.updateReferences()
        newDocument.updateAuthorsAffiliations()
        return newDocument