コード例 #1
0
ファイル: read_paperxml.py プロジェクト: danieldmm/minerva
 def __init__(self, parscit_api_url="http://127.0.0.1:5000/parscit/"):
     self.parscit_client=ParsCitClient(parscit_api_url)
コード例 #2
0
ファイル: read_paperxml.py プロジェクト: danieldmm/minerva
class PaperXMLReader(BaseSciDocXMLReader):
    """
        Reader class for Paper/NLM XML

        read()
    """
    def __init__(self, parscit_api_url="http://127.0.0.1:5000/parscit/"):
        self.parscit_client=ParsCitClient(parscit_api_url)

    def cleanUpPaperXML(self, xml):
        """
            Cleans up some messy stuff in PaperXML
        """
        xml=xml.replace(u"\xad","") # this is badly processed end-of-line hyphens
        xml=normalizeUnicode(xml)
        return xml

    def cleanUpReferencesText(self, reftext):
        """
            Same as above, but called only on references it takes less time
        """
        reftext=re.sub(r"Pro­ceedings","Proceedings",reftext)
        reftext=re.sub(r"[Ii]n\s?\s?[Pp]roceedings\s+?of\s+?","In Proceedings of ", reftext)
        reftext=re.sub(r"([Ii]n)([A-Z\d])",r"\1 \2", reftext)
        # Break reference lines after "In proceedings of [x]. Name, Name, date..."
##        reftext=re.sub(r"((ceedings\s+of.{4,40}|pages.{4,12})\.\s*)([A-Z][a-z]{1,14}\s+[A-Z].{1,14})",r"\1 \n\n \3",reftext)
        # add space after commas. Seems to affect conversion quality a lot
        reftext=re.sub(r"([,;\"])([^\s])",r"\1 \2",reftext)
        # make sure there's space after a full stop
        reftext=re.sub(r"([a-z\?\!]\.)([A-Z])",r"\1 \2",reftext)
        # make sure there's space after a date's dot, parenthesis, or lack of space
        reftext=re.sub(r"((?:19|20)[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)(\)?\.?)([\w\&;\"\'])",r"\1\2 \3",reftext)
        # Normalize line breaks
        reftext=re.sub(r"\n\r?\s?\n\r?\s?\n\r?\s?\n\r?\s?", r"\n\n",reftext)
        # Break apart several references on the same line
        # using apa_author and apa_year_num
        reftext=re.sub(r"(\w{2,150}\.) ((?:(?:(?:de |von |van )?[A-Z][A-Za-z'`-]+, [A-Z]\.) (?:and )?)+\((?:(?:19|20)[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)\))",r"\1 \n\n \2",reftext)
        # similar to above, different format.
        reftext=re.sub(r"(\w{2,150}\.)  ?((?:(?:(?:de |von |van )?[A-Z][A-Za-z'`-]+[ \.])+(?:and )?)+\s?\(?(?:(?:19|20)?[0-9][0-9][a-g]?|in\spress|to\sappear|forthcoming|submitted)\)?)",r"\1 \n\n \2",reftext)
##        ref_lines=re.split(r"\n\r?\n\r?",reftext)
##        print(reftext)
        return reftext


    def loadPaperMainAuthorXML(self, author_node):
        """
            Returns BibJSON-compatible author info

            Args:
                author_node: XML node
            Returns:
                dict with author data
        """
        res={"family":author_node.get("surname",""),"given":author_node.get("givenname","")}
        orgs=author_node.findAll("org")
        if orgs != []:
            res["affiliation"]=[]
            for org in orgs:
                location=",".join([org.get("city",""),org.get("country","")]).strip(",")
                res["affiliation"].append({"name":org.get("name",""),"location":location})
        res=normalizeAuthor(res)
        return res

    def loadPaperMetadata(self, newDocument, soup, filename):
        """
            Tries to recover metadata from Paper file
        """
        header=soup.find("firstpageheader")
        if header:
            title=header.find("title")
            if title:
                newDocument.metadata["title"]=title.text

        path,fname=os.path.split(filename)
        metafilename=re.sub(r"(.*)-paper.xml",r"\1.xml",fname,flags=re.IGNORECASE)
        metafilename=os.path.join(path, metafilename)

        self.bibtex_parser = BibTeXMLParser()
##        print("trying to load BibTeXML from ", metafilename)
        try:
            bib_data = self.bibtex_parser.parse_file(metafilename)
        except BibliographyDataError as e:
            print(e)
        except:
            print("COULDN'T LOAD BIBTEXML FOR ",metafilename)
            bib_data=None

        if bib_data:
            entry=bib_data.entries[bib_data.entries.keys()[0]]
            for field in entry.fields:
                newDocument.metadata[field]=entry.fields[field].replace(u"\u2013",u"-")

        authors=[]
        for a in header.findChildren("author"):
            authors.append(self.loadPaperMainAuthorXML(a))
        newDocument["metadata"]["authors"]=authors
        newDocument["metadata"]["surnames"]=[a["family"] for a in authors]
        newDocument["metadata"]["norm_title"]=normalizeTitle(newDocument["metadata"]["title"])
##        print (json.dumps(newDocument.metadata),"\n\n")

    def loadPaperAbstract(self, soup, newDocument):
        """
            Loads the abstract, including sections
        """
        abstract_node=soup.find("abstract")
        if not abstract_node:
            debugAddMessage(newDocument,"error","CANNOT LOAD ABSTRACT! file: %s\n" % newDocument.metadata.get("filename","None"))
            # !TODO: LOAD first paragraph as abstract if no abstract available?
        else:
            abstract=newDocument.addSection("root","Abstract")

            paras=abstract_node.findAll("p")
            if len(paras) == 0:
                paras.append(abstract)
            for p in paras:
                self.loadPaperParagraph(p,newDocument,abstract["id"])

            newDocument.abstract=abstract

    def loadPaperSection(self, sec, newDocument, parent):
        """
            Gets called for each section.

            Args:
                sec: XML node
                newDocument: SciDoc
                parent: id of this section's parent in newDocument
        """
        header_id=0 # CHANGE
        header_text=sec.get("title","")
        # make sure first letter is capitalized
        if len(header_text) > 0:
            header_text=header_text[0].upper()+header_text[1:]

        newSection=newDocument.addSection(parent, header_text, header_id)

        contents=sec.findChildren(["subsection", "p", "figure"], recursive=False)
        if contents:
            for element in contents:
                if element.name=="subsection":
                    self.loadPaperSection(element,newDocument,newSection["id"])
                elif element.name=="p":
                    newPar=self.loadPaperParagraph(element, newDocument, newSection["id"])
                elif element.name=="figure":
                    newPar=newDocument.addParagraph(newSection["id"])
                    newSent=newDocument.addSentence(newPar["id"],"")
                    newSent["text"]=element.get("caption","")
                    newSent["type"]="fig-caption"
                    # TODO improve figure loading

    def loadPaperSentence(self, s, newDocument, parent):
        """
            Given a string, adds the sentence to the SciDoc, parses the citations,
            matches them with the references

            Args:
                s: string
                newDocument: SciDoc
                parent: id of element this sentence will hang from (p)
        """

        def replaceTempCitToken(s, temp, final):
            """
                replace temporary citation placeholder with permanent one
            """
            return re.sub(CITATION_FORM % temp, CITATION_FORM % final, annotated_s, flags=re.IGNORECASE)

        newSent=newDocument.addSentence(parent,"")

        annotated_s,citations_found=annotateCitationsInSentence(s, newDocument.metadata["original_citation_style"])
        annotated_citations=[]

        if newDocument.metadata["original_citation_style"]=="APA":
            for index,citation in enumerate(citations_found):
                newCit=newDocument.addCitation(sent_id=newSent["id"])
                reference=matchCitationWithReference(citation, newDocument["references"])
##                print (citation["text"]," -> ", formatReference(reference))
                if reference:
                    newCit["ref_id"]=reference["id"]
                else:
                    # do something else?
                    newCit["ref_id"]=None
                annotated_citations.append(newCit)
                annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"])

        elif newDocument.metadata["original_citation_style"]=="AFI":
            for index,citation in enumerate(citations_found):
                newCit=newDocument.addCitation(sent_id=newSent["id"])
                # TODO check this: maybe not this simple. May need matching function.
                newCit["ref_id"]="ref"+str(int(citation["num"])-1)

                annotated_citations.append(newCit)
                annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"])

        newSent["citations"]=[acit["id"] for acit in annotated_citations]
        newSent["text"]=annotated_s

        # deal with many citations within characters of each other: make them know they are a cluster
        # TODO cluster citations? Store them in some other way?
        newDocument.countMultiCitations(newSent)


    def loadPaperParagraph(self, p, newDocument, parent_id):
        """
            Creates a paragraph in newDocument, splits the text into sentences,
            creates a sentence object for each
        """
        if p.parent.name == "td":
            # This is not a content paragraph, but the content of a table cell
            return None

        par_text=p.renderContents(encoding=None)
        if re.match(r"(<i>)?proceedings\s+of\s+the\s+.*",par_text,flags=re.IGNORECASE):
            # This is not a content paragraph, we throw it away
            return None

        newPar=newDocument.addParagraph(parent_id)

        try:
            sentences=sentenceSplit(par_text)
        except:
            print("UNICODE ERROR!",par_text)
            sentences=[par_text]

        for s in sentences:
            self.loadPaperSentence(s,newDocument,newPar["id"])
        return newPar

    def loadPaperReferences(self, ref_section, doc):
        """
            Load the reference section

            Args:
                ref: xml node for reference element
                doc: SciDoc instance we're loading this for
            Returns: dict with the new loaded reference
        """

        all_elements=ref_section.findAll(["p","doubt"])
        process_elements=[]
        for index,element in enumerate(all_elements):
            if element.name=="doubt":
                if len(all_elements) > index+1:
                    all_elements[1].setString(element.text+" "+all_elements[1].text)
                elif index > 0:
                    all_elements[index-1].setString(all_elements[index-1].text+ " " + element.text)

        plain_text=[]
        for element in ref_section.findAll(["p"]):
            text=element.text
            plain_text.append(re.sub(r"</?i>"," ",text))


        reftext="\n\n".join(plain_text)
        # clean up the terrible references text, normalize spaces, commas, etc.
        reftext=self.cleanUpReferencesText(reftext)

        if plain_text == []:
            print("WARNING: NO REFERENCES! in ", doc.metadata.get("filename",""))
        else:
            parsed_refs=self.parscit_client.extractReferenceList(reftext)
            if parsed_refs:
                for ref in parsed_refs:
                    doc.addExistingReference(ref)
            else:
                raise ValueError("Couldn't parse references! in %s" % doc.metadata.get("filename",""))
                #TODO integrate FreeCite/others
                pass

##        print (json.dumps(doc.references))

    def read(self, xml, identifier):
        """
            Load a PaperXML into a SciDoc.

            Args:
                xml: full xml string
                identifier: an identifier for this document, e.g. file name
                        Important: supply an actual path so that we can check
                        for the meatadata in bibtexml
                        If an actual full path, the path will be removed from it
                        when stored
            Returns:
                SciDoc instance
        """
##        # this solves a "bug" in BeautifulStoneSoup with "sec" tags
##        BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[]

        xml=self.cleanUpPaperXML(xml)
        soup=BeautifulStoneSoup(xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES)

        # Create a new SciDoc to store the paper
        newDocument=SciDoc()
        metadata=newDocument["metadata"]
        metadata["filename"]=os.path.basename(identifier)
##        if not citation_style:
##            raise ValueError("Cannot determine citation style")
            # default citation style if not otherwise detected
##            citation_style="APA"
        body=soup.find("body")
        if not body:
            # TODO: Make the error handling less terrible
            debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier)
##            newDocument["metadata"]["guid"]=cp.Corpus.getFileUID(metadata["filename"])
            return newDocument


        # Load metadata, either from corpus or from file
        self.loadPaperMetadata(newDocument, soup, identifier)
        if metadata["surnames"] == []:
            debugAddMessage(newDocument,"error","NO SURNAMES OF AUTHORS file: "+identifier)
            return newDocument

        if metadata["title"] == []:
            debugAddMessage(newDocument,"error","NO TITLE file: "+identifier)
            return newDocument

        metadata["guid"]=cp.Corpus.generateGUID(metadata)

        # Load all references from the XML
        references=body.find("references")
        if references:
            self.loadPaperReferences(references, newDocument)

        newDocument.updateReferences()
##        print (newDocument.references)
##        print("\n\n")
        sections=body.findChildren("section", recursive=False)

        detect_style_text="".join([sec.renderContents() for sec in sections[:3]])
##        citation_style=detectCitationStyle(detect_style_text, default="APA")
        # turns out I don't have a good detection algorithm for AFI
        citation_style="APA"
        metadata["original_citation_style"]=citation_style

        # Load Abstract
        self.loadPaperAbstract(soup,newDocument)

        for sec in sections:
            self.loadPaperSection(sec, newDocument, "root")

        newDocument.updateReferences()
        newDocument.updateAuthorsAffiliations()
        return newDocument