コード例 #1
0
ファイル: read_paperxml.py プロジェクト: danieldmm/minerva
    def loadPaperSentence(self, s, newDocument, parent):
        """
            Given a string, adds the sentence to the SciDoc, parses the citations,
            matches them with the references

            Args:
                s: string
                newDocument: SciDoc
                parent: id of element this sentence will hang from (p)
        """

        def replaceTempCitToken(s, temp, final):
            """
                replace temporary citation placeholder with permanent one
            """
            return re.sub(CITATION_FORM % temp, CITATION_FORM % final, annotated_s, flags=re.IGNORECASE)

        newSent=newDocument.addSentence(parent,"")

        annotated_s,citations_found=annotateCitationsInSentence(s, newDocument.metadata["original_citation_style"])
        annotated_citations=[]

        if newDocument.metadata["original_citation_style"]=="APA":
            for index,citation in enumerate(citations_found):
                newCit=newDocument.addCitation(sent_id=newSent["id"])
                reference=matchCitationWithReference(citation, newDocument["references"])
##                print (citation["text"]," -> ", formatReference(reference))
                if reference:
                    newCit["ref_id"]=reference["id"]
                else:
                    # do something else?
                    newCit["ref_id"]=None
                annotated_citations.append(newCit)
                annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"])

        elif newDocument.metadata["original_citation_style"]=="AFI":
            for index,citation in enumerate(citations_found):
                newCit=newDocument.addCitation(sent_id=newSent["id"])
                # TODO check this: maybe not this simple. May need matching function.
                newCit["ref_id"]="ref"+str(int(citation["num"])-1)

                annotated_citations.append(newCit)
                annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"])

        newSent["citations"]=[acit["id"] for acit in annotated_citations]
        newSent["text"]=annotated_s

        # deal with many citations within characters of each other: make them know they are a cluster
        # TODO cluster citations? Store them in some other way?
        newDocument.countMultiCitations(newSent)
コード例 #2
0
    def annotatePlainTextCitations(self, s, newDocument, newSent):
        """
            If the citations aren't tagged with <xref> because Sapienta stripped
            them away, try to extract the citations from plain text. *sigh*

            :param s: BeautifulSoup tag of the sentence
            :param newDocument: SciDoc instance we are populating
            :param newSent: new sentence in this document that we are adding
        """

        def replaceTempCitToken(s, temp, final):
            """
                replace temporary citation placeholder with permanent one
            """
            return re.sub(CITATION_FORM % temp, CITATION_FORM % final, annotated_s, flags=re.IGNORECASE)

        def replaceTempCitTokenMulti(s, temp, final_list):
            """
                Replace temporary citation placeholder with a list of permanent
                ones to deal with multi citations, e.g. [1,2,3]
            """
            assert(isinstance(final_list, list))
            rep_string="".join([CITATION_FORM % final for final in final_list])
            return re.sub(CITATION_FORM % temp, rep_string, annotated_s, flags=re.IGNORECASE)

        if not newDocument.metadata.get("original_citation_style", None):
            newDocument.metadata["original_citation_style"]="AFI"
        annotated_s,citations_found=annotateCitationsInSentence(s, newDocument.metadata["original_citation_style"])
        annotated_citations=[]

        if newDocument.metadata["original_citation_style"]=="APA":
            for index,citation in enumerate(citations_found):
                newCit=newDocument.addCitation(sent_id=newSent["id"])
                reference=matchCitationWithReference(citation, newDocument["references"])
##                print (citation["text"]," -> ", formatReference(reference))
                if reference:
                    newCit["ref_id"]=reference["id"]
                else:
                    # do something else?
                    newCit["ref_id"]=None
                annotated_citations.append(newCit)
                annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"])

        elif newDocument.metadata["original_citation_style"]=="AFI":
            for index,citation in enumerate(citations_found):
                valid_citation=True
                nums=[]
                for num in citation["nums"]:
                    cit_num=int(num)-1
                    if cit_num < 0:
                        # this is not a citation! Probably something like "then sampled a random number from uniform distribution, u ~ U[0,1]"
                        valid_citation=False
                        break
                    nums.append(cit_num)

                if not valid_citation:
                    continue

                cit_ids=[]
                for num in nums:
                    newCit=newDocument.addCitation(sent_id=newSent["id"])
                    # TODO check this: maybe not this simple? May need matching function.
                    newCit["ref_id"]="ref"+str(num)
                    cit_ids.append(newCit["id"])
                    annotated_citations.append(newCit)

                annotated_s=replaceTempCitTokenMulti(annotated_s, index+1, cit_ids)

        if len(annotated_citations) > 0:
            newSent["citations"]=[acit["id"] for acit in annotated_citations]
        newSent["text"]=annotated_s