def loadPaperSentence(self, s, newDocument, parent): """ Given a string, adds the sentence to the SciDoc, parses the citations, matches them with the references Args: s: string newDocument: SciDoc parent: id of element this sentence will hang from (p) """ def replaceTempCitToken(s, temp, final): """ replace temporary citation placeholder with permanent one """ return re.sub(CITATION_FORM % temp, CITATION_FORM % final, annotated_s, flags=re.IGNORECASE) newSent=newDocument.addSentence(parent,"") annotated_s,citations_found=annotateCitationsInSentence(s, newDocument.metadata["original_citation_style"]) annotated_citations=[] if newDocument.metadata["original_citation_style"]=="APA": for index,citation in enumerate(citations_found): newCit=newDocument.addCitation(sent_id=newSent["id"]) reference=matchCitationWithReference(citation, newDocument["references"]) ## print (citation["text"]," -> ", formatReference(reference)) if reference: newCit["ref_id"]=reference["id"] else: # do something else? newCit["ref_id"]=None annotated_citations.append(newCit) annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"]) elif newDocument.metadata["original_citation_style"]=="AFI": for index,citation in enumerate(citations_found): newCit=newDocument.addCitation(sent_id=newSent["id"]) # TODO check this: maybe not this simple. May need matching function. newCit["ref_id"]="ref"+str(int(citation["num"])-1) annotated_citations.append(newCit) annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"]) newSent["citations"]=[acit["id"] for acit in annotated_citations] newSent["text"]=annotated_s # deal with many citations within characters of each other: make them know they are a cluster # TODO cluster citations? Store them in some other way? newDocument.countMultiCitations(newSent)
def annotatePlainTextCitations(self, s, newDocument, newSent): """ If the citations aren't tagged with <xref> because Sapienta stripped them away, try to extract the citations from plain text. *sigh* :param s: BeautifulSoup tag of the sentence :param newDocument: SciDoc instance we are populating :param newSent: new sentence in this document that we are adding """ def replaceTempCitToken(s, temp, final): """ replace temporary citation placeholder with permanent one """ return re.sub(CITATION_FORM % temp, CITATION_FORM % final, annotated_s, flags=re.IGNORECASE) def replaceTempCitTokenMulti(s, temp, final_list): """ Replace temporary citation placeholder with a list of permanent ones to deal with multi citations, e.g. [1,2,3] """ assert(isinstance(final_list, list)) rep_string="".join([CITATION_FORM % final for final in final_list]) return re.sub(CITATION_FORM % temp, rep_string, annotated_s, flags=re.IGNORECASE) if not newDocument.metadata.get("original_citation_style", None): newDocument.metadata["original_citation_style"]="AFI" annotated_s,citations_found=annotateCitationsInSentence(s, newDocument.metadata["original_citation_style"]) annotated_citations=[] if newDocument.metadata["original_citation_style"]=="APA": for index,citation in enumerate(citations_found): newCit=newDocument.addCitation(sent_id=newSent["id"]) reference=matchCitationWithReference(citation, newDocument["references"]) ## print (citation["text"]," -> ", formatReference(reference)) if reference: newCit["ref_id"]=reference["id"] else: # do something else? newCit["ref_id"]=None annotated_citations.append(newCit) annotated_s=replaceTempCitToken(annotated_s, index+1, newCit["id"]) elif newDocument.metadata["original_citation_style"]=="AFI": for index,citation in enumerate(citations_found): valid_citation=True nums=[] for num in citation["nums"]: cit_num=int(num)-1 if cit_num < 0: # this is not a citation! Probably something like "then sampled a random number from uniform distribution, u ~ U[0,1]" valid_citation=False break nums.append(cit_num) if not valid_citation: continue cit_ids=[] for num in nums: newCit=newDocument.addCitation(sent_id=newSent["id"]) # TODO check this: maybe not this simple? May need matching function. newCit["ref_id"]="ref"+str(num) cit_ids.append(newCit["id"]) annotated_citations.append(newCit) annotated_s=replaceTempCitTokenMulti(annotated_s, index+1, cit_ids) if len(annotated_citations) > 0: newSent["citations"]=[acit["id"] for acit in annotated_citations] newSent["text"]=annotated_s