def processSentenceText(self, s, doc): """ This overriden function POS-tags every token in the sentence to spoonfeed it to the AZPrime classifier """ if s.get("pos_tagged","") != "": return s["pos_tagged"] text=re.sub(r"<cit\sid=\"?(.+?)\"?\s*?/>",r"_CIT_\1_",s["text"], flags=re.DOTALL|re.IGNORECASE) text=escapeText(cleanxml(text)) tokens=word_tokenize(text) tags=pos_tag(tokens) items=[] for t in tags: token=t[0] pos=t[1] if token == "<": token="<" elif token == ">": token=">" elif token == "&": token="&" if token.startswith("_CIT_"): items.append("<REF>%s</REF>" % token) else: if pos in ["''","'"]: items.append("<W C=\"%s\">%s</W>" % (pos,token)) else: items.append("<W C='%s'>%s</W>" % (pos,token)) xml_string=" ".join(items) if self.save_pos_tags: s["pos_tagged"]=xml_string return xml_string
def loadJATSReference(self, ref, doc): """ Load a reference from the bibliography section. :param ref: xml node for reference element :param doc: :class `SciDoc <SciDoc>` instance we're loading this for :returns: dict with the new loaded reference :rtype: dict """ xmltext=ref.__repr__() authorlist=[] surnames=[] original_id=ref["id"] citation_type_key="publication-type" element=ref.find("element-citation") if not element: element=ref.find("mixed-citation") if not element: element=ref.find("citation") if element: citation_type_key="citation-type" author_group=ref.find("person-group",{"person-group-type":"author"}) if not author_group: collab=ref.find("collab") if collab: authorlist.append(guessNamesOfPlainTextAuthor(collab.text)) else: author_group=ref if author_group: authors=author_group.findAll("name") else: authors=None collab=ref.find("collab") if collab: authorlist.append({"family":collab.text, "given":""}) surnames.append(collab.text) if authors: for a in authors: astring=a.__repr__() surname=a.find("surname") if surname: surnames.append(surname.text) given_names=a.find("given-names") if given_names and surname: authorlist.append({"given": given_names.text, "family": surname.text}) else: astring=cleanxml(astring) authorlist.append(guessNamesOfPlainTextAuthor(astring)) else: srnms=ref.findAll("surname") for s in srnms: surnames.append(s.text) newref=doc.addReference() ## newref["xml"]=xmltext ## newref["text"]=cleanxml(xmltext) newref["authors"]=authorlist newref["surnames"]=surnames newref["external_links"]=[] newref["title"]="<NO TITLE>" if not element: newref["xml"]=xmltext return newref article_title=ref.find("article-title") source=element.find("source") if source: newref["publication-name"]=source.text else: newref["publication-name"]="" try: newref["publication-type"]=element[citation_type_key] except: newref["publication-type"]="unknown" if newref["publication-type"]=="book": if source: newref["title"]=source.text else: if article_title: newref["title"]=article_title.text elif newref["publication-type"]=="journal": if article_title: newref["title"]=article_title.text elif newref["publication-type"]=="other": if article_title: newref["title"]=article_title.text elif source: newref["title"]=source.text self.extractInfoFromPatentText(ref.__repr__(), newref) self.loadJATSmetadataIfExists(element,["volume","issue","fpage","lpage","year"],newref) id=element.find("pub-id",{"pub-id-type":"doi"}) if id: newref["doi"]=id.text id=element.find("pub-id",{"pub-id-type":"pmid"}) if id: newref["pmid"]=id.text id=element.find("pub-id",{"pub-id-type":"pmc"}) if id: newref["pmcid"]=id.text if newref["title"] == "": newref["title"]="<NO TITLE FOUND>" ## comment=element.find("comment") ## if comment: ## extlink=comment.find("ext-link",{"ext-link-type":"uri"}) extlinks=element.findAll("ext-link",{"ext-link-type":"uri"}) for extlink in extlinks: newref["external_links"].append(extlink["xlink:href"]) if original_id and self.USE_ORIGINAL_REF_ID: newref["id"]=original_id return newref