def read(self, xml, identifier): """ Load a JATS/NLM (PubMed) XML into a SciDoc. :param xml: full xml string :type xml: basestring :param identifier: an identifier for this document, e.g. file name If an actual full path, the path will be removed from it when stored :type identifier: basestring :returns: :class:`SciDoc <SciDoc>` object :rtype: SciDoc """ # this solves a "bug" in BeautifulStoneSoup with "sec" tags BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[] #xml=fixNumberCitationsXML(xml) soup=BeautifulStoneSoup(xml) # Create a new SciDoc to store the paper newDocument=SciDoc() metadata=newDocument["metadata"] metadata["filename"]=os.path.basename(identifier) metadata["original_citation_style"]=detectCitationStyle(xml) body=soup.find("body") if not body: # TODO: Make the error handling less terrible debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier) newDocument["metadata"]["guid"]=cp.Corpus.generateGUID() return newDocument # Load metadata, either from corpus or from file self.loadJATSMetadataFromPaper(newDocument, soup) metadata["guid"]=cp.Corpus.generateGUID(metadata) # Load all references from the XML back=soup.find("back") if back: ref_list=back.find("ref-list") # other things in <back> like appendices: ignore them for now if ref_list: for ref in ref_list.findAll("ref"): self.loadJATSReference(ref, newDocument) newDocument.updateReferences() # Load Abstract self.loadJATSAbstract(soup,newDocument) for sec in body.findChildren("sec", recursive=False): self.loadJATSSection(sec, newDocument, "root") newDocument.updateAuthorsAffiliations() return newDocument
def wrapInSciDoc(self, contexts, doc_from_id, doc_to_id): """ Returns a SciDoc ready to be passed to the standard context_extract functions, where each context is a paragraph Args: contexts: list of context dicts doc_from_id: corpus_id of this SciDoc doc_to_id: corpus_id of target document (citation) Returns: SciDoc """ newDocument=SciDoc() metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_from_id) if metadata: newDocument.loadExistingMetadata(metadata) assert newDocument.metadata["guid"] != "" else: newDocument.metadata["guid"]=doc_from_id assert newDocument.metadata["guid"] != "" newDocument.metadata["corpus_id"]=doc_from_id newSection_id=newDocument.addSection("root", "", 0) metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_to_id) if not metadata: raise ValueError("Target document %s is not in corpus!" % doc_to_id) return ref=newDocument.addExistingReference(metadata) ref["corpus_id"]=doc_to_id for context in contexts: newPar_id=newDocument.addParagraph(newSection_id) for line in context["lines"]: newSent_id=newDocument.addSentence(newPar_id) text=line["text"] citations=[] if re.search(CIT_MARKER,text): newCit=newDocument.addCitation(newSent_id, ref["id"]) text=re.sub(CIT_MARKER, CITATION_FORM % newCit["id"], text) citations.append(newCit["id"]) sent=newDocument.element_by_id[newSent_id] sent["sentiment"]=line["sentiment"] sent["text"]=text if len(citations) > 0: sent["citations"]=citations return newDocument
def read(self, xml, identifier): """ Load a PaperXML into a SciDoc. Args: xml: full xml string identifier: an identifier for this document, e.g. file name Important: supply an actual path so that we can check for the meatadata in bibtexml If an actual full path, the path will be removed from it when stored Returns: SciDoc instance """ ## # this solves a "bug" in BeautifulStoneSoup with "sec" tags ## BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[] xml=self.cleanUpPaperXML(xml) soup=BeautifulStoneSoup(xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) # Create a new SciDoc to store the paper newDocument=SciDoc() metadata=newDocument["metadata"] metadata["filename"]=os.path.basename(identifier) ## if not citation_style: ## raise ValueError("Cannot determine citation style") # default citation style if not otherwise detected ## citation_style="APA" body=soup.find("body") if not body: # TODO: Make the error handling less terrible debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier) ## newDocument["metadata"]["guid"]=cp.Corpus.getFileUID(metadata["filename"]) return newDocument # Load metadata, either from corpus or from file self.loadPaperMetadata(newDocument, soup, identifier) if metadata["surnames"] == []: debugAddMessage(newDocument,"error","NO SURNAMES OF AUTHORS file: "+identifier) return newDocument if metadata["title"] == []: debugAddMessage(newDocument,"error","NO TITLE file: "+identifier) return newDocument metadata["guid"]=cp.Corpus.generateGUID(metadata) # Load all references from the XML references=body.find("references") if references: self.loadPaperReferences(references, newDocument) newDocument.updateReferences() ## print (newDocument.references) ## print("\n\n") sections=body.findChildren("section", recursive=False) detect_style_text="".join([sec.renderContents() for sec in sections[:3]]) ## citation_style=detectCitationStyle(detect_style_text, default="APA") # turns out I don't have a good detection algorithm for AFI citation_style="APA" metadata["original_citation_style"]=citation_style # Load Abstract self.loadPaperAbstract(soup,newDocument) for sec in sections: self.loadPaperSection(sec, newDocument, "root") newDocument.updateReferences() newDocument.updateAuthorsAffiliations() return newDocument
def loadAZSciXML(filename): """ Load a Cambridge-style SciXML """ # main loadSciXML text=loadFileText(filename) soup=BeautifulStoneSoup(text) fileno=soup.find("docno") fileno=fileno.text if fileno else "" # Create a new SciDoc to store the paper newDocument=SciDoc() newDocument["metadata"]["filename"]=os.path.basename(filename) newDocument["metadata"]["filepath"]=filename paper=soup.find("paper") if not paper: debugAddMessage(newDocument,"error","NO <PAPER> IN THIS PAPER! file: "+filename) return newDocument # Load metadata, either from corpus or from file ## key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"]) ## if cp.Corpus.metadata_index.has_key(key): ## metadata=cp.Corpus.metadata_index[key] ## else: metadata=None if metadata: newDocument["metadata"]["conference"]="" for field in metadata: newDocument["metadata"][field]=metadata[field] else: loadMetadata(newDocument, paper, fileno, soup) ## debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename) newDocument["metadata"]["guid"]=cp.Corpus.generateGUID(newDocument["metadata"]) # Clean up potential weird text in XML metadata ## makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff # Load all references (at the end of the document) from the XML for ref in soup.findAll("reference"): processReferenceXML(ref, newDocument) # Load Abstract abstract=soup.find("abstract") if not abstract: debugAddMessage(newDocument,"error","CANNOT LOAD ABSTRACT! file: "+ filename+"\n") # TODO: LOAD first paragraph as abstract else: newSection_id=newDocument.addSection("root","Abstract") newPar_id=newDocument.addParagraph(newSection_id) for s in abstract.findChildren("a-s"): addNewSentenceAndProcessRefs(s, newDocument, newPar_id, newSection_id) # deals with all of the adding of a sentence newDocument.abstract=newDocument.element_by_id[newSection_id] for div in soup.findAll("div"): loadStructureProcessDiv(div, newDocument) # try to match each citation with its reference matchCitationsWithReferences(newDocument) # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument