def read(self, xml, identifier): """ Load a JATS/NLM (PubMed) XML into a SciDoc. :param xml: full xml string :type xml: basestring :param identifier: an identifier for this document, e.g. file name If an actual full path, the path will be removed from it when stored :type identifier: basestring :returns: :class:`SciDoc <SciDoc>` object :rtype: SciDoc """ # this solves a "bug" in BeautifulStoneSoup with "sec" tags BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[] #xml=fixNumberCitationsXML(xml) soup=BeautifulStoneSoup(xml) # Create a new SciDoc to store the paper newDocument=SciDoc() metadata=newDocument["metadata"] metadata["filename"]=os.path.basename(identifier) metadata["original_citation_style"]=detectCitationStyle(xml) body=soup.find("body") if not body: # TODO: Make the error handling less terrible debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier) newDocument["metadata"]["guid"]=cp.Corpus.generateGUID() return newDocument # Load metadata, either from corpus or from file self.loadJATSMetadataFromPaper(newDocument, soup) metadata["guid"]=cp.Corpus.generateGUID(metadata) # Load all references from the XML back=soup.find("back") if back: ref_list=back.find("ref-list") # other things in <back> like appendices: ignore them for now if ref_list: for ref in ref_list.findAll("ref"): self.loadJATSReference(ref, newDocument) newDocument.updateReferences() # Load Abstract self.loadJATSAbstract(soup,newDocument) for sec in body.findChildren("sec", recursive=False): self.loadJATSSection(sec, newDocument, "root") newDocument.updateAuthorsAffiliations() return newDocument
def read(self, xml, identifier): """ Load a PaperXML into a SciDoc. Args: xml: full xml string identifier: an identifier for this document, e.g. file name Important: supply an actual path so that we can check for the meatadata in bibtexml If an actual full path, the path will be removed from it when stored Returns: SciDoc instance """ ## # this solves a "bug" in BeautifulStoneSoup with "sec" tags ## BeautifulStoneSoup.NESTABLE_TAGS["sec"]=[] xml=self.cleanUpPaperXML(xml) soup=BeautifulStoneSoup(xml, convertEntities=BeautifulStoneSoup.HTML_ENTITIES) # Create a new SciDoc to store the paper newDocument=SciDoc() metadata=newDocument["metadata"] metadata["filename"]=os.path.basename(identifier) ## if not citation_style: ## raise ValueError("Cannot determine citation style") # default citation style if not otherwise detected ## citation_style="APA" body=soup.find("body") if not body: # TODO: Make the error handling less terrible debugAddMessage(newDocument,"error","NO <BODY> IN THIS PAPER! file: "+identifier) ## newDocument["metadata"]["guid"]=cp.Corpus.getFileUID(metadata["filename"]) return newDocument # Load metadata, either from corpus or from file self.loadPaperMetadata(newDocument, soup, identifier) if metadata["surnames"] == []: debugAddMessage(newDocument,"error","NO SURNAMES OF AUTHORS file: "+identifier) return newDocument if metadata["title"] == []: debugAddMessage(newDocument,"error","NO TITLE file: "+identifier) return newDocument metadata["guid"]=cp.Corpus.generateGUID(metadata) # Load all references from the XML references=body.find("references") if references: self.loadPaperReferences(references, newDocument) newDocument.updateReferences() ## print (newDocument.references) ## print("\n\n") sections=body.findChildren("section", recursive=False) detect_style_text="".join([sec.renderContents() for sec in sections[:3]]) ## citation_style=detectCitationStyle(detect_style_text, default="APA") # turns out I don't have a good detection algorithm for AFI citation_style="APA" metadata["original_citation_style"]=citation_style # Load Abstract self.loadPaperAbstract(soup,newDocument) for sec in sections: self.loadPaperSection(sec, newDocument, "root") newDocument.updateReferences() newDocument.updateAuthorsAffiliations() return newDocument