def wrapInSciDoc(self, contexts, doc_from_id, doc_to_id): """ Returns a SciDoc ready to be passed to the standard context_extract functions, where each context is a paragraph Args: contexts: list of context dicts doc_from_id: corpus_id of this SciDoc doc_to_id: corpus_id of target document (citation) Returns: SciDoc """ newDocument=SciDoc() metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_from_id) if metadata: newDocument.loadExistingMetadata(metadata) assert newDocument.metadata["guid"] != "" else: newDocument.metadata["guid"]=doc_from_id assert newDocument.metadata["guid"] != "" newDocument.metadata["corpus_id"]=doc_from_id newSection_id=newDocument.addSection("root", "", 0) metadata=cp.Corpus.getMetadataByField("metadata.corpus_id",doc_to_id) if not metadata: raise ValueError("Target document %s is not in corpus!" % doc_to_id) return ref=newDocument.addExistingReference(metadata) ref["corpus_id"]=doc_to_id for context in contexts: newPar_id=newDocument.addParagraph(newSection_id) for line in context["lines"]: newSent_id=newDocument.addSentence(newPar_id) text=line["text"] citations=[] if re.search(CIT_MARKER,text): newCit=newDocument.addCitation(newSent_id, ref["id"]) text=re.sub(CIT_MARKER, CITATION_FORM % newCit["id"], text) citations.append(newCit["id"]) sent=newDocument.element_by_id[newSent_id] sent["sentiment"]=line["sentiment"] sent["text"]=text if len(citations) > 0: sent["citations"]=citations return newDocument
def loadAZSciXML(filename): """ Load a Cambridge-style SciXML """ # main loadSciXML text=loadFileText(filename) soup=BeautifulStoneSoup(text) fileno=soup.find("docno") fileno=fileno.text if fileno else "" # Create a new SciDoc to store the paper newDocument=SciDoc() newDocument["metadata"]["filename"]=os.path.basename(filename) newDocument["metadata"]["filepath"]=filename paper=soup.find("paper") if not paper: debugAddMessage(newDocument,"error","NO <PAPER> IN THIS PAPER! file: "+filename) return newDocument # Load metadata, either from corpus or from file ## key=cp.Corpus.getFileUID(newDocument["metadata"]["filename"]) ## if cp.Corpus.metadata_index.has_key(key): ## metadata=cp.Corpus.metadata_index[key] ## else: metadata=None if metadata: newDocument["metadata"]["conference"]="" for field in metadata: newDocument["metadata"][field]=metadata[field] else: loadMetadata(newDocument, paper, fileno, soup) ## debugAddMessage(newDocument,"error","PAPER NOT IN METADATA FILE! file: "+filename) newDocument["metadata"]["guid"]=cp.Corpus.generateGUID(newDocument["metadata"]) # Clean up potential weird text in XML metadata ## makeSureValuesAreReadable(newDocument) # remove if not dealing with crap conversion stuff # Load all references (at the end of the document) from the XML for ref in soup.findAll("reference"): processReferenceXML(ref, newDocument) # Load Abstract abstract=soup.find("abstract") if not abstract: debugAddMessage(newDocument,"error","CANNOT LOAD ABSTRACT! file: "+ filename+"\n") # TODO: LOAD first paragraph as abstract else: newSection_id=newDocument.addSection("root","Abstract") newPar_id=newDocument.addParagraph(newSection_id) for s in abstract.findChildren("a-s"): addNewSentenceAndProcessRefs(s, newDocument, newPar_id, newSection_id) # deals with all of the adding of a sentence newDocument.abstract=newDocument.element_by_id[newSection_id] for div in soup.findAll("div"): loadStructureProcessDiv(div, newDocument) # try to match each citation with its reference matchCitationsWithReferences(newDocument) # "in press", "forthcoming", "submitted", "to appear" = dates to fix & match # No functiona por: unicode ## for ref in newDocument["references"]: ## k=ref.get("AZ",["NO AZ"]) ## print k, most_common(k) return newDocument