def generateSideBySide(doc_list): """ Generates side-by-side visualizations of a JATS XML: one using an XML to HTML converter, one loading the XML into SciDocJSON and rendering it back as HTML """ from subprocess import Popen from read_auto import AutoXMLReader reader=AutoXMLReader() output_dir=os.path.join(cp.Corpus.ROOT_DIR,"conversion_visualization\\") file_list=[] for filename in doc_list: print("Converting %s" % filename) input_file=cp.Corpus.paths.inputXML+filename output_file=output_dir+"%s_1.html" % os.path.basename(filename) ## os_line="..\\..\\libs\\generate_jats_html.bat "+" "+input_file+" "+output_file ## print(os_line) ## p = Popen(os_line, cwd=r"..\..\libs") ## stdout, stderr = p.communicate() doc=reader.readFile(input_file) try: json.dumps(doc.data) except: print("Not JSON Serializable!!!!") html=SciDocRenderer(doc).prettyPrintDocumentHTML(True,True,True, True) output_file2=output_file.replace("_1.html","_2.html") writeFileText(html,output_file2) file_list.append([os.path.basename(output_file),os.path.basename(output_file2)]) file_list_json="file_data=%s;" % json.dumps(file_list) writeFileText(file_list_json,output_dir+"file_data.json")
def explainAnchorTextZoning(guid, max_inlinks=10, use_full_text=False): """ This generates a clipping collection file, including all the citation contexts of other files to this file """ meta=cp.Corpus.getMetadataByGUID(guid) all_html=["""<h1 class="title">%s</h1><span>Inlink context summary for %s</span>""" % (meta["title"],formatCitation(meta))] global CURRENT_CITATION CURRENT_CITATION=re.escape(formatCitation(meta)) for index, link in enumerate(meta["inlinks"]): if index == max_inlinks: break print("Processing anchor text from %s" % link) doc=cp.Corpus.loadSciDoc(link) if not use_full_text: trimDocToRelevantBits(doc, guid) renderer=SciDocRenderer(doc) html=renderer.prettyPrintDocumentHTML( formatspans=True, include_bibliography=use_full_text, wrap_with_HTML_tags=False, extra_attribute_function=extraAttributes, citation_formatting_function=citationFormatting, reference_formatting_function=referenceFormatting) all_html.append(html) html=padWithHTML(" ".join(all_html)) writeFileText(html,os.path.join(cp.Corpus.paths.output,guid+"_ilc_zoning.html"))
def fixPaperReferences(annotated_file, pmc_file, pmc_id, original_text=None): """ Replaces the <ref-list> section in `annotated_file` with that from `pmc_file` Checking that they actually the same file is done outside. """ annotated_text=loadFileText(annotated_file) if not original_text: original_text=loadFileText(pmc_file) try: orig_start, orig_end=selectRefListSection(original_text, pmc_file, pmc_id) except ValueError: return original_refs=original_text[orig_start:orig_end] try: annot_start, annot_end=selectRefListSection(annotated_text, annotated_file, getFilePMCID(annotated_file)) except ValueError: return new_annotated_text=annotated_text[:annot_start]+original_text[orig_start:orig_end]+annotated_text[annot_end:] writeFileText(new_annotated_text, annotated_file)
def explainInFileZoning(guid): """ Given a guid, generates a HTML visualization where sentences are tagged with their class for the contents of that file """ doc=cp.Corpus.loadSciDoc(guid) renderer=SciDocRenderer(doc) html=renderer.prettyPrintDocumentHTML(True,True, False, extra_attribute_function=extraAttributes, wrap_with_HTML_tags=False, reference_formatting_function=referenceFormatting) html=padWithHTML(html) writeFileText(html,os.path.join(cp.Corpus.paths.output,guid+"_text_zoning.html"))
def generateVisualizationFileList(self, files): """ For each element in the list, generates the _data.json file, then writes out global file_data.json Args: files: list of guids """ file_data=[] for index, guid in enumerate(files): print "Processing #%d - %s" % (index, guid) file_info=self.generateVisualizationOneFile(guid) file_data.append(file_info) json_str="file_data=" + json.dumps(file_data) + ";" writeFileText(json_str, self.output_dir+"file_data.json")
def fixPaperReferences(annotated_file, pmc_file, original_text=None): """ Replaces the <ref-list> section in `annotated_file` with that from `pmc_file` Checking that they actually the same file is done outside. """ annotated_text = loadFileText(annotated_file) if not original_text: original_text = loadFileText(pmc_file) orig_start, orig_end = selectRefListSection(original_text, pmc_file, pmc_id) original_refs = original_text[orig_start:orig_end] annot_start, annot_end = selectRefListSection(original_text) original_refs = original_text[orig_start:orig_end] annotated_text = annotated_text[:annot_start] + original_text[orig_start:orig_end] + annotated_text[annot_end:] writeFileText(annotated_text, annotated_file)
def generateSideBySide(doc_list): """ Generates side-by-side visualizations of a Paper XML: one using an XML to HTML converter, one loading the XML into SciDocJSON and rendering it back as HTML """ from subprocess import Popen reader=PaperXMLReader() output_dir="g:\\nlp\\phd\\aac\\conversion_visualization\\" if not os.path.exists(output_dir): os.makedirs(output_dir) file_list=[] for filename in doc_list: print("Converting %s" % filename) input_file=cp.Corpus.paths.inputXML+filename output_file=output_dir+"%s_1.html" % os.path.basename(filename) input_text=loadFileText(input_file) writeFileText(input_text,output_file) doc=reader.read(input_text, input_file) try: json.dumps(doc.data) except: print("Not JSON Serializable!!!!") html=SciDocRenderer(doc).prettyPrintDocumentHTML(True,True,True, True) output_file2=output_file.replace("_1.html","_2.html") writeFileText(html,output_file2) file_list.append([os.path.basename(output_file),os.path.basename(output_file2)]) file_list_json="file_data=%s;" % json.dumps(file_list) writeFileText(file_list_json,output_dir+"file_data.json")
def generateVisualizationOneFile(self, guid): """ Given a guid, it prepares its explainer document """ doc=cp.Corpus.loadSciDoc(guid) cp.Corpus.tagAllReferencesAsInCollectionOrNot(doc) counts1=self.getDocumentTokens(doc) # generate a unique id for each unique term, make a dictionary for index, token in enumerate(counts1): self.term_info[token]={"token_id":str(index), "references": []} self.overlapping_tokens={} ref_data={} in_collection_references=cp.Corpus.getMetadataByGUID(guid)["outlinks"] for ref in doc["references"]: match=cp.Corpus.matchReferenceInIndex(ref) if match: doc2=cp.Corpus.loadSciDoc(match["guid"]) counts2=self.getDocumentTokens(doc2) # for each in_collection_reference number (0 onwards) we store the list # of its overlapping tokens with the current document self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2) for token in self.overlapping_tokens[ref["id"]]: ref_list=self.term_info[token]["references"] if ref["id"] not in ref_list: ref_list.append(ref["id"]) ref_html=doc2.prettyPrintDocumentHTML( True, True, False, ## extra_attribute_function=self.extraAttributes, ## citation_formatting_function=self.citationFormatting, ## reference_formatting_function=self.referenceFormatting, text_formatting_function=self.textFormatting ) details="%s - %s - %s" % (match["guid"], formatCitation(ref), doc2["metadata"]["title"]) ref_record={"full_html":ref_html, "details":details} ref_data[ref["id"]]=ref_record # try to find some signal in the noise self.filterTokens(doc) html=doc.prettyPrintDocumentHTML( True, True, False, ## extra_attribute_function=self.extraAttributes, citation_formatting_function=self.citationFormatting, reference_formatting_function=self.referenceFormatting, text_formatting_function=self.textFormatting ) ## html=self.padWithHTML(html, guid) token_data={"full_html":html, "ref_data":ref_data} json_str=json.dumps(token_data) json_file=guid+"_data.json" writeFileText(json_str, self.output_dir+json_file) # TODO: generate file description # TODO: add title details="%s - %s - %s" % (guid, formatCitation(doc["metadata"]), doc["metadata"]["title"]) file_info={ "json_file":json_file, "id":guid, "title":doc["metadata"]["title"], "details": details} return file_info