def explainRelevance(guid): """ Given a guid, it prepares its explainer document """ doc=cp.Corpus.loadSciDoc(guid) html=doc.prettyPrintDocumentHTML(True,True,False,tokenizeAndTag, referenceFormatting) computeOverlapForAllReferences(doc,doc.glob["_inverted_index"]) writeFileText(html,cp.Corpus.dir_output+guid+"_explain.html")
def explainRelevance(guid): """ Given a guid, it prepares its explainer document """ doc = cp.Corpus.loadSciDoc(guid) html = doc.prettyPrintDocumentHTML(True, True, False, tokenizeAndTag, referenceFormatting) computeOverlapForAllReferences(doc, doc.glob["_inverted_index"]) writeFileText(html, cp.Corpus.dir_output + guid + "_explain.html")
def generateVisualization(self, guid): """ Given a guid, it prepares its explainer document """ ## output_dir=Corpus.dir_output output_dir=r"C:\Users\dd\Documents\Dropbox\PhD\code\doc_visualization\\" doc=Corpus.loadSciDoc(guid) Corpus.tagAllReferencesAsInCollectionOrNot(doc) counts1=self.getDocumentTokens(doc) # generate a unique id for each unique term, make a dictionary for index, token in enumerate(counts1): self.term_info[token]={"token_id":str(index), "references": []} self.overlapping_tokens={} in_collection_references=Corpus.getMetadataByGUID(guid)["outlinks"] for ref in doc["references"]: match=Corpus.matchReferenceInIndex(ref) if match: doc2=Corpus.loadSciDoc(match["guid"]) counts2=self.getDocumentTokens(doc2) # for each in_collection_reference number (0 onwards) we store the list # of its overlapping tokens with the current document self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2) for token in self.overlapping_tokens[ref["id"]]: ref_list=self.term_info[token]["references"] if ref["id"] not in ref_list: ref_list.append(ref["id"]) # try to find some signal in the noise self.filterTokens() json_str="var token_data=" + json.dumps(self.overlapping_tokens) + ";" writeFileText(json_str, output_dir+guid+"_data.json") html=doc.prettyPrintDocumentHTML( True, True, False, ## extra_attribute_function=self.extraAttributes, citation_formatting_function=self.citationFormatting, reference_formatting_function=self.referenceFormatting, text_formatting_function=self.textFormatting ) html=self.padWithHTML(html, guid) writeFileText(html,output_dir+guid+"_vis.html")
def generateVisualization(self, guid): """ Given a guid, it prepares its explainer document """ ## output_dir=Corpus.dir_output output_dir=r"C:\Users\dd\Documents\Dropbox\PhD\code\doc_visualization\\" doc=Corpus.loadSciDoc(guid) Corpus.tagAllReferencesAsInCollectionOrNot(doc) counts1=self.getDocumentTokens(doc) # generate a unique id for each unique term, make a dictionary for index, token in enumerate(counts1): self.term_info[token]={"token_id":str(index), "references": []} self.overlapping_tokens={} in_collection_references=Corpus.getMetadataByGUID(guid)["outlinks"] for ref in doc["references"]: match=Corpus.matchReferenceInIndex(ref) if match: doc2=Corpus.loadSciDoc(match["guid"]) counts2=self.getDocumentTokens(doc2) # for each in_collection_reference number (0 onwards) we store the list # of its overlapping tokens with the current document self.overlapping_tokens[ref["id"]]=self.getOverlappingTokens(counts1, counts2) for token in self.overlapping_tokens[ref["id"]]: ref_list=self.term_info[token]["references"] if ref["id"] not in ref_list: ref_list.append(ref["id"]) # try to find some signal in the noise self.filterTokens() json_str="var token_data=" + json.dumps(self.overlapping_tokens) + ";" writeFileText(json_str, output_dir+guid+"_data.json") html=doc.prettyPrintDocumentHTML( True, True, False, ## extra_attribute_function=self.extraAttributes, citation_formatting_function=self.citationFormatting, reference_formatting_function=self.referenceFormatting, text_formatting_function=self.textFormatting ) html=self.padWithHTML(html, guid) writeFileText(html,output_dir+guid+"_vis.html")