def computeOverlapForAllReferences(doc, inverted_index): """ """ text1=doc.getFullDocumentText(True,False) tokens1=tokenizeText(text1) per_reference_tokens={} for ref in doc["references"]: match=cp.Corpus.matcher.matchReference(ref) if match: doc2=cp.Corpus.loadSciDoc(match["guid"]) text2=doc2.getFullDocumentText(True,False) tokens2=tokenizeText(text2) per_reference_tokens[ref["id"]]=computeWordOverlap(text1,text2,inverted_index) return per_reference_tokens
def computeOverlapForAllReferences(doc, inverted_index): """ """ text1 = doc.getFullDocumentText(True, False) tokens1 = tokenizeText(text1) per_reference_tokens = {} for ref in doc["references"]: match = cp.Corpus.matcher.matchReference(ref) if match: doc2 = cp.Corpus.loadSciDoc(match["guid"]) text2 = doc2.getFullDocumentText(True, False) tokens2 = tokenizeText(text2) per_reference_tokens[ref["id"]] = computeWordOverlap( text1, text2, inverted_index) return per_reference_tokens
def getDocumentTokens(self, doc): """ Returns a dict of all terms in the document """ full_text=doc.getFullDocumentText(True,False) full_text=full_text.lower() tokens=tokenizeText(full_text) # remove many stopwords. hack! tokens=[token for token in tokens if token not in local_stopwords_list] counts=getDictOfTokenCounts(tokens) return counts
def tokenizeAndTag(text,glob): """ Function to be passed to SciDoc.prettyPrintHTML that will mark each token individually """ glob["_token_counter"]=glob.get("_token_counter",0) glob["_inverted_index"]=glob.get("_inverted_index",defaultdict(lambda:set())) tokens=tokenizeText(text) res=[] for token in tokens: token_low=token.lower() glob["_inverted_index"][token_low].add(glob["_token_counter"]) res.append('<span class="token" id="t'+str(glob["_token_counter"])+'">'+token+'</span>') glob["_token_counter"]+=1 return " ".join(res)
def tokenizeAndTag(text, glob): """ Function to be passed to SciDoc.prettyPrintHTML that will mark each token individually """ glob["_token_counter"] = glob.get("_token_counter", 0) glob["_inverted_index"] = glob.get("_inverted_index", defaultdict(lambda: set())) tokens = tokenizeText(text) res = [] for token in tokens: token_low = token.lower() glob["_inverted_index"][token_low].add(glob["_token_counter"]) res.append('<span class="token" id="t' + str(glob["_token_counter"]) + '">' + token + '</span>') glob["_token_counter"] += 1 return " ".join(res)
def textFormatting(self, text, glob): """ Text formatting function to be passed to JsonDoc.prettyPrintDocumentHTML() Args: text: text of sentence glob: globals dictionary """ res=[] text=text.strip(".") tokens=tokenizeText(text) for token in tokens: token_dict=self.term_info.get(token.lower(), {"token_id":0, "references":[]}) references=" ".join(token_dict["references"]) classes=str(token_dict["token_id"]) + " " + references res.append('<span term_id="%s" class="%s">%s</span>' % (str(token_dict["token_id"]),classes,token)) result=" ".join(res).strip() return result.strip(".")