def getDocumentTokens(self, doc): """ Returns a dict of all terms in the document """ full_text=doc.getFullDocumentText(True,False) full_text=full_text.lower() tokens=tokenizeText(full_text) # remove many stopwords. hack! tokens=[token for token in tokens if token not in local_stopwords_list] counts=getDictOfTokenCounts(tokens) return counts
def textFormatting(self, text, glob): """ Text formatting function to be passed to JsonDoc.prettyPrintDocumentHTML() Args: text: text of sentence glob: globals dictionary """ res=[] text=text.strip(".") tokens=tokenizeText(text) for token in tokens: token_dict=self.term_info.get(token.lower(), {"token_id":0, "references":[]}) references=" ".join(token_dict["references"]) classes=str(token_dict["token_id"]) + " " + references res.append('<span term_id="%s" class="%s">%s</span>' % (str(token_dict["token_id"]),classes,token)) result=" ".join(res).strip() return result.strip(".")