def parse(self, words, weight=1, TF_IDF=True): if not isinstance(words, basestring): return [] results = [] smart_print(words) words = re.sub('\s', ENGLISH_SEGMENT_SEPARATOR, words) smart_print(words) words = to_str(words) for token in self.seg.seg_txt(words): token = token.decode('utf-8') token = re.sub('Z+', ' ', token).strip() if self.is_keyword(token): results.append(token) d = {} for r in results: if r in d: d[r] += weight * self.keywords.get(r, 1) if TF_IDF else weight else: d[r] = weight * self.keywords.get(r, 1) if TF_IDF else weight return d
def __repr__(self): return to_str(self.name)