class PythonSeg(object): def __init__(self): self.seg = Segment() self.keywords = {} self.load() def load(self): for tag in Tag.objects(): score = getattr(tag, 'score', settings.NEW_WORD_DEFAULT_VALUE) self.add_word(tag.name, score) def add_word(self, name, score=settings.NEW_WORD_DEFAULT_VALUE): self.seg.add(name.strip()) self.add_keyword(name.strip(), score) def add_keyword(self, word, score=settings.NEW_WORD_DEFAULT_VALUE): self.keywords.update({word.strip(): score}) def is_keyword(self, word): return self.keywords.has_key(word) def parse(self, words, weight=1, TF_IDF=True): if not isinstance(words, basestring): return [] results = [] for token in self.seg.seg_text(words): if len(token) > 1: results.append(token) d = {} for r in results: if r in d: d[r] += weight * self.keywords.get(r, 1) if TF_IDF else weight else: d[r] = weight * self.keywords.get(r, 1) if TF_IDF else weight return d
def __init__(self): self.seg = Segment() self.keywords = {} self.load()