Esempio n. 1
0
def main():
    label_cnt = defaultdict(int)
    db = get_mongo('item')
    data = []
    for item in db.find():
        desc = ItemDescBase.deserialize(item['desc'])
        labels = item['tag']
        if not labels:
            continue
        for l in labels:
            label_cnt[l] += 1
        doc = desc.render_content()
        data.append((doc, labels))

    available_labels = set()
    total_cnt = sum(label_cnt.values())
    print total_cnt
    for label, cnt in label_cnt.iteritems():
        if cnt > total_cnt * 0.0015 and cnt < total_cnt * 0.1:
            available_labels.add(label)

    print 'remaining labels: ', len(available_labels)

    print("#documents: {}" . format(len(data)))
    print("training ...\n")
    random.shuffle(data)
    data = data[:MAX_DATA_SIZE]
    data = filter_data_label(data, available_labels)
    tagger = TextTagger(nr_min_word_count=3)
    tagger.fit(data)
    print("writing model...\n")
    tagger.dump(ukconfig.tagger_path)
Esempio n. 2
0
def auto_tagging(ctx, doc):
    """auto tagging an item.
        It will load tagger model from `ukconfig.tagger_path`.
        Model should be trained prior to make this function work"""
    global _tagger
    if _tagger is None:
        try:
            log_info('loading tagger ...')
            _tagger = TextTagger.load(ukconfig.tagger_path)
        except IOError:
            log_info('tagger model not found.')
            return

    tags = _tagger.predict_one(doc['desc'].render_content())
    declare_tag(tags)
    log_info('original tag: ' + str(doc['tag']))
    log_info('autotagging: ' + str(tags))
    doc['tag'] = list(set(doc['tag'] + tags))
    """auto tag """