Example #1
0
def main():
    label_cnt = defaultdict(int)
    db = get_mongo('item')
    data = []
    for item in db.find():
        desc = ItemDescBase.deserialize(item['desc'])
        labels = item['tag']
        if not labels:
            continue
        for l in labels:
            label_cnt[l] += 1
        doc = desc.render_content()
        data.append((doc, labels))

    available_labels = set()
    total_cnt = sum(label_cnt.values())
    print total_cnt
    for label, cnt in label_cnt.iteritems():
        if cnt > total_cnt * 0.0015 and cnt < total_cnt * 0.1:
            available_labels.add(label)

    print 'remaining labels: ', len(available_labels)

    print("#documents: {}" . format(len(data)))
    print("training ...\n")
    random.shuffle(data)
    data = data[:MAX_DATA_SIZE]
    data = filter_data_label(data, available_labels)
    tagger = TextTagger(nr_min_word_count=3)
    tagger.fit(data)
    print("writing model...\n")
    tagger.dump(ukconfig.tagger_path)
Example #2
0
def apply_postfilter(user_id, docs):
    """:param docs: raw document retrived from mongodb from item collection;
    would be changed in default"""

    for i in docs:
        i['desc'] = ItemDescBase.deserialize(i['desc'])

    ctx = PostfilterContext(user_id)

    for i in get_enabled_user_postfilter(user_id):
        PostfilterBase._postfilter_list[i].apply(ctx, docs)

    docs.sort(key=lambda x: x['creation_time'], reverse=True)
Example #3
0
 def chg(d):
     for k, v in d.iteritems():
         if isinstance(v, datetime):
             d[k] = str(v)
         elif isinstance(v, Binary):
             d[k] = ItemDescBase.deserialize(v).render_title()