def main(): label_cnt = defaultdict(int) db = get_mongo('item') data = [] for item in db.find(): desc = ItemDescBase.deserialize(item['desc']) labels = item['tag'] if not labels: continue for l in labels: label_cnt[l] += 1 doc = desc.render_content() data.append((doc, labels)) available_labels = set() total_cnt = sum(label_cnt.values()) print total_cnt for label, cnt in label_cnt.iteritems(): if cnt > total_cnt * 0.0015 and cnt < total_cnt * 0.1: available_labels.add(label) print 'remaining labels: ', len(available_labels) print("#documents: {}" . format(len(data))) print("training ...\n") random.shuffle(data) data = data[:MAX_DATA_SIZE] data = filter_data_label(data, available_labels) tagger = TextTagger(nr_min_word_count=3) tagger.fit(data) print("writing model...\n") tagger.dump(ukconfig.tagger_path)
def apply_postfilter(user_id, docs): """:param docs: raw document retrived from mongodb from item collection; would be changed in default""" for i in docs: i['desc'] = ItemDescBase.deserialize(i['desc']) ctx = PostfilterContext(user_id) for i in get_enabled_user_postfilter(user_id): PostfilterBase._postfilter_list[i].apply(ctx, docs) docs.sort(key=lambda x: x['creation_time'], reverse=True)
def chg(d): for k, v in d.iteritems(): if isinstance(v, datetime): d[k] = str(v) elif isinstance(v, Binary): d[k] = ItemDescBase.deserialize(v).render_title()