def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step3_cal(): """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息 """ print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp) inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] inputs.extend(event.getSubeventInfos(subeventid)) for r in inputs: r["title"] = r["title"].encode("utf-8") r["content"] = r["content168"].encode("utf-8") r["label"] = r["subeventid"] # 计算各簇的存量特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): feature = Feature(label) feature.upsert_newest(fwords) # 计算文本权重 for r in inputs: weight = text_weight_cal(r, cluster_feature[r['label']]) news = News(r["_id"], event.id) news.update_news_weight(weight) # 文本去重 items_dict = {} for r in inputs: try: items_dict[r["label"]].append(r) except KeyError: items_dict[r["label"]] = [r] for label, items in items_dict.iteritems(): results = duplicate(items) for r in results: news = News(r["_id"], event.id) news.update_news_duplicate(r["duplicate"], r["same_from"]) # 更新簇的大小、增幅信息 before_size = event.get_subevent_size(label) event.update_subevent_size(label, len(items)) event.update_subevent_addsize(label, len(items) - before_size) if initializing: # 更新事件状态由initializing变为active event.activate() print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)