def step4_cal(): """ 24 点时merge已有的簇 """ if not initializing and now_hour == 0: print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp) subevents = event.getSubEvents() subevent_fwords = dict() for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() subevent_fwords[subeventid] = fwords subeventids_sort_timestamp = event.get_sorted_subeventids() cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3) for res_id, mer_id in mids: # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇 temp_infos = event.get_subevent_infos(mer_id) for r in temp_infos: news = News(r["_id"], event.id) news.update_news_subeventid(res_id) event.remove_subevents([mer_id]) # 重新计算各簇的特征词, 并计算文本权重, 并去重 if len(mids): step3_cal() print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def step1_cal(): """第一步计算,获取子事件特征词,新文本与特征词匹配分类 """ print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp) if initializing: # 若话题需要做初始聚类,获取话题开始时间之前的文本 results = event.getInitialInfos() else: # 若话题已做完初始聚类,获取话题最新一小时的文本 results = event.getInfos(timestamp - 3600, timestamp) if now_hour == 0: # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇 temp_subeventids = event.getTodayCreatSubeventIds() temp_infos = event.getTodayCreatSubeventInfos() event.remove_subevents(temp_subeventids) results.extend(temp_infos) print eventid, ' before classify: ', len(results) # 获取子事件 subevents = event.getSubEvents() labels_list = [] feature_words_list = [] for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() feature_words_list.append(fwords) labels_list.append(subeventid) for r in results: text = (r['title'] + r['content168']).encode('utf-8') feature_words_inputs = [] for fwords in feature_words_list: wcdict = dict() for w, c in fwords.iteritems(): if isinstance(w, unicode): w = w.encode('utf-8') wcdict[w] = c feature_words_inputs.append(wcdict) # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签 label = subevent_classifier(text, labels_list, feature_words_inputs) if label == "other": label = event.getOtherSubEventID() news = News(r["_id"], event.id) news.update_news_subeventid(label) print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
def step1_cal(): """第一步计算,获取子事件特征词,新文本与特征词匹配分类 """ print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp) if initializing: # 若话题需要做初始聚类,获取话题开始时间之前的文本 results = event.getInitialInfos() else: # 若话题已做完初始聚类,获取话题最新一小时的文本 results = event.getInfos(timestamp - 3600, timestamp) if now_hour == 0: # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇 temp_subeventids = event.getTodayCreatSubeventIds() temp_infos = event.getTodayCreatSubeventInfos() event.remove_subevents(temp_subeventids) results.extend(temp_infos) print eventid, ' before classify: ', len(results) # 获取子事件 subevents = event.getSubEvents() labels_list = [] feature_words_list = [] for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() feature_words_list.append(fwords) labels_list.append(subeventid) for r in results: text = (r['title'] + r['content168']).encode('utf-8') feature_words_inputs = [] for fwords in feature_words_list: wcdict = dict() for w, c in fwords.iteritems(): if isinstance(w, unicode): w = w.encode('utf-8') wcdict[w] = c feature_words_inputs.append(wcdict) # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签 label = subevent_classifier(text, labels_list, feature_words_inputs) if label == "other": label = event.getOtherSubEventID() news = News(r["_id"], event.id) news.update_news_subeventid(label) print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
def one_topic_merge(eventid_initializing): """合并簇 input: eventid_initializing: (eventid, initializing) eventid: 话题ID initializing: 是否做初始聚类 """ eventid, initializing = eventid_initializing # 根据话题ID初始化话题实例 event = Event(eventid) timestamp = event.getLastmodify() + 3600 # 当前的时间戳,int, 默认为最后修改日期+3600 now_hour = int(time.strftime('%H', time.localtime(timestamp))) subevents = event.getSubEvents() subevent_fwords = dict() for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() subevent_fwords[subeventid] = fwords subeventids_sort_timestamp = event.get_sorted_subeventids() cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3) for res_id, mer_id in mids: # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇,同时重新计算各簇的特征词, 并计算文本权重, 并去重 temp_infos = event.get_subevent_infos(mer_id) for r in temp_infos: news = News(r["_id"], event.id) news.update_news_subeventid(res_id) event.remove_subevents([mer_id])
def step4_cal(): """ 24 点时merge已有的簇 """ if not initializing and now_hour == 0: print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp) subevents = event.getSubEvents() subevent_fwords = dict() for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() subevent_fwords[subeventid] = fwords subeventids_sort_timestamp = event.get_sorted_subeventids() cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3) for res_id, mer_id in mids: # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇 temp_infos = event.get_subevent_infos(mer_id) for r in temp_infos: news = News(r["_id"], event.id) news.update_news_subeventid(res_id) event.remove_subevents([mer_id]) # 重新计算各簇的特征词, 并计算文本权重, 并去重 if len(mids): step3_cal() print '[%s] ' % ts2datetime(int(time.time( ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
def test_subevent_classifier(): labels_list = [] feature_words_inputs = [] subevents = event.getSubEvents() for subevent in subevents: subeventid = subevent["_id"] if subeventid != "575612b6-a26f-4df9-a2de-01c85cae56a2": labels_list.append(subeventid) feature = Feature(subeventid) feature_words = feature.get_newest() new_feature_words = dict() for k, v in feature_words.iteritems(): new_feature_words[k.encode('utf-8')] = v feature_words_inputs.append(new_feature_words) news_id = "http://news.xinhuanet.com/comments/2014-11/03/c_1113084515.htm" news = News(news_id, event.id) ns = news.get_news_info() text = ns['title'].encode('utf-8') + ns['content168'].encode('utf-8') label = subevent_classifier(text, labels_list, feature_words_inputs) print label
def one_topic_merge(eventid_initializing): """合并簇 input: eventid_initializing: (eventid, initializing) eventid: 话题ID initializing: 是否做初始聚类 """ eventid, initializing = eventid_initializing # 根据话题ID初始化话题实例 event = Event(eventid) timestamp = event.getLastmodify() + 3600 # 当前的时间戳,int, 默认为最后修改日期+3600 now_hour = int(time.strftime('%H', time.localtime(timestamp))) subevents = event.getSubEvents() subevent_fwords = dict() for subevent in subevents: subeventid = subevent["_id"] feature = Feature(subeventid) # 获取每个子事件最新的特征词 fwords = feature.get_newest() subevent_fwords[subeventid] = fwords subeventids_sort_timestamp = event.get_sorted_subeventids() cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3) for res_id, mer_id in mids: # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇,同时重新计算各簇的特征词, 并计算文本权重, 并去重 temp_infos = event.get_subevent_infos(mer_id) for r in temp_infos: news = News(r["_id"], event.id) news.update_news_subeventid(res_id) event.remove_subevents([mer_id])