Ejemplo n.º 1
0
    def step4_cal():
        """ 24 点时merge已有的簇
        """
        if not initializing and now_hour == 0:
            print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp)

            subevents = event.getSubEvents()
            subevent_fwords = dict()
            for subevent in subevents:
                subeventid = subevent["_id"]
                feature = Feature(subeventid)

                # 获取每个子事件最新的特征词
                fwords = feature.get_newest()
                subevent_fwords[subeventid] = fwords

            subeventids_sort_timestamp = event.get_sorted_subeventids()

            cids, mids = merge_subevents(subevent_fwords, subeventids_sort_timestamp, top_tfidf_para=10, top_percent=0.3)

            for res_id, mer_id in mids:
                # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇
                temp_infos = event.get_subevent_infos(mer_id)

                for r in temp_infos:
                    news = News(r["_id"], event.id)
                    news.update_news_subeventid(res_id)

                event.remove_subevents([mer_id])

            # 重新计算各簇的特征词, 并计算文本权重, 并去重
            if len(mids):
                step3_cal()

            print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Ejemplo n.º 2
0
    def step1_cal():
        """第一步计算,获取子事件特征词,新文本与特征词匹配分类
        """
        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp)

        if initializing:
            # 若话题需要做初始聚类,获取话题开始时间之前的文本
            results = event.getInitialInfos()
        else:
            # 若话题已做完初始聚类,获取话题最新一小时的文本
            results = event.getInfos(timestamp - 3600, timestamp)

            if now_hour == 0:
                # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇
                temp_subeventids = event.getTodayCreatSubeventIds()
                temp_infos = event.getTodayCreatSubeventInfos()
                event.remove_subevents(temp_subeventids)
                results.extend(temp_infos)

        print eventid, ' before classify: ', len(results)

        # 获取子事件
        subevents = event.getSubEvents()
        labels_list = []
        feature_words_list = []
        for subevent in subevents:
            subeventid = subevent["_id"]
            feature = Feature(subeventid)

            # 获取每个子事件最新的特征词
            fwords = feature.get_newest()
            feature_words_list.append(fwords)
            labels_list.append(subeventid)

        for r in results:
            text = (r['title'] + r['content168']).encode('utf-8')
            feature_words_inputs = []
            for fwords in feature_words_list:
                wcdict = dict()
                for w, c in fwords.iteritems():
                    if isinstance(w, unicode):
                        w = w.encode('utf-8')
                    wcdict[w] = c
                feature_words_inputs.append(wcdict)

            # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签
            label = subevent_classifier(text, labels_list,
                                        feature_words_inputs)

            if label == "other":
                label = event.getOtherSubEventID()

            news = News(r["_id"], event.id)
            news.update_news_subeventid(label)

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
Ejemplo n.º 3
0
    def step1_cal():
        """第一步计算,获取子事件特征词,新文本与特征词匹配分类
        """
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step1' % ts2datetime(timestamp)

        if initializing:
            # 若话题需要做初始聚类,获取话题开始时间之前的文本
            results = event.getInitialInfos()
        else:
            # 若话题已做完初始聚类,获取话题最新一小时的文本
            results = event.getInfos(timestamp - 3600, timestamp)

            if now_hour == 0:
                # 如果不是在做初始化,24时的时候, 一定把当天(大于或等于0时小于24时)产生的簇(非其他簇)下的文本重新做一下匹配, 同时删除这些簇
                temp_subeventids = event.getTodayCreatSubeventIds()
                temp_infos = event.getTodayCreatSubeventInfos()
                event.remove_subevents(temp_subeventids)
                results.extend(temp_infos)

        print eventid, ' before classify: ', len(results)

        # 获取子事件
        subevents = event.getSubEvents()
        labels_list = []
        feature_words_list = []
        for subevent in subevents:
            subeventid = subevent["_id"]
            feature = Feature(subeventid)

            # 获取每个子事件最新的特征词
            fwords = feature.get_newest()
            feature_words_list.append(fwords)
            labels_list.append(subeventid)

        for r in results:
            text = (r['title'] + r['content168']).encode('utf-8')
            feature_words_inputs = []
            for fwords in feature_words_list:
                wcdict = dict()
                for w, c in fwords.iteritems():
                    if isinstance(w, unicode):
                        w = w.encode('utf-8')
                    wcdict[w] = c
                feature_words_inputs.append(wcdict)

            # 单条文本与各子事件的特征词进行匹配,得到每条文本的簇标签
            label = subevent_classifier(text, labels_list, feature_words_inputs)

            if label == "other":
                label = event.getOtherSubEventID()

            news = News(r["_id"], event.id)
            news.update_news_subeventid(label)

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step1' % ts2datetime(timestamp)
Ejemplo n.º 4
0
    def step2_cal():
        """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价
        """
        # 聚类评价时选取TOPK_FREQ_WORD的高频词
        TOPK_FREQ_WORD = 50
        # 聚类评价时最小簇的大小
        LEAST_SIZE = 8

        # 判断其他类是否需要分裂
        ifsplit = event.check_ifsplit(initializing)
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(timestamp)

        if ifsplit:
            inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(initializing)
            print eventid, ' after classify before split: ', len(inputs), kmeans_cluster_num, reserve_num
            if len(inputs) > 2:
                items = []
                for r in inputs:
                    r["title"] = r["title"].encode("utf-8")
                    r["content"] = r["content168"].encode("utf-8")
                    items.append(r)

                # kmeans聚类
                kmeans_results = kmeans(items, k=kmeans_cluster_num)

                # 聚类评价
                if initializing or now_hour == 0:
                    min_tfidf = event.get_min_tfidf()
                    final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=min_tfidf)
                else:
                    # 每小时聚类时,不用和已有簇的最小tfidf作比
                    final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE)

                # 更新新闻簇标签,更新子事件表
                for label, items in final_cluster_results.iteritems():
                    if label == "other":
                        label = event.getOtherSubEventID()

                    event.save_subevent(label, timestamp)

                    if label != event.getOtherSubEventID():
                        # 更新每类的tfidf
                        event.update_subevent_tfidf(label, tfidf_dict[label])

                    for r in items:
                        news = News(r["_id"], event.id)
                        news.update_news_subeventid(label)
            else:
                print 'inputs less than 2, kmeans aborted'

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)
Ejemplo n.º 5
0
def text_kmeans_clustering():
    # 聚类评价时选取TOPK_FREQ_WORD的高频词
    TOPK_FREQ_WORD = 50

    # 聚类评价时最小簇的大小
    LEAST_SIZE = 8

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        results = eventcomment.getNewsComments(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'])
            comment.update_comment_sentiment(sentiment)

        # kmeans 聚类及评价
        kmeans_results = kmeans(inputs, k=10)
        reserve_num = 5
        final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \
                top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None)

        inputs = []
        for label, items in final_cluster_results.iteritems():
            if label != 'other':
                inputs.extend(items)

            for item in items:
                news = News(item['news_id'])

                if label == 'other':
                    label = news.otherClusterId

                comment = Comment(item['_id'])
                comment.update_comment_label(label)

            eventcomment.save_cluster(label, news_id, int(time.time()))

        #计算各簇特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            eventcomment.update_feature_words(label, fwords)

        #计算文本权重
        for input in inputs:
            weight = text_weight_cal(input, cluster_feature[input['label']])
            comment = Comment(input['_id'])
            comment.update_comment_weight(weight)
Ejemplo n.º 6
0
    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Ejemplo n.º 7
0
    def step4_cal():
        """ 24 点时merge已有的簇
        """
        if not initializing and now_hour == 0:
            print '[%s] ' % ts2datetime(int(time.time(
            ))), 'event ', eventid, ' %s start step4' % ts2datetime(timestamp)

            subevents = event.getSubEvents()
            subevent_fwords = dict()
            for subevent in subevents:
                subeventid = subevent["_id"]
                feature = Feature(subeventid)

                # 获取每个子事件最新的特征词
                fwords = feature.get_newest()
                subevent_fwords[subeventid] = fwords

            subeventids_sort_timestamp = event.get_sorted_subeventids()

            cids, mids = merge_subevents(subevent_fwords,
                                         subeventids_sort_timestamp,
                                         top_tfidf_para=10,
                                         top_percent=0.3)

            for res_id, mer_id in mids:
                # 将mer_id下的文本扔入res_id下的簇,remove mer_id的簇
                temp_infos = event.get_subevent_infos(mer_id)

                for r in temp_infos:
                    news = News(r["_id"], event.id)
                    news.update_news_subeventid(res_id)

                event.remove_subevents([mer_id])

            # 重新计算各簇的特征词, 并计算文本权重, 并去重
            if len(mids):
                step3_cal()

            print '[%s] ' % ts2datetime(int(time.time(
            ))), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Ejemplo n.º 8
0
def test_subevent_classifier():
    labels_list = []
    feature_words_inputs = []
    subevents = event.getSubEvents()
    for subevent in subevents:
        subeventid = subevent["_id"]
        if subeventid != "575612b6-a26f-4df9-a2de-01c85cae56a2":
            labels_list.append(subeventid)
            feature = Feature(subeventid)
            feature_words = feature.get_newest()
            new_feature_words = dict()
            for k, v in feature_words.iteritems():
                new_feature_words[k.encode('utf-8')] = v
            feature_words_inputs.append(new_feature_words)

    news_id = "http://news.xinhuanet.com/comments/2014-11/03/c_1113084515.htm"
    news = News(news_id, event.id)
    ns = news.get_news_info()
    text = ns['title'].encode('utf-8') + ns['content168'].encode('utf-8')
    label = subevent_classifier(text, labels_list, feature_words_inputs)

    print label
Ejemplo n.º 9
0
 def add(self, id):
     parser = NewsParser(id)
     if not parser.success:
         return False
     try:
         date = parser.get_date()
         header = parser.get_header()
         text = parser.get_text()
     except Exception:
         return False
     news_entry = News(id=id, date=date, header=header, text=text)
     self.session.add(news_entry)
     self.session.commit()
     return True
Ejemplo n.º 10
0
    def step3_cal():
        """计算各簇的特征词、代表文本、去重, 更新簇的大小、增幅信息
        """
        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s start step3' % ts2datetime(timestamp)

        inputs = []
        subevents = event.getSubEvents()
        for subevent in subevents:
            subeventid = subevent["_id"]
            inputs.extend(event.getSubeventInfos(subeventid))

        for r in inputs:
            r["title"] = r["title"].encode("utf-8")
            r["content"] = r["content168"].encode("utf-8")
            r["label"] = r["subeventid"]

        # 计算各簇的存量特征词
        cluster_feature = extract_feature(inputs)
        for label, fwords in cluster_feature.iteritems():
            feature = Feature(label)
            feature.upsert_newest(fwords)

        # 计算文本权重
        for r in inputs:
            weight = text_weight_cal(r, cluster_feature[r['label']])
            news = News(r["_id"], event.id)
            news.update_news_weight(weight)

        # 文本去重
        items_dict = {}
        for r in inputs:
            try:
                items_dict[r["label"]].append(r)
            except KeyError:
                items_dict[r["label"]] = [r]

        for label, items in items_dict.iteritems():
            results = duplicate(items)
            for r in results:
                news = News(r["_id"], event.id)
                news.update_news_duplicate(r["duplicate"], r["same_from"])

            # 更新簇的大小、增幅信息
            before_size = event.get_subevent_size(label)
            event.update_subevent_size(label, len(items))
            event.update_subevent_addsize(label, len(items) - before_size)

        if initializing:
            # 更新事件状态由initializing变为active
            event.activate()

        print '[%s] ' % ts2datetime(int(time.time())), 'event ', eventid, ' %s end step3' % ts2datetime(timestamp)
Ejemplo n.º 11
0
def one_topic_calculation_comments_v2(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v2 import kmeans, tfidf_v2, text_classify, cluster_evaluation, global_text_weight

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        tfidf_word = tfidf_v2(inputs)

        #聚类个数=过滤后文本数/2向上取整,大于10的取10
        kmeans_cluster_number = int(math.ceil(float(len(inputs)) / 5.0))
        if kmeans_cluster_number > 10:
            kmeans_cluster_number = 10
        if kmeans_cluster_number < 5:
            kmeans_cluster_number = 5

        # 评论词聚类
        word_label = kmeans(tfidf_word, inputs, k=kmeans_cluster_number)

        # 计算全局文本权重
        for r in inputs:
            gweight = global_text_weight(r['content'], tfidf_word)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_global_weight(gweight)

        # 评论文本分类
        results = text_classify(inputs, word_label, tfidf_word)

        #簇评价
        reserved_num = int(math.ceil(float(kmeans_cluster_number) / 2.0))
        LEAST_CLUSTER_SIZE = 3  # 最小的簇大小
        TOPK_FREQ = 10
        TOPK_WEIGHT = 5
        LEAST_FREQ = 0
        final_cluster_results = cluster_evaluation(results, top_num=reserved_num, topk_freq=TOPK_FREQ, \
                least_freq=LEAST_FREQ, least_size=LEAST_CLUSTER_SIZE, topk_weight=TOPK_WEIGHT)
        for label, items in final_cluster_results.iteritems():
            if label == 'other':
                label = news.otherClusterId

            if len(items):
                eventcomment.save_cluster(label, news_id, int(time.time()))

            if label != news.otherClusterId:
                fwords = word_label[label]
                eventcomment.update_feature_words(label, fwords)

            for item in items:
                comment = Comment(item['_id'], topicid)
                comment.update_comment_label(label)
                comment.update_comment_weight(item['weight'])
Ejemplo n.º 12
0
def one_topic_calculation_comments_v7(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \
            cluster_evaluation, choose_cluster
    from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        # 数据字段预处理
        inputs = []
        for r in results:
            r['title'] = ''
            r['content168'] = r['content168'].encode('utf-8')
            r['content'] = r['content168']
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 去除垃圾和新闻文本
        items = weibo_subob_rub_neu_classifier(inputs)
        inputs = []
        for item in items:
            subob_rub_neu_label = item['subob_rub_neu_label']
            if not subob_rub_neu_label in [1, 0]:
                # 1表示垃圾文本,0表示新闻文本
                inputs.append(item)

        MIN_CLUSTERING_INPUT = 30
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 10
        # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v2(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM,
                                     MAX_CLUSTER_NUM)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k, v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])

        # 情绪计算
        for r in inputs:
            if r['subob_rub_neu_label'] == 2:
                sentiment = 0  # 0 中性
            elif r['subob_rub_neu_label'] == -1:
                sentiment = triple_classifier(r)  # 1 高兴、2 愤怒、3 悲伤、0无情感
                if sentiment == 0:
                    sentiment = mid_sentiment_classify(r['text'])

                if sentiment == -1:
                    sentiment = 0  # 中性

            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)
Ejemplo n.º 13
0
def one_topic_calculation_comments_v4(topicid):
    """对评论进行聚类
    """
    from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \
            cluster_evaluation, choose_cluster

    eventcomment = EventComments(topicid)
    newsIds = eventcomment.getNewsIds()

    for news_id in newsIds:
        eventcomment.clear_cluster(news_id)
        results = eventcomment.getNewsComments(news_id)
        news = News(news_id)

        inputs = []
        for r in results:
            r['title'] = ''
            r['content'] = r['content168'].encode('utf-8')
            r['text'] = r['content168']
            item = ad_filter(r)
            if item['ad_label'] == 0:
                inputs.append(item)

        # 情绪计算
        for r in inputs:
            sentiment = triple_classifier(r)
            comment = Comment(r['_id'], topicid)
            comment.update_comment_sentiment(sentiment)

        MIN_CLUSTERING_INPUT = 50
        MIN_CLUSTER_NUM = 2
        MAX_CLUSTER_NUM = 15
        if len(inputs) >= MIN_CLUSTERING_INPUT:
            tfidf_word, input_dict = tfidf_v4(inputs)
            results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM,
                                     MAX_CLUSTER_NUM)

            # for k, v in results.iteritems():
            #     print k, len(v)

            #评论文本聚类
            cluster_text = text_classify(inputs, results, tfidf_word)

            evaluation_inputs = []

            for k, v in enumerate(cluster_text):
                inputs[k]['label'] = v['label']
                inputs[k]['weight'] = v['weight']
                evaluation_inputs.append(inputs[k])

            #簇评价
            recommend_text = cluster_evaluation(evaluation_inputs)
            for label, items in recommend_text.iteritems():
                if label == 'other':
                    label = news.otherClusterId

                if len(items):
                    eventcomment.save_cluster(label, news_id, int(time.time()))

                if label != news.otherClusterId:
                    fwords = results[label]
                    eventcomment.update_feature_words(label, fwords)

                for item in items:
                    comment = Comment(item['_id'], topicid)
                    comment.update_comment_label(label)
                    comment.update_comment_weight(item['weight'])
Ejemplo n.º 14
0
    def step2_cal():
        """第二步计算,判断其他类是否需要分裂,若需要,则对其他类进行文本聚类,并做聚类评价
        """
        # 聚类评价时选取TOPK_FREQ_WORD的高频词
        TOPK_FREQ_WORD = 50
        # 聚类评价时最小簇的大小
        LEAST_SIZE = 8

        # 判断其他类是否需要分裂
        ifsplit = event.check_ifsplit(initializing)
        print '[%s] ' % ts2datetime(
            int(time.time())
        ), 'event ', eventid, ' split ', ifsplit, ' %s start step2' % ts2datetime(
            timestamp)

        if ifsplit:
            inputs, kmeans_cluster_num, reserve_num = event.getOtherSubEventInfos(
                initializing)
            print eventid, ' after classify before split: ', len(
                inputs), kmeans_cluster_num, reserve_num
            if len(inputs) > 2:
                items = []
                for r in inputs:
                    r["title"] = r["title"].encode("utf-8")
                    r["content"] = r["content168"].encode("utf-8")
                    items.append(r)

                # kmeans聚类
                kmeans_results = kmeans(items, k=kmeans_cluster_num)

                # 聚类评价
                if initializing or now_hour == 0:
                    min_tfidf = event.get_min_tfidf()
                    final_cluster_results, tfidf_dict = cluster_evaluation(
                        kmeans_results,
                        top_num=reserve_num,
                        topk_freq=TOPK_FREQ_WORD,
                        least_size=LEAST_SIZE,
                        min_tfidf=min_tfidf)
                else:
                    # 每小时聚类时,不用和已有簇的最小tfidf作比
                    final_cluster_results, tfidf_dict = cluster_evaluation(
                        kmeans_results,
                        top_num=reserve_num,
                        topk_freq=TOPK_FREQ_WORD,
                        least_size=LEAST_SIZE)

                # 更新新闻簇标签,更新子事件表
                for label, items in final_cluster_results.iteritems():
                    if label == "other":
                        label = event.getOtherSubEventID()

                    event.save_subevent(label, timestamp)

                    if label != event.getOtherSubEventID():
                        # 更新每类的tfidf
                        event.update_subevent_tfidf(label, tfidf_dict[label])

                    for r in items:
                        news = News(r["_id"], event.id)
                        news.update_news_subeventid(label)
            else:
                print 'inputs less than 2, kmeans aborted'

        print '[%s] ' % ts2datetime(int(time.time(
        ))), 'event ', eventid, ' %s end step2' % ts2datetime(timestamp)