def text_kmeans_clustering(): # 聚类评价时选取TOPK_FREQ_WORD的高频词 TOPK_FREQ_WORD = 50 # 聚类评价时最小簇的大小 LEAST_SIZE = 8 eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: results = eventcomment.getNewsComments(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id']) comment.update_comment_sentiment(sentiment) # kmeans 聚类及评价 kmeans_results = kmeans(inputs, k=10) reserve_num = 5 final_cluster_results, tfidf_dict = cluster_evaluation(kmeans_results, \ top_num=reserve_num, topk_freq=TOPK_FREQ_WORD, least_size=LEAST_SIZE, min_tfidf=None) inputs = [] for label, items in final_cluster_results.iteritems(): if label != 'other': inputs.extend(items) for item in items: news = News(item['news_id']) if label == 'other': label = news.otherClusterId comment = Comment(item['_id']) comment.update_comment_label(label) eventcomment.save_cluster(label, news_id, int(time.time())) #计算各簇特征词 cluster_feature = extract_feature(inputs) for label, fwords in cluster_feature.iteritems(): eventcomment.update_feature_words(label, fwords) #计算文本权重 for input in inputs: weight = text_weight_cal(input, cluster_feature[input['label']]) comment = Comment(input['_id']) comment.update_comment_weight(weight)
def classify_without_sentiment(uid_weibo, uid_list, start_date, end_date): ''' 没有情感标签的分类主函数 输入数据:list对象 [[uid,text,time],[uid,text,time],...] 输出数据:字典对象 {uid1:str1,uid2:str2,...} ''' uid_sentiment = dict() new_uid = [] min_ts = int(time.mktime(time.strptime(start_date, '%Y-%m-%d'))) max_ts = int(time.mktime(time.strptime(end_date, '%Y-%m-%d'))) time_index, time_list = sta_time_list(min_ts, max_ts) for uid, text, ts in uid_weibo: if uid not in new_uid: new_uid.append(uid) sentiment = triple_classifier({'text': text}) date_str = time.strftime('%Y-%m-%d', time.localtime(float(ts))) if uid_sentiment.has_key(uid): item = uid_sentiment[uid] index = time_index[date_str] if sentiment == 2 or sentiment == 5: item[index][0] = item[index][0] + 1 if sentiment == 3 or sentiment == 4: item[index][1] = item[index][1] + 1 item[index][2] = item[index][2] + 1 uid_sentiment[uid] = item else: item = time_list index = time_index[date_str] if sentiment == 2 or sentiment == 5: item[index][0] = item[index][0] + 1 if sentiment == 3 or sentiment == 4: item[index][1] = item[index][1] + 1 item[index][2] = item[index][2] + 1 uid_sentiment[uid] = item s_result = sentiment_classify(uid_sentiment, min_ts, max_ts) com_result = dict() if len(uid_list): for uid in uid_list: if s_result.has_key(uid): com_result[uid] = SEN_DICT[s_result[uid]] else: com_result[uid] = SEN_DICT[0] else: for uid in new_uid: if s_result.has_key(uid): com_result[uid] = SEN_DICT[s_result[uid]] else: com_result[uid] = SEN_DICT[0] return com_result
def diamond_classifier(item): # 其他类 sentiment = 0 if '【' in item['text'].encode('utf-8') and '】' in item['text'].encode('utf-8'): # 简单规则判断新闻类 sentiment = 4 else: # 积极、愤怒、悲伤3类情感分类器 sentiment = triple_classifier(item) return sentiment
def classify_without_sentiment(uid_weibo,uid_list,start_date,end_date): ''' 没有情感标签的分类主函数 输入数据:list对象 [[uid,text,time],[uid,text,time],...] 输出数据:字典对象 {uid1:str1,uid2:str2,...} ''' uid_sentiment = dict() new_uid = [] min_ts = int(time.mktime(time.strptime(start_date,'%Y-%m-%d'))) max_ts = int(time.mktime(time.strptime(end_date,'%Y-%m-%d'))) time_index,time_list = sta_time_list(min_ts,max_ts) for uid,text,ts in uid_weibo: if uid not in new_uid: new_uid.append(uid) sentiment = triple_classifier({'text':text}) date_str = time.strftime('%Y-%m-%d',time.localtime(float(ts))) if uid_sentiment.has_key(uid): item = uid_sentiment[uid] index = time_index[date_str] if sentiment == 2 or sentiment == 5: item[index][0] = item[index][0] + 1 if sentiment == 3 or sentiment == 4: item[index][1] = item[index][1] + 1 item[index][2] = item[index][2] + 1 uid_sentiment[uid] = item else: item = time_list index = time_index[date_str] if sentiment == 2 or sentiment == 5: item[index][0] = item[index][0] + 1 if sentiment == 3 or sentiment == 4: item[index][1] = item[index][1] + 1 item[index][2] = item[index][2] + 1 uid_sentiment[uid] = item s_result = sentiment_classify(uid_sentiment,min_ts,max_ts) com_result = dict() if len(uid_list): for uid in uid_list: if s_result.has_key(uid): com_result[uid] = SEN_DICT[s_result[uid]] else: com_result[uid] = SEN_DICT[0] else: for uid in new_uid: if s_result.has_key(uid): com_result[uid] = SEN_DICT[s_result[uid]] else: com_result[uid] = SEN_DICT[0] return com_result
def _diamond_classifier(text): # 其他类 sentiment = 0 text_utf8 = text.encode('utf-8') if '【' in text_utf8 and '】' in text_utf8: # 简单规则判断新闻类 sentiment = 4 else: # 积极、愤怒、悲伤3类情感分类器 sentiment = triple_classifier(text) return sentiment
def get_sentiment(uid_weibo,name):#学习情绪有关的参数 uid_sentiment = dict() uid_list = [] min_ts = MIN_TS max_ts = MAX_TS for item in uid_weibo: uid = item[0] text = item[1] ts = item[2] if int(ts) <= min_ts: min_ts = int(ts) if int(ts) >= max_ts: max_ts = int(ts) if uid not in uid_list: uid_list.append(uid) sentiment = triple_classifier({'text':text}) date_str = time.strftime('%Y-%m-%d',time.localtime(float(ts))) if uid_sentiment.has_key(uid): item = uid_sentiment[uid] if item.has_key(date_str): row = item[date_str] row.append(sentiment) item[date_str] = row else: row = [] row.append(sentiment) item[date_str] = row uid_sentiment[uid] = item else: item = dict() row = [] row.append(sentiment) item[date_str] = row uid_sentiment[uid] = item s_result = sentiment_classify(uid_sentiment,min_ts,max_ts) write_e_result(s_result,name)
def get_sentiment(uid_weibo, name): #学习情绪有关的参数 uid_sentiment = dict() uid_list = [] min_ts = MIN_TS max_ts = MAX_TS for item in uid_weibo: uid = item[0] text = item[1] ts = item[2] if int(ts) <= min_ts: min_ts = int(ts) if int(ts) >= max_ts: max_ts = int(ts) if uid not in uid_list: uid_list.append(uid) sentiment = triple_classifier({'text': text}) date_str = time.strftime('%Y-%m-%d', time.localtime(float(ts))) if uid_sentiment.has_key(uid): item = uid_sentiment[uid] if item.has_key(date_str): row = item[date_str] row.append(sentiment) item[date_str] = row else: row = [] row.append(sentiment) item[date_str] = row uid_sentiment[uid] = item else: item = dict() row = [] row.append(sentiment) item[date_str] = row uid_sentiment[uid] = item s_result = sentiment_classify(uid_sentiment, min_ts, max_ts) write_e_result(s_result, name)
def classify_without_sentiment(uid_weibo,uid_list,start_date,end_date): ''' 没有情感标签的分类主函数 输入数据:list对象 [[uid,text,time],[uid,text,time],...] 输出数据:字典对象 {uid1:str1,uid2:str2,...} ''' uid_sentiment = dict() min_ts = int(time.mktime(time.strptime(start_date,'%Y-%m-%d'))) max_ts = int(time.mktime(time.strptime(end_date,'%Y-%m-%d'))) for uid,text,s,ts in uid_weibo: sentiment = triple_classifier({'text':text}) date_str = time.strftime('%Y-%m-%d',time.localtime(float(ts))) if uid_sentiment.has_key(uid): item = uid_sentiment[uid] if item.has_key(date_str): row = item[date_str] row.append(sentiment) item[date_str] = row else: row = [] row.append(sentiment) item[date_str] = row uid_sentiment[uid] = item else: item = dict() row = [] row.append(sentiment) item[date_str] = row uid_sentiment[uid] = item s_result = sentiment_classify(uid_sentiment,min_ts,max_ts) com_result = dict() for uid in uid_list: com_result[uid] = SEN_DICT[s_result[uid]] return com_result
def test(ft_type): print ft_type if ft_type == 'facebook': index_name_pre = facebook_flow_text_index_name_pre index_type = facebook_flow_text_index_type user_index_name = facebook_user_index_name user_index_type = facebook_user_index_type else: index_name_pre = twitter_flow_text_index_name_pre index_type = twitter_flow_text_index_type user_index_name = twitter_user_index_name user_index_type = twitter_user_index_type # date_list = load_date_list(True) date_list = load_date_list() DFA = createWordTree() query_body = { 'post_filter': { 'missing': { 'field': 'keywords_string' } }, 'query': { 'filtered': { 'filter': { 'bool': { 'must': [{ 'range': { 'flag_ch': { 'gte': -1 } } }] } } } } } for date in date_list: count = 0 bulk_action = [] index_name = index_name_pre + date try: es_scan_results = scan(es, query=query_body, size=1000, index=index_name, doc_type=index_type) while True: try: scan_data = es_scan_results.next() item = scan_data['_source'] text = item['text_ch'] uid = item['uid'] if ft_type == 'facebook': _id = item['fid'] else: _id = item['tid'] ts = datetime2ts(date) #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords( keywords_list) #sensitive_words_dict sensitive_words_dict = searchWord( text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: sensitive_words_string_data = "&".join( sensitive_words_dict.keys()) sensitive_words_dict_data = json.dumps( sensitive_words_dict) else: sensitive_words_string_data = "" sensitive_words_dict_data = json.dumps({}) #redis if sensitive_words_dict: sensitive_count_string = r_cluster.hget( 'sensitive_' + str(ts), str(uid)) if sensitive_count_string: #redis取空 sensitive_count_dict = json.loads( sensitive_count_string) for word in sensitive_words_dict.keys(): if sensitive_count_dict.has_key(word): sensitive_count_dict[ word] += sensitive_words_dict[word] else: sensitive_count_dict[ word] = sensitive_words_dict[word] r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_count_dict)) else: r_cluster.hset('sensitive_' + str(ts), str(uid), json.dumps(sensitive_words_dict)) #sensitive sensitive_score = 0 if sensitive_words_dict: for k, v in sensitive_words_dict.iteritems(): tmp_stage = r_sensitive.hget("sensitive_words", k) if tmp_stage: sensitive_score += v * sensitive_score_dict[ str(tmp_stage)] #directed_uid directed_uid_data = 0 directed_uid, directed_uname = get_root_retweet( text, uid, ft_type) if directed_uid: directed_uid_data = long(directed_uid) # hashtag hashtag = '' RE = re.compile( u'#([0-9a-zA-Z-_⺀-⺙⺛-⻳⼀-⿕々〇〡-〩〸-〺〻㐀-䶵一-鿃豈-鶴侮-頻並-龎]+)[ ," =.。: :、]' ) hashtag_list = re.findall(RE, text) if hashtag_list: hashtag = '&'.join(hashtag_list) #action action = {'update': {'_id': _id}} # action_data action_data = { 'sentiment': str(sentiment), 'keywords_dict': json.dumps(keywords_dict), 'keywords_string': keywords_string, 'sensitive_words_string': sensitive_words_string_data, 'sensitive_words_dict': sensitive_words_dict_data, 'sensitive': sensitive_score, 'directed_uid': directed_uid_data, 'directed_uname': directed_uname, 'hashtag': hashtag, } bulk_action.extend([action, {'doc': action_data}]) count += 1 if count % 1000 == 0 and count != 0: if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) bulk_action = [] count = 0 except StopIteration: break if bulk_action: es.bulk(bulk_action, index=index_name, doc_type=facebook_flow_text_index_type, timeout=600) except Exception, e: #es文档不存在 print e
def comments_sentiment_rubbish_calculation(comments, logger): """输入为一堆comments, 字段包括title、content168 输出: item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 有意义的信息clusterid MEAN_CLUSTER_ID = 'sentiment' # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' try: r['content168'] = r['content168'].encode('utf-8') except: r['content168'] = r['text'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] inputs.append(r) # 先分中性及3类分类器 svm_inputs = [] for r in inputs: sentiment = neutral_classifier(r) if sentiment != 0: sentiment = triple_classifier(r) if sentiment == 0: svm_inputs.append(r) else: r['sentiment'] = sentiment items_infos.append(r) else: svm_inputs.append(r) # 情绪调整 senti_modify_inputs = [] for r in svm_inputs: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 if sentiment != 0: r['sentiment'] = sentiment items_infos.append(r) else: r['sentiment'] = sentiment senti_modify_inputs.append(r) # 新闻分类 inputs = [] for r in senti_modify_inputs: r = subob_classifier(r) if r['subob_label'] == 1: # 主客观文本分类 r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻 items_infos.append(r) else: inputs.append(r) # 去垃圾 items = rubbish_classifier(inputs) for item in items: if item['rub_label'] == 1: # svm去垃圾 item['sentiment'] = NON_CLUSTER_ID + '_rub' else: # 简单规则过滤广告 item = ad_filter(item) if item['ad_label'] == 1: item['sentiment'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # 去重,在一个情绪类别下将文本去重 sentiment_dict = dict() for item in items_infos: if 'sentiment' in item: sentiment = item['sentiment'] try: sentiment_dict[sentiment].append(item) except KeyError: sentiment_dict[sentiment] = [item] items_infos = [] for sentiment, items in sentiment_dict.iteritems(): items_list = duplicate(items) items_infos.extend(items_list) return {'item_infos': items_infos}
def one_topic_calculation_comments_v2(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v2 import kmeans, tfidf_v2, text_classify, cluster_evaluation, global_text_weight eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment) tfidf_word = tfidf_v2(inputs) #聚类个数=过滤后文本数/2向上取整,大于10的取10 kmeans_cluster_number = int(math.ceil(float(len(inputs)) / 5.0)) if kmeans_cluster_number > 10: kmeans_cluster_number = 10 if kmeans_cluster_number < 5: kmeans_cluster_number = 5 # 评论词聚类 word_label = kmeans(tfidf_word, inputs, k=kmeans_cluster_number) # 计算全局文本权重 for r in inputs: gweight = global_text_weight(r['content'], tfidf_word) comment = Comment(r['_id'], topicid) comment.update_comment_global_weight(gweight) # 评论文本分类 results = text_classify(inputs, word_label, tfidf_word) #簇评价 reserved_num = int(math.ceil(float(kmeans_cluster_number) / 2.0)) LEAST_CLUSTER_SIZE = 3 # 最小的簇大小 TOPK_FREQ = 10 TOPK_WEIGHT = 5 LEAST_FREQ = 0 final_cluster_results = cluster_evaluation(results, top_num=reserved_num, topk_freq=TOPK_FREQ, \ least_freq=LEAST_FREQ, least_size=LEAST_CLUSTER_SIZE, topk_weight=TOPK_WEIGHT) for label, items in final_cluster_results.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = word_label[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight'])
def one_topic_calculation_comments_v7(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \ cluster_evaluation, choose_cluster from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) # 数据字段预处理 inputs = [] for r in results: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 去除垃圾和新闻文本 items = weibo_subob_rub_neu_classifier(inputs) inputs = [] for item in items: subob_rub_neu_label = item['subob_rub_neu_label'] if not subob_rub_neu_label in [1, 0]: # 1表示垃圾文本,0表示新闻文本 inputs.append(item) MIN_CLUSTERING_INPUT = 30 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 10 # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k,v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight']) # 情绪计算 for r in inputs: if r['subob_rub_neu_label'] == 2: sentiment = 0 # 0 中性 elif r['subob_rub_neu_label'] == -1: sentiment = triple_classifier(r) # 1 高兴、2 愤怒、3 悲伤、0无情感 if sentiment == 0: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment)
def classify_without_sentiment(uid_weibo,uid_list,start_date,end_date): ''' 没有情感标签的分类主函数 输入数据:list对象 [[uid,text,time],[uid,text,time],...] 输出数据:字典对象 {uid1:str1,uid2:str2,...} ''' uid_sentiment = dict() new_uid = [] min_ts = int(time.mktime(time.strptime(start_date,'%Y-%m-%d'))) max_ts = int(time.mktime(time.strptime(end_date,'%Y-%m-%d'))) time_index,time_list = sta_time_list(min_ts,max_ts) n = len(time_list) for uid,text,ts in uid_weibo: if uid not in new_uid: new_uid.append(uid) if isinstance(text, unicode):#判断是否为unicode编码 sentiment = triple_classifier({'text':text}) else: sentiment = triple_classifier({'text':text.decode('utf-8')}) date_str = time.strftime('%Y-%m-%d',time.localtime(float(ts))) if uid_sentiment.has_key(uid): item = uid_sentiment[uid] index = time_index[date_str] if sentiment == 0:#中性 item[index][0] = item[index][0] + 1 elif sentiment == 2 or sentiment == 5:#冲动 item[index][1] = item[index][1] + 1 elif sentiment == 3 or sentiment == 4:#抑郁 item[index][2] = item[index][2] + 1 elif sentiment == 1:#积极 item[index][3] = item[index][3] + 1 else: item[index][4] = item[index][4] + 1 uid_sentiment[uid] = item else: item = list(np.zeros((n, 5))) index = time_index[date_str] if sentiment == 0:#中性 item[index][0] = item[index][0] + 1 elif sentiment == 2 or sentiment == 5:#冲动 item[index][1] = item[index][1] + 1 elif sentiment == 3 or sentiment == 4:#抑郁 item[index][2] = item[index][2] + 1 elif sentiment == 1:#积极 item[index][3] = item[index][3] + 1 else: item[index][4] = item[index][4] + 1 uid_sentiment[uid] = item s_result = sentiment_classify(uid_sentiment,min_ts,max_ts) com_result = dict() if len(uid_list): for uid in uid_list: if s_result.has_key(uid): com_result[uid] = s_result[uid] else: com_result[uid] = {'impulse':0,'depressed':0} else: for uid in new_uid: if s_result.has_key(uid): com_result[uid] = s_result[uid] else: com_result[uid] = {'impulse':0,'depressed':0} return com_result
def one_topic_calculation_comments_v4(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \ cluster_evaluation, choose_cluster eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment) MIN_CLUSTERING_INPUT = 50 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 15 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v4(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) # for k, v in results.iteritems(): # print k, len(v) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k,v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight'])
action = [] xdata = [] class_ts = time.time() while 1: item = receiver.recv_json() if not item: continue if int(item['sp_type']) == 1: read_count += 1 text = item['text'] uid = item['uid'] #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) item['sentiment'] = str(sentiment) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords(keywords_list) item['keywords_dict'] = json.dumps(keywords_dict) # use to compute item['keywords_string'] = keywords_string # use to search sensitive_words_dict = searchWord(text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join(sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps(sensitive_words_dict) else: item['sensitive_words_string'] = "" item['sensitive_words_dict'] = json.dumps({}) timestamp = item['timestamp']
def comments_sentiment_rubbish_calculation(comments, logger): """输入为一堆comments, 字段包括title、content168 输出: item_infos:单条信息列表, 数据字段:sentiment、same_from、duplicate """ # 无意义信息的clusterid,包括ad_filter分出来的广告,svm分出的垃圾,主客观分类器分出的新闻 NON_CLUSTER_ID = 'nonsense' # 有意义的信息clusterid MEAN_CLUSTER_ID = 'sentiment' # 单条信息list,每条信息存储 clusterid weight sentiment字段 items_infos = [] # 去除sentiment label clusterid ad_label subob_label rub_label clear_keys = ['sentiment', 'label', 'clusterid', 'ad_label', 'subob_label', 'rub_label', 'weight'] inputs = [] for r in comments: for key in clear_keys: if key in r: del r[key] inputs.append(r) comments = inputs # 数据字段预处理 inputs = [] for r in comments: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] inputs.append(r) # 先分中性及3类分类器 svm_inputs = [] for r in inputs: sentiment = neutral_classifier(r) if sentiment != 0: sentiment = triple_classifier(r) if sentiment == 0: svm_inputs.append(r) else: r['sentiment'] = sentiment items_infos.append(r) else: svm_inputs.append(r) # 情绪调整 senti_modify_inputs = [] for r in svm_inputs: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 if sentiment != 0: r['sentiment'] = sentiment items_infos.append(r) else: r['sentiment'] = sentiment senti_modify_inputs.append(r) # 新闻分类 inputs = [] for r in senti_modify_inputs: r = subob_classifier(r) if r['subob_label'] == 1: # 主客观文本分类 r['sentiment'] = NON_CLUSTER_ID + '_news' # 新闻 items_infos.append(r) else: inputs.append(r) # 去垃圾 items = rubbish_classifier(inputs) for item in items: if item['rub_label'] == 1: # svm去垃圾 item['sentiment'] = NON_CLUSTER_ID + '_rub' else: # 简单规则过滤广告 item = ad_filter(item) if item['ad_label'] == 1: item['sentiment'] = NON_CLUSTER_ID + '_rub' items_infos.append(item) # 去重,在一个情绪类别下将文本去重 sentiment_dict = dict() for item in items_infos: if 'sentiment' in item: sentiment = item['sentiment'] try: sentiment_dict[sentiment].append(item) except KeyError: sentiment_dict[sentiment] = [item] items_infos = [] for sentiment, items in sentiment_dict.iteritems(): items_list = duplicate(items) items_infos.extend(items_list) return {'item_infos': items_infos}
def one_topic_calculation_comments_v7(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v7 import tfidf_v2, text_classify, \ cluster_evaluation, choose_cluster from weibo_subob_rub_neu_classifier import weibo_subob_rub_neu_classifier eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) # 数据字段预处理 inputs = [] for r in results: r['title'] = '' r['content168'] = r['content168'].encode('utf-8') r['content'] = r['content168'] r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 去除垃圾和新闻文本 items = weibo_subob_rub_neu_classifier(inputs) inputs = [] for item in items: subob_rub_neu_label = item['subob_rub_neu_label'] if not subob_rub_neu_label in [1, 0]: # 1表示垃圾文本,0表示新闻文本 inputs.append(item) MIN_CLUSTERING_INPUT = 30 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 10 # TFIDF词、聚类数量自动选择、vsm作属性也要可设成参数 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v2(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight']) # 情绪计算 for r in inputs: if r['subob_rub_neu_label'] == 2: sentiment = 0 # 0 中性 elif r['subob_rub_neu_label'] == -1: sentiment = triple_classifier(r) # 1 高兴、2 愤怒、3 悲伤、0无情感 if sentiment == 0: sentiment = mid_sentiment_classify(r['text']) if sentiment == -1: sentiment = 0 # 中性 comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment)
def one_topic_calculation_comments_v4(topicid): """对评论进行聚类 """ from comment_clustering_tfidf_v4 import kmeans, tfidf_v4, text_classify, \ cluster_evaluation, choose_cluster eventcomment = EventComments(topicid) newsIds = eventcomment.getNewsIds() for news_id in newsIds: eventcomment.clear_cluster(news_id) results = eventcomment.getNewsComments(news_id) news = News(news_id) inputs = [] for r in results: r['title'] = '' r['content'] = r['content168'].encode('utf-8') r['text'] = r['content168'] item = ad_filter(r) if item['ad_label'] == 0: inputs.append(item) # 情绪计算 for r in inputs: sentiment = triple_classifier(r) comment = Comment(r['_id'], topicid) comment.update_comment_sentiment(sentiment) MIN_CLUSTERING_INPUT = 50 MIN_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 15 if len(inputs) >= MIN_CLUSTERING_INPUT: tfidf_word, input_dict = tfidf_v4(inputs) results = choose_cluster(tfidf_word, inputs, MIN_CLUSTER_NUM, MAX_CLUSTER_NUM) # for k, v in results.iteritems(): # print k, len(v) #评论文本聚类 cluster_text = text_classify(inputs, results, tfidf_word) evaluation_inputs = [] for k, v in enumerate(cluster_text): inputs[k]['label'] = v['label'] inputs[k]['weight'] = v['weight'] evaluation_inputs.append(inputs[k]) #簇评价 recommend_text = cluster_evaluation(evaluation_inputs) for label, items in recommend_text.iteritems(): if label == 'other': label = news.otherClusterId if len(items): eventcomment.save_cluster(label, news_id, int(time.time())) if label != news.otherClusterId: fwords = results[label] eventcomment.update_feature_words(label, fwords) for item in items: comment = Comment(item['_id'], topicid) comment.update_comment_label(label) comment.update_comment_weight(item['weight'])
def classify_without_sentiment(uid_weibo, uid_list, start_date, end_date): ''' 没有情感标签的分类主函数 输入数据:list对象 [[uid,text,time],[uid,text,time],...] 输出数据:字典对象 {uid1:str1,uid2:str2,...} ''' uid_sentiment = dict() new_uid = [] min_ts = int(time.mktime(time.strptime(start_date, '%Y-%m-%d'))) max_ts = int(time.mktime(time.strptime(end_date, '%Y-%m-%d'))) time_index, time_list = sta_time_list(min_ts, max_ts) n = len(time_list) for uid, text, ts in uid_weibo: if uid not in new_uid: new_uid.append(uid) if isinstance(text, unicode): #判断是否为unicode编码 sentiment = triple_classifier({'text': text}) else: sentiment = triple_classifier({'text': text.decode('utf-8')}) date_str = time.strftime('%Y-%m-%d', time.localtime(float(ts))) if uid_sentiment.has_key(uid): item = uid_sentiment[uid] index = time_index[date_str] if sentiment == 0: #中性 item[index][0] = item[index][0] + 1 elif sentiment == 2 or sentiment == 5: #冲动 item[index][1] = item[index][1] + 1 elif sentiment == 3 or sentiment == 4: #抑郁 item[index][2] = item[index][2] + 1 elif sentiment == 1: #积极 item[index][3] = item[index][3] + 1 else: item[index][4] = item[index][4] + 1 uid_sentiment[uid] = item else: item = list(np.zeros((n, 5))) index = time_index[date_str] if sentiment == 0: #中性 item[index][0] = item[index][0] + 1 elif sentiment == 2 or sentiment == 5: #冲动 item[index][1] = item[index][1] + 1 elif sentiment == 3 or sentiment == 4: #抑郁 item[index][2] = item[index][2] + 1 elif sentiment == 1: #积极 item[index][3] = item[index][3] + 1 else: item[index][4] = item[index][4] + 1 uid_sentiment[uid] = item s_result = sentiment_classify(uid_sentiment, min_ts, max_ts) com_result = dict() if len(uid_list): for uid in uid_list: if s_result.has_key(uid): com_result[uid] = s_result[uid] else: com_result[uid] = {'impulse': 0, 'depressed': 0} else: for uid in new_uid: if s_result.has_key(uid): com_result[uid] = s_result[uid] else: com_result[uid] = {'impulse': 0, 'depressed': 0} return com_result
item = scan_data['_source'] text = item['text'] uid = item['uid'] try: text_ch = trans([text]) if text_ch: text = text_ch[0] item['text'] = text_ch[0] except: pass #add sentiment field to weibo sentiment, keywords_list = triple_classifier(item) item['sentiment'] = str(sentiment) #add key words to weibo keywords_dict, keywords_string = get_weibo_keywords( keywords_list) item['keywords_dict'] = json.dumps( keywords_dict) # use to compute item['keywords_string'] = keywords_string # use to search sensitive_words_dict = searchWord( text.encode('utf-8', 'ignore'), DFA) if sensitive_words_dict: item['sensitive_words_string'] = "&".join( sensitive_words_dict.keys()) item['sensitive_words_dict'] = json.dumps( sensitive_words_dict)
def fill_sentiment(item): sentiment = triple_classifier(item) item[XAPIAN_EXTRA_FIELD] = sentiment return item
def sentimentCronTopic(topic, weibos_list, start_ts, over_ts, sort_field=SORT_FIELD, save_fields=RESP_ITER_KEYS, during=Fifteenminutes, w_limit=TOP_WEIBOS_LIMIT, k_limit=TOP_KEYWORDS_LIMIT): import sys sys.path.append('../triple_classifier/') from triple_sentiment_classifier import triple_classifier start_ts = int(start_ts) over_ts = int(over_ts) over_ts = ts2HourlyTime(over_ts, during) interval = (over_ts - start_ts) / during logFile.write('start_ts: ' + str(start_ts) + '\r\n') logFile.write('over_ts: ' + str(over_ts) + '\r\n') logFile.write('during: ' + str(during) + '\r\n') logFile.write('interval: ' + str(interval) + '\r\n') for i in range(interval, 0, -1): begin_ts = over_ts - during * i end_ts = begin_ts + during emotions_count = {} emotions_kcount = {} emotions_weibo = {} emotions_rcount = {} weiboIDs = {} for k, v in emotions_kv.iteritems(): zero = 0 emotions_count[v] = [end_ts, 0] emotions_kcount[v] = [end_ts, ''] emotions_weibo[v] = [end_ts, []] weiboIDs[v] = [end_ts, []] # print begin_ts, end_ts, 'topic %s starts calculate' % topic.encode('utf-8') slide = get_weibos_slide(weibos_list, begin_ts, end_ts) string = ['', '', '', ''] emo0 = 0 emo1 = 1 for weibo in slide: sentiment, emo = triple_classifier(weibo) if sentiment != 0 and emo == 0: emo0 += 1 elif sentiment != 0 and emo == 1: emo1 += 1 # words = jieba.cut(weibo['text'], cut_all=False) weibo['sentiment'] = sentiment string[sentiment] = string[sentiment] + weibo['text'] if sentiment != 0: emotions_count[sentiment][1] += 1 # kcount = emotions_kcount[sentiment][1] emotions_weibo[sentiment][1].append(weibo) else: zero += 1 for k, v in emotions_kv.iteritems(): # sorted_kcount = sorted(emotions_kcount[v][1].iteritems(), key=lambda(k, v):v, reverse=False) # sorted_kcount = { k: v for k, v in sorted_kcount[len(sorted_kcount)-k_limit:]} # emotions_kcount[v][1] = sorted_kcount sorted_weibos = sorted(emotions_weibo[v][1], key=lambda i: i[sort_field], reverse=False) emotions_weibo[v][1] = sorted_weibos[len(sorted_weibos) - w_limit:] for item in emotions_weibo[v][1]: weiboIDs[v][1].append(item['key']) wordd = {} if string[v] != '': words = GetKeyWords(string[v].encode('utf-8'), 5, True) word_list = words.split('#') for word in word_list: token = word.split(r'/') if (len(token) == 3 and not (token[0] in STOPWORDS)): # wordd.append({token[0]:token[2]}) wordd[token[0]] = token[2] emotions_kcount[v][1] = wordd print emo0, emo1 print zero, emotions_count[1][1], emotions_count[2][1], emotions_count[3][1] save_rt_results('count', topic, emotions_count, during) save_rt_results('kcount', topic, emotions_kcount, during, k_limit, w_limit) save_rt_results('weibos', topic, weiboIDs, during, k_limit, w_limit) j = interval - i logFile.write('finish ' + str(j) + ' slide' + '\r\n')