def get_news_main(news_text): news = re_cut(news_text) if len(news): if '】' not in news: title = '' text = news else: title = news_text.split('】')[0] text = news_text.split('】')[-1] if '【' in title: title = title[2:len(title)] else: title = title result = cut_main(text, title) else: result = 'Null' re_dict = {} name_list = ['people', 'organization', 'place', 'time'] if result != 'Null': for i in range(0, len(result)): key = name_list[i] value = result[i] re_dict[key] = value else: re_dict = { 'people': 'Null', 'organization': 'Null', 'time': 'Null', 'place': 'Null' } return re_dict
def topic_classfiy(uid_weibo): #话题分类主函数 ''' 用户话题分类主函数 输入数据示例:字典 {uid1:[weibo1,weibo2,weibo3,...]} 输出数据示例:字典 用户18个话题的分布: {uid1:{'art':0.1,'social':0.2...}...} 用户关注较多的话题(最多有3个): {uid1:['art','social','media']...} ''' weibo_text = dict() for k, v in uid_weibo.items(): item = '' for i in range(0, len(v)): text = re_cut(v[i]['text']) item = item + '.' + text weibo_text[k] = item result_data = load_weibo(weibo_text) #话题分类主函数 uid_topic = rank_result(result_data) return result_data, uid_topic
def cut_flow_text(text): #对流文本切分评论 text_result = dict() text = re_cut(text) if '//@' in text: texts = text.split('//@') elif '/@' in text: #表示有评论 texts = text.split('/@') else: return 0, 0 text_list = [] for i in range(0, len(texts)): text_str = texts[i].strip() if not len(text_str): #空文本 continue if i == 0: text_list.append(text_str) else: if ':' in text_str: s_list = text_str.split(':') s_content = s_list[-1] if len(s_content): text_list.append(s_content) else: text_list.append(text_str) try: key = text_list[-1] except IndexError: return 0, 0 if len(key) < 2: return 0, 0 return key, text_list[0:len(text_list) - 1]
def topic_classfiy(uid_weibo):#话题分类主函数 ''' 用户话题分类主函数 输入数据示例:字典 {uid1:[weibo1,weibo2,weibo3,...]} 输出数据示例:字典 用户18个话题的分布: {uid1:{'art':0.1,'social':0.2...}...} 用户关注较多的话题(最多有3个): {uid1:['art','social','media']...} ''' weibo_text = dict() for k,v in uid_weibo.items(): item = '' for i in range(0,len(v)): text = re_cut(v[i]['text']) item = item + '.' + text weibo_text[k] = item result_data = load_weibo(weibo_text)#话题分类主函数 uid_topic = rank_result(result_data) return result_data,uid_topic
def input_data(): #测试输入 sw = load_scws() uid_weibo = dict() uid_list = [] reader = csv.reader(file(abs_path + '/weibo_data/uid_text_0728.csv', 'rb')) #print 'reader::',reader for mid, w_text in reader: print 'mid:::', mid print 'w_text:::', w_text v = re_cut(w_text.decode('utf-8')) words = sw.participle(v.encode('utf-8')) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and ( word[0] not in black_word ) and (word[0] not in single_word_whitelist): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 uid_list.append(mid) uid_weibo[mid] = word_list return uid_list, uid_weibo
def freq_word(items): """ 统计一条文本的词频,对文本进行过滤后再分词 input: items:微博字典,{"mid": 12345, "text": text} output: top_word:词和词频构成的字典,如:{词:词频, 词:词频} """ word_list = [] text = items["text"] #print type(text) text = re_cut(text) cut_text = sw.participle(text.encode('utf-8')) #print cut_text cut_word_list = [term for term, cx in cut_text if cx in cx_dict] for w in cut_word_list: word_list.append(w) counter = Counter(word_list) total = sum(counter.values()) topk_words = counter.most_common() top_word = {k:(float(v)/float(total)) for k,v in topk_words} return top_word
def flow_psychology_classfiy(text): #心理状态分类主函数 w_text = re_cut(text) if len(w_text): #非空 label = find_label(w_text, DS_DICT, DS_COUNT) else: label = 'other' return label
def flow_psychology_classfiy(text):#心理状态分类主函数 w_text = re_cut(text) if len(w_text):#非空 label = find_label(w_text,DS_DICT,DS_COUNT) else: label = 'other' return label
def text_classify(inputs, word_label, tfidf_word): """ 对每条评论分别计算属于每个类的权重,将其归入权重最大的类 输入数据: inputs:评论字典的列表,[{'_id':评论id,'news_id':新闻id,'content':评论内容}] word_cluster:词聚类结果,{'类标签':[词1,词2,...]} tfidf_word:tfidf topk词及权值,[(词,权值)] 输出数据: 每条文本的归类,字典,{'_id':[类,属于该类的权重]} """ #将词及权值整理为字典格式 word_weight = {} for idx, w in enumerate(tfidf_word): word_weight[w[0]] = w[1] #计算每条评论属于各个类的权值 for input in inputs: text_weight = {} text = input['text'] text_list = [] text = re_cut(text) cut_text = sw.participle(text.encode('utf-8')) #print cut_text cut_word_list = [term for term, cx in cut_text if cx in cx_dict] for w in cut_word_list: text_list.append(w) if text_list == []: continue for l, w_list in word_label.iteritems(): weight = 0 for w in w_list: weight += text.encode('utf-8').count(w) * word_weight[w] text_weight[l] = float(weight) / (float(len(text_list))) sorted_weight = sorted(text_weight.iteritems(), key=lambda asd: asd[1], reverse=True) if sorted_weight[0][1] != 0: clusterid, weight = sorted_weight[0] else: clusterid = 'other' weight = 0 input.pop('text') input['label'] = clusterid input['weight'] = weight return inputs
def get_topic_word(texts, nt): ''' 输入数据: text:list对象,一条记录表示一条微博文本 nt:需要的topic的数量 输出数据: topic的list ''' text_list = [] for text in texts: w_text = re_cut(text) words = sw.participle(w_text) word_list = [] for word in words: if word[0] not in black and word[1] in cx_dict and len( word[0]) > 3: word_list.append(word[0]) text_list.append(word_list) ##生成字典 dictionary = corpora.Dictionary(text_list) dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None) corpus = [ dictionary.doc2bow(text) for text in text_list if len(dictionary.doc2bow(text)) ] if not len(corpus): return 'Null' ##生成tf-idf矩阵 tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] ##LDA模型训练 lda = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf, id2word=dictionary, num_topics=nt, update_every=1, chunksize=5000, passes=1) topics = lda.show_topics(num_topics=nt, num_words=10, log=False, formatted=True) return topics
def text_classify(inputs, word_label, tfidf_word): """ 对每条评论分别计算属于每个类的权重,将其归入权重最大的类 输入数据: inputs:评论字典的列表,[{'_id':评论id,'news_id':新闻id,'content':评论内容}] word_cluster:词聚类结果,{'类标签':[词1,词2,...]} tfidf_word:tfidf topk词及权值,[(词,权值)] 输出数据: 每条文本的归类,字典,{'_id':[类,属于该类的权重]} """ #将词及权值整理为字典格式 word_weight = {} for idx,w in enumerate(tfidf_word): word_weight[w[0]] = w[1] #计算每条评论属于各个类的权值 for input in inputs: text_weight = {} text = input['text'] text_list = [] text = re_cut(text) cut_text = sw.participle(text.encode('utf-8')) #print cut_text cut_word_list = [term for term, cx in cut_text if cx in cx_dict] for w in cut_word_list: text_list.append(w) if text_list == []: continue for l, w_list in word_label.iteritems(): weight = 0 for w in w_list: weight += text.encode('utf-8').count(w)*word_weight[w] text_weight[l] = float(weight)/(float(len(text_list))) sorted_weight = sorted(text_weight.iteritems(), key=lambda asd:asd[1], reverse=True) if sorted_weight[0][1] != 0: clusterid, weight = sorted_weight[0] else: clusterid = 'other' weight = 0 input.pop('text') input['label'] = clusterid input['weight'] = weight return inputs
def flow_psychology_classfiy(text): #心理状态分类主函数 ''' 返回结果:文本的情绪标签(int类型) 0 消极其他 2 生气 3 焦虑 4 悲伤 5 厌恶 ''' w_text = re_cut(text) if len(w_text): #非空 label = find_label(w_text, DZ_DICT, DZ_COUNT) else: label = 0 return label
def flow_psychology_classfiy(text):#心理状态分类主函数 ''' 返回结果:文本的情绪标签(int类型) 0 消极其他 2 生气 3 焦虑 4 悲伤 5 厌恶 ''' w_text = re_cut(text) if len(w_text):#非空 label = find_label(w_text,DZ_DICT,DZ_COUNT) else: label = 0 return label
def get_keyword(weibo_text):#事件关键词提取 tr_list = [] text_list = [] for text in weibo_text: w_text = re_cut(text) if not len(w_text): continue words = SW.participle(w_text) word_list = [] for word in words: if word[0] not in black_word and word[1] in cx_dict and len(word[0])>3: word_list.append(word[0]) text_list.append(word[0]) tr_list.append(word_list) keywords = use_topicrank([text_list],tr_list,N_GRAM,WORD_N,TOPIC_N) return keywords
def get_weibo_single(text,n_gram=2,n_count=3): ''' 针对单条微博提取关键词,但是效率比较低 输入数据: text:单条微博文本,utf-8编码 n_gram:词语滑动窗口,建议取2 n_count:返回的关键词数量 输出数据: 字典:键是词语,值是词语对应的权重 ''' w_text = re_cut(text) if w_text: w_key = get_keyword(w_text, n_gram, n_count) uid_word = w_key else: uid_word = dict() return uid_word
def psychology_classfiy(uid_weibo): # 心理状态分类主函数 """ 用户心理状态分类主函数 输入数据示例:字典 {uid1:[weibo1,weibo2,weibo3,...]} 输出数据示例:字典(每个用户对应两个字典,一个是一层分类器的状态比例,另一个是二层分类器(消极状态)的状态比例) {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...} """ df_dict, df_count = load_dict(f_label) ds_dict, ds_count = load_dict(s_label) data_s = s_label data_f = f_label data_s.append("other") data_f.append("middle") sw = load_scws() result_data = dict() for k, v in uid_weibo.items(): domain_f = start_p(data_f) domain_s = start_p(data_s) for i in range(0, len(v)): w_text = re_cut(v[i]) if not len(w_text): continue label_f, label_s = find_label(w_text, sw, df_dict, df_count, ds_dict, ds_count) domain_f[label_f] = domain_f[label_f] + 1 domain_s[label_s] = domain_s[label_s] + 1 for k1, v1 in domain_f.items(): domain_f[k1] = float(v1) / float(len(v)) for k1, v1 in domain_s.items(): if domain_f["negemo"] != 0: domain_s[k1] = float(v1) / float(len(v)) else: domain_s[k1] = 0 result_data[k] = {"first": domain_f, "second": domain_s} return result_data
def psychology_classfiy(uid_weibo):#心理状态分类主函数 ''' 用户心理状态分类主函数 输入数据示例:字典 {uid1:[weibo1,weibo2,weibo3,...]} 输出数据示例:字典(每个用户对应两个字典,一个是一层分类器的状态比例,另一个是二层分类器(消极状态)的状态比例) {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...} ''' df_dict,df_count = load_dict(f_label) ds_dict,ds_count = load_dict(s_label) data_s = s_label data_f = f_label data_s.append('other') data_f.append('middle') sw = load_scws() result_data = dict() for k,v in uid_weibo.items(): domain_f = start_p(data_f) domain_s = start_p(data_s) for i in range(0,len(v)): w_text = re_cut(v[i]) if not len(w_text): continue label_f,label_s = find_label(w_text,sw,df_dict,df_count,ds_dict,ds_count) domain_f[label_f] = domain_f[label_f] + 1 domain_s[label_s] = domain_s[label_s] + 1 for k1,v1 in domain_f.items(): domain_f[k1] = float(v1)/float(len(v)) for k1,v1 in domain_s.items(): if domain_f['negemo'] != 0: domain_s[k1] = float(v1)/float(len(v)) else: domain_s[k1] = 0 result_data[k] = {'first' : domain_f, 'second' : domain_s} return result_data
def input_data():#测试输入 sw = load_scws() uid_weibo = dict() uid_list = [] reader = csv.reader(file(abs_path+'/weibo_data/uid_text_0728.csv', 'rb')) for mid,w_text in reader: v = re_cut(w_text.decode('utf-8')) words = sw.participle(v.encode('utf-8')) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black_word) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 uid_list.append(mid) uid_weibo[mid] = word_list return uid_list,uid_weibo
def get_weibo_test(): uid_dict = dict() uid_tr = dict() texts = '' reader = csv.reader(file('./text_data/text_0.csv', 'rb')) count = 0 ori_text = [] for uid, text, ts, geo in reader: ori_text.append(text) w_text = re_cut(text) if count == 0: texts = texts + w_text else: texts = texts + '。' + w_text words = sw.participle(w_text) word_list = [] for word in words: if word[0] not in black and word[1] in cx_dict and len( word[0]) > 3: if uid_dict.has_key(uid): uid_dict[uid].append(word[0]) else: uid_dict[uid] = [] uid_dict[uid].append(word[0]) word_list.append(word[0]) count = count + 1 if len(word_list): if uid_tr.has_key(uid): uid_tr[uid].append(word_list) else: uid_tr[uid] = [] uid_tr[uid].append(word_list) uid_list = [] text_list = [] for k, v in uid_dict.iteritems(): uid_list.append(k) text_list.append(v) return texts, text_list, ori_text
def get_group_keywords(uid_list): now_ts = time.time() now_ts = datetime2ts('2013-09-03') former_ts = now_ts - DAY flow_index_1 = flow_text_index_name_pre + ts2datetime(now_ts) flow_index_2 = flow_text_index_name_pre + ts2datetime(former_ts) query_body = { "query": { "filtered": { "filter": { "terms": { "uid": uid_list } } } }, "size": 10000 } text_list = [] # 为分词前的文本 word_dict = dict() # 分词后的word dict text_results = es_flow_text.search(index=[flow_index_1, flow_index_2], doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] if text_results: for item in text_results: iter_text = item['_source']['text'].encode('utf-8', "ignore") iter_text = re_cut(iter_text) text_list.append(iter_text) if text_list: for iter_text in text_list: cut_text = sw.participle(iter_text) cut_word_list = [term for term, cx in cut_text if cx in cx_dict] tmp_list = [] for w in cut_word_list: if word_dict.has_key(w): word_dict[w] += 1 else: word_dict[w] = 1 return word_dict
def input_data(): # 测试输入 uid_weibo = dict() uid_list = [] sw = load_scws() reader = csv.reader(file("./weibo_data/uid_text_0728.csv", "rb")) for mid, w_text in reader: text = re_cut(w_text) if mid not in uid_list: uid_list.append(mid) if uid_weibo.has_key(mid): word_dict = uid_weibo[mid] words = sw.participle(text) for word in words: if ( (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black_word) ): # 选择分词结果的名词、动词、形容词,并去掉单个词 if word_dict.has_key(str(word[0])): word_dict[str(word[0])] = word_dict[str(word[0])] + 1 else: word_dict[str(word[0])] = 1 uid_weibo[mid] = word_dict else: word_dict = dict() words = sw.participle(text) for word in words: if ( (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black_word) ): # 选择分词结果的名词、动词、形容词,并去掉单个词 if word_dict.has_key(str(word[0])): word_dict[str(word[0])] = word_dict[str(word[0])] + 1 else: word_dict[str(word[0])] = 1 uid_weibo[mid] = word_dict return uid_list, uid_weibo
def input_data(): #测试输入 uid_weibo = dict() uid_list = [] sw = load_scws() reader = csv.reader(file('./weibo_data/uid_text_0728.csv', 'rb')) for mid, w_text in reader: text = re_cut(w_text) if mid not in uid_list: uid_list.append(mid) if uid_weibo.has_key(mid): word_dict = uid_weibo[mid] words = sw.participle(text) for word in words: if (word[1] in cx_dict) and ( 3 < len(word[0]) < 30 or word[0] in single_word_whitelist ) and (word[0] not in black_word): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_dict.has_key(str(word[0])): word_dict[str(word[0])] = word_dict[str(word[0])] + 1 else: word_dict[str(word[0])] = 1 uid_weibo[mid] = word_dict else: word_dict = dict() words = sw.participle(text) for word in words: if (word[1] in cx_dict) and ( 3 < len(word[0]) < 30 or word[0] in single_word_whitelist ) and (word[0] not in black_word): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_dict.has_key(str(word[0])): word_dict[str(word[0])] = word_dict[str(word[0])] + 1 else: word_dict[str(word[0])] = 1 uid_weibo[mid] = word_dict return uid_list, uid_weibo
def get_group_keywords(uid_list): now_ts = time.time() now_ts = datetime2ts('2013-09-03') former_ts = now_ts - DAY flow_index_1 = flow_text_index_name_pre + ts2datetime(now_ts) flow_index_2 = flow_text_index_name_pre + ts2datetime(former_ts) query_body = { "query":{ "filtered":{ "filter":{ "terms":{ "uid":uid_list } } } }, "size":10000 } text_list = [] # 为分词前的文本 word_dict = dict() # 分词后的word dict text_results = es_flow_text.search(index=[flow_index_1, flow_index_2], doc_type=flow_text_index_type, body=query_body)["hits"]["hits"] if text_results: for item in text_results: iter_text = item['_source']['text'].encode('utf-8', "ignore") iter_text = re_cut(iter_text) text_list.append(iter_text) if text_list: for iter_text in text_list: cut_text = sw.participle(iter_text) cut_word_list = [term for term, cx in cut_text if cx in cx_dict] tmp_list = [] for w in cut_word_list: if word_dict.has_key(w): word_dict[w] += 1 else: word_dict[w] = 1 return word_dict
def psychology_classfiy(uid_weibo):#心理状态分类主函数 ''' 用户心理状态分类主函数 输入数据示例:字典 {uid1:[weibo1,weibo2,weibo3,...]} 输出数据示例:字典(每个用户对应两个字典,一个是一层分类器的状态比例,另一个是二层分类器(消极状态)的状态比例) {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...} ''' data_s = s_label data_f = f_label data_s.append('other') data_f.append('middle') result_data = dict() for k,v in uid_weibo.items(): domain_f = start_p(data_f) domain_s = start_p(data_s) for i in range(0,len(v)): w_text = re_cut(v[i]['text']) if not len(w_text): continue label_f,label_s = find_label(w_text,DF_DICT,DF_COUNT,DS_DICT,DS_COUNT) domain_f[label_f] = domain_f[label_f] + 1 domain_s[label_s] = domain_s[label_s] + 1 for k1,v1 in domain_f.items(): domain_f[k1] = float(v1)/float(len(v)) for k1,v1 in domain_s.items(): if domain_f['negemo'] != 0: domain_s[k1] = float(v1)/float(len(v)) else: domain_s[k1] = 0 result_data[k] = {'first' : domain_f, 'second' : domain_s} return result_data
def text_generation_main(text_list, keyword_list): #文本生成主函数 ''' 输入数据: text_list:文本列表 keyword_list:关键词列表(观点关键词) 输出数据: summary:生成的发帖文本 ''' sen_list = [] if len(text_list) == 0 or len(keyword_list) == 0: return '' for text in text_list: if type(text).__name__ == "unicode": new_text = text.encode('utf-8') else: new_text = text text_re = re_cut(new_text) ts = text_re.split('。') for t in ts: if t not in sen_list: sen_list.append(t) new_keywords = [] for key in keyword_list: if type(key).__name__ == "unicode": new_keywords.append(key.encode('utf-8')) else: new_keywords.append(key) rank_text = rank_text_list(sen_list, new_keywords) if len(rank_text) == 0: #加入错误判定 summary = '' else: summary = combine_rank_text(rank_text) return summary
def get_weibo(text,n_gram=2,n_count=20): ''' 针对一批微博提取关键词 输入数据: text:微博文本列表,utf-8编码 n_gram:词语滑动窗口,建议取2 n_count:返回的关键词数量 输出数据: 字典:键是词语,值是词语对应的权重 ''' text_str = '' for item in text: w_text = re_cut(item) if w_text: text_str = text_str + '。' + w_text if text_str: w_key = get_keyword(text_str, n_gram, n_count) uid_word = w_key else: uid_word = dict() return uid_word