def input_data(name): #测试输入 uid_list = [] reader = csv.reader(file(abs_path + '/weibo_data/0122_uid.txt', 'rb')) for line in reader: uid = line[0].strip('\t\r\n') uid = uid.strip('\xef\xbb\xbf') uid_list.append(uid) uid_weibo = dict() sw = load_scws() reader = csv.reader(file(abs_path + '/test_weibo/com_weibo0126.csv', 'rb')) for mid, w_text, ts in reader: mid = mid.strip('\xef\xbb\xbf') if mid in uid_list: if uid_weibo.has_key(mid): item = uid_weibo[mid] item = item + '_' + w_text uid_weibo[mid] = item else: item = w_text uid_weibo[mid] = item uid_word = dict() for k, v in uid_weibo.iteritems(): item = dict() words = sw.participle(v) for word in words: if item.has_key(word[0]): item[word[0]] = item[word[0]] + 1 else: item[word[0]] = 1 uid_word[k] = item return uid_list, uid_word
def input_data(name):#测试输入 uid_list = [] reader = csv.reader(file(abs_path + '/weibo_data/0122_uid.txt', 'rb')) for line in reader: uid = line[0].strip('\t\r\n') uid = uid.strip('\xef\xbb\xbf') uid_list.append(uid) uid_weibo = dict() sw = load_scws() reader = csv.reader(file(abs_path + '/test_weibo/com_weibo0126.csv', 'rb')) for mid,w_text,ts in reader: mid = mid.strip('\xef\xbb\xbf') if mid in uid_list: if uid_weibo.has_key(mid): item = uid_weibo[mid] item = item + '_' + w_text uid_weibo[mid] = item else: item = w_text uid_weibo[mid] = item uid_word = dict() for k,v in uid_weibo.iteritems(): item = dict() words = sw.participle(v) for word in words: if item.has_key(word[0]): item[word[0]] = item[word[0]] + 1 else: item[word[0]] = 1 uid_word[k] = item return uid_list,uid_word
def input_data(): #测试输入 sw = load_scws() uid_weibo = dict() uid_list = [] reader = csv.reader(file('./weibo_data/uid_text_0728.csv', 'rb')) for mid, w_text in reader: if uid_weibo.has_key(str(mid)): uid_weibo[str(mid)] = uid_weibo[str(mid)] + '-' + w_text else: uid_weibo[str(mid)] = w_text if mid not in uid_list: uid_list.append(mid) uid_word = dict() for k, v in uid_weibo.items(): words = sw.participle(v) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and ( word[0] not in black_word ) and (word[0] not in single_word_whitelist): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 uid_word[k] = word_list return uid_list, uid_word
def input_data(): #测试输入 sw = load_scws() uid_weibo = dict() uid_list = [] reader = csv.reader(file(abs_path + '/weibo_data/uid_text_0728.csv', 'rb')) #print 'reader::',reader for mid, w_text in reader: print 'mid:::', mid print 'w_text:::', w_text v = re_cut(w_text.decode('utf-8')) words = sw.participle(v.encode('utf-8')) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and ( word[0] not in black_word ) and (word[0] not in single_word_whitelist): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 uid_list.append(mid) uid_weibo[mid] = word_list return uid_list, uid_weibo
def load_weibo(uid_weibo): ts = time.time() domain_dict, domain_count = load_train() end = time.time() print '%s' % (end - ts) len_dict = dict() total = 0 for k, v in domain_dict.items(): len_dict[k] = len(v) total = total + len(v) sw = load_scws() black = load_black_words() result_data = dict() ts = time.time() for k, v in uid_weibo.items(): words = sw.participle(v) domain_p = start_p(name_list) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len( word[0]) < 30 and (word[0] not in black) and ( word[0] not in single_word_whitelist ) and (word[0] not in word_list): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 for d_k in domain_p.keys(): start = time.time() domain_p[d_k] = com_p(word_list, domain_dict[d_k], domain_count[d_k], len_dict[d_k], total) #计算文档属于每一个类的概率 end_time = time.time() print '%s' % (end_time - start) result_data[k] = domain_p end = time.time() print '%s takes %s...' % (k, end - ts) ts = end return result_data
def psychology_classfiy(uid_weibo): # 心理状态分类主函数 """ 用户心理状态分类主函数 输入数据示例:字典 {uid1:[weibo1,weibo2,weibo3,...]} 输出数据示例:字典(每个用户对应两个字典,一个是一层分类器的状态比例,另一个是二层分类器(消极状态)的状态比例) {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...} """ df_dict, df_count = load_dict(f_label) ds_dict, ds_count = load_dict(s_label) data_s = s_label data_f = f_label data_s.append("other") data_f.append("middle") sw = load_scws() result_data = dict() for k, v in uid_weibo.items(): domain_f = start_p(data_f) domain_s = start_p(data_s) for i in range(0, len(v)): w_text = re_cut(v[i]) if not len(w_text): continue label_f, label_s = find_label(w_text, sw, df_dict, df_count, ds_dict, ds_count) domain_f[label_f] = domain_f[label_f] + 1 domain_s[label_s] = domain_s[label_s] + 1 for k1, v1 in domain_f.items(): domain_f[k1] = float(v1) / float(len(v)) for k1, v1 in domain_s.items(): if domain_f["negemo"] != 0: domain_s[k1] = float(v1) / float(len(v)) else: domain_s[k1] = 0 result_data[k] = {"first": domain_f, "second": domain_s} return result_data
def psychology_classfiy(uid_weibo):#心理状态分类主函数 ''' 用户心理状态分类主函数 输入数据示例:字典 {uid1:[weibo1,weibo2,weibo3,...]} 输出数据示例:字典(每个用户对应两个字典,一个是一层分类器的状态比例,另一个是二层分类器(消极状态)的状态比例) {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...} ''' df_dict,df_count = load_dict(f_label) ds_dict,ds_count = load_dict(s_label) data_s = s_label data_f = f_label data_s.append('other') data_f.append('middle') sw = load_scws() result_data = dict() for k,v in uid_weibo.items(): domain_f = start_p(data_f) domain_s = start_p(data_s) for i in range(0,len(v)): w_text = re_cut(v[i]) if not len(w_text): continue label_f,label_s = find_label(w_text,sw,df_dict,df_count,ds_dict,ds_count) domain_f[label_f] = domain_f[label_f] + 1 domain_s[label_s] = domain_s[label_s] + 1 for k1,v1 in domain_f.items(): domain_f[k1] = float(v1)/float(len(v)) for k1,v1 in domain_s.items(): if domain_f['negemo'] != 0: domain_s[k1] = float(v1)/float(len(v)) else: domain_s[k1] = 0 result_data[k] = {'first' : domain_f, 'second' : domain_s} return result_data
def input_data():#测试输入 sw = load_scws() uid_weibo = dict() uid_list = [] reader = csv.reader(file(abs_path+'/weibo_data/uid_text_0728.csv', 'rb')) for mid,w_text in reader: v = re_cut(w_text.decode('utf-8')) words = sw.participle(v.encode('utf-8')) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black_word) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 uid_list.append(mid) uid_weibo[mid] = word_list return uid_list,uid_weibo
def input_data(): # 测试输入 uid_weibo = dict() uid_list = [] sw = load_scws() reader = csv.reader(file("./weibo_data/uid_text_0728.csv", "rb")) for mid, w_text in reader: text = re_cut(w_text) if mid not in uid_list: uid_list.append(mid) if uid_weibo.has_key(mid): word_dict = uid_weibo[mid] words = sw.participle(text) for word in words: if ( (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black_word) ): # 选择分词结果的名词、动词、形容词,并去掉单个词 if word_dict.has_key(str(word[0])): word_dict[str(word[0])] = word_dict[str(word[0])] + 1 else: word_dict[str(word[0])] = 1 uid_weibo[mid] = word_dict else: word_dict = dict() words = sw.participle(text) for word in words: if ( (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black_word) ): # 选择分词结果的名词、动词、形容词,并去掉单个词 if word_dict.has_key(str(word[0])): word_dict[str(word[0])] = word_dict[str(word[0])] + 1 else: word_dict[str(word[0])] = 1 uid_weibo[mid] = word_dict return uid_list, uid_weibo
def load_weibo(uid_weibo): ts = time.time() domain_dict,domain_count = load_train() end = time.time() print '%s' % (end-ts) len_dict = dict() total = 0 for k,v in domain_dict.items(): len_dict[k] = len(v) total = total + len(v) sw = load_scws() black = load_black_words() result_data = dict() ts = time.time() for k,v in uid_weibo.items(): words = sw.participle(v) domain_p = start_p(name_list) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black) and (word[0] not in single_word_whitelist) and (word[0] not in word_list):#选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 for d_k in domain_p.keys(): start = time.time() domain_p[d_k] = com_p(word_list,domain_dict[d_k],domain_count[d_k],len_dict[d_k],total)#计算文档属于每一个类的概率 end_time = time.time() print '%s' % (end_time-start) result_data[k] = domain_p end = time.time() print '%s takes %s...' % (k,end-ts) ts = end return result_data
def read_csv(domain_dict,domain_count,d_time): sw = load_scws() black = load_black_words() text = '' word_dict = dict() reader = csv.reader(file('./add_dict/%s_new.csv'% d_time, 'rb')) for line in reader: #line = line[0].strip('\xef\xbb\xbf') #line = line.strip('\n') text = text + ',' + line #print text.encode('utf-8') words = sw.participle(text) for word in words: if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词 if domain_dict.has_key(str(word[0])): domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1 else: domain_dict[str(word[0])] = 1 domain_count = domain_count + 1 return domain_dict,domain_count
def input_data(): #测试输入 uid_weibo = dict() uid_list = [] sw = load_scws() reader = csv.reader(file('./weibo_data/uid_text_0728.csv', 'rb')) for mid, w_text in reader: text = re_cut(w_text) if mid not in uid_list: uid_list.append(mid) if uid_weibo.has_key(mid): word_dict = uid_weibo[mid] words = sw.participle(text) for word in words: if (word[1] in cx_dict) and ( 3 < len(word[0]) < 30 or word[0] in single_word_whitelist ) and (word[0] not in black_word): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_dict.has_key(str(word[0])): word_dict[str(word[0])] = word_dict[str(word[0])] + 1 else: word_dict[str(word[0])] = 1 uid_weibo[mid] = word_dict else: word_dict = dict() words = sw.participle(text) for word in words: if (word[1] in cx_dict) and ( 3 < len(word[0]) < 30 or word[0] in single_word_whitelist ) and (word[0] not in black_word): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_dict.has_key(str(word[0])): word_dict[str(word[0])] = word_dict[str(word[0])] + 1 else: word_dict[str(word[0])] = 1 uid_weibo[mid] = word_dict return uid_list, uid_weibo
def read_csv(domain_dict, domain_count, d_time): sw = load_scws() black = load_black_words() text = '' word_dict = dict() reader = csv.reader(file('./add_dict/%s_new.csv' % d_time, 'rb')) for line in reader: #line = line[0].strip('\xef\xbb\xbf') #line = line.strip('\n') text = text + ',' + line #print text.encode('utf-8') words = sw.participle(text) for word in words: if (word[1] in cx_dict) and (3 < len( word[0]) < 30 or word[0] in single_word_whitelist) and ( word[0] not in black): #选择分词结果的名词、动词、形容词,并去掉单个词 if domain_dict.has_key(str(word[0])): domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1 else: domain_dict[str(word[0])] = 1 domain_count = domain_count + 1 return domain_dict, domain_count
#from global_utils import es_flow_text as es_text #from global_utils import flow_text_index_name_pre, flow_text_index_type #from time_utils import ts2datetime, datetime2ts #from parameter import SOCIAL_SENSOR_TIME_INTERVAL as time_interval #from parameter import SOCIAL_SENSOR_FORWARD_RANGE as forward_time_range PROCESS_GRAM = 3 Min_CLUSTER_NUM = 2 MAX_CLUSTER_NUM = 15 CLUTO_FOLDER = 'cluto' COMMENT_WORDS_CLUSTER_NUM = 10 CLUSTERING_KMEANS_CLUSTERING_NUM = 10 CLUTO_EXECUTE_PATH = './cluto-2.1.2/Linux-i686/vcluster' AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), './') sw = load_scws() #cx_dict = set(['Ag','a','an','Ng','n','nr','ns','nt','nz','Vg','v','vd','vn','@','j']) cx_dict = set(['Ng', 'n', 'nr', 'ns', 'nt', 'nz']) # 关键词词性词典, 保留名词 def freq_word(items): """ 统计一条文本的词频,对文本进行过滤后再分词 input: items:微博字典,{"mid": 12345, "text": text} output: top_word:词和词频构成的字典,如:{词:词频, 词:词频} """ word_list = [] text = items["text"] #print type(text)
domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1 else: domain_dict[str(word[0])] = 1 domain_count = domain_count + 1 return domain_dict,domain_count def write_has(filename,has_word): n = len(has_word) keyword = TopkHeap(n) for k,v in has_word.items(): keyword.Push((v,k)) keyword_data = keyword.TopK() with open('%s/topic_dict/%s_ori.csv' % (abs_path,filename), 'wb') as f: writer = csv.writer(f) for i in range(0,len(keyword_data)): if keyword_data[i][0] > 1: writer.writerow((keyword_data[i][0],keyword_data[i][1])) if __name__ == '__main__': sw = load_scws() for j in name_list: new_dict,new_count = read_csv(sw,DOMAIN_DICT_ORI[j],DOMAIN_COUNT_ORI[j],j)#更新类型字典 #write_has(j,new_dict)#将结果写入文件