def domain_classfiy_by_text(user_weibo):#根据用户微博文本进行领域分类 ''' 输入数据:字典 {uid:weibo字符串(多条微博用逗号连接),...} 输出数据:字典 {uid:label1,uid2:label2,...} ''' domain_dict,domain_count = load_train() len_dict = dict() total = 0 for k,v in domain_dict.items(): len_dict[k] = len(v) total = total + len(v) sw = load_scws() black = load_black_words() result_data = dict() p_data = dict() for k,v in user_weibo.items(): start = time.time() words = sw.participle(v) domain_p = start_p() word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 for d_k in domain_p.keys(): start_time = time.time() domain_p[d_k] = com_p(word_list,domain_dict[d_k],domain_count[d_k],len_dict[d_k],total)#计算文档属于每一个类的概率 end_time = time.time() print '%s domain takes %s second...' % (d_k,(end_time-start_time)) label,rank_data = rank_dict(domain_p) result_data[k] = label p_data[k] = rank_data end = time.time() print '%s takes %s second...' % (k,(end-start)) return result_data,p_data
mediaw = getMediaWords() def getBusinessWords(): businessw = [] f = open('./domain_dict/businessw.txt', 'r') for line in f: businessw.append(line.strip()) # 商业人士词汇 return businessw businessw = getBusinessWords() s = load_scws() def user_domain_classifier_v2(user): r = user label = labels[11] verified_type = r['verified_type'] location = r['location'].encode('utf-8') province = location.split(' ')[0] followers_count = r['followers_count'] statuses_count = r['statuses_count'] name = r['name'] description = r['description']
return mediaw mediaw = getMediaWords() def getBusinessWords(): businessw = [] f = open('./domain_dict/businessw.txt', 'r') for line in f: businessw.append(line.strip()) # 商业人士词汇 return businessw businessw = getBusinessWords() s = load_scws() def user_domain_classifier_v2(user): r = user label = labels[11] verified_type = r['verified_type'] location = r['location'].encode('utf-8') province = location.split(' ')[0] followers_count = r['followers_count'] statuses_count = r['statuses_count'] name = r['name'] description = r['description']