def load_weibo(uid_weibo): ts = time.time() domain_dict, domain_count = load_train() end = time.time() print '%s' % (end - ts) len_dict = dict() total = 0 for k, v in domain_dict.items(): len_dict[k] = len(v) total = total + len(v) sw = load_scws() black = load_black_words() result_data = dict() ts = time.time() for k, v in uid_weibo.items(): words = sw.participle(v) domain_p = start_p(name_list) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len( word[0]) < 30 and (word[0] not in black) and ( word[0] not in single_word_whitelist ) and (word[0] not in word_list): #选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 for d_k in domain_p.keys(): start = time.time() domain_p[d_k] = com_p(word_list, domain_dict[d_k], domain_count[d_k], len_dict[d_k], total) #计算文档属于每一个类的概率 end_time = time.time() print '%s' % (end_time - start) result_data[k] = domain_p end = time.time() print '%s takes %s...' % (k, end - ts) ts = end return result_data
def load_weibo(uid_weibo): ts = time.time() domain_dict,domain_count = load_train() end = time.time() print '%s' % (end-ts) len_dict = dict() total = 0 for k,v in domain_dict.items(): len_dict[k] = len(v) total = total + len(v) sw = load_scws() black = load_black_words() result_data = dict() ts = time.time() for k,v in uid_weibo.items(): words = sw.participle(v) domain_p = start_p(name_list) word_list = dict() for word in words: if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black) and (word[0] not in single_word_whitelist) and (word[0] not in word_list):#选择分词结果的名词、动词、形容词,并去掉单个词 if word_list.has_key(word[0]): word_list[word[0]] = word_list[word[0]] + 1 else: word_list[word[0]] = 1 for d_k in domain_p.keys(): start = time.time() domain_p[d_k] = com_p(word_list,domain_dict[d_k],domain_count[d_k],len_dict[d_k],total)#计算文档属于每一个类的概率 end_time = time.time() print '%s' % (end_time-start) result_data[k] = domain_p end = time.time() print '%s takes %s...' % (k,end-ts) ts = end return result_data
def read_csv(domain_dict,domain_count,d_time): sw = load_scws() black = load_black_words() text = '' word_dict = dict() reader = csv.reader(file('./add_dict/%s_new.csv'% d_time, 'rb')) for line in reader: #line = line[0].strip('\xef\xbb\xbf') #line = line.strip('\n') text = text + ',' + line #print text.encode('utf-8') words = sw.participle(text) for word in words: if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词 if domain_dict.has_key(str(word[0])): domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1 else: domain_dict[str(word[0])] = 1 domain_count = domain_count + 1 return domain_dict,domain_count
def read_csv(domain_dict, domain_count, d_time): sw = load_scws() black = load_black_words() text = '' word_dict = dict() reader = csv.reader(file('./add_dict/%s_new.csv' % d_time, 'rb')) for line in reader: #line = line[0].strip('\xef\xbb\xbf') #line = line.strip('\n') text = text + ',' + line #print text.encode('utf-8') words = sw.participle(text) for word in words: if (word[1] in cx_dict) and (3 < len( word[0]) < 30 or word[0] in single_word_whitelist) and ( word[0] not in black): #选择分词结果的名词、动词、形容词,并去掉单个词 if domain_dict.has_key(str(word[0])): domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1 else: domain_dict[str(word[0])] = 1 domain_count = domain_count + 1 return domain_dict, domain_count
# -*- coding: UTF-8 -*- import os import time import re import scws import csv import sys import json from textrank4zh import TextRank4Keyword, TextRank4Sentence from config import load_scws,load_black_words,re_cut black = load_black_words() tr4w = TextRank4Keyword() def get_keyword(w_text, n_gram, n_count): tr4w.analyze(text=w_text, lower=True, window=n_gram) word_list = dict() k_dict = tr4w.get_keywords(n_count, word_min_len=2) for item in k_dict: if item.word.encode('utf-8').isdigit() or item.word.encode('utf-8') in black: continue word_list[item.word.encode('utf-8')] = item.weight return word_list def get_weibo_single(text,n_gram=2,n_count=3): ''' 针对单条微博提取关键词,但是效率比较低 输入数据: