Beispiel #1
0
def input_data(name):  #测试输入

    uid_list = []
    reader = csv.reader(file(abs_path + '/weibo_data/0122_uid.txt', 'rb'))
    for line in reader:
        uid = line[0].strip('\t\r\n')
        uid = uid.strip('\xef\xbb\xbf')
        uid_list.append(uid)

    uid_weibo = dict()
    sw = load_scws()
    reader = csv.reader(file(abs_path + '/test_weibo/com_weibo0126.csv', 'rb'))
    for mid, w_text, ts in reader:
        mid = mid.strip('\xef\xbb\xbf')
        if mid in uid_list:
            if uid_weibo.has_key(mid):
                item = uid_weibo[mid]
                item = item + '_' + w_text
                uid_weibo[mid] = item
            else:
                item = w_text
                uid_weibo[mid] = item

    uid_word = dict()
    for k, v in uid_weibo.iteritems():
        item = dict()
        words = sw.participle(v)
        for word in words:
            if item.has_key(word[0]):
                item[word[0]] = item[word[0]] + 1
            else:
                item[word[0]] = 1
        uid_word[k] = item

    return uid_list, uid_word
def input_data(name):#测试输入

    uid_list = []
    reader = csv.reader(file(abs_path + '/weibo_data/0122_uid.txt', 'rb'))
    for line in reader:
        uid = line[0].strip('\t\r\n')
        uid = uid.strip('\xef\xbb\xbf')
        uid_list.append(uid)

    uid_weibo = dict()
    sw = load_scws()
    reader = csv.reader(file(abs_path + '/test_weibo/com_weibo0126.csv', 'rb'))
    for mid,w_text,ts in reader:
        mid = mid.strip('\xef\xbb\xbf')
        if mid in uid_list:
            if uid_weibo.has_key(mid):
                item = uid_weibo[mid]
                item = item + '_' + w_text
                uid_weibo[mid] = item
            else:
                item = w_text
                uid_weibo[mid] = item

    uid_word = dict()
    for k,v in uid_weibo.iteritems():
        item = dict()
        words = sw.participle(v)
        for word in words:
            if item.has_key(word[0]):
                item[word[0]] = item[word[0]] + 1
            else:
                item[word[0]] = 1
        uid_word[k] = item
    
    return uid_list,uid_word
Beispiel #3
0
def input_data():  #测试输入

    sw = load_scws()
    uid_weibo = dict()
    uid_list = []
    reader = csv.reader(file('./weibo_data/uid_text_0728.csv', 'rb'))
    for mid, w_text in reader:
        if uid_weibo.has_key(str(mid)):
            uid_weibo[str(mid)] = uid_weibo[str(mid)] + '-' + w_text
        else:
            uid_weibo[str(mid)] = w_text
        if mid not in uid_list:
            uid_list.append(mid)

    uid_word = dict()
    for k, v in uid_weibo.items():
        words = sw.participle(v)
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (
                    word[0] not in black_word
            ) and (word[0]
                   not in single_word_whitelist):  #选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        uid_word[k] = word_list

    return uid_list, uid_word
Beispiel #4
0
def input_data():  #测试输入

    sw = load_scws()
    uid_weibo = dict()
    uid_list = []
    reader = csv.reader(file(abs_path + '/weibo_data/uid_text_0728.csv', 'rb'))
    #print 'reader::',reader
    for mid, w_text in reader:
        print 'mid:::', mid
        print 'w_text:::', w_text
        v = re_cut(w_text.decode('utf-8'))
        words = sw.participle(v.encode('utf-8'))
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (
                    word[0] not in black_word
            ) and (word[0]
                   not in single_word_whitelist):  #选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        uid_list.append(mid)
        uid_weibo[mid] = word_list

    return uid_list, uid_weibo
Beispiel #5
0
def load_weibo(uid_weibo):

    ts = time.time()
    domain_dict, domain_count = load_train()
    end = time.time()

    print '%s' % (end - ts)

    len_dict = dict()
    total = 0
    for k, v in domain_dict.items():
        len_dict[k] = len(v)
        total = total + len(v)

    sw = load_scws()
    black = load_black_words()
    result_data = dict()
    ts = time.time()
    for k, v in uid_weibo.items():
        words = sw.participle(v)
        domain_p = start_p(name_list)
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(
                    word[0]) < 30 and (word[0] not in black) and (
                        word[0] not in single_word_whitelist
                    ) and (word[0] not in word_list):  #选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        for d_k in domain_p.keys():
            start = time.time()
            domain_p[d_k] = com_p(word_list, domain_dict[d_k],
                                  domain_count[d_k], len_dict[d_k],
                                  total)  #计算文档属于每一个类的概率
            end_time = time.time()
            print '%s' % (end_time - start)
        result_data[k] = domain_p
        end = time.time()
        print '%s takes %s...' % (k, end - ts)
        ts = end

    return result_data
Beispiel #6
0
def psychology_classfiy(uid_weibo):  # 心理状态分类主函数
    """
    用户心理状态分类主函数
    输入数据示例:字典
    {uid1:[weibo1,weibo2,weibo3,...]}

    输出数据示例:字典(每个用户对应两个字典,一个是一层分类器的状态比例,另一个是二层分类器(消极状态)的状态比例)
    {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...}
    """

    df_dict, df_count = load_dict(f_label)
    ds_dict, ds_count = load_dict(s_label)

    data_s = s_label
    data_f = f_label
    data_s.append("other")
    data_f.append("middle")

    sw = load_scws()
    result_data = dict()
    for k, v in uid_weibo.items():
        domain_f = start_p(data_f)
        domain_s = start_p(data_s)
        for i in range(0, len(v)):
            w_text = re_cut(v[i])
            if not len(w_text):
                continue
            label_f, label_s = find_label(w_text, sw, df_dict, df_count, ds_dict, ds_count)
            domain_f[label_f] = domain_f[label_f] + 1
            domain_s[label_s] = domain_s[label_s] + 1

        for k1, v1 in domain_f.items():
            domain_f[k1] = float(v1) / float(len(v))
        for k1, v1 in domain_s.items():
            if domain_f["negemo"] != 0:
                domain_s[k1] = float(v1) / float(len(v))
            else:
                domain_s[k1] = 0

        result_data[k] = {"first": domain_f, "second": domain_s}

    return result_data
Beispiel #7
0
def psychology_classfiy(uid_weibo):#心理状态分类主函数
    '''
    用户心理状态分类主函数
    输入数据示例:字典
    {uid1:[weibo1,weibo2,weibo3,...]}

    输出数据示例:字典(每个用户对应两个字典,一个是一层分类器的状态比例,另一个是二层分类器(消极状态)的状态比例)
    {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...}
    '''
    
    df_dict,df_count = load_dict(f_label)
    ds_dict,ds_count = load_dict(s_label)
    
    data_s = s_label
    data_f = f_label
    data_s.append('other')
    data_f.append('middle')

    sw = load_scws()
    result_data = dict()
    for k,v in uid_weibo.items():
        domain_f = start_p(data_f)
        domain_s = start_p(data_s)
        for i in range(0,len(v)):
            w_text = re_cut(v[i])
            if not len(w_text):
                continue
            label_f,label_s = find_label(w_text,sw,df_dict,df_count,ds_dict,ds_count)
            domain_f[label_f] = domain_f[label_f] + 1
            domain_s[label_s] = domain_s[label_s] + 1

        for k1,v1 in domain_f.items():
            domain_f[k1] = float(v1)/float(len(v))
        for k1,v1 in domain_s.items():
            if domain_f['negemo'] != 0:
                domain_s[k1] = float(v1)/float(len(v))
            else:
                domain_s[k1] = 0

        result_data[k] = {'first' : domain_f, 'second' : domain_s}

    return result_data
def input_data():#测试输入

    sw = load_scws()
    uid_weibo = dict()
    uid_list = []
    reader = csv.reader(file(abs_path+'/weibo_data/uid_text_0728.csv', 'rb'))
    for mid,w_text in reader:
        v = re_cut(w_text.decode('utf-8'))
        words = sw.participle(v.encode('utf-8'))
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black_word) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        uid_list.append(mid)
        uid_weibo[mid] = word_list
    
    return uid_list,uid_weibo
def input_data():  # 测试输入

    uid_weibo = dict()
    uid_list = []
    sw = load_scws()
    reader = csv.reader(file("./weibo_data/uid_text_0728.csv", "rb"))
    for mid, w_text in reader:
        text = re_cut(w_text)
        if mid not in uid_list:
            uid_list.append(mid)
        if uid_weibo.has_key(mid):
            word_dict = uid_weibo[mid]
            words = sw.participle(text)
            for word in words:
                if (
                    (word[1] in cx_dict)
                    and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist)
                    and (word[0] not in black_word)
                ):  # 选择分词结果的名词、动词、形容词,并去掉单个词
                    if word_dict.has_key(str(word[0])):
                        word_dict[str(word[0])] = word_dict[str(word[0])] + 1
                    else:
                        word_dict[str(word[0])] = 1
            uid_weibo[mid] = word_dict
        else:
            word_dict = dict()
            words = sw.participle(text)
            for word in words:
                if (
                    (word[1] in cx_dict)
                    and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist)
                    and (word[0] not in black_word)
                ):  # 选择分词结果的名词、动词、形容词,并去掉单个词
                    if word_dict.has_key(str(word[0])):
                        word_dict[str(word[0])] = word_dict[str(word[0])] + 1
                    else:
                        word_dict[str(word[0])] = 1
            uid_weibo[mid] = word_dict

    return uid_list, uid_weibo
Beispiel #10
0
def load_weibo(uid_weibo):

    ts = time.time()
    domain_dict,domain_count = load_train()
    end = time.time()

    print '%s' % (end-ts)

    len_dict = dict()
    total = 0
    for k,v in domain_dict.items():
        len_dict[k] = len(v)
        total = total + len(v)

    sw = load_scws()
    black = load_black_words()
    result_data = dict()
    ts = time.time()
    for k,v in uid_weibo.items():
        words = sw.participle(v)
        domain_p = start_p(name_list)
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black) and (word[0] not in single_word_whitelist) and (word[0] not in word_list):#选择分词结果的名词、动词、形容词,并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        for d_k in domain_p.keys():
            start = time.time()
            domain_p[d_k] = com_p(word_list,domain_dict[d_k],domain_count[d_k],len_dict[d_k],total)#计算文档属于每一个类的概率
            end_time = time.time()
            print '%s' % (end_time-start)
        result_data[k] = domain_p
        end = time.time()
        print '%s takes %s...' % (k,end-ts)
        ts = end

    return result_data
Beispiel #11
0
def read_csv(domain_dict,domain_count,d_time):
    sw = load_scws()
    black = load_black_words()
    text = ''
    word_dict = dict()
    reader = csv.reader(file('./add_dict/%s_new.csv'% d_time, 'rb'))
    for line in reader:
        #line = line[0].strip('\xef\xbb\xbf')
        #line = line.strip('\n')
        text = text + ',' + line

    #print text.encode('utf-8')
    words = sw.participle(text)
    for word in words:
        if (word[1] in cx_dict) and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词
            if domain_dict.has_key(str(word[0])):
                domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1
            else:
                domain_dict[str(word[0])] = 1
            domain_count = domain_count + 1

    return domain_dict,domain_count
def input_data():  #测试输入

    uid_weibo = dict()
    uid_list = []
    sw = load_scws()
    reader = csv.reader(file('./weibo_data/uid_text_0728.csv', 'rb'))
    for mid, w_text in reader:
        text = re_cut(w_text)
        if mid not in uid_list:
            uid_list.append(mid)
        if uid_weibo.has_key(mid):
            word_dict = uid_weibo[mid]
            words = sw.participle(text)
            for word in words:
                if (word[1] in cx_dict) and (
                        3 < len(word[0]) < 30
                        or word[0] in single_word_whitelist
                ) and (word[0] not in black_word):  #选择分词结果的名词、动词、形容词,并去掉单个词
                    if word_dict.has_key(str(word[0])):
                        word_dict[str(word[0])] = word_dict[str(word[0])] + 1
                    else:
                        word_dict[str(word[0])] = 1
            uid_weibo[mid] = word_dict
        else:
            word_dict = dict()
            words = sw.participle(text)
            for word in words:
                if (word[1] in cx_dict) and (
                        3 < len(word[0]) < 30
                        or word[0] in single_word_whitelist
                ) and (word[0] not in black_word):  #选择分词结果的名词、动词、形容词,并去掉单个词
                    if word_dict.has_key(str(word[0])):
                        word_dict[str(word[0])] = word_dict[str(word[0])] + 1
                    else:
                        word_dict[str(word[0])] = 1
            uid_weibo[mid] = word_dict

    return uid_list, uid_weibo
Beispiel #13
0
def read_csv(domain_dict, domain_count, d_time):
    sw = load_scws()
    black = load_black_words()
    text = ''
    word_dict = dict()
    reader = csv.reader(file('./add_dict/%s_new.csv' % d_time, 'rb'))
    for line in reader:
        #line = line[0].strip('\xef\xbb\xbf')
        #line = line.strip('\n')
        text = text + ',' + line

    #print text.encode('utf-8')
    words = sw.participle(text)
    for word in words:
        if (word[1] in cx_dict) and (3 < len(
                word[0]) < 30 or word[0] in single_word_whitelist) and (
                    word[0] not in black):  #选择分词结果的名词、动词、形容词,并去掉单个词
            if domain_dict.has_key(str(word[0])):
                domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1
            else:
                domain_dict[str(word[0])] = 1
            domain_count = domain_count + 1

    return domain_dict, domain_count
Beispiel #14
0
#from global_utils import es_flow_text as es_text
#from global_utils import flow_text_index_name_pre, flow_text_index_type
#from time_utils import ts2datetime, datetime2ts
#from parameter import SOCIAL_SENSOR_TIME_INTERVAL as time_interval
#from parameter import SOCIAL_SENSOR_FORWARD_RANGE as forward_time_range

PROCESS_GRAM = 3
Min_CLUSTER_NUM = 2
MAX_CLUSTER_NUM = 15
CLUTO_FOLDER = 'cluto'
COMMENT_WORDS_CLUSTER_NUM = 10
CLUSTERING_KMEANS_CLUSTERING_NUM = 10
CLUTO_EXECUTE_PATH = './cluto-2.1.2/Linux-i686/vcluster'
AB_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), './')

sw = load_scws()
#cx_dict = set(['Ag','a','an','Ng','n','nr','ns','nt','nz','Vg','v','vd','vn','@','j'])
cx_dict = set(['Ng', 'n', 'nr', 'ns', 'nt', 'nz'])  # 关键词词性词典, 保留名词


def freq_word(items):
    """
    统计一条文本的词频,对文本进行过滤后再分词
    input:
        items:微博字典,{"mid": 12345, "text": text}
    output:
        top_word:词和词频构成的字典,如:{词:词频, 词:词频}
    """
    word_list = []
    text = items["text"]
    #print type(text)
Beispiel #15
0
                domain_dict[str(word[0])] = domain_dict[str(word[0])] + 1
            else:
                domain_dict[str(word[0])] = 1
            domain_count = domain_count + 1

    return domain_dict,domain_count

def write_has(filename,has_word):

    n = len(has_word)
    keyword = TopkHeap(n)

    for k,v in has_word.items():
        keyword.Push((v,k))

    keyword_data = keyword.TopK()

    with open('%s/topic_dict/%s_ori.csv' % (abs_path,filename), 'wb') as f:
        writer = csv.writer(f)
        for i in range(0,len(keyword_data)):
            if keyword_data[i][0] > 1:
                writer.writerow((keyword_data[i][0],keyword_data[i][1]))

if __name__ == '__main__':

    sw = load_scws()
    for j in name_list:
        new_dict,new_count = read_csv(sw,DOMAIN_DICT_ORI[j],DOMAIN_COUNT_ORI[j],j)#更新类型字典
        #write_has(j,new_dict)#将结果写入文件