Esempio n. 1
0
def triple_classifier(tweet):
    '''
    输出结果:
    0 中性
    1 积极
    2 生气
    3 焦虑
    4 悲伤
    5 厌恶
    6 消极其他
    '''
    sentiment = MIDDLE

    text = tweet['text']
    keywords_list = []

    emoticon_sentiment = emoticon(text)
    if emoticon_sentiment != MIDDLE:
        entries = cut(fc, text)
        entry = [e for e in entries]
        keywords_list = entries
        if emoticon_sentiment == POSITIVE:
            sentiment = emoticon_sentiment
            text = u''
        else:
            sentiment = flow_psychology_classfiy(text)
            if sentiment == 0:
                sentiment = 6
            text = u''

    if text != u'':
        entries = fc.get_text_fc(text)
        entry = [e for e in entries]
        keywords_list = entry

        bow = dictionary_1.doc2bow(entry)
        s = [1, 1]
        for pair in bow:
            s[0] = s[0] * (step1_score[pair[0]][0]**pair[1])
            s[1] = s[1] * (step1_score[pair[0]][1]**pair[1])
        if s[0] < s[1]:
            bow = dictionary_2.doc2bow(entry)
            s2 = [1, 1]
            for pair in bow:
                s2[0] = s2[0] * (step2_score[pair[0]][0]**pair[1])
                s2[1] = s2[1] * (step2_score[pair[0]][1]**pair[1])
            if s2[0] > s2[1]:
                sentiment = POSITIVE
            elif s2[0] == s2[1]:
                sentiment = MIDDLE
            else:
                sentiment = flow_psychology_classfiy(text)
                if sentiment == 0:
                    sentiment = 6
        else:
            sentiment = MIDDLE

    return sentiment
Esempio n. 2
0
def cut_words_noun(text):
    '''分词, 加入黑名单过滤单个词,保留名词
       input
           texts: 输入text的list,utf-8
       output:
           terms: 关键词list
    '''
    if not isinstance(text, str):
        raise ValueError("cut words input text must be string")

    cx_terms = fc.get_text_fc(text, cx=True)

    return [term for term, cx in cx_terms if cx in cx_dict_noun_utils and term not in black_words]
Esempio n. 3
0
def test_data(weibo, flag):
    word_dict = dict()
    with open(ABS_PATH + '/svm/new_feature.csv', 'r') as f:
        reader = csv.reader(f)
        for w, c in reader:
            word_dict[str(w)] = c

    items = []
    for i in range(0, len(weibo)):
        words = fc.get_text_fc(weibo[i]['content168'])
        row = dict()
        for word in words:
            if str(word[0]) in row:
                row[str(word[0])] = row[str(word[0])] + 1
            else:
                row[str(word[0])] = 1
        items.append(row)

    f_items = []
    for i in range(0, len(items)):
        row = items[i]
        f_row = ''
        f_row = f_row + str(1)
        for k, v in word_dict.items():
            if k in row:
                item = str(word_dict[k]) + ':' + str(row[k])
                f_row = f_row + ' ' + str(item)
        f_items.append(f_row)

    with open(ABS_PATH + '/svm_test/test%s.txt' % flag, 'w') as f:
        writer = csv.writer(f)
        for i in range(0, len(f_items)):
            row = []
            row.append(f_items[i])
            writer.writerow((row))
    f.close()
Esempio n. 4
0
def word_net(weibo,k_cluster):#词频词网
    single_word_whitelist = load_single_word_whitelist()
    black = load_black_words()
    cx_dict = set(['Ag','a','an','Ng','n','nr','ns','nt','nz','Vg','v','vd','vn','@','j'])
    n = 0
    ts = time.time()

    f_dict = dict()#频数字典
    total = 0#词的总数
    weibo_word = []
    for i in range(0,len(weibo)):
        text = weibo[i]['content168']
        words = fc.get_text_fc(text ,cx=True)
        row = []
        for word in words:
            if (word[1] in cx_dict) and (1 < len(word[0]) < 10 or word[0] in single_word_whitelist) and (word[0] not in black):#选择分词结果的名词、动词、形容词,并去掉单个词
                total = total + 1
                if str(word[0]) in f_dict:
                    f_dict[str(word[0])] = f_dict[str(word[0])] + 1
                else:
                    f_dict[str(word[0])] = 1
                row.append(word[0])
        weibo_word.append(row)

    keyword = TopkHeap(300)
    for k,v in f_dict.items():#计算单个词的信息量
        if v >= 2 and (float(v)/float(total)) <= 0.8:#去掉频数小于3,频率高于80%的词
            p = v
            keyword.Push((p,k))#排序
    
    keyword_data = keyword.TopK()#取得前100的高频词作为顶点
    ts = time.time()

    keyword = []
    k_value = dict()
    for i in range(0,len(keyword_data)):
        keyword.append(keyword_data[i][1])
        k_value[str(keyword_data[i][1])] = float(keyword_data[i][0])/float(total)

    word_net = dict()#词网字典
    for i in range(0,len(weibo_word)):
        row = weibo_word[i]
        for j in range(0,len(row)):
            if row[j] in keyword:
                if j-1 >= 0 and row[j] != row[j-1]:
                    if str(row[j]+'_'+row[j-1]) in word_net:
                        word_net[str(row[j]+'_'+row[j-1])] = word_net[str(row[j]+'_'+row[j-1])] + 1
                    elif str(row[j-1]+'_'+row[j]) in word_net:
                        word_net[str(row[j-1]+'_'+row[j])] = word_net[str(row[j-1]+'_'+row[j])] + 1
                    else:
                        word_net[str(row[j-1]+'_'+row[j])] = 1
                if j+1 < len(row) and row[j] != row[j+1]:
                    if str(row[j]+'_'+row[j+1]) in word_net:
                        word_net[str(row[j]+'_'+row[j+1])] = word_net[str(row[j]+'_'+row[j+1])] + 1
                    elif str(row[j+1]+'_'+row[j]) in word_net:
                        word_net[str(row[j+1]+'_'+row[j])] = word_net[str(row[j+1]+'_'+row[j])] + 1
                    else:
                        word_net[str(row[j]+'_'+row[j+1])] = 1

    weight = TopkHeap(500)   #这里选择的top数会和下面聚类的输入及输出一样
    for k,v in word_net.items():#计算权重
        k1,k2 = k.split('_')
        if k1 not in k_value:
            k_value[k1] = 0
        if k2 not in k_value:
            k_value[k2] = 0
        if k_value[k1] > k_value[k2]:
            p = v*k_value[k1]
        else:
            p = v*k_value[k2]
        weight.Push((p,k))#排序

    data = weight.TopK()
    word = []
    word_weight = dict()
    for i in range(0,len(data)):
        if data[i][1] not in word:
            word.append(data[i][1])
            word_weight[data[i][1]] = data[i][0]

    #聚类
    feature = []
    for w in word:
        k1,k2 = w.split('_')
        c = []
        for i in range(0, len(weibo)):
            n1 = weibo[i]['content168'].count(str(k1))
            n2 = weibo[i]['content168'].count(str(k2))
            n = n1 + n2
            c.append(n)
        feature.append(c)
    features = np.array(feature)
    result = kmeans(features,k_cluster,'summary')

    word_result_before = dict()
    for i in range(0,len(result)):
        label = result[i][0]
        w = (word[i],word_weight[word[i]])
        try:
            word_result_before[label].append(w)
        except KeyError:
            word_result_before[label] = [w]
    
    word_result = dict()
    word_main = dict()
    for label in word_result_before:
        main_words = sorted(word_result_before[label],key = lambda x:x[1],reverse = True)
        word_result[label] = [i[0] for i in word_result_before[label]]
        word_main[label] = [i[0] for i in main_words[:k_cluster]]

    return word_result, word_weight, word_main