Exemple #1
0
def summary_main(weibo_data):  #摘要自动生成主函数
    '''
        输入数据:
        weibo列表:[weibo1,weibo2,...]
    '''

    word_result, word_weight = word_net(weibo_data, 5)

    text_list = text_net(word_result, word_weight, weibo_data)

    text_str = ''
    for text in text_list:
        re_t = re_cut(text)
        if not len(re_t):
            continue
        if re_t[-1] != '。':
            text_str = text_str + re_t + '。'
        else:
            text_str = text_str + re_t
    #print text_str
    tr4s = TextRank4Sentence()
    tr4s.analyze(text=text_str, lower=True, source='all_filters')

    result = []
    for item in tr4s.get_key_sentences(num=10):
        result.append(item.sentence)

    return result
Exemple #2
0
def opinion_main(weibo_data,k_cluster):
    '''
        观点挖掘主函数:
        输入数据:
        weibo_data:微博列表,[weibo1,weibo2,...]
        k_cluster:子话题个数

        输出数据:
        opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...}
        word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
        text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''
    
    weibo_new = []
    for i in range(0,len(weibo_data)):
        text = weibo_data[i]
        n = str(text).count('@')
        if n >= 5:
            continue
        value = cut_filter(text)
        if len(value) > 0:
            if text != '转发微博':
                weibo_new.append(value)
    
    word_result,word_weight = word_net(weibo_new,k_cluster)#提取关键词对
    
    text_list,opinion_name = text_net(word_result,word_weight,weibo_new)#提取代表文本

    return opinion_name,word_result,text_list
Exemple #3
0
def opinion_main(weibo_data, k_cluster):
    '''
        观点挖掘主函数:
        输入数据:
        weibo_data:微博列表,[weibo1,weibo2,...]
        k_cluster:子话题个数

        输出数据:
        opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...}
        word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
        text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''
    print('\t\tGetting keywords...')
    limit_num = 30000
    weibo_data = weibo_num_limit(weibo_data, limit_num)
    # while True:
    word_result, word_weight, word_main = word_net(weibo_data,
                                                   k_cluster)  #提取关键词对
    # if len(word_result):
    #     break
    # else:
    #     print('Cluto wrong!!! Trying again... If you want to stop it, just kill it...')

    print('\t\tGetting present text...')
    text_list, opinion_name = text_net(word_result, word_weight,
                                       weibo_data)  #提取代表文本,会保证每个聚类里面的微博数量是相等的

    return opinion_name, word_result, text_list, word_main
Exemple #4
0
def search_weibo_from_word(uidlist,keywords):#第四种策略:先根据BM25检索文本,然后再根据关键词的交集筛选文本
    '''
        输入数据:
        uidlist:uid列表
        keywords:keywords列表,热点新闻切词之后的结果

        输出数据:
        text_list:筛选之后的微博文本
    '''
    text_list,word_set = get_text_word_by_id(uidlist)#根据uid列表获取对应的文本和分词之后的结果

    text_set,word_dict = get_text_by_BM(text_list,word_set,keywords)
    
    n = int(0.5*len(text_set))
    if n < 1:
        n = 1
    result_list = TopkHeap(n)

    w_n = int(0.5*len(keywords))
    if w_n < 1:
        w_n = 1

    for i in range(0,len(word_dict)):
        words = word_dict[i]
        len_n = len(set(words)&set(keywords))
        if len_n >= w_n:
            result_list.Push((len_n,text_set[i]))

    result = result_list.TopK()
    text_list = []
    for i in range(0,len(result)):
        if result[i][1] not in text_list:
            text_list.append(result[i][1])
    
    if len(text_list) >= 10:
        word_result,word_weight = word_net(text_list,OPINION_CLUSTER)
        text_list = text_net(word_result,word_weight,text_list)
        result = []
        for text in text_list:
            s = summary_text(text)
            max_r,n = get_s(result,s)
            if max_r >= 0.5:
                continue
            else:
                result.append(s)
    else:
        result = [summary_text(text_list)]

    return result
Exemple #5
0
def opinion_main(weibo_data,k_cluster):
    '''
        观点挖掘主函数:
        输入数据:
        weibo_data:微博列表,[[mid,text,uid,timetamp,uname,forwarding_count,comment_count],...]
        k_cluster:子话题个数

        输出数据:
        opinion_name:子话题名称字典,{topic1:name1,topic2:name2,...}
        word_result:子话题关键词对,{topic1:[w1,w2,...],topic2:[w1,w2,...],...}
        text_list:子话题对应的文本,{topic1:[text1,text2,...],topic2:[text1,text2,..],..}
    '''
    
    weibo_new = []
    for i in range(0,len(weibo_data)):
        text = weibo_data[i][1]
        mid = weibo_data[i][0]
        uid = weibo_data[i][2]
        timetamp = weibo_data[i][3]
        uname = weibo_data[i][4]
        forwarding_count = weibo_data[i][5]
        comment_count = weibo_data[i][6]
        #print 'text..',text
        #print 'type..text..',type(text)
        text = text.encode('utf-8')
        n = str(text).count('@')
        if n >= 5:
            continue
        
        value = cut_filter(text)
        if len(value) > 0:
            if text != '转发微博':
                weibo_new.append((value,mid,uid,timetamp,uname,forwarding_count,comment_count))
    
    word_result,word_weight = word_net(weibo_new,k_cluster)#提取关键词对
    
    text_list,opinion_name = text_net(word_result,word_weight,weibo_new)#提取代表文本

    return opinion_name,word_result,text_list
Exemple #6
0
def main(flag, k_cluster):
    weibo = []
    weibo_dict = dict()
    reader = csv.reader(file('./test/weibo%s.csv' % flag, 'rb'))
    for mid, text in reader:
        n = str(text).count('@')
        if n >= 5:
            continue
        value = cut_filter(text)
        if len(value) > 0:
            if text != '转发微博':
                weibo.append(str(mid))
                weibo_dict[str(mid)] = str(text)

    test(weibo, weibo_dict, flag)  #生成测试数据

    lable = choose_ad(flag)  #广告过滤

    ind, word = word_net(weibo, weibo_dict, lable, flag, k_cluster)  #提取关键词对

    write(ind, word, flag)  #写关键词对

    text_net(weibo, weibo_dict, lable, ind, word, flag)  #提取代表文本
Exemple #7
0
def main(flag,k_cluster):
    weibo = []
    weibo_dict = dict()
    reader = csv.reader(file('./test/weibo%s.csv' % flag, 'rb'))
    for mid,text in reader:
        n = str(text).count('@')
        if n >= 5:
            continue
        value = cut_filter(text)
        if len(value) > 0:
            if text != '转发微博':
                weibo.append(str(mid))
                weibo_dict[str(mid)] = str(text)

    test(weibo,weibo_dict,flag)#生成测试数据
    
    lable = choose_ad(flag)#广告过滤

    ind, word = word_net(weibo,weibo_dict,lable,flag,k_cluster)#提取关键词对

    write(ind,word,flag)#写关键词对
    
    text_net(weibo,weibo_dict,lable,ind,word,flag)#提取代表文本