Python re_cut Beispiele, config.re_cut Python Beispiele

Beispiel #1

0

Datei anzeigen

def get_news_main(news_text):

    news = re_cut(news_text)
    if len(news):
        if '】' not in news:
            title = ''
            text = news
        else:
            title = news_text.split('】')[0]
            text = news_text.split('】')[-1]
            if '【' in title:
                title = title[2:len(title)]
            else:
                title = title

        result = cut_main(text, title)
    else:
        result = 'Null'

    re_dict = {}
    name_list = ['people', 'organization', 'place', 'time']
    if result != 'Null':
        for i in range(0, len(result)):
            key = name_list[i]
            value = result[i]
            re_dict[key] = value
    else:
        re_dict = {
            'people': 'Null',
            'organization': 'Null',
            'time': 'Null',
            'place': 'Null'
        }

    return re_dict

Beispiel #2

0

Datei anzeigen

def topic_classfiy(uid_weibo):  #话题分类主函数
    '''
    用户话题分类主函数
    输入数据示例：字典
    {uid1:[weibo1,weibo2,weibo3,...]}

    输出数据示例：字典
    用户18个话题的分布：
    {uid1:{'art':0.1,'social':0.2...}...}
    用户关注较多的话题（最多有3个）：
    {uid1:['art','social','media']...}
    '''
    weibo_text = dict()
    for k, v in uid_weibo.items():
        item = ''
        for i in range(0, len(v)):
            text = re_cut(v[i]['text'])
            item = item + '.' + text
        weibo_text[k] = item

    result_data = load_weibo(weibo_text)  #话题分类主函数

    uid_topic = rank_result(result_data)

    return result_data, uid_topic

Beispiel #3

0

Datei anzeigen

def cut_flow_text(text):  #对流文本切分评论

    text_result = dict()
    text = re_cut(text)
    if '//@' in text:
        texts = text.split('//@')
    elif '/@' in text:  #表示有评论
        texts = text.split('/@')
    else:
        return 0, 0

    text_list = []
    for i in range(0, len(texts)):
        text_str = texts[i].strip()
        if not len(text_str):  #空文本
            continue
        if i == 0:
            text_list.append(text_str)
        else:
            if ':' in text_str:
                s_list = text_str.split(':')
                s_content = s_list[-1]
                if len(s_content):
                    text_list.append(s_content)
            else:
                text_list.append(text_str)
    try:
        key = text_list[-1]
    except IndexError:
        return 0, 0
    if len(key) < 2:
        return 0, 0

    return key, text_list[0:len(text_list) - 1]

Beispiel #4

0

Datei anzeigen

Datei: test_topic.py Projekt: ystone1025/project2015

def topic_classfiy(uid_weibo):#话题分类主函数
    '''
    用户话题分类主函数
    输入数据示例：字典
    {uid1:[weibo1,weibo2,weibo3,...]}

    输出数据示例：字典
    用户18个话题的分布：
    {uid1:{'art':0.1,'social':0.2...}...}
    用户关注较多的话题（最多有3个）：
    {uid1:['art','social','media']...}
    '''
    weibo_text = dict()
    for k,v in uid_weibo.items():
        item = ''
        for i in range(0,len(v)):
            text = re_cut(v[i]['text'])
            item = item + '.' + text
        weibo_text[k] = item

    result_data = load_weibo(weibo_text)#话题分类主函数

    uid_topic = rank_result(result_data)
    
    return result_data,uid_topic

Beispiel #5

0

Datei anzeigen

def input_data():  #测试输入

    sw = load_scws()
    uid_weibo = dict()
    uid_list = []
    reader = csv.reader(file(abs_path + '/weibo_data/uid_text_0728.csv', 'rb'))
    #print 'reader::',reader
    for mid, w_text in reader:
        print 'mid:::', mid
        print 'w_text:::', w_text
        v = re_cut(w_text.decode('utf-8'))
        words = sw.participle(v.encode('utf-8'))
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (
                    word[0] not in black_word
            ) and (word[0]
                   not in single_word_whitelist):  #选择分词结果的名词、动词、形容词，并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        uid_list.append(mid)
        uid_weibo[mid] = word_list

    return uid_list, uid_weibo

Beispiel #6

0

Datei anzeigen

Datei: clustering.py Projekt: NoahChanInvictus/finance_rumor_detect

def freq_word(items):
    """
    统计一条文本的词频，对文本进行过滤后再分词
    input:
        items:微博字典，{"mid": 12345, "text": text}
    output:
        top_word:词和词频构成的字典，如:{词:词频, 词:词频}
    """
    word_list = []
    text = items["text"]
    #print type(text)
    text = re_cut(text)
    cut_text = sw.participle(text.encode('utf-8'))
    #print cut_text
    cut_word_list = [term for term, cx in cut_text if cx in cx_dict]
    for w in cut_word_list:
        word_list.append(w)


    counter = Counter(word_list)
    total = sum(counter.values())
    topk_words = counter.most_common()
    top_word = {k:(float(v)/float(total)) for k,v in topk_words}

    return top_word

Beispiel #7

0

Datei anzeigen

Datei: flow_psy.py Projekt: SwoJa/ruman

def flow_psychology_classfiy(text):  #心理状态分类主函数

    w_text = re_cut(text)
    if len(w_text):  #非空
        label = find_label(w_text, DS_DICT, DS_COUNT)
    else:
        label = 'other'

    return label

Beispiel #8

0

Datei anzeigen

Datei: flow_psy.py Projekt: SwoJa/ruman

def flow_psychology_classfiy(text):#心理状态分类主函数

    w_text = re_cut(text)
    if len(w_text):#非空
        label = find_label(w_text,DS_DICT,DS_COUNT)
    else:
        label = 'other'
    
    return label

Beispiel #9

0

Datei anzeigen

def text_classify(inputs, word_label, tfidf_word):
    """
    对每条评论分别计算属于每个类的权重，将其归入权重最大的类
    输入数据：
        inputs:评论字典的列表，[{'_id':评论id,'news_id':新闻id,'content':评论内容}]
        word_cluster:词聚类结果,{'类标签'：[词1，词2，...]}
        tfidf_word:tfidf topk词及权值，[(词，权值)]

    输出数据：
        每条文本的归类，字典，{'_id':[类，属于该类的权重]}
    """

    #将词及权值整理为字典格式
    word_weight = {}
    for idx, w in enumerate(tfidf_word):
        word_weight[w[0]] = w[1]

    #计算每条评论属于各个类的权值
    for input in inputs:
        text_weight = {}
        text = input['text']
        text_list = []
        text = re_cut(text)
        cut_text = sw.participle(text.encode('utf-8'))
        #print cut_text
        cut_word_list = [term for term, cx in cut_text if cx in cx_dict]
        for w in cut_word_list:
            text_list.append(w)

        if text_list == []:
            continue

        for l, w_list in word_label.iteritems():
            weight = 0
            for w in w_list:
                weight += text.encode('utf-8').count(w) * word_weight[w]
            text_weight[l] = float(weight) / (float(len(text_list)))
        sorted_weight = sorted(text_weight.iteritems(),
                               key=lambda asd: asd[1],
                               reverse=True)
        if sorted_weight[0][1] != 0:
            clusterid, weight = sorted_weight[0]
        else:
            clusterid = 'other'
            weight = 0
        input.pop('text')
        input['label'] = clusterid
        input['weight'] = weight

    return inputs

Beispiel #10

0

Datei anzeigen

Datei: text_process.py Projekt: zhangxiangyuu/knowledge_revised

def get_topic_word(texts, nt):
    '''
        输入数据：
        text:list对象，一条记录表示一条微博文本
        nt:需要的topic的数量

        输出数据：
        topic的list
    '''

    text_list = []
    for text in texts:
        w_text = re_cut(text)
        words = sw.participle(w_text)
        word_list = []
        for word in words:
            if word[0] not in black and word[1] in cx_dict and len(
                    word[0]) > 3:
                word_list.append(word[0])
        text_list.append(word_list)

    ##生成字典
    dictionary = corpora.Dictionary(text_list)
    dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=None)
    corpus = [
        dictionary.doc2bow(text) for text in text_list
        if len(dictionary.doc2bow(text))
    ]

    if not len(corpus):
        return 'Null'

    ##生成tf-idf矩阵
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    ##LDA模型训练
    lda = gensim.models.ldamodel.LdaModel(corpus=corpus_tfidf,
                                          id2word=dictionary,
                                          num_topics=nt,
                                          update_every=1,
                                          chunksize=5000,
                                          passes=1)

    topics = lda.show_topics(num_topics=nt,
                             num_words=10,
                             log=False,
                             formatted=True)

    return topics

Beispiel #11

0

Datei anzeigen

Datei: clustering.py Projekt: ferrero-zhang/user_portrait_0324

def text_classify(inputs, word_label, tfidf_word):
    """
    对每条评论分别计算属于每个类的权重，将其归入权重最大的类
    输入数据：
        inputs:评论字典的列表，[{'_id':评论id,'news_id':新闻id,'content':评论内容}]
        word_cluster:词聚类结果,{'类标签'：[词1，词2，...]}
        tfidf_word:tfidf topk词及权值，[(词，权值)]

    输出数据：
        每条文本的归类，字典，{'_id':[类，属于该类的权重]}
    """

    #将词及权值整理为字典格式
    word_weight = {}
    for idx,w in enumerate(tfidf_word):
        word_weight[w[0]] = w[1]

    #计算每条评论属于各个类的权值
    for input in inputs:
        text_weight = {}
        text = input['text']
        text_list = []
        text = re_cut(text)
        cut_text = sw.participle(text.encode('utf-8'))
        #print cut_text
        cut_word_list = [term for term, cx in cut_text if cx in cx_dict]
        for w in cut_word_list:
            text_list.append(w)

        if text_list == []:
            continue

        for l, w_list in word_label.iteritems():
            weight = 0
            for w in w_list:
                weight += text.encode('utf-8').count(w)*word_weight[w]
            text_weight[l] = float(weight)/(float(len(text_list)))
        sorted_weight = sorted(text_weight.iteritems(), key=lambda asd:asd[1], reverse=True)
        if sorted_weight[0][1] != 0:
            clusterid, weight = sorted_weight[0]
        else:
            clusterid = 'other'
            weight = 0
        input.pop('text')
        input['label'] = clusterid
        input['weight'] = weight

    return inputs

Beispiel #12

0

Datei anzeigen

def flow_psychology_classfiy(text):  #心理状态分类主函数
    '''
    返回结果：文本的情绪标签（int类型）
    0 消极其他
    2 生气
    3 焦虑
    4 悲伤
    5 厌恶
    '''
    w_text = re_cut(text)
    if len(w_text):  #非空
        label = find_label(w_text, DZ_DICT, DZ_COUNT)
    else:
        label = 0

    return label

Beispiel #13

0

Datei anzeigen

Datei: flow_psy.py Projekt: huxiaoqian/revised_user_portrait

def flow_psychology_classfiy(text):#心理状态分类主函数
    '''
    返回结果：文本的情绪标签（int类型）
    0 消极其他
    2 生气
    3 焦虑
    4 悲伤
    5 厌恶
    '''
    w_text = re_cut(text)
    if len(w_text):#非空
        label = find_label(w_text,DZ_DICT,DZ_COUNT)
    else:
        label = 0
    
    return label

Beispiel #14

0

Datei anzeigen

Datei: event_relationship.py Projekt: zhaishujie2/knowledge-management

def get_keyword(weibo_text):#事件关键词提取

    tr_list = []
    text_list = []
    for text in weibo_text:
        w_text = re_cut(text)
        if not len(w_text):
            continue
        words = SW.participle(w_text)
        word_list = []
        for word in words:
            if word[0] not in black_word and word[1] in cx_dict and len(word[0])>3:
                word_list.append(word[0])
                text_list.append(word[0])
        tr_list.append(word_list)

    keywords = use_topicrank([text_list],tr_list,N_GRAM,WORD_N,TOPIC_N)

    return keywords

Beispiel #15

0

Datei anzeigen

def get_weibo_single(text,n_gram=2,n_count=3):
    '''
        针对单条微博提取关键词，但是效率比较低
        输入数据：
        text：单条微博文本，utf-8编码
        n_gram：词语滑动窗口，建议取2
        n_count：返回的关键词数量
        输出数据：
        字典：键是词语，值是词语对应的权重
    '''

    w_text = re_cut(text)
    if w_text:
        w_key = get_keyword(w_text, n_gram, n_count)
        uid_word = w_key
    else:
        uid_word = dict()
    
    return uid_word

Beispiel #16

0

Datei anzeigen

Datei: new_psy.py Projekt: ystone1025/project2015

def psychology_classfiy(uid_weibo):  # 心理状态分类主函数
    """
    用户心理状态分类主函数
    输入数据示例：字典
    {uid1:[weibo1,weibo2,weibo3,...]}

    输出数据示例：字典(每个用户对应两个字典，一个是一层分类器的状态比例，另一个是二层分类器（消极状态）的状态比例)
    {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...}
    """

    df_dict, df_count = load_dict(f_label)
    ds_dict, ds_count = load_dict(s_label)

    data_s = s_label
    data_f = f_label
    data_s.append("other")
    data_f.append("middle")

    sw = load_scws()
    result_data = dict()
    for k, v in uid_weibo.items():
        domain_f = start_p(data_f)
        domain_s = start_p(data_s)
        for i in range(0, len(v)):
            w_text = re_cut(v[i])
            if not len(w_text):
                continue
            label_f, label_s = find_label(w_text, sw, df_dict, df_count, ds_dict, ds_count)
            domain_f[label_f] = domain_f[label_f] + 1
            domain_s[label_s] = domain_s[label_s] + 1

        for k1, v1 in domain_f.items():
            domain_f[k1] = float(v1) / float(len(v))
        for k1, v1 in domain_s.items():
            if domain_f["negemo"] != 0:
                domain_s[k1] = float(v1) / float(len(v))
            else:
                domain_s[k1] = 0

        result_data[k] = {"first": domain_f, "second": domain_s}

    return result_data

Beispiel #17

0

Datei anzeigen

Datei: new_psy.py Projekt: ystone1025/project2015

def psychology_classfiy(uid_weibo):#心理状态分类主函数
    '''
    用户心理状态分类主函数
    输入数据示例：字典
    {uid1:[weibo1,weibo2,weibo3,...]}

    输出数据示例：字典(每个用户对应两个字典，一个是一层分类器的状态比例，另一个是二层分类器（消极状态）的状态比例)
    {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...}
    '''
    
    df_dict,df_count = load_dict(f_label)
    ds_dict,ds_count = load_dict(s_label)
    
    data_s = s_label
    data_f = f_label
    data_s.append('other')
    data_f.append('middle')

    sw = load_scws()
    result_data = dict()
    for k,v in uid_weibo.items():
        domain_f = start_p(data_f)
        domain_s = start_p(data_s)
        for i in range(0,len(v)):
            w_text = re_cut(v[i])
            if not len(w_text):
                continue
            label_f,label_s = find_label(w_text,sw,df_dict,df_count,ds_dict,ds_count)
            domain_f[label_f] = domain_f[label_f] + 1
            domain_s[label_s] = domain_s[label_s] + 1

        for k1,v1 in domain_f.items():
            domain_f[k1] = float(v1)/float(len(v))
        for k1,v1 in domain_s.items():
            if domain_f['negemo'] != 0:
                domain_s[k1] = float(v1)/float(len(v))
            else:
                domain_s[k1] = 0

        result_data[k] = {'first' : domain_f, 'second' : domain_s}

    return result_data

Beispiel #18

0

Datei anzeigen

Datei: test_data.py Projekt: huxiaoqian/user_portrait_ending2

def input_data():#测试输入

    sw = load_scws()
    uid_weibo = dict()
    uid_list = []
    reader = csv.reader(file(abs_path+'/weibo_data/uid_text_0728.csv', 'rb'))
    for mid,w_text in reader:
        v = re_cut(w_text.decode('utf-8'))
        words = sw.participle(v.encode('utf-8'))
        word_list = dict()
        for word in words:
            if (word[1] in cx_dict) and 3 < len(word[0]) < 30 and (word[0] not in black_word) and (word[0] not in single_word_whitelist):#选择分词结果的名词、动词、形容词，并去掉单个词
                if word_list.has_key(word[0]):
                    word_list[word[0]] = word_list[word[0]] + 1
                else:
                    word_list[word[0]] = 1
        uid_list.append(mid)
        uid_weibo[mid] = word_list
    
    return uid_list,uid_weibo

Beispiel #19

0

Datei anzeigen

Datei: text_process.py Projekt: zhangxiangyuu/knowledge_revised

def get_weibo_test():

    uid_dict = dict()
    uid_tr = dict()
    texts = ''
    reader = csv.reader(file('./text_data/text_0.csv', 'rb'))
    count = 0
    ori_text = []
    for uid, text, ts, geo in reader:
        ori_text.append(text)
        w_text = re_cut(text)
        if count == 0:
            texts = texts + w_text
        else:
            texts = texts + '。' + w_text
        words = sw.participle(w_text)
        word_list = []
        for word in words:
            if word[0] not in black and word[1] in cx_dict and len(
                    word[0]) > 3:
                if uid_dict.has_key(uid):
                    uid_dict[uid].append(word[0])
                else:
                    uid_dict[uid] = []
                    uid_dict[uid].append(word[0])
                word_list.append(word[0])
        count = count + 1
        if len(word_list):
            if uid_tr.has_key(uid):
                uid_tr[uid].append(word_list)
            else:
                uid_tr[uid] = []
                uid_tr[uid].append(word_list)

    uid_list = []
    text_list = []
    for k, v in uid_dict.iteritems():
        uid_list.append(k)
        text_list.append(v)

    return texts, text_list, ori_text

Beispiel #20

0

Datei anzeigen

Datei: get_group_keywords.py Projekt: NoahChanInvictus/finance_rumor_detect

def get_group_keywords(uid_list):
    now_ts = time.time()
    now_ts = datetime2ts('2013-09-03')
    former_ts = now_ts - DAY
    flow_index_1 = flow_text_index_name_pre + ts2datetime(now_ts)
    flow_index_2 = flow_text_index_name_pre + ts2datetime(former_ts)
    query_body = {
        "query": {
            "filtered": {
                "filter": {
                    "terms": {
                        "uid": uid_list
                    }
                }
            }
        },
        "size": 10000
    }
    text_list = []  # 为分词前的文本
    word_dict = dict()  # 分词后的word dict
    text_results = es_flow_text.search(index=[flow_index_1, flow_index_2],
                                       doc_type=flow_text_index_type,
                                       body=query_body)["hits"]["hits"]
    if text_results:
        for item in text_results:
            iter_text = item['_source']['text'].encode('utf-8', "ignore")
            iter_text = re_cut(iter_text)
            text_list.append(iter_text)
    if text_list:
        for iter_text in text_list:
            cut_text = sw.participle(iter_text)
            cut_word_list = [term for term, cx in cut_text if cx in cx_dict]
            tmp_list = []
            for w in cut_word_list:
                if word_dict.has_key(w):
                    word_dict[w] += 1
                else:
                    word_dict[w] = 1

    return word_dict

Beispiel #21

0

Datei anzeigen

Datei: test_data.py Projekt: jianjian0dandan/sensitive_user_portrait

def input_data():  # 测试输入

    uid_weibo = dict()
    uid_list = []
    sw = load_scws()
    reader = csv.reader(file("./weibo_data/uid_text_0728.csv", "rb"))
    for mid, w_text in reader:
        text = re_cut(w_text)
        if mid not in uid_list:
            uid_list.append(mid)
        if uid_weibo.has_key(mid):
            word_dict = uid_weibo[mid]
            words = sw.participle(text)
            for word in words:
                if (
                    (word[1] in cx_dict)
                    and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist)
                    and (word[0] not in black_word)
                ):  # 选择分词结果的名词、动词、形容词，并去掉单个词
                    if word_dict.has_key(str(word[0])):
                        word_dict[str(word[0])] = word_dict[str(word[0])] + 1
                    else:
                        word_dict[str(word[0])] = 1
            uid_weibo[mid] = word_dict
        else:
            word_dict = dict()
            words = sw.participle(text)
            for word in words:
                if (
                    (word[1] in cx_dict)
                    and (3 < len(word[0]) < 30 or word[0] in single_word_whitelist)
                    and (word[0] not in black_word)
                ):  # 选择分词结果的名词、动词、形容词，并去掉单个词
                    if word_dict.has_key(str(word[0])):
                        word_dict[str(word[0])] = word_dict[str(word[0])] + 1
                    else:
                        word_dict[str(word[0])] = 1
            uid_weibo[mid] = word_dict

    return uid_list, uid_weibo

Beispiel #22

0

Datei anzeigen

Datei: test_data.py Projekt: ibs9668/sensitive_user_portrait

def input_data():  #测试输入

    uid_weibo = dict()
    uid_list = []
    sw = load_scws()
    reader = csv.reader(file('./weibo_data/uid_text_0728.csv', 'rb'))
    for mid, w_text in reader:
        text = re_cut(w_text)
        if mid not in uid_list:
            uid_list.append(mid)
        if uid_weibo.has_key(mid):
            word_dict = uid_weibo[mid]
            words = sw.participle(text)
            for word in words:
                if (word[1] in cx_dict) and (
                        3 < len(word[0]) < 30
                        or word[0] in single_word_whitelist
                ) and (word[0] not in black_word):  #选择分词结果的名词、动词、形容词，并去掉单个词
                    if word_dict.has_key(str(word[0])):
                        word_dict[str(word[0])] = word_dict[str(word[0])] + 1
                    else:
                        word_dict[str(word[0])] = 1
            uid_weibo[mid] = word_dict
        else:
            word_dict = dict()
            words = sw.participle(text)
            for word in words:
                if (word[1] in cx_dict) and (
                        3 < len(word[0]) < 30
                        or word[0] in single_word_whitelist
                ) and (word[0] not in black_word):  #选择分词结果的名词、动词、形容词，并去掉单个词
                    if word_dict.has_key(str(word[0])):
                        word_dict[str(word[0])] = word_dict[str(word[0])] + 1
                    else:
                        word_dict[str(word[0])] = 1
            uid_weibo[mid] = word_dict

    return uid_list, uid_weibo

Beispiel #23

0

Datei anzeigen

Datei: get_group_keywords.py Projekt: ferrero-zhang/user_portrait_0324

def get_group_keywords(uid_list):
    now_ts = time.time()
    now_ts = datetime2ts('2013-09-03')
    former_ts = now_ts - DAY
    flow_index_1 = flow_text_index_name_pre + ts2datetime(now_ts)
    flow_index_2 = flow_text_index_name_pre + ts2datetime(former_ts)
    query_body = {
        "query":{
            "filtered":{
                "filter":{
                    "terms":{
                        "uid":uid_list
                    }
                }
            }
        },
        "size":10000
    }
    text_list = [] # 为分词前的文本
    word_dict = dict() # 分词后的word dict
    text_results = es_flow_text.search(index=[flow_index_1, flow_index_2], doc_type=flow_text_index_type, body=query_body)["hits"]["hits"]
    if text_results:
        for item in text_results:
            iter_text = item['_source']['text'].encode('utf-8', "ignore")
            iter_text = re_cut(iter_text)
            text_list.append(iter_text)
    if text_list:
        for iter_text in text_list:
            cut_text = sw.participle(iter_text)
            cut_word_list = [term for term, cx in cut_text if cx in cx_dict]
            tmp_list = []
            for w in cut_word_list:
                if word_dict.has_key(w):
                    word_dict[w] += 1
                else:
                    word_dict[w] = 1

    return word_dict

Beispiel #24

0

Datei anzeigen

Datei: new_psy.py Projekt: SwoJa/ruman

def psychology_classfiy(uid_weibo):#心理状态分类主函数
    '''
    用户心理状态分类主函数
    输入数据示例：字典
    {uid1:[weibo1,weibo2,weibo3,...]}

    输出数据示例：字典(每个用户对应两个字典，一个是一层分类器的状态比例，另一个是二层分类器（消极状态）的状态比例)
    {uid1:{'first':{'negemo':0.2,'posemo':0.3,'middle':0.5},'second':{'anger':0.2,'anx':0.5,'sad':0.1,'other':0.2}}...}
    '''
    
    data_s = s_label
    data_f = f_label
    data_s.append('other')
    data_f.append('middle')

    result_data = dict()
    for k,v in uid_weibo.items():
        domain_f = start_p(data_f)
        domain_s = start_p(data_s)
        for i in range(0,len(v)):
            w_text = re_cut(v[i]['text'])
            if not len(w_text):
                continue
            label_f,label_s = find_label(w_text,DF_DICT,DF_COUNT,DS_DICT,DS_COUNT)
            domain_f[label_f] = domain_f[label_f] + 1
            domain_s[label_s] = domain_s[label_s] + 1

        for k1,v1 in domain_f.items():
            domain_f[k1] = float(v1)/float(len(v))
        for k1,v1 in domain_s.items():
            if domain_f['negemo'] != 0:
                domain_s[k1] = float(v1)/float(len(v))
            else:
                domain_s[k1] = 0

        result_data[k] = {'first' : domain_f, 'second' : domain_s}

    return result_data

Beispiel #25

0

Datei anzeigen

Datei: text_generation_v2.py Projekt: yuanhuiru/xnr2

def text_generation_main(text_list, keyword_list):  #文本生成主函数
    '''
	输入数据：
	text_list:文本列表
	keyword_list:关键词列表（观点关键词）

	输出数据：
	summary:生成的发帖文本
	'''

    sen_list = []
    if len(text_list) == 0 or len(keyword_list) == 0:
        return ''
    for text in text_list:
        if type(text).__name__ == "unicode":
            new_text = text.encode('utf-8')
        else:
            new_text = text
        text_re = re_cut(new_text)
        ts = text_re.split('。')
        for t in ts:
            if t not in sen_list:
                sen_list.append(t)
    new_keywords = []
    for key in keyword_list:
        if type(key).__name__ == "unicode":
            new_keywords.append(key.encode('utf-8'))
        else:
            new_keywords.append(key)

    rank_text = rank_text_list(sen_list, new_keywords)
    if len(rank_text) == 0:  #加入错误判定
        summary = ''
    else:
        summary = combine_rank_text(rank_text)

    return summary

Beispiel #26

0

Datei anzeigen

def get_weibo(text,n_gram=2,n_count=20):
    '''
        针对一批微博提取关键词
        输入数据：
        text：微博文本列表，utf-8编码
        n_gram：词语滑动窗口，建议取2
        n_count：返回的关键词数量
        输出数据：
        字典：键是词语，值是词语对应的权重
    '''

    text_str = ''
    for item in text:
        w_text = re_cut(item)
        if w_text:
            text_str = text_str + '。' + w_text

    if text_str:
        w_key = get_keyword(text_str, n_gram, n_count)
        uid_word = w_key
    else:
        uid_word = dict()

    return uid_word