def freq_word_comment(items):
    '''
    统计一条评论中的名词
    输入数据:
        items:新闻组成的列表:字典, 数据示例:[{'item的下标':评论id,'news_id':新闻id,'content':新闻内容}]
    输出数据:
        top_word:词和词频构成的列表,{词:词频率}
        word_comment:每条评论的名词,{"_id":[名词1,名词2,...]}
    '''
    words_list = []
    word_comment = {} # 记录每条评论的名词
    for item in items:
        text = item['content']
        words = cut_words_noun(text)
        word_item = []
        for w in words:
            words_list.append(w)
            word_item.append(w)
        word_comment[items.index(item)]=word_item

    counter = Counter(words_list)
    total = sum(counter.values())#总词频数
    topk_words = counter.most_common()
    top_word = {k:float(v)/float(total) for k,v in topk_words}

    return top_word,word_comment
Exemple #2
0
def freq_word_comment(items):
    '''
    统计一条评论中的名词
    输入数据:
        items:新闻组成的列表:字典, 数据示例:[{'item的下标':评论id,'news_id':新闻id,'content':新闻内容}]
    输出数据:
        top_word:词和词频构成的列表,{词:词频率}
        word_comment:每条评论的名词,{"_id":[名词1,名词2,...]}
    '''
    words_list = []
    word_comment = {}  # 记录每条评论的名词
    for item in items:
        text = item['content']
        words = cut_words_noun(text)
        word_item = []
        for w in words:
            words_list.append(w)
            word_item.append(w)
        word_comment[items.index(item)] = word_item

    counter = Counter(words_list)
    total = sum(counter.values())  #总词频数
    topk_words = counter.most_common()
    top_word = {k: float(v) / float(total) for k, v in topk_words}

    return top_word, word_comment
def text_weight_cal(item, feature_words):
    """根据类的特征词计算单条文本的权重
       input:
           item: 单条信息, {"title": "标题", "content": "内容"}, utf-8编码
           feature_words: 某一类的特征词, 字典
       output:
           单条文本的权重
    """
    text = item["title"] + item["content"]
    words = cut_words_noun(text)
    weight = sum([text.count(word) for word, count in feature_words.iteritems()]) / (float(len(words)) + 1.0)

    return weight
Exemple #4
0
def text_weight_cal(item, feature_words):
    """根据类的特征词计算单条文本的权重
       input:
           item: 单条信息, {"title": "标题", "content": "内容"}, utf-8编码
           feature_words: 某一类的特征词, 字典
       output:
           单条文本的权重
    """
    text = item["title"] + item["content"]
    words = cut_words_noun(text)
    weight = sum([
        text.count(word) for word, count in feature_words.iteritems()
    ]) / (float(len(words)) + 1.0)

    return weight
Exemple #5
0
def global_weight_cal_tfidf(tfidf_word, item):
    """根据tfidf词计算全局文本权重
    """
    #将词及权值整理为字典格式
    word_weight = {}
    for idx, w in enumerate(tfidf_word):
        word_weight[w[0]] = w[1]

    text = item["title"] + item["content"]
    text_word = cut_words_noun(text)  #每句话分词结果,用于text_weight中

    weight = 0
    for w, c in tfidf_word:
        weight += text.count(w) * word_weight[w]
    text_weight = float(weight) / (float(len(text_word)) + 1.0)

    return text_weight
def global_weight_cal_tfidf(tfidf_word, item):
    """根据tfidf词计算全局文本权重
    """
    #将词及权值整理为字典格式
    word_weight = {}
    for idx, w in enumerate(tfidf_word):
        word_weight[w[0]] = w[1]

    text = item["title"] + item["content"]
    text_word = cut_words_noun(text)#每句话分词结果,用于text_weight中

    weight = 0
    for w, c in tfidf_word:
        weight += text.count(w) * word_weight[w]
    text_weight = float(weight)/(float(len(text_word)) + 1.0)

    return text_weight
Exemple #7
0
def freq_word_news(item):
    '''
    统计新闻的名词
    输入数据:新闻,字符串
    输出数据:新闻的词及词频率字典,{词:词频率}
    '''
    words_list = []
    words = cut_words_noun(item)
    word_item = []
    for w in words:
        words_list.append(w)

    counter = Counter(words_list)
    total = sum(counter.values())  #总词频数
    topk_words = counter.most_common()
    top_word = {k: float(v) / float(total) for k, v in topk_words}

    return top_word
def freq_word_news(item):
    '''
    统计新闻的名词
    输入数据:新闻,字符串
    输出数据:新闻的词及词频率字典,{词:词频率}
    '''
    words_list = []
    words = cut_words_noun(item)
    word_item = []
    for w in words:
        words_list.append(w)

    counter = Counter(words_list)
    total = sum(counter.values())#总词频数
    topk_words = counter.most_common()
    top_word = {k:float(v)/float(total) for k,v in topk_words}

    return top_word
Exemple #9
0
def cal_global_weight(items, topk=50):
    """输入一批文本,计算每条文本的全局权重
       每个item有content字段,以utf-8编码
    """
    texts = ''
    for item in items:
        texts += item['content']

    words = cut_words_noun(texts)
    counter = Counter(words)
    topk_words = dict(counter.most_common(topk))

    results = []
    for item in items:
        weight = text_weight_cal(item, topk_words)
        item['weight'] = weight
        results.append(item)

    return results
def cal_global_weight(items, topk=50):
    """输入一批文本,计算每条文本的全局权重
       每个item有content字段,以utf-8编码
    """
    texts = ''
    for item in items:
        texts += item['content']

    words = cut_words_noun(texts)
    counter = Counter(words)
    topk_words = dict(counter.most_common(topk))

    results = []
    for item in items:
        weight = text_weight_cal(item, topk_words)
        item['weight'] = weight
        results.append(item)

    return results
Exemple #11
0
def freq_word(items):
    '''
    统计一条文本的词频
    输入数据:
        items: 新闻组成的列表:字典, 数据示例:{'_id':评论id,'news_id':新闻id,'content':新闻内容}
    输出数据:
        top_word: 词和词频构成的字典, 数据示例:{词:词频,词:词频,...}
    '''
    words_list = []
    text = items['content']
    words = cut_words_noun(text)
    for w in words:
        words_list.append(w)

    counter = Counter(words_list)
    total = sum(counter.values())  #总词频数
    topk_words = counter.most_common()
    top_word = {k: (float(v) / float(total)) for k, v in topk_words}

    return top_word
def freq_word(items):
    '''
    统计一条文本的词频
    输入数据:
        items: 新闻组成的列表:字典, 数据示例:{'_id':评论id,'news_id':新闻id,'content':新闻内容}
    输出数据:
        top_word: 词和词频构成的字典, 数据示例:{词:词频,词:词频,...}
    '''
    words_list = []
    text = items['content']
    words = cut_words_noun(text)
    for w in words:
        words_list.append(w)

    counter = Counter(words_list)
    total = sum(counter.values())#总词频数
    topk_words = counter.most_common()
    top_word = {k:(float(v)/float(total)) for k,v in topk_words}

    return top_word
Exemple #13
0
def text_classify(inputs, word_label, tfidf_word):
    '''
    对每条评论分别计算属于每个类的权重,将其归入权重最大的类
    输入数据:
        inputs:评论字典的列表,[{'_id':评论id,'news_id':新闻id,'content':评论内容}]
        word_cluster:词聚类结果,{'类标签':[词1,词2,...]}
        tfidf_word:tfidf topk词及权值,[(词,权值)]

    输出数据:
        每条文本的归类,字典,{'_id':[类,属于该类的权重]}
    '''
    #将词及权值整理为字典格式
    word_weight = {}
    for idx, w in enumerate(tfidf_word):
        word_weight[w[0]] = w[1]

    #计算每条评论属于各个类的权值
    for input in inputs:
        text_weight = {}
        text = input['content']
        text_word = cut_words_noun(text)  #每句话分词结果,用于text_weight中

        for l, w_list in word_label.iteritems():
            weight = 0
            for w in w_list:
                weight += text.count(w) * word_weight[w]
            text_weight[l] = float(weight) / (float(len(text_word)) + 1.0)
        sorted_weight = sorted(text_weight.iteritems(),
                               key=lambda asd: asd[1],
                               reverse=True)
        if sorted_weight[0][1] != 0:  #只有一条文本属于任何一个类的权值都不为0时才归类
            clusterid, weight = sorted_weight[0]
        else:
            clusterid = 'other'
            weight = 0

        input['label'] = clusterid
        input['weight'] = weight

    return inputs
def text_classify(inputs, word_label, tfidf_word):
    '''
    对每条评论分别计算属于每个类的权重,将其归入权重最大的类
    输入数据:
        inputs:评论字典的列表,[{'_id':评论id,'news_id':新闻id,'content':评论内容}]
        word_cluster:词聚类结果,{'类标签':[词1,词2,...]}
        tfidf_word:tfidf topk词及权值,[(词,权值)]

    输出数据:
        每条文本的归类,字典,{'_id':[类,属于该类的权重]}
    '''
    #将词及权值整理为字典格式
    word_weight = {}
    for idx,w in enumerate(tfidf_word):
        word_weight[w[0]] = w[1]

    #计算每条评论属于各个类的权值
    for input in inputs:
        text_weight = {}
        text = input['content']
        text_word = cut_words_noun(text)#每句话分词结果,用于text_weight中

        for l,w_list in word_label.iteritems():
            weight = 0
            for w in w_list:
                weight += text.count(w)*word_weight[w]
            text_weight[l] = float(weight)/(float(len(text_word)) + 1.0)
        sorted_weight = sorted(text_weight.iteritems(), key = lambda asd:asd[1], reverse = True)
        if sorted_weight[0][1]!=0:#只有一条文本属于任何一个类的权值都不为0时才归类
            clusterid, weight = sorted_weight[0]
        else:
            clusterid = 'other'
            weight = 0

        input['label'] = clusterid
        input['weight'] = weight

    return inputs