def freq_word_comment(items): ''' 统计一条评论中的名词 输入数据: items:新闻组成的列表:字典, 数据示例:[{'item的下标':评论id,'news_id':新闻id,'content':新闻内容}] 输出数据: top_word:词和词频构成的列表,{词:词频率} word_comment:每条评论的名词,{"_id":[名词1,名词2,...]} ''' words_list = [] word_comment = {} # 记录每条评论的名词 for item in items: text = item['content'] words = cut_words_noun(text) word_item = [] for w in words: words_list.append(w) word_item.append(w) word_comment[items.index(item)]=word_item counter = Counter(words_list) total = sum(counter.values())#总词频数 topk_words = counter.most_common() top_word = {k:float(v)/float(total) for k,v in topk_words} return top_word,word_comment
def freq_word_comment(items): ''' 统计一条评论中的名词 输入数据: items:新闻组成的列表:字典, 数据示例:[{'item的下标':评论id,'news_id':新闻id,'content':新闻内容}] 输出数据: top_word:词和词频构成的列表,{词:词频率} word_comment:每条评论的名词,{"_id":[名词1,名词2,...]} ''' words_list = [] word_comment = {} # 记录每条评论的名词 for item in items: text = item['content'] words = cut_words_noun(text) word_item = [] for w in words: words_list.append(w) word_item.append(w) word_comment[items.index(item)] = word_item counter = Counter(words_list) total = sum(counter.values()) #总词频数 topk_words = counter.most_common() top_word = {k: float(v) / float(total) for k, v in topk_words} return top_word, word_comment
def text_weight_cal(item, feature_words): """根据类的特征词计算单条文本的权重 input: item: 单条信息, {"title": "标题", "content": "内容"}, utf-8编码 feature_words: 某一类的特征词, 字典 output: 单条文本的权重 """ text = item["title"] + item["content"] words = cut_words_noun(text) weight = sum([text.count(word) for word, count in feature_words.iteritems()]) / (float(len(words)) + 1.0) return weight
def text_weight_cal(item, feature_words): """根据类的特征词计算单条文本的权重 input: item: 单条信息, {"title": "标题", "content": "内容"}, utf-8编码 feature_words: 某一类的特征词, 字典 output: 单条文本的权重 """ text = item["title"] + item["content"] words = cut_words_noun(text) weight = sum([ text.count(word) for word, count in feature_words.iteritems() ]) / (float(len(words)) + 1.0) return weight
def global_weight_cal_tfidf(tfidf_word, item): """根据tfidf词计算全局文本权重 """ #将词及权值整理为字典格式 word_weight = {} for idx, w in enumerate(tfidf_word): word_weight[w[0]] = w[1] text = item["title"] + item["content"] text_word = cut_words_noun(text) #每句话分词结果,用于text_weight中 weight = 0 for w, c in tfidf_word: weight += text.count(w) * word_weight[w] text_weight = float(weight) / (float(len(text_word)) + 1.0) return text_weight
def global_weight_cal_tfidf(tfidf_word, item): """根据tfidf词计算全局文本权重 """ #将词及权值整理为字典格式 word_weight = {} for idx, w in enumerate(tfidf_word): word_weight[w[0]] = w[1] text = item["title"] + item["content"] text_word = cut_words_noun(text)#每句话分词结果,用于text_weight中 weight = 0 for w, c in tfidf_word: weight += text.count(w) * word_weight[w] text_weight = float(weight)/(float(len(text_word)) + 1.0) return text_weight
def freq_word_news(item): ''' 统计新闻的名词 输入数据:新闻,字符串 输出数据:新闻的词及词频率字典,{词:词频率} ''' words_list = [] words = cut_words_noun(item) word_item = [] for w in words: words_list.append(w) counter = Counter(words_list) total = sum(counter.values()) #总词频数 topk_words = counter.most_common() top_word = {k: float(v) / float(total) for k, v in topk_words} return top_word
def freq_word_news(item): ''' 统计新闻的名词 输入数据:新闻,字符串 输出数据:新闻的词及词频率字典,{词:词频率} ''' words_list = [] words = cut_words_noun(item) word_item = [] for w in words: words_list.append(w) counter = Counter(words_list) total = sum(counter.values())#总词频数 topk_words = counter.most_common() top_word = {k:float(v)/float(total) for k,v in topk_words} return top_word
def cal_global_weight(items, topk=50): """输入一批文本,计算每条文本的全局权重 每个item有content字段,以utf-8编码 """ texts = '' for item in items: texts += item['content'] words = cut_words_noun(texts) counter = Counter(words) topk_words = dict(counter.most_common(topk)) results = [] for item in items: weight = text_weight_cal(item, topk_words) item['weight'] = weight results.append(item) return results
def freq_word(items): ''' 统计一条文本的词频 输入数据: items: 新闻组成的列表:字典, 数据示例:{'_id':评论id,'news_id':新闻id,'content':新闻内容} 输出数据: top_word: 词和词频构成的字典, 数据示例:{词:词频,词:词频,...} ''' words_list = [] text = items['content'] words = cut_words_noun(text) for w in words: words_list.append(w) counter = Counter(words_list) total = sum(counter.values()) #总词频数 topk_words = counter.most_common() top_word = {k: (float(v) / float(total)) for k, v in topk_words} return top_word
def freq_word(items): ''' 统计一条文本的词频 输入数据: items: 新闻组成的列表:字典, 数据示例:{'_id':评论id,'news_id':新闻id,'content':新闻内容} 输出数据: top_word: 词和词频构成的字典, 数据示例:{词:词频,词:词频,...} ''' words_list = [] text = items['content'] words = cut_words_noun(text) for w in words: words_list.append(w) counter = Counter(words_list) total = sum(counter.values())#总词频数 topk_words = counter.most_common() top_word = {k:(float(v)/float(total)) for k,v in topk_words} return top_word
def text_classify(inputs, word_label, tfidf_word): ''' 对每条评论分别计算属于每个类的权重,将其归入权重最大的类 输入数据: inputs:评论字典的列表,[{'_id':评论id,'news_id':新闻id,'content':评论内容}] word_cluster:词聚类结果,{'类标签':[词1,词2,...]} tfidf_word:tfidf topk词及权值,[(词,权值)] 输出数据: 每条文本的归类,字典,{'_id':[类,属于该类的权重]} ''' #将词及权值整理为字典格式 word_weight = {} for idx, w in enumerate(tfidf_word): word_weight[w[0]] = w[1] #计算每条评论属于各个类的权值 for input in inputs: text_weight = {} text = input['content'] text_word = cut_words_noun(text) #每句话分词结果,用于text_weight中 for l, w_list in word_label.iteritems(): weight = 0 for w in w_list: weight += text.count(w) * word_weight[w] text_weight[l] = float(weight) / (float(len(text_word)) + 1.0) sorted_weight = sorted(text_weight.iteritems(), key=lambda asd: asd[1], reverse=True) if sorted_weight[0][1] != 0: #只有一条文本属于任何一个类的权值都不为0时才归类 clusterid, weight = sorted_weight[0] else: clusterid = 'other' weight = 0 input['label'] = clusterid input['weight'] = weight return inputs
def text_classify(inputs, word_label, tfidf_word): ''' 对每条评论分别计算属于每个类的权重,将其归入权重最大的类 输入数据: inputs:评论字典的列表,[{'_id':评论id,'news_id':新闻id,'content':评论内容}] word_cluster:词聚类结果,{'类标签':[词1,词2,...]} tfidf_word:tfidf topk词及权值,[(词,权值)] 输出数据: 每条文本的归类,字典,{'_id':[类,属于该类的权重]} ''' #将词及权值整理为字典格式 word_weight = {} for idx,w in enumerate(tfidf_word): word_weight[w[0]] = w[1] #计算每条评论属于各个类的权值 for input in inputs: text_weight = {} text = input['content'] text_word = cut_words_noun(text)#每句话分词结果,用于text_weight中 for l,w_list in word_label.iteritems(): weight = 0 for w in w_list: weight += text.count(w)*word_weight[w] text_weight[l] = float(weight)/(float(len(text_word)) + 1.0) sorted_weight = sorted(text_weight.iteritems(), key = lambda asd:asd[1], reverse = True) if sorted_weight[0][1]!=0:#只有一条文本属于任何一个类的权值都不为0时才归类 clusterid, weight = sorted_weight[0] else: clusterid = 'other' weight = 0 input['label'] = clusterid input['weight'] = weight return inputs