def extract_keyword(items): keywords_weight = dict() for item in items: title = item['title'] content = item['content'] title_terms = cut_words(title) content_terms = cut_words(content) for term in title_terms: try: keywords_weight[term] += title_term_weight except KeyError: keywords_weight[term] = title_term_weight for term in content_terms: try: keywords_weight[term] += content_term_weight except KeyError: keywords_weight[term] = content_term_weight # 筛掉频率大于或等于0.8, 频数小于或等于3的词 keywords_count = dict() total_weight = sum(keywords_weight.values()) for keyword, weight in keywords_weight.iteritems(): ratio = float(weight) / float(total_weight) if ratio >= 0.8 or weight <= 3: continue keywords_count[keyword] = weight return keywords_count
def freq_word_evaluation_half(items, topk=10, topk_weight=5): ''' 选取权值排在topk_weight的评论 input: items: 新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...] output: 权值排在前一半的评论,数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...] ''' words_list = [] #评论按照权值大小降序排列 idx = 0 weight_dict = {} for item in items: weight_dict[idx] = item['weight'] sorted_weight = sorted(weight_dict.iteritems(),key = lambda asd:asd[1],reverse=True) result_weight = sorted_weight[:topk_weight] half_item = [] for r in result_weight: half_item.append(items[int(r[0])]) for item in half_item: text = item['content'] words = cut_words(text) words_list.extend(words) counter = Counter(words_list) total_weight = sum(dict(counter.most_common()).values()) topk_words = counter.most_common(topk) keywords_dict = {k: v for k, v in topk_words} return keywords_dict, total_weight
def freq_word(items, topk=20): ''' 统计一批文本的topk高频词 input: items: 新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳,'lable':类别标签},...] topk: 按照词频的前多少个词, 默认取20 output: topk_words: 词、词频组成的列表, 数据示例:[(词,词频),(词,词频)...] ''' from utils import cut_words from collections import Counter words_list = [] for item in items: text = item['title'] + item['content'] words = cut_words(text) words_list.extend(words) counter = Counter(words_list) total_weight = sum(dict(counter.most_common()).values()) topk_words = counter.most_common(topk) keywords_dict = {k: v for k, v in topk_words} return keywords_dict, total_weight
def word_bag(word, inputs, gram): ''' 取出一个词前三个词和后三个词,包括动词、名词、形容词 输入数据: word:选取出的特征词 inputs:过滤后的评论文本 gram:取特征词前面gram个词和后面gram个词 输出数据: counter_dict:{特征词:counter(在每个维度上特征值)} ''' #一个词与前后三个词构成词袋 words_bag = [] counter_dict = {} for w in word: for input in inputs: if w[0] in input['content']: text = input['content'] words = cut_words(text) if w[0] in words: index = words.index(w[0]) if index - gram < 0: bag = words[:index] else: bag = words[index - gram:index] bag.extend(words[index:index + gram]) words_bag.extend(bag) counter = Counter(words_bag) top_words = counter.most_common() counter_dict[w] = {k: v for k, v in top_words} #特征词列表 feature_list = list(set(words_bag)) return counter_dict, feature_list
def word_bag(word, inputs, gram): ''' 取出一个词前三个词和后三个词,包括动词、名词、形容词 输入数据: word:选取出的特征词 inputs:过滤后的评论文本 gram:取特征词前面gram个词和后面gram个词 输出数据: counter_dict:{特征词:counter(在每个维度上特征值)} ''' #一个词与前后三个词构成词袋 words_bag = [] counter_dict = {} for w in word: for input in inputs: if w[0] in input['content']: text = input['content'] words = cut_words(text) if w[0] in words: index = words.index(w[0]) if index-gram<0: bag = words[:index] else: bag = words[index-gram:index] bag.extend(words[index:index+gram]) words_bag.extend(bag) counter = Counter(words_bag) top_words = counter.most_common() counter_dict[w] = {k:v for k,v in top_words} #特征词列表 feature_list = list(set(words_bag)) return counter_dict,feature_list
def filter_comment(inputs): """ 针对一条新闻下的一组评论进行过滤 过滤评论函数:将评论中@及后面用户名、表情符号去掉,只保留名词、动词、形容词次数大于3的评论;如果评论中的名词在高频(新闻+评论)词中出现过则保留该条评论,否则删除掉 输入数据: inputs:评论数据,示例:[{'_id':评论id,'news_id':新闻id,'content':评论内容}] news:新闻数据,"新闻" 输出数据: 过滤后的评论数据 """ for r in inputs: news_content = r['news_content'] item_reserved = [] item_rubbish = [] at_pattern = r'@(.+?)\s' emotion_pattern = r'\[(\S+?)\]' for input in inputs: rub_label = 0 # 表示不是垃圾 text = re.sub(at_pattern, '', input['content'] + ' ') #在每个input后加一个空格,以去掉@在末尾的情况 text = text.strip(' ') text = re.sub(emotion_pattern, '', text) words = cut_words(text) if len(words) >= 3 and len(words) <= 20: for word in words: if word in market_words: rub_label = 1 # 表示命中广告词,是垃圾 input['rub_label'] = rub_label item_rubbish.append(input) break if rub_label == 0: input['content'] = text item_reserved.append(input) else: rub_label = 1 input['rub_label'] = rub_label item_rubbish.append(input) # 如果评论中的名词出现在过新闻中,则保留该评论 comment_top, comment_noun = freq_word_comment(item_reserved) # 评论中的词及词频 news_word = freq_word_news(news_content) # 新闻中的词及词频 imp_word = word_list(comment_top, news_word) # 评论和新闻中的词及词频结合 results = comment_word_in_news(comment_noun, imp_word, item_reserved) return results + item_rubbish
def filter_comment(inputs): """ 针对一条新闻下的一组评论进行过滤 过滤评论函数:将评论中@及后面用户名、表情符号去掉,只保留名词、动词、形容词次数大于3的评论;如果评论中的名词在高频(新闻+评论)词中出现过则保留该条评论,否则删除掉 输入数据: inputs:评论数据,示例:[{'_id':评论id,'news_id':新闻id,'content':评论内容}] news:新闻数据,"新闻" 输出数据: 过滤后的评论数据 """ for r in inputs: news_content = r['news_content'] item_reserved = [] item_rubbish = [] at_pattern = r'@(.+?)\s' emotion_pattern = r'\[(\S+?)\]' for input in inputs: rub_label = 0 # 表示不是垃圾 text = re.sub(at_pattern, '',input['content']+' ')#在每个input后加一个空格,以去掉@在末尾的情况 text = text.strip(' ') text = re.sub(emotion_pattern,'',text) words = cut_words(text) if len(words) >= 3 and len(words)<=20: for word in words: if word in market_words: rub_label = 1 # 表示命中广告词,是垃圾 input['rub_label'] = rub_label item_rubbish.append(input) break if rub_label == 0: input['content'] = text item_reserved.append(input) else: rub_label = 1 input['rub_label'] = rub_label item_rubbish.append(input) # 如果评论中的名词出现在过新闻中,则保留该评论 comment_top, comment_noun = freq_word_comment(item_reserved) # 评论中的词及词频 news_word = freq_word_news(news_content) # 新闻中的词及词频 imp_word = word_list(comment_top,news_word) # 评论和新闻中的词及词频结合 results = comment_word_in_news(comment_noun, imp_word, item_reserved) return results + item_rubbish
def process_for_cluto(inputs, cluto_input_folder=None): """ 数据预处理函数 input: inputs: 新闻数据, 示例:[{'_id':新闻id,'source_from_name':新闻来源,'title':新闻标题,'content':新闻内容,'timestamp':时间戳}] output: cluto输入文件路径 """ # handle default if not cluto_input_folder: cluto_input_folder = os.path.join(AB_PATH, CLUTO_FOLDER) feature_set = set() # 不重复的词集合 words_list = [] # 所有新闻分词结果集合 for input in inputs: text = input['title'] + input['content'] words = cut_words(text) words_list.append(words) # 特征词字典 dictionary = corpora.Dictionary(words_list) # 将feature中的词转换成列表 feature_set = set(dictionary.keys()) row_count = len(inputs) # documents count column_count = len(feature_set) # feature count nonzero_count = 0 # nonzero elements count # 文件名以PID命名 if not os.path.exists(cluto_input_folder): os.makedirs(cluto_input_folder) file_name = os.path.join(cluto_input_folder, '%s.txt' % os.getpid()) with open(file_name, 'w') as fw: lines = [] for words in words_list: bow = dictionary.doc2bow(words) nonzero_count += len(bow) line = ' '.join(['%s %s' % (w + 1, c) for w, c in bow]) + '\n' lines.append(line) fw.write('%s %s %s\n' % (row_count, column_count, nonzero_count)) fw.writelines(lines) return file_name
def freq_word(items): ''' 统计一条文本的词频 input: items: 新闻组成的列表:字典, 数据示例:{'_id':评论id,'news_id':新闻id,'content':新闻内容} output: top_word: 词和词频构成的字典, 数据示例:{词:词频,词:词频,...} ''' words_list = [] text = items['content'] words = cut_words(text) for w in words: words_list.append(w) counter = Counter(words_list) total = sum(counter.values())#总词频数 topk_words = counter.most_common() top_word = {k:(float(v)/float(total)) for k,v in topk_words} return top_word
def freq_word(items): ''' 统计一条文本的词频 input: items: 新闻组成的列表:字典, 数据示例:{'_id':评论id,'news_id':新闻id,'content':新闻内容} output: top_word: 词和词频构成的字典, 数据示例:{词:词频,词:词频,...} ''' words_list = [] text = items['content'] words = cut_words(text) for w in words: words_list.append(w) counter = Counter(words_list) total = sum(counter.values()) #总词频数 topk_words = counter.most_common() top_word = {k: (float(v) / float(total)) for k, v in topk_words} return top_word
def freq_word_evaluation(items, topk=10): ''' 聚类评价用,统计一类文本的topk高频词 input: items: 新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签},...] topk: 按照词频的前多少个词, 默认取10 output: topk_words: 词、词频组成的列表, 数据示例:[(词,词频),(词,词频)...] ''' words_list = [] for item in items: text = item['content'] words = cut_words(text) words_list.extend(words) counter = Counter(words_list) total_weight = sum(dict(counter.most_common()).values()) topk_words = counter.most_common(topk) keywords_dict = {k: v for k, v in topk_words} return keywords_dict, total_weight
def freq_word_evaluation_half(items, topk=10, topk_weight=5): ''' 选取权值排在topk_weight的评论 input: items: 新闻组成的列表:字典的序列, 数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...] output: 权值排在前一半的评论,数据示例:[{'_id':新闻id,'content':新闻内容,'lable':类别标签,'weight':每条评论属于该类的权值},...] ''' words_list = [] #评论按照权值大小降序排列 idx = 0 weight_dict = {} for item in items: weight_dict[idx] = item['weight'] sorted_weight = sorted(weight_dict.iteritems(), key=lambda asd: asd[1], reverse=True) result_weight = sorted_weight[:topk_weight] half_item = [] for r in result_weight: half_item.append(items[int(r[0])]) for item in half_item: text = item['content'] words = cut_words(text) words_list.extend(words) counter = Counter(words_list) total_weight = sum(dict(counter.most_common()).values()) topk_words = counter.most_common(topk) keywords_dict = {k: v for k, v in topk_words} return keywords_dict, total_weight
def ad_filter(item, market_words=market_words): """ 按照简单规则过滤评论函数:将评论中@及后面用户名、表情符号去掉,只保留名词、动词、形容词词数大于3的评论 input: item:评论数据,示例:{'_id':评论id,'content':评论内容} output: 过滤后的评论数据, 增加了ad_label,0表示非垃圾; 增加了text_filter_ad,表示去除无用东西的文本 """ text = item['content'] text = remove_at(text) text = remove_emoticon(text) words = cut_words(text) ad_label = 0 # 默认每条记录都不是垃圾 if len(words) >= 3 and len(words) <= 20: if len(set(words) & set(market_words)): ad_label = 1 else: ad_label = 1 item['ad_label'] = ad_label item['text_filter_ad'] = text return item
def ad_filter(item, market_words=market_words): """ 按照简单规则过滤评论函数:将评论中@及后面用户名、表情符号去掉,只保留名词、动词、形容词词数大于3的评论 input: item:评论数据,示例:{'_id':评论id,'content':评论内容} output: 过滤后的评论数据, 增加了ad_label,0表示非垃圾; 增加了text_filter_ad,表示去除无用东西的文本 """ text = item['content'] text = remove_at(text) text = remove_emoticon(text) words = cut_words(text) ad_label = 0 # 默认每条记录都不是垃圾 if len(words) >= 3 and len(words)<=20: if len(set(words) & set(market_words)): ad_label = 1 else: ad_label = 1 item['ad_label'] = ad_label item['text_filter_ad'] = text return item