def combine_features(self, news): features = [] # print("combine_features 训练集的关键词:",self.train_data_entity[:10]) if self.load_from_file is True: if self.feature_data_dict is None: self.feature_data_dict = fenci.get_fenci_feature_func( '../jieba_fenci_model/result/result_jieba_fenci.txt') for ner in self.feature_data_dict[news['newsId']]: features.append([ [ner], self.feature_data_dict[news['newsId']][ner] + [ len(ner), self.num_of_not_word(ner), news['content'].count(ner), # 正文中的词频 news['title'].count(ner), # title中的词频 (news['title'] + news['content']).count(ner), # 总的词频 (news['title'] + news['content']).index(ner), # 关键词第一次出现的位置 (news['title'] + news['content']).rindex(ner), # 关键词最后一次出现的位置 len(news['title']), # 标题的长度 len(news['content']) # 正文的长度 ] ]) return features content_words_tfidf, title_words_tfidf = self.get_tfidf_Score(news) content_words_textRank, title_words_textRank = self.get_textRank_Score( news) keys = content_words_tfidf.keys() | title_words_tfidf.keys( ) | content_words_textRank.keys() | title_words_textRank.keys() for ner in keys: features.append([ [ner], [ content_words_tfidf[ner] if ner in content_words_tfidf else 0, # 特征:正文中的tfidf title_words_tfidf[ner] if ner in title_words_tfidf else 0, # 标题中的tfidf content_words_textRank[ner] if ner in content_words_textRank else 0, # 特征:正文中的textRank title_words_textRank[ner] if ner in title_words_textRank else 0, # 标题中的textRank len(ner), # 实体的长度 self.num_of_not_word(ner), # 含有符号的个数 news['content'].count(ner), # 正文中的词频 news['title'].count(ner), # title中的词频 (news['title'] + news['content']).count(ner), # 总的词频 (news['title'] + news['content']).index(ner), # 关键词第一次出现的位置 (news['title'] + news['content']).rindex(ner), # 关键词最后一次出现的位置 len(news['title']), # 标题的长度 len(news['content']) # 正文的长度 # self.train_data_entity.count(ner)/len(self.train_data_entity) # 关键词在训练集中的概率 (效果差) ] ]) return features
def combine_features(self, news): features = [] # print("combine_features 训练集的关键词:",self.train_data_entity[:10]) if True: # if self.load_from_file is True: if self.feature_data_dict is None: print('加载预分词') self.feature_data_dict = fenci.get_fenci_feature_func( '../jieba_fenci_model/result/result_jieba_fenci.txt') for ner in self.feature_data_dict[news['newsId']]: features.append([ [ner], self.feature_data_dict[news['newsId']][ner] + [ len(ner), self.num_of_not_word(ner), news['content'].count(ner), # 正文中的词频 news['title'].count(ner), # title中的词频 (news['title'] + news['content']).count(ner), # 总的词频 (news['title'] + news['content']).index(ner), # 关键词第一次出现的位置 (news['title'] + news['content']).rindex(ner), # 关键词最后一次出现的位置 len(news['title']), # 标题的长度 len(news['content']), # 正文的长度 self.bert_obj.is_in_bert(news['newsId'], ner), self.ltp_obj.get_label(ner, news['newsId'], model=3) # self.train_data_entity.count(ner)/len(self.train_data_entity) # 关键词在训练集中的概率 (效果差) ] + self.ltp_obj.get_label(ner, news['newsId'], model=1) ]) return features content_words_tfidf, title_words_tfidf = self.get_tfidf_Score(news) content_words_textRank, title_words_textRank = self.get_textRank_Score( news) keys = content_words_tfidf.keys() | title_words_tfidf.keys( ) | content_words_textRank.keys() | title_words_textRank.keys() for ner in keys: # # one_hot = np.zeros(len(self.key_word_pos) + 1) # one_hot = np.zeros(len(self.key_word_pos)) # if (ner in self.word_pos and self.word_pos[ner] in self.key_word_pos): # one_hot[self.key_word_pos.index(self.word_pos[ner])] = 1 # else: # # one_hot[len(self.key_word_pos)] = 1 # one_hot[self.key_word_pos.index('n')] = 1 features.append([ [ner], [ content_words_tfidf[ner] if ner in content_words_tfidf else 0, # 特征:正文中的tfidf title_words_tfidf[ner] if ner in title_words_tfidf else 0, # 标题中的tfidf content_words_textRank[ner] if ner in content_words_textRank else 0, # 特征:正文中的textRank title_words_textRank[ner] if ner in title_words_textRank else 0, # 标题中的textRank len(ner), # 实体的长度 self.num_of_not_word(ner), # 含有符号的个数 news['content'].count(ner), # 正文中的词频 news['title'].count(ner), # title中的词频 (news['title'] + news['content']).count(ner), # 总的词频 (news['title'] + news['content']).index(ner), # 关键词第一次出现的位置 (news['title'] + news['content']).rindex(ner), # 关键词最后一次出现的位置 len(news['title']), # 标题的长度 len(news['content']), # 正文的长度 (self.key_word_pos.index(self.word_pos[ner]) if (ner in self.word_pos and self.word_pos[ner] in self. key_word_pos) else self.key_word_pos.index('n')) * 0.1, self.bert_obj.is_in_bert(news['newsId'], ner), self.ltp_obj.get_label(ner, news['newsId'], model=3) # self.train_data_entity.count(ner)/len(self.train_data_entity) # 关键词在训练集中的概率 (效果差) ] + self.ltp_obj.get_label(ner, news['newsId'], model=1) ]) return features
def combine_features(self, news): features = [] if self.load_from_file is True: if self.feature_data_dict is None: self.feature_data_dict = fenci.get_fenci_feature_func( '../jieba_fenci_model/result/result_jieba_fenci.txt') for ner in self.feature_data_dict[news['newsId']]: features.append([ [ner], self.feature_data_dict[news['newsId']][ner] + [ len(ner), self.num_of_not_word(ner), news['content'].count(ner), # 正文中的词频 news['title'].count(ner), # title中的词频 (news['title'] + news['content']).count(ner), # 总的词频 (news['title'] + news['content']).index(ner), # 关键词第一次出现的位置 (news['title'] + news['content'] ).rindex(ner), # 关键词最后一次出现的位置 len(news['title']), # 标题的长度 len(news['content']) # 正文的长度 ] ]) return features content_words_tfidf, title_words_tfidf = self.get_tfidf_Score(news) content_words_textRank, title_words_textRank = self.get_textRank_Score( news) # content_words_tfidf = content_words_textRank # title_words_tfidf = title_words_textRank keys = content_words_tfidf.keys() | title_words_tfidf.keys( ) | content_words_textRank.keys() | title_words_textRank.keys() features = [] for ner in keys: features.append([ [ner], [ content_words_tfidf[ner] if ner in content_words_tfidf else 0, title_words_tfidf[ner] if ner in title_words_tfidf else 0, content_words_textRank[ner] if ner in content_words_textRank else 0, title_words_textRank[ner] if ner in title_words_textRank else 0, len(ner), self.num_of_not_word( ner), # 特征:正文中的tfidf,标题中的tfidf,实体的长度,含有符号的个数 news['content'].count(ner), # 正文中的词频 news['title'].count(ner), # title中的词频 (news['title'] + news['content']).count(ner), # 总的词频 (news['title'] + news['content']).index(ner), # 关键词第一次出现的位置 (news['title'] + news['content']).rindex(ner), # 关键词最后一次出现的位置 len(news['title']), # 标题的长度 len(news['content']) # 正文的长度 ] ]) # self.num_of_not_word(ner) # 正则化 (效果差) # feature_matrix = [feature[1] for feature in features] # feature_matrix = normalize(np.array(feature_matrix)) # for index, feature in enumerate(features): # feature[1] = list(feature_matrix[index]) # for ner in self.pkuseg_cut(news): # a = 0 # if ner in tfidf: #0 # a = tfidf[ner] # features.append([[ner],[a]]) #特征可以继续添加 b,c,d,e,f,g...... return features