def seed_trigger(self, seed_words): """ :return: 每一类事件触发对应的words,且根据频数对其排序,选取前top个数据 """ top_seed_trigger = {} for event, value in seed_words.items(): top_seed_trigger[event] = self.sort_words(items=value) datapreprocess.write_file(path=self.trigger_path + "top_event_trigger.csv", item_data=pd.DataFrame(top_seed_trigger).T) datapreprocess.write_json_file(path=self.trigger_path + "top_event_trigger.json", item_data=top_seed_trigger) return top_seed_trigger
def form_seed_words(self): """ :return: 得到最初的触发器词 """ logger.info("形成种子事件词...") target_train_data = self.target_train_data.copy()[[ "Label", "Split_content" ]] trigger_words = {} for i in range(len(target_train_data.index)): if int(target_train_data.ix[i, "Label"]) not in trigger_words: trigger_words[int( target_train_data.ix[i, "Label"])] = literal_eval( target_train_data.ix[i, "Split_content"]) else: trigger_words[int(target_train_data.ix[i, "Label"])]. \ extend(literal_eval(target_train_data.ix[i, "Split_content"])) datapreprocess.write_json_file(path=self.trigger_path + "init_trigger.json", item_data=trigger_words) return trigger_words
def add_new_word(self, threshold=0.8, top=2000, enrich=True, source_from=1, new=False): """ 从测试集中提取top=1000的新词,加入到event :return: """ if enrich: logger.info("丰富种子动词形成触发器词...") # 加载语料库中高频新词 if source_from == 1: corpora_split_content = self.corpora_split_content.copy() else: # 加载测试集中高频新词 corpora_split_content = \ datapreprocess.read_file(path="../Data/pre_process/target_test_x.csv")["Split_content"] model = Word2Vec.load(self.model_path) new_trigger_words, seed_event_vec, all_word = self.top_seed_trigger.copy( ), self.seed_event_vec, [] final_trigger_words = {} # sorted the corpora high frequency words for i in range(len(corpora_split_content.index)): all_word.extend( literal_eval(corpora_split_content.ix[i, "Split_content"])) high_frequency_words = self.sort_words(all_word, top=top) new_trigger_words_list = [] for value in new_trigger_words.values(): for wd in value: new_trigger_words_list.append(wd[0]) # add new word to trigger. if not new: for item in high_frequency_words: if item[0] not in new_trigger_words_list: cosine_value = [] for i in sorted(seed_event_vec.keys()): if item[0] in model: vec_pair = [ seed_event_vec[i], model.wv[item[0]] ] cosine_value.append( cosine_similarity(vec_pair)[1, 0]) if cosine_value and max(cosine_value) >= threshold: event = cosine_value.index(max(cosine_value)) + 1 logger.info( "增加词:{word},频率:{frequency},相似度:{simi},对应事件{event}" .format(word=item[0], frequency=item[1], simi=max(cosine_value), event=event)) new_trigger_words[event].append(item) else: new_trigger_words = {1: [], 2: [], 3: [], 4: [], 5: [], 6: []} for item in high_frequency_words: cosine_value = [] for i in sorted(seed_event_vec.keys()): if item[0] in model: vec_pair = [seed_event_vec[i], model.wv[item[0]]] cosine_value.append( cosine_similarity(vec_pair)[1, 0]) if cosine_value and max(cosine_value) >= threshold: event = cosine_value.index(max(cosine_value)) + 1 logger.info( "增加词:{word},频率:{frequency},相似度:{simi},对应事件{event}". format(word=item[0], frequency=item[1], simi=max(cosine_value), event=event)) new_trigger_words[event].append(item) datapreprocess.write_json_file(path=self.trigger_path + "event_words_add_new.json", item_data=new_trigger_words) high_top = min( list(map(lambda x: len(x), new_trigger_words.values()))) logger.info("每类标签的词数{num}".format(num=high_top)) for key, value in new_trigger_words.items(): final_trigger_words[key] = value[:high_top] datapreprocess.write_json_file(path=self.trigger_path + "final_event_words.json", item_data=final_trigger_words) datapreprocess.write_file( path=self.trigger_path + "final_event_word.csv", item_data=pd.DataFrame(final_trigger_words).T) return final_trigger_words else: return self.top_seed_trigger.copy()