Example #1
0
 def seed_trigger(self, seed_words):
     """
     :return: 每一类事件触发对应的words,且根据频数对其排序,选取前top个数据
     """
     top_seed_trigger = {}
     for event, value in seed_words.items():
         top_seed_trigger[event] = self.sort_words(items=value)
     datapreprocess.write_file(path=self.trigger_path +
                               "top_event_trigger.csv",
                               item_data=pd.DataFrame(top_seed_trigger).T)
     datapreprocess.write_json_file(path=self.trigger_path +
                                    "top_event_trigger.json",
                                    item_data=top_seed_trigger)
     return top_seed_trigger
Example #2
0
 def form_seed_words(self):
     """
     :return: 得到最初的触发器词
     """
     logger.info("形成种子事件词...")
     target_train_data = self.target_train_data.copy()[[
         "Label", "Split_content"
     ]]
     trigger_words = {}
     for i in range(len(target_train_data.index)):
         if int(target_train_data.ix[i, "Label"]) not in trigger_words:
             trigger_words[int(
                 target_train_data.ix[i, "Label"])] = literal_eval(
                     target_train_data.ix[i, "Split_content"])
         else:
             trigger_words[int(target_train_data.ix[i, "Label"])]. \
                 extend(literal_eval(target_train_data.ix[i, "Split_content"]))
     datapreprocess.write_json_file(path=self.trigger_path +
                                    "init_trigger.json",
                                    item_data=trigger_words)
     return trigger_words
Example #3
0
    def add_new_word(self,
                     threshold=0.8,
                     top=2000,
                     enrich=True,
                     source_from=1,
                     new=False):
        """
        从测试集中提取top=1000的新词,加入到event
        :return:
        """
        if enrich:
            logger.info("丰富种子动词形成触发器词...")
            # 加载语料库中高频新词
            if source_from == 1:
                corpora_split_content = self.corpora_split_content.copy()
            else:
                # 加载测试集中高频新词
                corpora_split_content = \
                    datapreprocess.read_file(path="../Data/pre_process/target_test_x.csv")["Split_content"]
            model = Word2Vec.load(self.model_path)
            new_trigger_words, seed_event_vec, all_word = self.top_seed_trigger.copy(
            ), self.seed_event_vec, []
            final_trigger_words = {}
            # sorted the corpora high frequency words
            for i in range(len(corpora_split_content.index)):
                all_word.extend(
                    literal_eval(corpora_split_content.ix[i, "Split_content"]))
            high_frequency_words = self.sort_words(all_word, top=top)
            new_trigger_words_list = []
            for value in new_trigger_words.values():
                for wd in value:
                    new_trigger_words_list.append(wd[0])
            # add new word to trigger.
            if not new:
                for item in high_frequency_words:
                    if item[0] not in new_trigger_words_list:
                        cosine_value = []
                        for i in sorted(seed_event_vec.keys()):
                            if item[0] in model:
                                vec_pair = [
                                    seed_event_vec[i], model.wv[item[0]]
                                ]
                                cosine_value.append(
                                    cosine_similarity(vec_pair)[1, 0])
                        if cosine_value and max(cosine_value) >= threshold:
                            event = cosine_value.index(max(cosine_value)) + 1
                            logger.info(
                                "增加词:{word},频率:{frequency},相似度:{simi},对应事件{event}"
                                .format(word=item[0],
                                        frequency=item[1],
                                        simi=max(cosine_value),
                                        event=event))
                            new_trigger_words[event].append(item)
            else:
                new_trigger_words = {1: [], 2: [], 3: [], 4: [], 5: [], 6: []}
                for item in high_frequency_words:
                    cosine_value = []
                    for i in sorted(seed_event_vec.keys()):
                        if item[0] in model:
                            vec_pair = [seed_event_vec[i], model.wv[item[0]]]
                            cosine_value.append(
                                cosine_similarity(vec_pair)[1, 0])
                    if cosine_value and max(cosine_value) >= threshold:
                        event = cosine_value.index(max(cosine_value)) + 1
                        logger.info(
                            "增加词:{word},频率:{frequency},相似度:{simi},对应事件{event}".
                            format(word=item[0],
                                   frequency=item[1],
                                   simi=max(cosine_value),
                                   event=event))
                        new_trigger_words[event].append(item)

            datapreprocess.write_json_file(path=self.trigger_path +
                                           "event_words_add_new.json",
                                           item_data=new_trigger_words)
            high_top = min(
                list(map(lambda x: len(x), new_trigger_words.values())))
            logger.info("每类标签的词数{num}".format(num=high_top))
            for key, value in new_trigger_words.items():
                final_trigger_words[key] = value[:high_top]
            datapreprocess.write_json_file(path=self.trigger_path +
                                           "final_event_words.json",
                                           item_data=final_trigger_words)
            datapreprocess.write_file(
                path=self.trigger_path + "final_event_word.csv",
                item_data=pd.DataFrame(final_trigger_words).T)
            return final_trigger_words
        else:
            return self.top_seed_trigger.copy()