Esempio n. 1
0
def form_clean_corpora(raw, company):
    """
    :param company_list: 将原始预料raw_data中抽取company_list的新语料
    :return:
    """
    indexs = []
    for i in range(len(raw.index)):
        if raw.ix[i, "Company"] not in company:
            indexs.append(i)
    clean_data = raw.drop(indexs, axis=0)
    datapreprocess.write_file(path="../Data_2/raw/clean_company_corpora.csv",
                              item_data=clean_data)
    return clean_data
Esempio n. 2
0
 def seed_trigger(self, seed_words):
     """
     :return: 每一类事件触发对应的words,且根据频数对其排序,选取前top个数据
     """
     top_seed_trigger = {}
     for event, value in seed_words.items():
         top_seed_trigger[event] = self.sort_words(items=value)
     datapreprocess.write_file(path=self.trigger_path +
                               "top_event_trigger.csv",
                               item_data=pd.DataFrame(top_seed_trigger).T)
     datapreprocess.write_json_file(path=self.trigger_path +
                                    "top_event_trigger.json",
                                    item_data=top_seed_trigger)
     return top_seed_trigger
Esempio n. 3
0
 def seed_vec(self, e_trigger, filename, weight=False):
     """
     :return: 得到每类事件的向量
     """
     logger.info("计算种子数据中的事件向量...")
     model = Word2Vec.load(self.model_path)
     seed_vec = {}
     for e, value in e_trigger.items():
         seed_vec[e] = self.compute_vec(item=value,
                                        model=model,
                                        weight=weight)
     datapreprocess.write_file(path=self.trigger_path + filename,
                               item_data=pd.DataFrame(seed_vec).T)
     return seed_vec
Esempio n. 4
0
def form_target_data(item, label_map, company):
    indexs = []
    for i in range(len(item.index)):
        if item.ix[i, "Company"] not in company:
            indexs.append(i)
    clean_data = item.drop(indexs, axis=0)
    clean_data = clean_data.dropna()
    clean_data["Label"] = clean_data["Label"].apply(lambda x: label_map[x])
    for i in range(len(company)):
        print(company[i],
              len(clean_data[clean_data["Company"] == company[i]].index))
    for i in range(1, 7):
        print(i, len(clean_data[clean_data["Label"] == i].index))
    print(len(clean_data.index))
    print(clean_data["Company"].unique())
    datapreprocess.write_file(path="../Data_2/raw/target.csv",
                              item_data=clean_data)
    return clean_data
Esempio n. 5
0
    def add_new_word(self,
                     threshold=0.8,
                     top=2000,
                     enrich=True,
                     source_from=1,
                     new=False):
        """
        从测试集中提取top=1000的新词,加入到event
        :return:
        """
        if enrich:
            logger.info("丰富种子动词形成触发器词...")
            # 加载语料库中高频新词
            if source_from == 1:
                corpora_split_content = self.corpora_split_content.copy()
            else:
                # 加载测试集中高频新词
                corpora_split_content = \
                    datapreprocess.read_file(path="../Data/pre_process/target_test_x.csv")["Split_content"]
            model = Word2Vec.load(self.model_path)
            new_trigger_words, seed_event_vec, all_word = self.top_seed_trigger.copy(
            ), self.seed_event_vec, []
            final_trigger_words = {}
            # sorted the corpora high frequency words
            for i in range(len(corpora_split_content.index)):
                all_word.extend(
                    literal_eval(corpora_split_content.ix[i, "Split_content"]))
            high_frequency_words = self.sort_words(all_word, top=top)
            new_trigger_words_list = []
            for value in new_trigger_words.values():
                for wd in value:
                    new_trigger_words_list.append(wd[0])
            # add new word to trigger.
            if not new:
                for item in high_frequency_words:
                    if item[0] not in new_trigger_words_list:
                        cosine_value = []
                        for i in sorted(seed_event_vec.keys()):
                            if item[0] in model:
                                vec_pair = [
                                    seed_event_vec[i], model.wv[item[0]]
                                ]
                                cosine_value.append(
                                    cosine_similarity(vec_pair)[1, 0])
                        if cosine_value and max(cosine_value) >= threshold:
                            event = cosine_value.index(max(cosine_value)) + 1
                            logger.info(
                                "增加词:{word},频率:{frequency},相似度:{simi},对应事件{event}"
                                .format(word=item[0],
                                        frequency=item[1],
                                        simi=max(cosine_value),
                                        event=event))
                            new_trigger_words[event].append(item)
            else:
                new_trigger_words = {1: [], 2: [], 3: [], 4: [], 5: [], 6: []}
                for item in high_frequency_words:
                    cosine_value = []
                    for i in sorted(seed_event_vec.keys()):
                        if item[0] in model:
                            vec_pair = [seed_event_vec[i], model.wv[item[0]]]
                            cosine_value.append(
                                cosine_similarity(vec_pair)[1, 0])
                    if cosine_value and max(cosine_value) >= threshold:
                        event = cosine_value.index(max(cosine_value)) + 1
                        logger.info(
                            "增加词:{word},频率:{frequency},相似度:{simi},对应事件{event}".
                            format(word=item[0],
                                   frequency=item[1],
                                   simi=max(cosine_value),
                                   event=event))
                        new_trigger_words[event].append(item)

            datapreprocess.write_json_file(path=self.trigger_path +
                                           "event_words_add_new.json",
                                           item_data=new_trigger_words)
            high_top = min(
                list(map(lambda x: len(x), new_trigger_words.values())))
            logger.info("每类标签的词数{num}".format(num=high_top))
            for key, value in new_trigger_words.items():
                final_trigger_words[key] = value[:high_top]
            datapreprocess.write_json_file(path=self.trigger_path +
                                           "final_event_words.json",
                                           item_data=final_trigger_words)
            datapreprocess.write_file(
                path=self.trigger_path + "final_event_word.csv",
                item_data=pd.DataFrame(final_trigger_words).T)
            return final_trigger_words
        else:
            return self.top_seed_trigger.copy()
Esempio n. 6
0
def form_test_data(item):
    test_data = item.drop("Sentence_vec", axis=1)
    datapreprocess.write_file(path="../Data_2/pre_process/test_data_label.csv",
                              item_data=test_data)