Ejemplo n.º 1
0
    def generate_instances(self, sentences, child_conn):
        # Each process has its own NLTK PoS-tagger
        tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
        instances = list()
        while True:
            try:
                s = sentences.get_nowait()
                if sentences.qsize() % 500 == 0:
                    print(multiprocessing.current_process(), \
                        "Instances to process", sentences.qsize())

                sentence = Sentence(s, self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size, tagger,
                                    self.config)

                for rel in sentence.relationships:
                    t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                              rel.between, rel.after, self.config)
                    instances.append(t)

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, instances))
                break
Ejemplo n.º 2
0
    def test_update_selectivity(self):

        bef_words = ['dummy']
        bet_words = ['dummy']
        aft_words = ['dummy']

        # positive
        pattern = Pattern()
        t = Tuple('seed_1 ', 'seed_2 ', None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.positive, 1)
        self.assertEqual(pattern.negative, 0)
        self.assertEqual(pattern.unknown, 0)

        # negative
        pattern = Pattern()
        t = Tuple('seed_1', 'seed_5', None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.negative, 1)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.unknown, 0)

        # negative
        pattern = Pattern()
        t = Tuple('seed_1', 'seed_3', None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.unknown, 0)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.negative, 1)

        # unknown
        pattern = Pattern()
        t = Tuple('seed_4', 'seed_5', None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config)
        self.assertEqual(pattern.negative, 0)
        self.assertEqual(pattern.positive, 0)
        self.assertEqual(pattern.unknown, 1)
Ejemplo n.º 3
0
    def test_update_confidence(self):
        bef_words = ['dummy']
        bet_words = ['dummy']
        aft_words = ['dummy']

        # positive
        pattern = Pattern()
        t = Tuple(self.e1, self.e2, None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config, self.baseline)
        pattern.update_confidence(self.config)
        print(pattern.p_values[0])
        self.assertGreater(pattern.confidence, .5)

        # negative
        pattern = Pattern()
        t = Tuple(self.e2, self.e1, None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config, self.baseline)
        pattern.update_confidence(self.config)
        self.assertLess(pattern.confidence, .5)
Ejemplo n.º 4
0
    def test_update_selectivity(self):
        bef_words = ['dummy']
        bet_words = ['dummy']
        aft_words = ['dummy']

        # positive
        pattern = Pattern()
        t = Tuple(self.e1, self.e2, None, bef_words, bet_words, aft_words,
                  self.config)
        pattern.update_selectivity(t, self.config, self.baseline)
        self.assertEqual(len(pattern.p_values), 1)
        self.assertEqual(pattern.p_values[0],
                         self.baseline.shortest_path(self.e1, self.e2))
Ejemplo n.º 5
0
    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences where named entities are
        already tagged

        :param sentences_file:
        """
        if os.path.exists("processed_tuples.pkl"):

            with open("processed_tuples.pkl", "rb") as f_in:
                print("\nLoading processed tuples from disk...")
                self.processed_tuples = pickle.load(f_in)
            print(len(self.processed_tuples), "tuples loaded")

            temp_file = open("temp.txt", "w", encoding='utf-8')
            for i in self.processed_tuples:
                temp_file.write(i.e1 + '\t' + i.e2 + '\n')
            temp_file.close()

        else:

            # load needed stuff, word2vec model and a pos-tagger
            self.config.read_word2vec()
            tagger = None

            print("\nGenerating relationship instances from sentences")
            with open(sentences_file, encoding='utf-8') as f_sentences:
                count = 0
                for line in f_sentences:
                    if line.startswith("#"):
                        continue
                    count += 1
                    if count % 10000 == 0:
                        sys.stdout.write(".")

                    sentence = Sentence(line.strip(), self.config.e1_type,
                                        self.config.e2_type,
                                        self.config.max_tokens_away,
                                        self.config.min_tokens_away,
                                        self.config.context_window_size,
                                        tagger, self.config)

                    for rel in sentence.relationships:
                        t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                                  rel.between, rel.after, self.config)
                        self.processed_tuples.append(t)
                print("\n", len(self.processed_tuples), "tuples generated")

            print("Writing generated tuples to disk")
            with open("processed_tuples.pkl", "wb") as f_out:
                pickle.dump(self.processed_tuples, f_out)
Ejemplo n.º 6
0
    def generate_tuples(self, data_dir: str):
        """
        用于从源数据中,用多线程的方式生成tuples
        Args:
            data_dir: 数据存储的路径,其中包括:
                      eg. 源文章名称    __ data/round2/0.txt
                          NER结果名称   __ data/round2/0_ner.pkl
                          文章分句结果   __ data/round2/0_sentence_split.pkl
        """

        # Step1 : load word2idx and emb_matrix
        self.config.load_word2idx_embmatrix()

        # Step2 : 生成候选关系对
        instances = list()
        file_names = scan_files(data_dir)

        for file in file_names:
            passage = load_file(data_dir, file, "txt")  # type:str
            sent_split = pickle.load(
                open(data_dir + file + "_sentence_split.pkl",
                     "rb"))  # type:List[tuple]
            ner_result = pickle.load(open(data_dir + file + "_ner.pkl",
                                          "rb"))  # type:List[tuple]

            sent_split.sort(key=lambda x: x[0])

            # Step2.1 : 找出属于e1与e2的实体
            e1_entities, e2_entities = list(), list()
            for e in ner_result:
                # e是个4元组,例如:('Disease', 1, 10, '糖尿病下肢动脉病变')
                if e[0] == self.config.e1_type:
                    e1_entities.append(e)
                elif e[0] == self.config.e2_type:
                    e2_entities.append(e)
            e1_entities.sort(key=lambda x: x[1])
            e2_entities.sort(key=lambda x: x[1])

            # Step2.2 : 对每一个e1去找到候选的e2,并确定三元组<BEF,BET,AFT,sequence_tag>
            for e1 in e1_entities:
                e1_start, e1_end = e1[1], e1[2]
                cur_sentence_idx = -1
                for idx, s in enumerate(sent_split):
                    if s[0] <= e1_start and s[1] >= e1_end:
                        cur_sentence_idx = idx
                        break
                # 根据当前实体的位置确定了寻找e2的上下界:即 上一句 + 当前句 + 下一句
                search_e2_start = sent_split[
                    cur_sentence_idx - 1 if cur_sentence_idx > 1 else 0][0]
                search_e2_end = sent_split[cur_sentence_idx + 1 if cur_sentence_idx < len(sent_split) - 1 \
                    else len(sent_split) - 1][1]

                for i in range(len(e2_entities)):
                    e2 = e2_entities[i]
                    e2_start = e2[1]
                    e2_end = e2[2]
                    if e2_end < search_e2_start:
                        continue
                    elif e2_start > search_e2_end:
                        break
                    elif e2_start >= search_e2_start and e2_end <= search_e2_end:
                        if e1_end == e2_start:
                            # 情况(1):e1在e2前,且紧挨着
                            before = passage[search_e2_start:e1_start]
                            between = ""
                            after = passage[e2_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=True,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e2_end == e1_start:
                            # 情况(2):e1在e2后,且紧挨着
                            before = passage[search_e2_start:e2_start]
                            between = ""
                            after = passage[e1_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=False,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e1_end < e2_start:
                            # 情况(3):e1在e2前,不挨着
                            before = passage[search_e2_start:e1_start]
                            between = passage[e1_end:e2_start]
                            after = passage[e2_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=True,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e2_end < e1_start:
                            # 情况(4):e1在e2后,不挨着
                            before = passage[search_e2_start:e2_start]
                            between = passage[e2_end:e1_start]
                            after = passage[e1_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=False,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)

        # Stpe3 : 持久化
        pickle.dump(
            instances,
            open("./saved_model_files/RE_candidate_instances.pkl", "wb"))