def generate_instances(self, sentences, child_conn): # Each process has its own NLTK PoS-tagger tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') instances = list() while True: try: s = sentences.get_nowait() if sentences.qsize() % 500 == 0: print(multiprocessing.current_process(), \ "Instances to process", sentences.qsize()) sentence = Sentence(s, self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) instances.append(t) except queue.Empty: print(multiprocessing.current_process(), "Queue is Empty") pid = multiprocessing.current_process().pid child_conn.send((pid, instances)) break
def test_update_selectivity(self): bef_words = ['dummy'] bet_words = ['dummy'] aft_words = ['dummy'] # positive pattern = Pattern() t = Tuple('seed_1 ', 'seed_2 ', None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config) self.assertEqual(pattern.positive, 1) self.assertEqual(pattern.negative, 0) self.assertEqual(pattern.unknown, 0) # negative pattern = Pattern() t = Tuple('seed_1', 'seed_5', None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config) self.assertEqual(pattern.negative, 1) self.assertEqual(pattern.positive, 0) self.assertEqual(pattern.unknown, 0) # negative pattern = Pattern() t = Tuple('seed_1', 'seed_3', None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config) self.assertEqual(pattern.unknown, 0) self.assertEqual(pattern.positive, 0) self.assertEqual(pattern.negative, 1) # unknown pattern = Pattern() t = Tuple('seed_4', 'seed_5', None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config) self.assertEqual(pattern.negative, 0) self.assertEqual(pattern.positive, 0) self.assertEqual(pattern.unknown, 1)
def test_update_confidence(self): bef_words = ['dummy'] bet_words = ['dummy'] aft_words = ['dummy'] # positive pattern = Pattern() t = Tuple(self.e1, self.e2, None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config, self.baseline) pattern.update_confidence(self.config) print(pattern.p_values[0]) self.assertGreater(pattern.confidence, .5) # negative pattern = Pattern() t = Tuple(self.e2, self.e1, None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config, self.baseline) pattern.update_confidence(self.config) self.assertLess(pattern.confidence, .5)
def test_update_selectivity(self): bef_words = ['dummy'] bet_words = ['dummy'] aft_words = ['dummy'] # positive pattern = Pattern() t = Tuple(self.e1, self.e2, None, bef_words, bet_words, aft_words, self.config) pattern.update_selectivity(t, self.config, self.baseline) self.assertEqual(len(pattern.p_values), 1) self.assertEqual(pattern.p_values[0], self.baseline.shortest_path(self.e1, self.e2))
def generate_tuples(self, sentences_file): """ Generate tuples instances from a text file with sentences where named entities are already tagged :param sentences_file: """ if os.path.exists("processed_tuples.pkl"): with open("processed_tuples.pkl", "rb") as f_in: print("\nLoading processed tuples from disk...") self.processed_tuples = pickle.load(f_in) print(len(self.processed_tuples), "tuples loaded") temp_file = open("temp.txt", "w", encoding='utf-8') for i in self.processed_tuples: temp_file.write(i.e1 + '\t' + i.e2 + '\n') temp_file.close() else: # load needed stuff, word2vec model and a pos-tagger self.config.read_word2vec() tagger = None print("\nGenerating relationship instances from sentences") with open(sentences_file, encoding='utf-8') as f_sentences: count = 0 for line in f_sentences: if line.startswith("#"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") sentence = Sentence(line.strip(), self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) self.processed_tuples.append(t) print("\n", len(self.processed_tuples), "tuples generated") print("Writing generated tuples to disk") with open("processed_tuples.pkl", "wb") as f_out: pickle.dump(self.processed_tuples, f_out)
def generate_tuples(self, data_dir: str): """ 用于从源数据中,用多线程的方式生成tuples Args: data_dir: 数据存储的路径,其中包括: eg. 源文章名称 __ data/round2/0.txt NER结果名称 __ data/round2/0_ner.pkl 文章分句结果 __ data/round2/0_sentence_split.pkl """ # Step1 : load word2idx and emb_matrix self.config.load_word2idx_embmatrix() # Step2 : 生成候选关系对 instances = list() file_names = scan_files(data_dir) for file in file_names: passage = load_file(data_dir, file, "txt") # type:str sent_split = pickle.load( open(data_dir + file + "_sentence_split.pkl", "rb")) # type:List[tuple] ner_result = pickle.load(open(data_dir + file + "_ner.pkl", "rb")) # type:List[tuple] sent_split.sort(key=lambda x: x[0]) # Step2.1 : 找出属于e1与e2的实体 e1_entities, e2_entities = list(), list() for e in ner_result: # e是个4元组,例如:('Disease', 1, 10, '糖尿病下肢动脉病变') if e[0] == self.config.e1_type: e1_entities.append(e) elif e[0] == self.config.e2_type: e2_entities.append(e) e1_entities.sort(key=lambda x: x[1]) e2_entities.sort(key=lambda x: x[1]) # Step2.2 : 对每一个e1去找到候选的e2,并确定三元组<BEF,BET,AFT,sequence_tag> for e1 in e1_entities: e1_start, e1_end = e1[1], e1[2] cur_sentence_idx = -1 for idx, s in enumerate(sent_split): if s[0] <= e1_start and s[1] >= e1_end: cur_sentence_idx = idx break # 根据当前实体的位置确定了寻找e2的上下界:即 上一句 + 当前句 + 下一句 search_e2_start = sent_split[ cur_sentence_idx - 1 if cur_sentence_idx > 1 else 0][0] search_e2_end = sent_split[cur_sentence_idx + 1 if cur_sentence_idx < len(sent_split) - 1 \ else len(sent_split) - 1][1] for i in range(len(e2_entities)): e2 = e2_entities[i] e2_start = e2[1] e2_end = e2[2] if e2_end < search_e2_start: continue elif e2_start > search_e2_end: break elif e2_start >= search_e2_start and e2_end <= search_e2_end: if e1_end == e2_start: # 情况(1):e1在e2前,且紧挨着 before = passage[search_e2_start:e1_start] between = "" after = passage[e2_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=True, before=before, between=between, after=after, config=self.config) instances.append(t) elif e2_end == e1_start: # 情况(2):e1在e2后,且紧挨着 before = passage[search_e2_start:e2_start] between = "" after = passage[e1_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=False, before=before, between=between, after=after, config=self.config) instances.append(t) elif e1_end < e2_start: # 情况(3):e1在e2前,不挨着 before = passage[search_e2_start:e1_start] between = passage[e1_end:e2_start] after = passage[e2_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=True, before=before, between=between, after=after, config=self.config) instances.append(t) elif e2_end < e1_start: # 情况(4):e1在e2后,不挨着 before = passage[search_e2_start:e2_start] between = passage[e2_end:e1_start] after = passage[e1_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=False, before=before, between=between, after=after, config=self.config) instances.append(t) # Stpe3 : 持久化 pickle.dump( instances, open("./saved_model_files/RE_candidate_instances.pkl", "wb"))