def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence): self.curr_iteration = 0 self.patterns = list() self.processed_tuples = list() self.candidate_tuples = defaultdict(list) self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence)
def __init__(self, args): if args.num_cores == 0: self.num_cpus = multiprocessing.cpu_count() else: self.num_cpus = args.num_cores self.processed_tuples = list() self.candidate_tuples = defaultdict( list) # 当字典里的key不存在但被查找时,返回的不是keyError而是一个默认空list self.config = Config(args.config_file, args.positive_seeds_file, args.negative_seeds_file, args.similarity, args.confidence)
def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence, num_cores): if num_cores == 0: self.num_cpus = multiprocessing.cpu_count() else: self.num_cpus = num_cores self.processed_tuples = list() self.candidate_tuples = defaultdict(list) self.curr_iteration = 0 self.patterns = list() self.patterns_index = dict() self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence)
def gather_sizes_with_bootstrapping_patterns(cfg: Box, patterns, all_new_objects) -> DefaultDict[Tuple, list]: """Gather text, parse tuples and check if tuples include valid sizes.""" visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors) config = Config(cfg, visual_config) tuples = generate_tuples(randomString(), config, names=all_new_objects) config.visual = cfg.parameters.visual_at_inference candidate_tuples = extract_tuples(config, patterns, tuples) filtered_tuples = filter_tuples(candidate_tuples, cfg.parameters.dev_threshold) for t in candidate_tuples.keys(): logger.info(t.sentence) logger.info(f"{t.e1} {t.e2}") logger.info(t.confidence) logger.info("\n") return filtered_tuples
class BREDS(object): def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence, num_cores): if num_cores == 0: self.num_cpus = multiprocessing.cpu_count() else: self.num_cpus = num_cores self.processed_tuples = list() self.candidate_tuples = defaultdict(list) self.curr_iteration = 0 self.patterns = list() self.patterns_index = dict() self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence) def generate_tuples(self, sentences_file): # generate tuples instances from a text file with sentences # where named entities are already tagged # load word2vec model self.config.read_word2vec() # copy all sentences from input file into a Queue # shared by all processes manager = multiprocessing.Manager() queue = manager.Queue() print("\nLoading sentences from file") f_sentences = codecs.open(sentences_file, encoding='utf-8') count = 0 for line in f_sentences: if line.startswith("#"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") queue.put(line.strip()) f_sentences.close() pipes = [multiprocessing.Pipe(False) for _ in range(self.num_cpus)] processes = [ multiprocessing.Process(target=self.generate_instances, args=(queue, pipes[i][1])) for i in range(self.num_cpus)] print("\nGenerating relationship instances from sentences") print("Running", len(processes), " processes") for proc in processes: proc.start() for i in range(len(pipes)): data = pipes[i][0].recv() child_instances = data[1] for x in child_instances: self.processed_tuples.append(x) for proc in processes: proc.join() print("\n", len(self.processed_tuples), "instances generated") print("Writing generated tuples to disk") f = open("processed_tuples.pkl", "wb") pickle.dump(self.processed_tuples, f) f.close() def generate_instances(self, sentences, child_conn): # Each process has its own NLTK PoS-tagger tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') instances = list() while True: try: s = sentences.get_nowait() if sentences.qsize() % 500 == 0: print(multiprocessing.current_process(), \ "Instances to process", sentences.qsize()) sentence = Sentence(s, self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) instances.append(t) except queue.Empty: print(multiprocessing.current_process(), "Queue is Empty") pid = multiprocessing.current_process().pid child_conn.send((pid, instances)) break def similarity_3_contexts(self, t, p): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and p.bef_vector is not None: bef = dot( matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector) ) if t.bet_vector is not None and p.bet_vector is not None: bet = dot( matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector) ) if t.aft_vector is not None and p.aft_vector is not None: aft = dot( matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector) ) return self.config.alpha*bef + \ self.config.beta*bet + \ self.config.gamma*aft def similarity_all(self, t, extraction_pattern): # calculates the cosine similarity between all patterns part of a # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern # extracted from a sentence; # # returns the max similarity scores good = 0 bad = 0 max_similarity = 0 for p in list(extraction_pattern.tuples): score = self.similarity_3_contexts(t, p) if score > max_similarity: max_similarity = score if score >= self.config.threshold_similarity: good += 1 else: bad += 1 if good >= bad: return True, max_similarity else: return False, 0.0 def match_seeds_tuples(self): # checks if an extracted tuple matches seeds tuples matched_tuples = list() count_matches = dict() for t in self.processed_tuples: for s in self.config.positive_seed_tuples: if t.e1 == s.e1 and t.e2 == s.e2: matched_tuples.append(t) try: count_matches[(t.e1, t.e2)] += 1 except KeyError: count_matches[(t.e1, t.e2)] = 1 return count_matches, matched_tuples def cluster_tuples(self, matched_tuples): # single-Pass clustering # Initialize: if no patterns exist, first tuple goes to first cluster if len(self.patterns) == 0: c1 = Pattern(matched_tuples[0]) self.patterns.append(c1) count = 0 for t in matched_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() max_similarity = 0 max_similarity_cluster_index = 0 # go through all patterns(clusters of tuples) and find the one # with the highest similarity score for i in range(0, len(self.patterns), 1): extraction_pattern = self.patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster # having this tuple as the centroid if max_similarity < self.config.threshold_similarity: c = Pattern(t) self.patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: self.patterns[max_similarity_cluster_index].add_tuple(t) def write_relationships_to_disk(self): print("\nWriting extracted relationships to disk") f_output = open("relationships.txt", "w") tmp = sorted(list(self.candidate_tuples.keys()), reverse=True) for t in tmp: f_output.write("instance: " + t.e1+'\t'+t.e2+'\tscore:'+str(t.confidence)+'\n') f_output.write("sentence: "+t.sentence+'\n') f_output.write("pattern_bef: "+t.bef_words+'\n') f_output.write("pattern_bet: "+t.bet_words+'\n') f_output.write("pattern_aft: "+t.aft_words+'\n') if t.passive_voice is False: f_output.write("passive voice: False\n") elif t.passive_voice is True: f_output.write("passive voice: True\n") f_output.write("\n") f_output.close() def init_bootstrap(self, tuples): # starts a bootstrap iteration if tuples is not None: f = open(tuples, "r") print("Loading pre-processed sentences", tuples) self.processed_tuples = pickle.load(f) f.close() print(len(self.processed_tuples), "tuples loaded") self.curr_iteration = 0 while self.curr_iteration <= self.config.number_iterations: print("==========================================") print("\nStarting iteration", self.curr_iteration) print("\nLooking for seed matches of:") for s in self.config.positive_seed_tuples: print(s.e1, '\t', s.e2) # Looks for sentences matching the seed instances count_matches, matched_tuples = self.match_seeds_tuples() if len(matched_tuples) == 0: print("\nNo seed matches found") sys.exit(0) else: print("\nNumber of seed matches found") sorted_counts = sorted( list(count_matches.items()), key=operator.itemgetter(1), reverse=True ) for t in sorted_counts: print(t[0][0], '\t', t[0][1], t[1]) print("\n", len(matched_tuples), "tuples matched") # Cluster the matched instances: generate patterns print("\nClustering matched instances to generate patterns") if len(self.patterns) == 0: self.cluster_tuples(matched_tuples) # Eliminate patterns supported by less than # 'min_pattern_support' tuples new_patterns = [p for p in self.patterns if len(p.tuples) > self.config.min_pattern_support] self.patterns = new_patterns else: # Parallelize single-pass clustering # Each tuple must be compared with each extraction pattern # Map: # - Divide the tuples into smaller lists, # accordingly to the number of CPUs # - Pass to each CPU a sub-list of tuples and all the # patterns, comparison is done by each CPU # Merge: # - Each CPU sends to the father process the updated # patterns and new patterns # - Merge patterns based on a pattern_id # - Cluster new created patterns with single-pass clustering # make a copy of the extraction patterns to be # passed to each CPU patterns = [ list(self.patterns) for _ in range(self.num_cpus) ] # distribute tuples per different CPUs chunks = [list() for _ in range(self.num_cpus)] n_tuples_per_child = int( math.ceil(float(len(matched_tuples)) / self.num_cpus) ) print("\n#CPUS", self.num_cpus, '\t', \ "Tuples per CPU", n_tuples_per_child) chunk_n = 0 chunck_begin = 0 chunck_end = n_tuples_per_child while chunk_n < self.num_cpus: chunks[chunk_n] = matched_tuples[ chunck_begin:chunck_end ] chunck_begin = chunck_end chunck_end += n_tuples_per_child chunk_n += 1 count = 0 for c in chunks: print("CPU_"+str(count), " ", len(c), "patterns") count += 1 pipes = [ multiprocessing.Pipe(False) for _ in range(self.num_cpus) ] processes = [ multiprocessing.Process( target=self.cluster_tuples_parallel, args=(patterns[i], chunks[i], pipes[i][1])) for i in range(self.num_cpus) ] print("\nRunning", len(processes), " processes") for proc in processes: proc.start() # Receive and merge all patterns by 'pattern_id' # new created patterns (new pattern_id) go into # 'child_patterns' and then are merged # by single-pass clustering between patterns child_patterns = list() for i in range(len(pipes)): data = pipes[i][0].recv() patterns = data[1] for p_updated in patterns: pattern_exists = False for p_original in self.patterns: if p_original.id == p_updated.id: p_original.tuples.update(p_updated.tuples) pattern_exists = True break if pattern_exists is False: child_patterns.append(p_updated) for proc in processes: proc.join() print("\nSELF Patterns:") for p in self.patterns: p.merge_all_tuples_bet() print('\n'+str(p.id)) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print("BET", bet_words.encode("utf8")) print("\nChild Patterns:") for p in child_patterns: p.merge_all_tuples_bet() print('\n'+str(p.id)) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print("BET", bet_words.encode("utf8")) print(len(child_patterns), "new created patterns") # merge/aggregate similar patterns generated by # the child processes # start comparing smaller ones with greater ones child_patterns.sort(key=lambda y: len(y.tuples), reverse=False) count = 0 new_list = list(self.patterns) for p1 in child_patterns: print("\nNew Patterns", len(child_patterns), \ "Processed", count) print("New List", len(new_list)) print("Pattern:", p1.id, "Tuples:", len(p1.tuples)) max_similarity = 0 max_similarity_cluster = None for p2 in new_list: if p1 == p2: continue score = self.similarity_cluster(p1, p2) if score > max_similarity: max_similarity = score max_similarity_cluster = p2 if max_similarity >= self.config.threshold_similarity: for t in p1.tuples: max_similarity_cluster.tuples.add(t) else: new_list.append(p1) count += 1 # add merged patterns to main patterns structure for p in new_list: if p not in self.patterns: self.patterns.append(p) if self.curr_iteration == 0 and len(self.patterns) == 0: print("No patterns generated") sys.exit(0) print("\n", len(self.patterns), "patterns generated") # merge equal tuples inside patterns to make # less comparisons in collecting instances for p in self.patterns: # if only the BET context is being used, # merge only based on BET contexts if self.config.alpha == 0 and self.config.gamma == 0: p.merge_all_tuples_bet() if PRINT_PATTERNS is True: print("\nPatterns:") for p in self.patterns: print('\n'+str(p.id)) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print("BET", bet_words) else: for t in p.tuples: print("BEF", t.bef_words) print("BET", t.bet_words) print("AFT", t.aft_words) print("========") # Look for sentences with occurrence of # seeds semantic types (e.g., ORG - LOC) # This was already collect and its stored in # self.processed_tuples # # Measure the similarity of each occurrence with # each extraction pattern and store each pattern that has a # similarity higher than a given threshold # # Each candidate tuple will then have a number of patterns # that extracted it each with an associated degree of match. print("\nNumber of tuples to be analyzed:", \ len(self.processed_tuples)) print("\nCollecting instances based on", \ len(self.patterns), "extraction patterns") # create copies of generated extraction patterns # to be passed to each process patterns = [list(self.patterns) for _ in range(self.num_cpus)] # copy all tuples into a Queue shared by all processes manager = multiprocessing.Manager() queue = manager.Queue() for t in self.processed_tuples: queue.put(t) # each distinct process receives as arguments: # - a list, copy of all the original extraction patterns # - a Queue of the tuples # - a pipe to return the collected tuples and updated # patterns to the parent process pipes = [ multiprocessing.Pipe(False) for _ in range(self.num_cpus) ] processes = [ multiprocessing.Process( target=self.find_instances, args=(patterns[i], queue, pipes[i][1])) for i in range(self.num_cpus) ] print("Running", len(processes), " processes") for proc in processes: proc.start() # structures to store each process altered patterns # and collected tuples patterns_updated = list() collected_tuples = list() for i in range(len(pipes)): data = pipes[i][0].recv() child_pid = data[0] patterns = data[1] tuples = data[2] print(child_pid, "patterns", len(patterns), \ "tuples", len(tuples)) patterns_updated.extend(patterns) collected_tuples.extend(tuples) for proc in processes: proc.join() # Extraction patterns aggregation happens here: for p_updated in patterns_updated: for p_original in self.patterns: if p_original.id == p_updated.id: p_original.positive += p_updated.positive p_original.negative += p_updated.negative p_original.unknown += p_updated.unknown # Index the patterns in an hashtable for later use for p in self.patterns: self.patterns_index[p.id] = p # update all patterns confidence for p in self.patterns: p.update_confidence(self.config) if PRINT_PATTERNS is True: print("\nPatterns:") for p in self.patterns: print(p.id) print("Positive", p.positive) print("Negative", p.negative) print("Pattern Confidence", p.confidence) print("\n") # Candidate tuples aggregation happens here: print("Collecting generated candidate tuples") for e in collected_tuples: t = e[0] pattern_best = e[1] sim_best = e[2] # if this tuple was already extracted, check if this # extraction pattern is already associated with it, if not, # associate this pattern with it and similarity score if t in self.candidate_tuples: t_patterns = self.candidate_tuples[t] if t_patterns is not None: if pattern_best not in [x[0] for x in t_patterns]: self.candidate_tuples[t].append( (self.patterns_index[pattern_best.id], sim_best) ) # if this tuple was not extracted before, associate this # pattern with the instance and the similarity score else: self.candidate_tuples[t].append( (self.patterns_index[pattern_best.id], sim_best) ) # update tuple confidence based on patterns confidence print("\n\nCalculating tuples confidence") for t in list(self.candidate_tuples.keys()): confidence = 1 t.confidence_old = t.confidence for p in self.candidate_tuples.get(t): confidence *= 1 - (p[0].confidence * p[1]) t.confidence = 1 - confidence if self.curr_iteration > 0: t.confidence = \ t.confidence * self.config.wUpdt + \ t.confidence_old * (1 - self.config.wUpdt) # sort tuples by confidence and print if PRINT_TUPLES is True: extracted_tuples = list(self.candidate_tuples.keys()) tuples_sorted = sorted( extracted_tuples, key=lambda tl: tl.confidence, reverse=True ) for t in tuples_sorted: print(t.sentence) print(t.e1, t.e2) print(t.confidence) print("\n") # update seed set of tuples to use in next iteration # seeds = { T | conf(T) > instance_confidence } print("Adding tuples to seed with confidence >=" + \ str(self.config.instance_confidence)) for t in list(self.candidate_tuples.keys()): if t.confidence >= self.config.instance_confidence: seed = Seed(t.e1, t.e2) self.config.positive_seed_tuples.add(seed) # increment the number of iterations self.curr_iteration += 1 self.write_relationships_to_disk() def similarity_cluster(self, p1, p2): count = 0 score = 0 if self.config.alpha == 0 and self.config.gamma == 0: p1.merge_all_tuples_bet() p2.merge_all_tuples_bet() for v_bet1 in p1.bet_uniques_vectors: for v_bet2 in p2.bet_uniques_vectors: if v_bet1 is not None and v_bet2 is not None: score += dot( matutils.unitvec(asarray(v_bet1)), matutils.unitvec(asarray(v_bet2)) ) count += 1 else: for t1 in p1.tuples: for t2 in p2.tuples: score += self.similarity_3_contexts(t1, t2) count += 1 return float(score) / float(count) def find_instances(self, patterns, instances, child_conn): updated_patterns = list() candidate_tuples = list() while True: try: t = instances.get_nowait() if instances.qsize() % 500 == 0: sys.stdout.write( str(multiprocessing.current_process()) + " Instances to process: " + str(instances.qsize())+'\n') sys.stdout.flush() # measure similarity towards every extraction pattern max_similarity = 0 pattern_best = None for p in patterns: good = 0 bad = 0 if self.config.alpha == 0 and self.config.gamma == 0: for p_bet_v in list(p.bet_uniques_vectors): if t.bet_vector is not None and p_bet_v is not None: score = dot( matutils.unitvec(t.bet_vector), matutils.unitvec(asarray(p_bet_v)) ) if score >= self.config.threshold_similarity: good += 1 else: bad += 1 if good > bad: p.update_selectivity(t, self.config) if score > max_similarity: max_similarity = score pattern_best = p # if its above a threshold associated the pattern with it if max_similarity >= self.config.threshold_similarity: candidate_tuples.append((t, pattern_best, max_similarity)) except queue.Empty: print(multiprocessing.current_process(), "Queue is Empty") for p in patterns: updated_patterns.append(p) pid = multiprocessing.current_process().pid child_conn.send((pid, updated_patterns, candidate_tuples)) break def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn): updated_patterns = list(patterns) count = 0 for t in matched_tuples: count += 1 if count % 500 == 0: print(multiprocessing.current_process(), count, \ "tuples processed") # go through all patterns(clusters of tuples) and find the one with # the highest similarity score max_similarity = 0 max_similarity_cluster_index = 0 for i in range(0, len(updated_patterns)): extraction_pattern = updated_patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster if max_similarity < self.config.threshold_similarity: c = Pattern(t) updated_patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: updated_patterns[max_similarity_cluster_index].add_tuple(t) # Eliminate clusters with two or less patterns new_patterns = [p for p in updated_patterns if len(p.tuples) > 5] pid = multiprocessing.current_process().pid print(multiprocessing.current_process(), "Patterns: ", len(new_patterns)) child_conn.send((pid, new_patterns))
class BREDS(object): def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence): self.curr_iteration = 0 self.patterns = list() self.processed_tuples = list() self.candidate_tuples = defaultdict(list) self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence) def generate_tuples(self, sentences_file): """ Generate tuples instances from a text file with sentences where named entities are already tagged :param sentences_file: """ if os.path.exists("processed_tuples.pkl"): with open("processed_tuples.pkl", "rb") as f_in: print("\nLoading processed tuples from disk...") self.processed_tuples = pickle.load(f_in) print(len(self.processed_tuples), "tuples loaded") temp_file = open("temp.txt", "w", encoding='utf-8') for i in self.processed_tuples: temp_file.write(i.e1 + '\t' + i.e2 + '\n') temp_file.close() else: # load needed stuff, word2vec model and a pos-tagger self.config.read_word2vec() tagger = None print("\nGenerating relationship instances from sentences") with open(sentences_file, encoding='utf-8') as f_sentences: count = 0 for line in f_sentences: if line.startswith("#"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") sentence = Sentence(line.strip(), self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) self.processed_tuples.append(t) print("\n", len(self.processed_tuples), "tuples generated") print("Writing generated tuples to disk") with open("processed_tuples.pkl", "wb") as f_out: pickle.dump(self.processed_tuples, f_out) def similarity_3_contexts(self, p, t): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and p.bef_vector is not None: bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)) if t.bet_vector is not None and p.bet_vector is not None: bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)) if t.aft_vector is not None and p.aft_vector is not None: aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)) return self.config.alpha * bef + self.config.beta * bet + self.config.gamma * aft def similarity_all(self, t, extraction_pattern): # calculates the cosine similarity between all patterns part of a # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern # extracted from a sentence; # returns the max similarity scores good = 0 bad = 0 max_similarity = 0 for p in list(extraction_pattern.tuples): score = self.similarity_3_contexts(t, p) if score > max_similarity: max_similarity = score if score >= self.config.threshold_similarity: good += 1 else: bad += 1 if good >= bad: return True, max_similarity else: return False, 0.0 def match_seeds_tuples(self): # checks if an extracted tuple matches seeds tuples matched_tuples = list() count_matches = dict() for t in self.processed_tuples: for s in self.config.positive_seed_tuples: if t.e1 == s.e1 and t.e2 == s.e2: matched_tuples.append(t) try: count_matches[(t.e1, t.e2)] += 1 except KeyError: count_matches[(t.e1, t.e2)] = 1 return count_matches, matched_tuples def write_relationships_to_disk(self): print("\nWriting extracted relationships to disk") f_output = open("relationships.txt", "w", encoding='utf-8') tmp = sorted(list(self.candidate_tuples.keys()), reverse=True) for t in tmp: f_output.write("instance: " + t.e1 + '\t' + t.e2 + '\tscore:' + str(t.confidence) + '\n') f_output.write("sentence: " + t.sentence + '\n') f_output.write("pattern_bef: " + t.bef_words + '\n') f_output.write("pattern_bet: " + t.bet_words + '\n') f_output.write("pattern_aft: " + t.aft_words + '\n') # if t.passive_voice is False: # f_output.write("passive voice: False\n") # elif t.passive_voice is True: # f_output.write("passive voice: True\n") f_output.write("\n") f_output.close() def init_bootstrap(self, tuples): # starts a bootstrap iteration if tuples is not None: f = open(tuples, "r") print("\nLoading processed tuples from disk...") self.processed_tuples = pickle.load(f) f.close() temp_file = open("temp.txt", "w", encoding='utf-8') for i in self.processed_tuples: print(i) temp_file.write(i + '\n') temp_file.close() print(len(self.processed_tuples), "tuples loaded") self.curr_iteration = 0 while self.curr_iteration <= self.config.number_iterations: print("==========================================") print("\nStarting iteration", self.curr_iteration) print("\nLooking for seed matches of:") for s in self.config.positive_seed_tuples: print(s.e1, '\t', s.e2) # Looks for sentences matching the seed instances count_matches, matched_tuples = self.match_seeds_tuples() if len(matched_tuples) == 0: print("\nNo seed matches found") sys.exit(0) else: print("\nNumber of seed matches found") sorted_counts = sorted(list(count_matches.items()), key=operator.itemgetter(1), reverse=True) for t in sorted_counts: print(t[0][0], '\t', t[0][1], t[1]) print("\n", len(matched_tuples), "tuples matched") # Cluster the matched instances, to generate # patterns/update patterns print("\nClustering matched instances to generate patterns") self.cluster_tuples(matched_tuples) # Eliminate patterns supported by less than # 'min_pattern_support' tuples new_patterns = [ p for p in self.patterns if len(p.tuples) > self.config.min_pattern_support ] self.patterns = new_patterns print("\n", len(self.patterns), "patterns generated") if PRINT_PATTERNS is True: count = 1 print("\nPatterns:") for p in self.patterns: print(count) for t in p.tuples: print("BEF", t.bef_words) print("BET", t.bet_words) print("AFT", t.aft_words) print("========") print("\n") count += 1 if self.curr_iteration == 0 and len(self.patterns) == 0: print("No patterns generated") sys.exit(0) # Look for sentences with occurrence of seeds # semantic types (e.g., ORG - LOC) # This was already collect and its stored in: # self.processed_tuples # # Measure the similarity of each occurrence with each # extraction pattern and store each pattern that has a # similarity higher than a given threshold # # Each candidate tuple will then have a number of patterns # that extracted it each with an associated degree of match. print("Number of tuples to be analyzed:", len(self.processed_tuples)) print("\nCollecting instances based on extraction patterns") count = 0 for t in self.processed_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() sim_best = 0 for extraction_pattern in self.patterns: accept, score = self.similarity_all( t, extraction_pattern) if accept is True: extraction_pattern.update_selectivity( t, self.config) if score > sim_best: sim_best = score pattern_best = extraction_pattern if sim_best >= self.config.threshold_similarity: # if this tuple was already extracted, check if this # extraction pattern is already associated with it, # if not, associate this pattern with it and store the # similarity score patterns = self.candidate_tuples[t] if patterns is not None: if pattern_best not in [x[0] for x in patterns]: self.candidate_tuples[t].append( (pattern_best, sim_best)) # If this tuple was not extracted before # associate this pattern with the instance # and the similarity score else: self.candidate_tuples[t].append( (pattern_best, sim_best)) # update all patterns confidence for p in self.patterns: p.update_confidence(self.config) if PRINT_PATTERNS is True: print("\nPatterns:") for p in self.patterns: for t in p.tuples: print("BEF", t.bef_words) print("BET", t.bet_words) print("AFT", t.aft_words) print("========") print("Positive", p.positive) print("Negative", p.negative) print("Unknown", p.unknown) print("Tuples", len(p.tuples)) print("Pattern Confidence", p.confidence) print("\n") # update tuple confidence based on patterns confidence print("\n\nCalculating tuples confidence") for idx, t in enumerate(list(self.candidate_tuples.keys())): confidence = 1 t.confidence_old = t.confidence for p in self.candidate_tuples.get(t): confidence *= 1 - (p[0].confidence * p[1]) t.confidence = 1 - confidence if idx > 0: t.confidence = t.confidence * self.config.wUpdt + t.confidence_old * ( 1 - self.config.wUpdt) # sort tuples by confidence and print if PRINT_TUPLES is True: extracted_tuples = list(self.candidate_tuples.keys()) tuples_sorted = sorted(extracted_tuples, key=lambda tpl: tpl.confidence, reverse=True) for t in tuples_sorted: print(t.sentence) print(t.e1, t.e2) print(t.confidence) print("\n") print("Adding tuples to seed with confidence >= {}".format( str(self.config.instance_confidence))) for t in list(self.candidate_tuples.keys()): if t.confidence >= self.config.instance_confidence: seed = Seed(t.e1, t.e2) self.config.positive_seed_tuples.add(seed) # increment the number of iterations self.curr_iteration += 1 self.write_relationships_to_disk() def cluster_tuples(self, matched_tuples): # this is a single-pass clustering # Initialize: if no patterns exist, first tuple goes to first cluster if len(self.patterns) == 0: c1 = Pattern(matched_tuples[0]) self.patterns.append(c1) count = 0 for t in matched_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() max_similarity = 0 max_similarity_cluster_index = 0 # go through all patterns(clusters of tuples) and find the one # with the highest similarity score for i in range(0, len(self.patterns), 1): extraction_pattern = self.patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster having # this tuple as the centroid if max_similarity < self.config.threshold_similarity: c = Pattern(t) self.patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: self.patterns[max_similarity_cluster_index].add_tuple(t)
class BREDS: def __init__(self, args): if args.num_cores == 0: self.num_cpus = multiprocessing.cpu_count() else: self.num_cpus = args.num_cores self.processed_tuples = list() self.candidate_tuples = defaultdict( list) # 当字典里的key不存在但被查找时,返回的不是keyError而是一个默认空list self.config = Config(args.config_file, args.positive_seeds_file, args.negative_seeds_file, args.similarity, args.confidence) def generate_tuples(self, data_dir: str): """ 用于从源数据中,用多线程的方式生成tuples Args: data_dir: 数据存储的路径,其中包括: eg. 源文章名称 __ data/round2/0.txt NER结果名称 __ data/round2/0_ner.pkl 文章分句结果 __ data/round2/0_sentence_split.pkl """ # Step1 : load word2idx and emb_matrix self.config.load_word2idx_embmatrix() # Step2 : 生成候选关系对 instances = list() file_names = scan_files(data_dir) for file in file_names: passage = load_file(data_dir, file, "txt") # type:str sent_split = pickle.load( open(data_dir + file + "_sentence_split.pkl", "rb")) # type:List[tuple] ner_result = pickle.load(open(data_dir + file + "_ner.pkl", "rb")) # type:List[tuple] sent_split.sort(key=lambda x: x[0]) # Step2.1 : 找出属于e1与e2的实体 e1_entities, e2_entities = list(), list() for e in ner_result: # e是个4元组,例如:('Disease', 1, 10, '糖尿病下肢动脉病变') if e[0] == self.config.e1_type: e1_entities.append(e) elif e[0] == self.config.e2_type: e2_entities.append(e) e1_entities.sort(key=lambda x: x[1]) e2_entities.sort(key=lambda x: x[1]) # Step2.2 : 对每一个e1去找到候选的e2,并确定三元组<BEF,BET,AFT,sequence_tag> for e1 in e1_entities: e1_start, e1_end = e1[1], e1[2] cur_sentence_idx = -1 for idx, s in enumerate(sent_split): if s[0] <= e1_start and s[1] >= e1_end: cur_sentence_idx = idx break # 根据当前实体的位置确定了寻找e2的上下界:即 上一句 + 当前句 + 下一句 search_e2_start = sent_split[ cur_sentence_idx - 1 if cur_sentence_idx > 1 else 0][0] search_e2_end = sent_split[cur_sentence_idx + 1 if cur_sentence_idx < len(sent_split) - 1 \ else len(sent_split) - 1][1] for i in range(len(e2_entities)): e2 = e2_entities[i] e2_start = e2[1] e2_end = e2[2] if e2_end < search_e2_start: continue elif e2_start > search_e2_end: break elif e2_start >= search_e2_start and e2_end <= search_e2_end: if e1_end == e2_start: # 情况(1):e1在e2前,且紧挨着 before = passage[search_e2_start:e1_start] between = "" after = passage[e2_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=True, before=before, between=between, after=after, config=self.config) instances.append(t) elif e2_end == e1_start: # 情况(2):e1在e2后,且紧挨着 before = passage[search_e2_start:e2_start] between = "" after = passage[e1_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=False, before=before, between=between, after=after, config=self.config) instances.append(t) elif e1_end < e2_start: # 情况(3):e1在e2前,不挨着 before = passage[search_e2_start:e1_start] between = passage[e1_end:e2_start] after = passage[e2_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=True, before=before, between=between, after=after, config=self.config) instances.append(t) elif e2_end < e1_start: # 情况(4):e1在e2后,不挨着 before = passage[search_e2_start:e2_start] between = passage[e2_end:e1_start] after = passage[e1_end:search_e2_end] t = Tuple(e1[3], e2[3], sequence_tag=False, before=before, between=between, after=after, config=self.config) instances.append(t) # Stpe3 : 持久化 pickle.dump( instances, open("./saved_model_files/RE_candidate_instances.pkl", "wb")) def similarity_3_contexts(self, t: Tuple, p: Tuple) -> float: bef, bet, aft = 0, 0, 0 # TODO 貌似应该增加参数的 return 0
def main(): with open("config.yml", "r") as ymlfile: cfg = Box(yaml.safe_load(ymlfile)) # cfg = Box(yaml.safe_load(ymlfile), default_box=True, default_box_attr=None) test_pairs, unseen_objects = comparison_dev_set(cfg) unseen_objects = [o.replace('_', " ") for o in unseen_objects] # TODO check whether the objects aren't in the bootstrapped objects visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors) config = Config(cfg, visual_config) visual_config = config.visual_config objects = list(visual_config.entity_to_synsets.keys()) logger.info(f'Objects: {objects}') G = build_cooccurrence_graph(objects, visual_config) word2vec_model = load_word2vec(cfg.parameters.word2vec_path) similar_words = find_similar_words(word2vec_model, unseen_objects, n_word2vec=200) # calc coverage and precision results = list() settings: List[BackoffSettings] = [ # BackoffSettings(use_direct=True), # BackoffSettings(use_word2vec=True), # BackoffSettings(use_hypernyms=True), # BackoffSettings(use_hyponyms=True), # BackoffSettings(use_head_noun=True), # BackoffSettings(use_direct=True, use_word2vec=True), BackoffSettings(use_direct=True, use_word2vec=True, use_hypernyms=True), # BackoffSettings(use_direct=True, use_hypernyms=True), # BackoffSettings(use_direct=True, use_hyponyms=True), # BackoffSettings(use_direct=True, use_head_noun=True), # BackoffSettings(use_direct=True, use_hyponyms=True) ] golds = [p.larger for p in test_pairs] for setting in settings: preds = list() fractions_larger = list() notes = list() prop = VisualPropagation(G, config.visual_config) logger.info(f'\nRunning for setting {setting.print()}') comparer = Comparer(prop, setting, similar_words, objects) for test_pair in tqdm.tqdm(test_pairs): # TODO return confidence; use the higher one res_visual, fraction_larger, note = comparer.compare_visual_with_backoff(test_pair) fractions_larger.append(fraction_larger) preds.append(res_visual) notes.append(note) with open(f'visual_comparison_predictions_{setting.print()}.pkl', 'wb') as f: pickle.dump(list(zip(preds, fractions_larger, notes)), f) useful_counts = comparer.useful_paths_count tr = SymmetricalLogTransform(base=10, linthresh=1, linscale=1) ss = tr.transform([0., max(useful_counts) + 1]) bins = tr.inverted().transform(np.linspace(*ss, num=100)) fig, ax = plt.subplots() plt.hist(useful_counts, bins=bins) plt.xlabel('Number of useful paths') ax.set_xscale('symlog') plt.savefig(f'useful_paths{setting.print()}.png') useful_counts = np.array(useful_counts) logger.info(f'Number of objects with no useful path: {len(np.extract(useful_counts == 0, useful_counts))}') logger.info(f'Not recog count: {comparer.not_recognized_count}') logger.info(f'Total number of test cases: {len(golds)}') coverage, selectivity = coverage_accuracy_relational(golds, preds) logger.info(f'Coverage: {coverage}') logger.info(f'selectivity: {selectivity}') results.append(RelationalResult(setting.print(), selectivity, coverage)) assert len(fractions_larger) == len(preds) corrects_not_none = list() diffs_not_none = list() for i, fraction_larger in enumerate(fractions_larger): gold = golds[i] res = preds[i] if fraction_larger is not None and fraction_larger != 0.5: fraction_larger_centered = fraction_larger - .5 corrects_not_none.append(gold == res) diffs_not_none.append(abs(fraction_larger_centered)) # TODO do something special for when fraction_larger_centered == 0 regr_linear = Ridge(alpha=1.0) regr_linear.fit(np.reshape(diffs_not_none, (-1, 1)), corrects_not_none) with open('visual_confidence_model.pkl', 'wb') as f: pickle.dump(regr_linear, f) fig, ax = plt.subplots() bin_means, bin_edges, binnumber = stats.binned_statistic(diffs_not_none, corrects_not_none, 'mean', bins=20) bin_counts, _, _ = stats.binned_statistic(diffs_not_none, corrects_not_none, 'count', bins=20) x = np.linspace(min(diffs_not_none), max(diffs_not_none), 500) X = np.reshape(x, (-1, 1)) plt.plot(x, regr_linear.predict(X), '-', label='linear ridge regression') minc = min(bin_counts) maxc = max(bin_counts) norm = colors.SymLogNorm(vmin=minc, vmax=maxc, linthresh=1) bin_counts_normalized = [norm(c) for c in bin_counts] logger.info(f'counts, norm: {list(zip(bin_counts, bin_counts_normalized))}') viridis = cm.get_cmap('viridis', 20) mins = bin_edges[:-1] maxs = bin_edges[1:] mask = ~np.isnan(bin_means) plt.hlines(np.extract(mask, bin_means), np.extract(mask, mins), np.extract(mask, maxs), colors=viridis(np.extract(mask, bin_counts_normalized)), lw=5, label='binned statistic of data') sm = plt.cm.ScalarMappable(cmap=viridis, norm=norm) ticks = [10**1.5, 10**1.75, 10**2, 10**2.5] colorbar = plt.colorbar(sm, ticks=ticks) colorbar.ax.set_yticklabels(['10^1.5', '10^1.75', '10^2', '10^2.5']) colorbar.set_label('bin count') plt.ylim(-0.05, 1.05) plt.legend() plt.xlabel('Absolute fraction_larger') plt.ylabel('Selectivity') ax.set_xscale('linear') plt.savefig('fraction_larger_selectivity_linear.png') plt.show() correlation, _ = pearsonr(diffs_not_none, corrects_not_none) logger.info(f'Pearsons correlation: {correlation}') correlation_spearman, _ = spearmanr(np.array(diffs_not_none), b=np.array(corrects_not_none)) logger.info(f'Spearman correlation: {correlation_spearman}') results_df = pd.DataFrame(results) results_df.to_csv('results_visual_backoff.csv')
def main(): with open("config.yml", "r") as ymlfile: cfg = Box(yaml.safe_load(ymlfile)) # cfg = Box(yaml.safe_load(ymlfile), default_box=True, default_box_attr=None) # TODO check whether the objects aren't in the bootstrapped objects visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors) config = Config(cfg, visual_config) input: DataFrame = pd.read_csv(cfg.path.dev) input = input.astype({'object': str}) unseen_objects = list(input['object']) logger.info(f'Unseen objects: {unseen_objects}') visual_config = config.visual_config objects = list(visual_config.entity_to_synsets.keys()) logger.info(f'Objects: {objects}') G = build_cooccurrence_graph(objects, visual_config) with open(cfg.path.final_seeds_cache) as f: numeric_seeds = json.load(f) numeric_seeds = dict((key.strip().replace(' ', '_'), value) for (key, value) in numeric_seeds.items()) del numeric_seeds[ 'rhine'] # There is a 'rhine' in VG, which was included in VG as the river. fixing this manually, # since it's in a lot of results point_predictions = dict() point_predictions_evenly = dict() point_predictions_svm = dict() prop = VisualPropagation(G, config.visual_config) for unseen_object in unseen_objects: unseen_object = unseen_object.replace(' ', '_') logger.info(f'Processing {unseen_object}') if unseen_object not in objects: logger.info(f'{unseen_object} not in visuals') point_predictions[unseen_object.replace('_', ' ')] = None point_predictions_evenly[unseen_object.replace('_', ' ')] = None point_predictions_svm[unseen_object.replace('_', ' ')] = None continue none_count = 0 lower_bounds = set() upper_bounds = set() for numeric_seed in tqdm.tqdm(numeric_seeds.keys()): pair = Pair(unseen_object, numeric_seed) if pair.both_in_list(objects): fraction_larger, _ = prop.compare_pair(pair) if fraction_larger is None: none_count += 1 continue if fraction_larger < .5: upper_bounds.add(numeric_seed) if fraction_larger > .5: lower_bounds.add(numeric_seed) logger.debug( f'{pair.e1} {pair.e2} fraction larger: {fraction_larger}') else: logger.debug( f'{pair.e1} or {pair.e2} not in VG. Objects: {objects}') lower_bounds_sizes = fill_sizes_list(lower_bounds, numeric_seeds) upper_bounds_sizes = fill_sizes_list(upper_bounds, numeric_seeds) # size = predict_size_with_bounds(lower_bounds_sizes, upper_bounds_sizes) size = iterativily_find_size(lower_bounds_sizes, upper_bounds_sizes) size_evenly = iterativily_find_size_evenly(lower_bounds_sizes, upper_bounds_sizes) size_svm = predict_size_with_bounds(lower_bounds_sizes, upper_bounds_sizes) point_predictions[unseen_object.replace('_', ' ')] = size point_predictions_evenly[unseen_object.replace('_', ' ')] = size_evenly point_predictions_svm[unseen_object.replace('_', ' ')] = size_svm logger.info(f'\nObject: {unseen_object}') logger.info(f'Size: {size}') logger.info(f'Size evenly: {size_evenly}') logger.info(f'Size svm: {size_svm}') logger.info( f"None count: {none_count} out of {len(numeric_seeds.keys())}") logger.info( f"Lower bounds (n={len(lower_bounds)}): mean: {np.mean(lower_bounds_sizes)} median: {np.median(lower_bounds_sizes)}\n\t{lower_bounds}\n\t{lower_bounds_sizes}" ) logger.info( f"Upper bounds (n={len(upper_bounds)}): mean: {np.mean(upper_bounds_sizes)} median: {np.median(upper_bounds_sizes)}\n\t{upper_bounds}\n\t{upper_bounds_sizes}" ) with open(f'point_predictions_visual_ranges.pkl', 'wb') as f: pickle.dump(point_predictions, f) with open(f'point_predictions_visual_ranges_evenly.pkl', 'wb') as f: pickle.dump(point_predictions_evenly, f) with open(f'point_predictions_visual_ranges_svm.pkl', 'wb') as f: pickle.dump(point_predictions_svm, f)
class BREDS(object): def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence, num_cores): if num_cores == 0: self.num_cpus = multiprocessing.cpu_count() else: self.num_cpus = num_cores self.processed_tuples = list() self.candidate_tuples = defaultdict(list) self.curr_iteration = 0 self.patterns = list() self.patterns_index = dict() self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence) def generate_tuples(self, sentences_file): # generate tuples instances from a text file with sentences # where named entities are already tagged # load word2vec model self.config.read_word2vec() # copy all sentences from input file into a Queue # shared by all processes manager = multiprocessing.Manager() queue = manager.Queue() print("\nLoading sentences from file") f_sentences = codecs.open(sentences_file, encoding='utf-8') count = 0 for line in f_sentences: if line.startswith("#"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") queue.put(line.strip()) f_sentences.close() pipes = [multiprocessing.Pipe(False) for _ in range(self.num_cpus)] processes = [ multiprocessing.Process(target=self.generate_instances, args=(queue, pipes[i][1])) for i in range(self.num_cpus) ] print("\nGenerating relationship instances from sentences") print("Running", len(processes), " processes") for proc in processes: proc.start() for i in range(len(pipes)): data = pipes[i][0].recv() child_instances = data[1] for x in child_instances: self.processed_tuples.append(x) for proc in processes: proc.join() print("\n", len(self.processed_tuples), "instances generated") print("Writing generated tuples to disk") f = open("processed_tuples.pkl", "wb") pickle.dump(self.processed_tuples, f) f.close() def generate_instances(self, sentences, child_conn): # Each process has its own NLTK PoS-tagger tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') instances = list() while True: try: s = sentences.get_nowait() if sentences.qsize() % 500 == 0: print(multiprocessing.current_process(), \ "Instances to process", sentences.qsize()) sentence = Sentence(s, self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) instances.append(t) except queue.Empty: print(multiprocessing.current_process(), "Queue is Empty") pid = multiprocessing.current_process().pid child_conn.send((pid, instances)) break def similarity_3_contexts(self, t, p): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and p.bef_vector is not None: bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)) if t.bet_vector is not None and p.bet_vector is not None: bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)) if t.aft_vector is not None and p.aft_vector is not None: aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)) return self.config.alpha*bef + \ self.config.beta*bet + \ self.config.gamma*aft def similarity_all(self, t, extraction_pattern): # calculates the cosine similarity between all patterns part of a # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern # extracted from a sentence; # # returns the max similarity scores good = 0 bad = 0 max_similarity = 0 for p in list(extraction_pattern.tuples): score = self.similarity_3_contexts(t, p) if score > max_similarity: max_similarity = score if score >= self.config.threshold_similarity: good += 1 else: bad += 1 if good >= bad: return True, max_similarity else: return False, 0.0 def match_seeds_tuples(self): # checks if an extracted tuple matches seeds tuples matched_tuples = list() count_matches = dict() for t in self.processed_tuples: for s in self.config.positive_seed_tuples: if t.e1 == s.e1 and t.e2 == s.e2: matched_tuples.append(t) try: count_matches[(t.e1, t.e2)] += 1 except KeyError: count_matches[(t.e1, t.e2)] = 1 return count_matches, matched_tuples def cluster_tuples(self, matched_tuples): # single-Pass clustering # Initialize: if no patterns exist, first tuple goes to first cluster if len(self.patterns) == 0: c1 = Pattern(matched_tuples[0]) self.patterns.append(c1) count = 0 for t in matched_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() max_similarity = 0 max_similarity_cluster_index = 0 # go through all patterns(clusters of tuples) and find the one # with the highest similarity score for i in range(0, len(self.patterns), 1): extraction_pattern = self.patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster # having this tuple as the centroid if max_similarity < self.config.threshold_similarity: c = Pattern(t) self.patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: self.patterns[max_similarity_cluster_index].add_tuple(t) def write_relationships_to_disk(self): print("\nWriting extracted relationships to disk") f_output = open("relationships.txt", "w") tmp = sorted(list(self.candidate_tuples.keys()), reverse=True) for t in tmp: f_output.write("instance: " + t.e1 + '\t' + t.e2 + '\tscore:' + str(t.confidence) + '\n') f_output.write("sentence: " + t.sentence + '\n') f_output.write("pattern_bef: " + t.bef_words + '\n') f_output.write("pattern_bet: " + t.bet_words + '\n') f_output.write("pattern_aft: " + t.aft_words + '\n') if t.passive_voice is False: f_output.write("passive voice: False\n") elif t.passive_voice is True: f_output.write("passive voice: True\n") f_output.write("\n") f_output.close() def init_bootstrap(self, tuples): # starts a bootstrap iteration if tuples is not None: f = open(tuples, "r") print("Loading pre-processed sentences", tuples) self.processed_tuples = pickle.load(f) f.close() print(len(self.processed_tuples), "tuples loaded") self.curr_iteration = 0 while self.curr_iteration <= self.config.number_iterations: print("==========================================") print("\nStarting iteration", self.curr_iteration) print("\nLooking for seed matches of:") for s in self.config.positive_seed_tuples: print(s.e1, '\t', s.e2) # Looks for sentences matching the seed instances count_matches, matched_tuples = self.match_seeds_tuples() if len(matched_tuples) == 0: print("\nNo seed matches found") sys.exit(0) else: print("\nNumber of seed matches found") sorted_counts = sorted(list(count_matches.items()), key=operator.itemgetter(1), reverse=True) for t in sorted_counts: print(t[0][0], '\t', t[0][1], t[1]) print("\n", len(matched_tuples), "tuples matched") # Cluster the matched instances: generate patterns print("\nClustering matched instances to generate patterns") if len(self.patterns) == 0: self.cluster_tuples(matched_tuples) # Eliminate patterns supported by less than # 'min_pattern_support' tuples new_patterns = [ p for p in self.patterns if len(p.tuples) > self.config.min_pattern_support ] self.patterns = new_patterns else: # Parallelize single-pass clustering # Each tuple must be compared with each extraction pattern # Map: # - Divide the tuples into smaller lists, # accordingly to the number of CPUs # - Pass to each CPU a sub-list of tuples and all the # patterns, comparison is done by each CPU # Merge: # - Each CPU sends to the father process the updated # patterns and new patterns # - Merge patterns based on a pattern_id # - Cluster new created patterns with single-pass clustering # make a copy of the extraction patterns to be # passed to each CPU patterns = [ list(self.patterns) for _ in range(self.num_cpus) ] # distribute tuples per different CPUs chunks = [list() for _ in range(self.num_cpus)] n_tuples_per_child = int( math.ceil(float(len(matched_tuples)) / self.num_cpus)) print("\n#CPUS", self.num_cpus, '\t', \ "Tuples per CPU", n_tuples_per_child) chunk_n = 0 chunck_begin = 0 chunck_end = n_tuples_per_child while chunk_n < self.num_cpus: chunks[chunk_n] = matched_tuples[ chunck_begin:chunck_end] chunck_begin = chunck_end chunck_end += n_tuples_per_child chunk_n += 1 count = 0 for c in chunks: print("CPU_" + str(count), " ", len(c), "patterns") count += 1 pipes = [ multiprocessing.Pipe(False) for _ in range(self.num_cpus) ] processes = [ multiprocessing.Process( target=self.cluster_tuples_parallel, args=(patterns[i], chunks[i], pipes[i][1])) for i in range(self.num_cpus) ] print("\nRunning", len(processes), " processes") for proc in processes: proc.start() # Receive and merge all patterns by 'pattern_id' # new created patterns (new pattern_id) go into # 'child_patterns' and then are merged # by single-pass clustering between patterns child_patterns = list() for i in range(len(pipes)): data = pipes[i][0].recv() patterns = data[1] for p_updated in patterns: pattern_exists = False for p_original in self.patterns: if p_original.id == p_updated.id: p_original.tuples.update(p_updated.tuples) pattern_exists = True break if pattern_exists is False: child_patterns.append(p_updated) for proc in processes: proc.join() print("\nSELF Patterns:") for p in self.patterns: p.merge_all_tuples_bet() print('\n' + str(p.id)) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print("BET", bet_words.encode("utf8")) print("\nChild Patterns:") for p in child_patterns: p.merge_all_tuples_bet() print('\n' + str(p.id)) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print("BET", bet_words.encode("utf8")) print(len(child_patterns), "new created patterns") # merge/aggregate similar patterns generated by # the child processes # start comparing smaller ones with greater ones child_patterns.sort(key=lambda y: len(y.tuples), reverse=False) count = 0 new_list = list(self.patterns) for p1 in child_patterns: print("\nNew Patterns", len(child_patterns), \ "Processed", count) print("New List", len(new_list)) print("Pattern:", p1.id, "Tuples:", len(p1.tuples)) max_similarity = 0 max_similarity_cluster = None for p2 in new_list: if p1 == p2: continue score = self.similarity_cluster(p1, p2) if score > max_similarity: max_similarity = score max_similarity_cluster = p2 if max_similarity >= self.config.threshold_similarity: for t in p1.tuples: max_similarity_cluster.tuples.add(t) else: new_list.append(p1) count += 1 # add merged patterns to main patterns structure for p in new_list: if p not in self.patterns: self.patterns.append(p) if self.curr_iteration == 0 and len(self.patterns) == 0: print("No patterns generated") sys.exit(0) print("\n", len(self.patterns), "patterns generated") # merge equal tuples inside patterns to make # less comparisons in collecting instances for p in self.patterns: # if only the BET context is being used, # merge only based on BET contexts if self.config.alpha == 0 and self.config.gamma == 0: p.merge_all_tuples_bet() if PRINT_PATTERNS is True: print("\nPatterns:") for p in self.patterns: print('\n' + str(p.id)) if self.config.alpha == 0 and self.config.gamma == 0: for bet_words in p.bet_uniques_words: print("BET", bet_words) else: for t in p.tuples: print("BEF", t.bef_words) print("BET", t.bet_words) print("AFT", t.aft_words) print("========") # Look for sentences with occurrence of # seeds semantic types (e.g., ORG - LOC) # This was already collect and its stored in # self.processed_tuples # # Measure the similarity of each occurrence with # each extraction pattern and store each pattern that has a # similarity higher than a given threshold # # Each candidate tuple will then have a number of patterns # that extracted it each with an associated degree of match. print("\nNumber of tuples to be analyzed:", \ len(self.processed_tuples)) print("\nCollecting instances based on", \ len(self.patterns), "extraction patterns") # create copies of generated extraction patterns # to be passed to each process patterns = [list(self.patterns) for _ in range(self.num_cpus)] # copy all tuples into a Queue shared by all processes manager = multiprocessing.Manager() queue = manager.Queue() for t in self.processed_tuples: queue.put(t) # each distinct process receives as arguments: # - a list, copy of all the original extraction patterns # - a Queue of the tuples # - a pipe to return the collected tuples and updated # patterns to the parent process pipes = [ multiprocessing.Pipe(False) for _ in range(self.num_cpus) ] processes = [ multiprocessing.Process(target=self.find_instances, args=(patterns[i], queue, pipes[i][1])) for i in range(self.num_cpus) ] print("Running", len(processes), " processes") for proc in processes: proc.start() # structures to store each process altered patterns # and collected tuples patterns_updated = list() collected_tuples = list() for i in range(len(pipes)): data = pipes[i][0].recv() child_pid = data[0] patterns = data[1] tuples = data[2] print(child_pid, "patterns", len(patterns), \ "tuples", len(tuples)) patterns_updated.extend(patterns) collected_tuples.extend(tuples) for proc in processes: proc.join() # Extraction patterns aggregation happens here: for p_updated in patterns_updated: for p_original in self.patterns: if p_original.id == p_updated.id: p_original.positive += p_updated.positive p_original.negative += p_updated.negative p_original.unknown += p_updated.unknown # Index the patterns in an hashtable for later use for p in self.patterns: self.patterns_index[p.id] = p # update all patterns confidence for p in self.patterns: p.update_confidence(self.config) if PRINT_PATTERNS is True: print("\nPatterns:") for p in self.patterns: print(p.id) print("Positive", p.positive) print("Negative", p.negative) print("Pattern Confidence", p.confidence) print("\n") # Candidate tuples aggregation happens here: print("Collecting generated candidate tuples") for e in collected_tuples: t = e[0] pattern_best = e[1] sim_best = e[2] # if this tuple was already extracted, check if this # extraction pattern is already associated with it, if not, # associate this pattern with it and similarity score if t in self.candidate_tuples: t_patterns = self.candidate_tuples[t] if t_patterns is not None: if pattern_best not in [x[0] for x in t_patterns]: self.candidate_tuples[t].append( (self.patterns_index[pattern_best.id], sim_best)) # if this tuple was not extracted before, associate this # pattern with the instance and the similarity score else: self.candidate_tuples[t].append( (self.patterns_index[pattern_best.id], sim_best)) # update tuple confidence based on patterns confidence print("\n\nCalculating tuples confidence") for t in list(self.candidate_tuples.keys()): confidence = 1 t.confidence_old = t.confidence for p in self.candidate_tuples.get(t): confidence *= 1 - (p[0].confidence * p[1]) t.confidence = 1 - confidence if self.curr_iteration > 0: t.confidence = \ t.confidence * self.config.wUpdt + \ t.confidence_old * (1 - self.config.wUpdt) # sort tuples by confidence and print if PRINT_TUPLES is True: extracted_tuples = list(self.candidate_tuples.keys()) tuples_sorted = sorted(extracted_tuples, key=lambda tl: tl.confidence, reverse=True) for t in tuples_sorted: print(t.sentence) print(t.e1, t.e2) print(t.confidence) print("\n") # update seed set of tuples to use in next iteration # seeds = { T | conf(T) > instance_confidence } print("Adding tuples to seed with confidence >=" + \ str(self.config.instance_confidence)) for t in list(self.candidate_tuples.keys()): if t.confidence >= self.config.instance_confidence: seed = Seed(t.e1, t.e2) self.config.positive_seed_tuples.add(seed) # increment the number of iterations self.curr_iteration += 1 self.write_relationships_to_disk() def similarity_cluster(self, p1, p2): count = 0 score = 0 if self.config.alpha == 0 and self.config.gamma == 0: p1.merge_all_tuples_bet() p2.merge_all_tuples_bet() for v_bet1 in p1.bet_uniques_vectors: for v_bet2 in p2.bet_uniques_vectors: if v_bet1 is not None and v_bet2 is not None: score += dot(matutils.unitvec(asarray(v_bet1)), matutils.unitvec(asarray(v_bet2))) count += 1 else: for t1 in p1.tuples: for t2 in p2.tuples: score += self.similarity_3_contexts(t1, t2) count += 1 return float(score) / float(count) def find_instances(self, patterns, instances, child_conn): updated_patterns = list() candidate_tuples = list() while True: try: t = instances.get_nowait() if instances.qsize() % 500 == 0: sys.stdout.write( str(multiprocessing.current_process()) + " Instances to process: " + str(instances.qsize()) + '\n') sys.stdout.flush() # measure similarity towards every extraction pattern max_similarity = 0 pattern_best = None for p in patterns: good = 0 bad = 0 if self.config.alpha == 0 and self.config.gamma == 0: for p_bet_v in list(p.bet_uniques_vectors): if t.bet_vector is not None and p_bet_v is not None: score = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(asarray(p_bet_v))) if score >= self.config.threshold_similarity: good += 1 else: bad += 1 if good > bad: p.update_selectivity(t, self.config) if score > max_similarity: max_similarity = score pattern_best = p # if its above a threshold associated the pattern with it if max_similarity >= self.config.threshold_similarity: candidate_tuples.append((t, pattern_best, max_similarity)) except queue.Empty: print(multiprocessing.current_process(), "Queue is Empty") for p in patterns: updated_patterns.append(p) pid = multiprocessing.current_process().pid child_conn.send((pid, updated_patterns, candidate_tuples)) break def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn): updated_patterns = list(patterns) count = 0 for t in matched_tuples: count += 1 if count % 500 == 0: print(multiprocessing.current_process(), count, \ "tuples processed") # go through all patterns(clusters of tuples) and find the one with # the highest similarity score max_similarity = 0 max_similarity_cluster_index = 0 for i in range(0, len(updated_patterns)): extraction_pattern = updated_patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster if max_similarity < self.config.threshold_similarity: c = Pattern(t) updated_patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: updated_patterns[max_similarity_cluster_index].add_tuple(t) # Eliminate clusters with two or less patterns new_patterns = [p for p in updated_patterns if len(p.tuples) > 5] pid = multiprocessing.current_process().pid print(multiprocessing.current_process(), "Patterns: ", len(new_patterns)) child_conn.send((pid, new_patterns))
class BREDS(object): def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence): self.curr_iteration = 0 self.patterns = list() self.processed_tuples = list() self.candidate_tuples = defaultdict(list) self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence) def generate_tuples(self, sentences_file): """ Generate tuples instances from a text file with sentences where named entities are already tagged :param sentences_file: """ if os.path.exists("processed_tuples.pkl"): with open("processed_tuples.pkl", "rb") as f_in: print("\nLoading processed tuples from disk...") self.processed_tuples = pickle.load(f_in) print(len(self.processed_tuples), "tuples loaded") else: # load needed stuff, word2vec model and a pos-tagger self.config.read_word2vec() tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle') print("\nGenerating relationship instances from sentences") with open(sentences_file, encoding='utf-8') as f_sentences: count = 0 for line in f_sentences: if line.startswith("#"): continue count += 1 if count % 10000 == 0: sys.stdout.write(".") sentence = Sentence(line.strip(), self.config.e1_type, self.config.e2_type, self.config.max_tokens_away, self.config.min_tokens_away, self.config.context_window_size, tagger, self.config) for rel in sentence.relationships: t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before, rel.between, rel.after, self.config) self.processed_tuples.append(t) print("\n", len(self.processed_tuples), "tuples generated") print("Writing generated tuples to disk") with open("processed_tuples.pkl", "wb") as f_out: pickle.dump(self.processed_tuples, f_out) def similarity_3_contexts(self, p, t): (bef, bet, aft) = (0, 0, 0) if t.bef_vector is not None and p.bef_vector is not None: bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)) if t.bet_vector is not None and p.bet_vector is not None: bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)) if t.aft_vector is not None and p.aft_vector is not None: aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)) return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft def similarity_all(self, t, extraction_pattern): # calculates the cosine similarity between all patterns part of a # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern # extracted from a sentence; # returns the max similarity scores good = 0 bad = 0 max_similarity = 0 for p in list(extraction_pattern.tuples): score = self.similarity_3_contexts(t, p) if score > max_similarity: max_similarity = score if score >= self.config.threshold_similarity: good += 1 else: bad += 1 if good >= bad: return True, max_similarity else: return False, 0.0 def match_seeds_tuples(self): # checks if an extracted tuple matches seeds tuples matched_tuples = list() count_matches = dict() for t in self.processed_tuples: for s in self.config.positive_seed_tuples: if t.e1 == s.e1 and t.e2 == s.e2: matched_tuples.append(t) try: count_matches[(t.e1, t.e2)] += 1 except KeyError: count_matches[(t.e1, t.e2)] = 1 return count_matches, matched_tuples def write_relationships_to_disk(self): print("\nWriting extracted relationships to disk") f_output = open("relationships.txt", "w") tmp = sorted(list(self.candidate_tuples.keys()), reverse=True) for t in tmp: f_output.write("instance: " + t.e1+'\t'+t.e2+'\tscore:'+str(t.confidence)+'\n') f_output.write("sentence: "+t.sentence+'\n') f_output.write("pattern_bef: "+t.bef_words+'\n') f_output.write("pattern_bet: "+t.bet_words+'\n') f_output.write("pattern_aft: "+t.aft_words+'\n') if t.passive_voice is False: f_output.write("passive voice: False\n") elif t.passive_voice is True: f_output.write("passive voice: True\n") f_output.write("\n") f_output.close() def init_bootstrap(self, tuples): # starts a bootstrap iteration if tuples is not None: f = open(tuples, "r") print("\nLoading processed tuples from disk...") self.processed_tuples = pickle.load(f) f.close() print(len(self.processed_tuples), "tuples loaded") self.curr_iteration = 0 while self.curr_iteration <= self.config.number_iterations: print("==========================================") print("\nStarting iteration", self.curr_iteration) print("\nLooking for seed matches of:") for s in self.config.positive_seed_tuples: print(s.e1, '\t', s.e2) # Looks for sentences matching the seed instances count_matches, matched_tuples = self.match_seeds_tuples() if len(matched_tuples) == 0: print("\nNo seed matches found") sys.exit(0) else: print("\nNumber of seed matches found") sorted_counts = sorted( list(count_matches.items()), key=operator.itemgetter(1), reverse=True ) for t in sorted_counts: print(t[0][0], '\t', t[0][1], t[1]) print("\n", len(matched_tuples), "tuples matched") # Cluster the matched instances, to generate # patterns/update patterns print("\nClustering matched instances to generate patterns") self.cluster_tuples(matched_tuples) # Eliminate patterns supported by less than # 'min_pattern_support' tuples new_patterns = [p for p in self.patterns if len(p.tuples) > self.config.min_pattern_support] self.patterns = new_patterns print("\n", len(self.patterns), "patterns generated") if PRINT_PATTERNS is True: count = 1 print("\nPatterns:") for p in self.patterns: print(count) for t in p.tuples: print("BEF", t.bef_words) print("BET", t.bet_words) print("AFT", t.aft_words) print("========") print("\n") count += 1 if self.curr_iteration == 0 and len(self.patterns) == 0: print("No patterns generated") sys.exit(0) # Look for sentences with occurrence of seeds # semantic types (e.g., ORG - LOC) # This was already collect and its stored in: # self.processed_tuples # # Measure the similarity of each occurrence with each # extraction pattern and store each pattern that has a # similarity higher than a given threshold # # Each candidate tuple will then have a number of patterns # that extracted it each with an associated degree of match. print("Number of tuples to be analyzed:", len(self.processed_tuples)) print("\nCollecting instances based on extraction patterns") count = 0 for t in self.processed_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() sim_best = 0 for extraction_pattern in self.patterns: accept, score = self.similarity_all( t, extraction_pattern ) if accept is True: extraction_pattern.update_selectivity( t, self.config ) if score > sim_best: sim_best = score pattern_best = extraction_pattern if sim_best >= self.config.threshold_similarity: # if this tuple was already extracted, check if this # extraction pattern is already associated with it, # if not, associate this pattern with it and store the # similarity score patterns = self.candidate_tuples[t] if patterns is not None: if pattern_best not in [x[0] for x in patterns]: self.candidate_tuples[t].append( (pattern_best, sim_best) ) # If this tuple was not extracted before # associate this pattern with the instance # and the similarity score else: self.candidate_tuples[t].append( (pattern_best, sim_best) ) # update all patterns confidence for p in self.patterns: p.update_confidence(self.config) if PRINT_PATTERNS is True: print("\nPatterns:") for p in self.patterns: for t in p.tuples: print("BEF", t.bef_words) print("BET", t.bet_words) print("AFT", t.aft_words) print("========") print("Positive", p.positive) print("Negative", p.negative) print("Unknown", p.unknown) print("Tuples", len(p.tuples)) print("Pattern Confidence", p.confidence) print("\n") # update tuple confidence based on patterns confidence print("\n\nCalculating tuples confidence") for t in list(self.candidate_tuples.keys()): confidence = 1 t.confidence_old = t.confidence for p in self.candidate_tuples.get(t): confidence *= 1 - (p[0].confidence * p[1]) t.confidence = 1 - confidence # sort tuples by confidence and print if PRINT_TUPLES is True: extracted_tuples = list(self.candidate_tuples.keys()) tuples_sorted = sorted(extracted_tuples, key=lambda tpl: tpl.confidence, reverse=True) for t in tuples_sorted: print(t.sentence) print(t.e1, t.e2) print(t.confidence) print("\n") print("Adding tuples to seed with confidence >= {}".format( str(self.config.instance_confidence))) for t in list(self.candidate_tuples.keys()): if t.confidence >= self.config.instance_confidence: seed = Seed(t.e1, t.e2) self.config.positive_seed_tuples.add(seed) # increment the number of iterations self.curr_iteration += 1 self.write_relationships_to_disk() def cluster_tuples(self, matched_tuples): # this is a single-pass clustering # Initialize: if no patterns exist, first tuple goes to first cluster if len(self.patterns) == 0: c1 = Pattern(matched_tuples[0]) self.patterns.append(c1) count = 0 for t in matched_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() max_similarity = 0 max_similarity_cluster_index = 0 # go through all patterns(clusters of tuples) and find the one # with the highest similarity score for i in range(0, len(self.patterns), 1): extraction_pattern = self.patterns[i] accept, score = self.similarity_all(t, extraction_pattern) if accept is True and score > max_similarity: max_similarity = score max_similarity_cluster_index = i # if max_similarity < min_degree_match create a new cluster having # this tuple as the centroid if max_similarity < self.config.threshold_similarity: c = Pattern(t) self.patterns.append(c) # if max_similarity >= min_degree_match add to the cluster with # the highest similarity else: self.patterns[max_similarity_cluster_index].add_tuple(t)