def read_negative_seeds(self, negative_seeds): for line in fileinput.input(negative_seeds): if line.startswith("#") or len(line) == 1: continue if line.startswith("e1"): self.e1_type = line.split(":")[1].strip() elif line.startswith("e2"): self.e2_type = line.split(":")[1].strip() else: e1 = line.split(";")[0].strip() e2 = line.split(";")[1].strip() seed = Seed(e1, e2) self.negative_seed_tuples.add(seed)
def init_bootstrapp(self, tuples): if tuples is not None: f = open(tuples, "r") print("Loading pre-processed sentences", tuples) self.processed_tuples = pickle.load(f) f.close() print(len(self.processed_tuples), "tuples loaded") """ starts a bootstrap iteration """ i = 0 while i <= self.config.number_iterations: print("\n=============================================") print("\nStarting iteration", i) print("\nLooking for seed matches of:") for s in self.config.seed_tuples: print(s.e1, '\t', s.e2) # Looks for sentences macthing the seed instances count_matches, matched_tuples = self.match_seeds_tuples(self) if len(matched_tuples) == 0: print("\nNo seed matches found") sys.exit(0) else: print("\nNumber of seed matches found") sorted_counts = sorted(count_matches.items(), key=operator.itemgetter(1), reverse=True) for t in sorted_counts: print(t[0][0], '\t', t[0][1], t[1]) # Cluster the matched instances: generate patterns/update patterns print("\nClustering matched instances to generate patterns") self.cluster_tuples(self, matched_tuples) # Eliminate patterns supported by less than 'min_pattern_support' tuples new_patterns = [ p for p in self.patterns if len(p.tuples) >= self.config.min_pattern_support ] self.patterns = new_patterns print("\n", len(self.patterns), "patterns generated") if i == 0 and len(self.patterns) == 0: print("No patterns generated") sys.exit(0) # Look for sentences with occurrence of seeds semantic types (e.g., ORG - LOC) # This was already collect and its stored in: self.processed_tuples # # Measure the similarity of each occurrence with each extraction pattern # and store each pattern that has a similarity higher than a given threshold # # Each candidate tuple will then have a number of patterns that helped generate it, # each with an associated de gree of match. Snowball uses this infor print("\nCollecting instances based on extraction patterns") count = 0 pattern_best = None for t in self.processed_tuples: count += 1 if count % 1000 == 0: sys.stdout.write(".") sys.stdout.flush() sim_best = 0 # TODO 上一轮的迭代匹配结果并没有清空?所以继续影响这一次的 pos/neg/unknown 统计? for extraction_pattern in self.patterns: score = self.similarity(t, extraction_pattern) if score > self.config.threshold_similarity: extraction_pattern.update_selectivity( t, self.config) if score > sim_best: sim_best = score pattern_best = extraction_pattern if sim_best >= self.config.threshold_similarity: # if this instance was already extracted, check if it was by this extraction pattern patterns = self.candidate_tuples[t] if patterns is not None: if pattern_best not in [x[0] for x in patterns]: self.candidate_tuples[t].append( (pattern_best, sim_best)) # if this instance was not extracted before, associate theisextraciton pattern with the instance # and the similarity score else: self.candidate_tuples[t].append( (pattern_best, sim_best)) # update extraction pattern confidence extraction_pattern.confidence_old = extraction_pattern.confidence extraction_pattern.update_confidence() # normalize patterns confidence # find the maximum value of confidence and divide all by the maximum max_confidence = 0 for p in self.patterns: if p.confidence > max_confidence: max_confidence = p.confidence if max_confidence > 0: for p in self.patterns: p.confidence = float( p.confidence) / float(max_confidence) if PRINT_PATTERNS is True: print("\nPatterns:") for p in self.patterns: p.merge_tuple_patterns() print("Patterns:", len(p.tuples)) print("Positive", p.positive) print("Negative", p.negative) print("Unknown", p.unknown) print("Tuples", len(p.tuples)) print("Pattern Confidence", p.confidence) print("\n") # update tuple confidence based on patterns confidence print("\nCalculating tuples confidence") for t in self.candidate_tuples.keys(): confidence = 1 t.confidence_old = t.confidence for p in self.candidate_tuples.get(t): confidence *= 1 - (p[0].confidence * p[1]) t.confidence = 1 - confidence # use past confidence values to calculate new confidence # if parameter Wupdt < 0.5 the system trusts new examples less on each iteration # which will lead to more conservative patterns and have a damping effect. if i > 0: t.confidence = t.confidence * self.config.wUpdt + t.confidence_old * ( 1 - self.config.wUpdt) # update seed set of tuples to use in next iteration # seeds = { T | Conf(T) > min_tuple_confidence } if i + 1 < self.config.number_iterations: print("Adding tuples to seed with confidence =>" + str(self.config.instance_confidance)) for t in self.candidate_tuples.keys(): if t.confidence >= self.config.instance_confidance: seed = Seed(t.e1, t.e2) self.config.seed_tuples.add(seed) # increment the number of iterations i += 1 print("\nWriting extracted relationships to disk") f_output = open("relationships.txt", "w") tmp = sorted(self.candidate_tuples, key=lambda tpl: tpl.confidence, reverse=True) for t in tmp: # f_output.write("instance: "+t.e1.encode("utf8")+'\t'+t.e2.encode("utf8")+'\tscore:'+str(t.confidence)+'\n') # f_output.write("sentence: "+t.sentence.encode("utf8")+'\n') f_output.write("instance: " + t.e1 + '\t' + t.e2 + '\tscore:' + str(t.confidence) + '\n') f_output.write("sentence: " + t.sentence + '\n') # writer patterns that extracted this tuple patterns = set() for pattern in self.candidate_tuples[t]: patterns.add(pattern[0]) for p in patterns: p.merge_tuple_patterns() f_output.write("pattern_bet: " + ', '.join(p.tuple_patterns) + '\n') if t.passive_voice is False or t.passive_voice is None: f_output.write("passive voice: False\n") elif t.passive_voice is True: f_output.write("passive voice: True\n") f_output.write("\n") f_output.close()