Ejemplo n.º 1
0
 def read_negative_seeds(self, negative_seeds):
     for line in fileinput.input(negative_seeds):
         if line.startswith("#") or len(line) == 1:
             continue
         if line.startswith("e1"):
             self.e1_type = line.split(":")[1].strip()
         elif line.startswith("e2"):
             self.e2_type = line.split(":")[1].strip()
         else:
             e1 = line.split(";")[0].strip()
             e2 = line.split(";")[1].strip()
             seed = Seed(e1, e2)
             self.negative_seed_tuples.add(seed)
Ejemplo n.º 2
0
    def init_bootstrapp(self, tuples):
        if tuples is not None:
            f = open(tuples, "r")
            print("Loading pre-processed sentences", tuples)
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")
        """
        starts a bootstrap iteration
        """
        i = 0
        while i <= self.config.number_iterations:
            print("\n=============================================")
            print("\nStarting iteration", i)
            print("\nLooking for seed matches of:")
            for s in self.config.seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences macthing the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples(self)

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(count_matches.items(),
                                       key=operator.itemgetter(1),
                                       reverse=True)
                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])

                # Cluster the matched instances: generate patterns/update patterns
                print("\nClustering matched instances to generate patterns")
                self.cluster_tuples(self, matched_tuples)

                # Eliminate patterns supported by less than 'min_pattern_support' tuples
                new_patterns = [
                    p for p in self.patterns
                    if len(p.tuples) >= self.config.min_pattern_support
                ]
                self.patterns = new_patterns
                print("\n", len(self.patterns), "patterns generated")
                if i == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                # Look for sentences with occurrence of seeds semantic types (e.g., ORG - LOC)
                # This was already collect and its stored in: self.processed_tuples
                #
                # Measure the similarity of each occurrence with each extraction pattern
                # and store each pattern that has a similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns that helped generate it,
                # each with an associated de gree of match. Snowball uses this infor
                print("\nCollecting instances based on extraction patterns")
                count = 0
                pattern_best = None
                for t in self.processed_tuples:
                    count += 1
                    if count % 1000 == 0:
                        sys.stdout.write(".")
                        sys.stdout.flush()
                    sim_best = 0
                    # TODO 上一轮的迭代匹配结果并没有清空?所以继续影响这一次的 pos/neg/unknown 统计?
                    for extraction_pattern in self.patterns:
                        score = self.similarity(t, extraction_pattern)
                        if score > self.config.threshold_similarity:
                            extraction_pattern.update_selectivity(
                                t, self.config)
                        if score > sim_best:
                            sim_best = score
                            pattern_best = extraction_pattern

                    if sim_best >= self.config.threshold_similarity:
                        # if this instance was already extracted, check if it was by this extraction pattern
                        patterns = self.candidate_tuples[t]
                        if patterns is not None:
                            if pattern_best not in [x[0] for x in patterns]:
                                self.candidate_tuples[t].append(
                                    (pattern_best, sim_best))

                        # if this instance was not extracted before, associate theisextraciton pattern with the instance
                        # and the similarity score
                        else:
                            self.candidate_tuples[t].append(
                                (pattern_best, sim_best))

                    # update extraction pattern confidence
                    extraction_pattern.confidence_old = extraction_pattern.confidence
                    extraction_pattern.update_confidence()

                # normalize patterns confidence
                # find the maximum value of confidence and divide all by the maximum
                max_confidence = 0
                for p in self.patterns:
                    if p.confidence > max_confidence:
                        max_confidence = p.confidence

                if max_confidence > 0:
                    for p in self.patterns:
                        p.confidence = float(
                            p.confidence) / float(max_confidence)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        p.merge_tuple_patterns()
                        print("Patterns:", len(p.tuples))
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Unknown", p.unknown)
                        print("Tuples", len(p.tuples))
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # update tuple confidence based on patterns confidence
                print("\nCalculating tuples confidence")
                for t in self.candidate_tuples.keys():
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                    # use past confidence values to calculate new confidence
                    # if parameter Wupdt < 0.5 the system trusts new examples less on each iteration
                    # which will lead to more conservative patterns and have a damping effect.
                    if i > 0:
                        t.confidence = t.confidence * self.config.wUpdt + t.confidence_old * (
                            1 - self.config.wUpdt)

                # update seed set of tuples to use in next iteration
                # seeds = { T | Conf(T) > min_tuple_confidence }
                if i + 1 < self.config.number_iterations:
                    print("Adding tuples to seed with confidence =>" +
                          str(self.config.instance_confidance))
                    for t in self.candidate_tuples.keys():
                        if t.confidence >= self.config.instance_confidance:
                            seed = Seed(t.e1, t.e2)
                            self.config.seed_tuples.add(seed)

                # increment the number of iterations
                i += 1

        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w")
        tmp = sorted(self.candidate_tuples,
                     key=lambda tpl: tpl.confidence,
                     reverse=True)
        for t in tmp:
            # f_output.write("instance: "+t.e1.encode("utf8")+'\t'+t.e2.encode("utf8")+'\tscore:'+str(t.confidence)+'\n')
            # f_output.write("sentence: "+t.sentence.encode("utf8")+'\n')

            f_output.write("instance: " + t.e1 + '\t' + t.e2 + '\tscore:' +
                           str(t.confidence) + '\n')
            f_output.write("sentence: " + t.sentence + '\n')

            # writer patterns that extracted this tuple
            patterns = set()
            for pattern in self.candidate_tuples[t]:
                patterns.add(pattern[0])
            for p in patterns:
                p.merge_tuple_patterns()
                f_output.write("pattern_bet: " + ', '.join(p.tuple_patterns) +
                               '\n')
            if t.passive_voice is False or t.passive_voice is None:
                f_output.write("passive voice: False\n")
            elif t.passive_voice is True:
                f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()