Esempio n. 1
0
 def read_seeds(self, seeds_file, holder):
     for line in fileinput.input(seeds_file):
         if line.startswith("#") or len(line) == 1:
             continue
         if line.startswith("e1"):
             self.e1_type = line.split(":")[1].strip()
         elif line.startswith("e2"):
             self.e2_type = line.split(":")[1].strip()
         else:
             e1 = line.split(";")[0].strip()
             e2 = line.split(";")[1].strip()
             seed = Seed(e1, e2)
             holder.add(seed)
Esempio n. 2
0
    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("\nLoading processed tuples from disk_init_bootstap...")
            self.processed_tuples = cPickle.load(f)
            print(">>>>>>>>> self.processed_tuples=  ", self.processed_tuples)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(count_matches.items(),
                                       key=operator.itemgetter(1),
                                       reverse=True)
                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])

                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances, to generate
                # patterns/update patterns
                print("\nClustering matched instances to generate patterns")
                self.cluster_tuples(matched_tuples)

                # Eliminate patterns supported by less than
                # 'min_pattern_support' tuples
                new_patterns = [
                    p for p in self.patterns
                    if len(p.tuples) > self.config.min_pattern_support
                ]
                self.patterns = new_patterns

                print("\n", len(self.patterns), "patterns generated")

                if PRINT_PATTERNS is True:
                    count = 1
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(count)
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                            print("\n")
                        count += 1

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                # Look for sentences with occurrence of seeds
                # semantic types (e.g., ORG - LOC)
                # This was already collect and its stored in:
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with each
                # extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("Number of tuples to be analyzed:", \
                    len(self.processed_tuples))

                print("\nCollecting instances based on extraction patterns")
                count = 0

                for t in self.processed_tuples:

                    count += 1
                    if count % 1000 == 0:
                        sys.stdout.write(".")
                        sys.stdout.flush()

                    sim_best = 0
                    for extraction_pattern in self.patterns:
                        accept, score = self.similarity_all(
                            t, extraction_pattern)
                        if accept is True:
                            extraction_pattern.update_selectivity(
                                t, self.config)
                            if score > sim_best:
                                sim_best = score
                                pattern_best = extraction_pattern

                    if sim_best >= self.config.threshold_similarity:
                        # if this tuple was already extracted, check if this
                        # extraction pattern is already associated with it,
                        # if not, associate this pattern with it and store the
                        # similarity score
                        patterns = self.candidate_tuples[t]
                        if patterns is not None:
                            if pattern_best not in [x[0] for x in patterns]:
                                self.candidate_tuples[t].append(
                                    (pattern_best, sim_best))

                        # If this tuple was not extracted before
                        # associate this pattern with the instance
                        # and the similarity score
                        else:
                            self.candidate_tuples[t].append(
                                (pattern_best, sim_best))

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Unknown", p.unknown)
                        print("Tuples", len(p.tuples))
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for t in self.candidate_tuples.keys():
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = self.candidate_tuples.keys()
                    tuples_sorted = sorted(extracted_tuples,
                                           key=lambda tpl: tpl.confidence,
                                           reverse=True)
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                print("Adding tuples to seed with confidence >=" + \
                      str(self.config.instance_confidence))
                for t in self.candidate_tuples.keys():
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()
Esempio n. 3
0
    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print "Loading pre-processed sentences", tuples
            self.processed_tuples = cPickle.load(f)
            f.close()
            print len(self.processed_tuples), "tuples loaded"

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print "=========================================="
            print "\nStarting iteration", self.curr_iteration
            print "\nLooking for seed matches of:"
            for s in self.config.positive_seed_tuples:
                print s.e1, '\t', s.e2

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print "\nNo seed matches found"
                sys.exit(0)

            else:
                print "\nNumber of seed matches found"
                sorted_counts = sorted(count_matches.items(),
                                       key=operator.itemgetter(1),
                                       reverse=True)

                for t in sorted_counts:
                    print t[0][0], '\t', t[0][1], t[1]
                print "\n", len(matched_tuples), "tuples matched"

                # Cluster the matched instances: generate patterns
                print "\nClustering matched instances to generate patterns"
                if len(self.patterns) == 0:
                    self.cluster_tuples(matched_tuples)

                    # Eliminate patterns supported by less than
                    # 'min_pattern_support' tuples
                    new_patterns = [
                        p for p in self.patterns
                        if len(p.tuples) > self.config.min_pattern_support
                    ]
                    self.patterns = new_patterns

                else:
                    # Parallelize single-pass clustering
                    # Each tuple must be compared with each extraction pattern

                    # Map:
                    # - Divide the tuples into smaller lists,
                    # accordingly to the number of CPUs
                    # - Pass to each CPU a sub-list of tuples and all the
                    # patterns, comparison is done by each CPU

                    # Merge:
                    # - Each CPU sends to the father process the updated
                    # patterns and new patterns
                    # - Merge patterns based on a pattern_id
                    # - Cluster new created patterns with single-pass clustering

                    # make a copy of the extraction patterns to be
                    # passed to each CPU
                    patterns = [
                        list(self.patterns) for _ in range(self.num_cpus)
                    ]

                    # distribute tuples per different CPUs
                    chunks = [list() for _ in range(self.num_cpus)]
                    n_tuples_per_child = int(
                        math.ceil(float(len(matched_tuples)) / self.num_cpus))

                    print "\n#CPUS", self.num_cpus, '\t', \
                        "Tuples per CPU", n_tuples_per_child

                    chunk_n = 0
                    chunck_begin = 0
                    chunck_end = n_tuples_per_child

                    while chunk_n < self.num_cpus:
                        chunks[chunk_n] = matched_tuples[
                            chunck_begin:chunck_end]
                        chunck_begin = chunck_end
                        chunck_end += n_tuples_per_child
                        chunk_n += 1

                    count = 0
                    for c in chunks:
                        print "CPU_" + str(count), "  ", len(c), "patterns"
                        count += 1

                    pipes = [
                        multiprocessing.Pipe(False)
                        for _ in range(self.num_cpus)
                    ]
                    processes = [
                        multiprocessing.Process(
                            target=self.cluster_tuples_parallel,
                            args=(patterns[i], chunks[i], pipes[i][1]))
                        for i in range(self.num_cpus)
                    ]

                    print "\nRunning", len(processes), " processes"
                    for proc in processes:
                        proc.start()

                    # Receive and merge all patterns by 'pattern_id'
                    # new created patterns (new pattern_id) go into
                    # 'child_patterns' and then are merged
                    # by single-pass clustering between patterns

                    child_patterns = list()

                    for i in range(len(pipes)):
                        data = pipes[i][0].recv()
                        patterns = data[1]
                        for p_updated in patterns:
                            pattern_exists = False
                            for p_original in self.patterns:
                                if p_original.id == p_updated.id:
                                    p_original.tuples.update(p_updated.tuples)
                                    pattern_exists = True
                                    break

                            if pattern_exists is False:
                                child_patterns.append(p_updated)

                    for proc in processes:
                        proc.join()

                    print "\nSELF Patterns:"
                    for p in self.patterns:
                        p.merge_all_tuples_bet()
                        print '\n' + str(p.id)
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print "BET", bet_words.encode("utf8")

                    print "\nChild Patterns:"
                    for p in child_patterns:
                        p.merge_all_tuples_bet()
                        print '\n' + str(p.id)
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print "BET", bet_words.encode("utf8")

                    print len(child_patterns), "new created patterns"

                    # merge/aggregate similar patterns generated by
                    # the child processes

                    # start comparing smaller ones with greater ones
                    child_patterns.sort(key=lambda y: len(y.tuples),
                                        reverse=False)
                    count = 0
                    new_list = list(self.patterns)
                    for p1 in child_patterns:
                        print "\nNew Patterns", len(child_patterns), \
                            "Processed", count
                        print "New List", len(new_list)
                        print "Pattern:", p1.id, "Tuples:", len(p1.tuples)
                        max_similarity = 0
                        max_similarity_cluster = None
                        for p2 in new_list:
                            if p1 == p2:
                                continue
                            score = self.similarity_cluster(p1, p2)
                            if score > max_similarity:
                                max_similarity = score
                                max_similarity_cluster = p2
                        if max_similarity >= self.config.threshold_similarity:
                            for t in p1.tuples:
                                max_similarity_cluster.tuples.add(t)
                        else:
                            new_list.append(p1)
                        count += 1

                    # add merged patterns to main patterns structure
                    for p in new_list:
                        if p not in self.patterns:
                            self.patterns.append(p)

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print "No patterns generated"
                    sys.exit(0)

                print "\n", len(self.patterns), "patterns generated"

                # merge equal tuples inside patterns to make
                # less comparisons in collecting instances
                for p in self.patterns:
                    # if only the BET context is being used,
                    # merge only based on BET contexts
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        p.merge_all_tuples_bet()

                if PRINT_PATTERNS is True:
                    print "\nPatterns:"
                    for p in self.patterns:
                        print '\n' + str(p.id)
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print "BET", bet_words
                        else:
                            for t in p.tuples:
                                print "BEF", t.bef_words
                                print "BET", t.bet_words
                                print "AFT", t.aft_words
                                print "========"

                # Look for sentences with occurrence of
                # seeds semantic types (e.g., ORG - LOC)

                # This was already collect and its stored in
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with
                # each extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print "\nNumber of tuples to be analyzed:", \
                    len(self.processed_tuples)

                print "\nCollecting instances based on", \
                    len(self.patterns), "extraction patterns"

                # create copies of generated extraction patterns
                # to be passed to each process
                patterns = [list(self.patterns) for _ in range(self.num_cpus)]

                # copy all tuples into a Queue shared by all processes
                manager = multiprocessing.Manager()
                queue = manager.Queue()
                for t in self.processed_tuples:
                    queue.put(t)

                # each distinct process receives as arguments:
                #   - a list, copy of all the original extraction patterns
                #   - a Queue of the tuples
                #   - a pipe to return the collected tuples and updated
                #     patterns to the parent process

                pipes = [
                    multiprocessing.Pipe(False) for _ in range(self.num_cpus)
                ]
                processes = [
                    multiprocessing.Process(target=self.find_instances,
                                            args=(patterns[i], queue,
                                                  pipes[i][1]))
                    for i in range(self.num_cpus)
                ]

                print "Running", len(processes), " processes"
                for proc in processes:
                    proc.start()

                # structures to store each process altered patterns
                # and collected tuples
                patterns_updated = list()
                collected_tuples = list()

                for i in range(len(pipes)):
                    data = pipes[i][0].recv()
                    child_pid = data[0]
                    patterns = data[1]
                    tuples = data[2]
                    print child_pid, "patterns", len(patterns), \
                        "tuples", len(tuples)
                    patterns_updated.extend(patterns)
                    collected_tuples.extend(tuples)

                for proc in processes:
                    proc.join()

                # Extraction patterns aggregation happens here:
                for p_updated in patterns_updated:
                    for p_original in self.patterns:
                        if p_original.id == p_updated.id:
                            p_original.positive += p_updated.positive
                            p_original.negative += p_updated.negative
                            p_original.unknown += p_updated.unknown

                # Index the patterns in an hashtable for later use
                for p in self.patterns:
                    self.patterns_index[p.id] = p

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print "\nPatterns:"
                    for p in self.patterns:
                        print p.id
                        print "Positive", p.positive
                        print "Negative", p.negative
                        print "Pattern Confidence", p.confidence
                        print "\n"

                # Candidate tuples aggregation happens here:
                print "Collecting generated candidate tuples"
                for e in collected_tuples:
                    t = e[0]
                    pattern_best = e[1]
                    sim_best = e[2]

                    # if this tuple was already extracted, check if this
                    # extraction pattern is already associated with it, if not,
                    # associate this pattern with it and similarity score
                    if t in self.candidate_tuples:
                        t_patterns = self.candidate_tuples[t]
                        if t_patterns is not None:
                            if pattern_best not in [x[0] for x in t_patterns]:
                                self.candidate_tuples[t].append(
                                    (self.patterns_index[pattern_best.id],
                                     sim_best))

                    # if this tuple was not extracted before, associate this
                    # pattern with the instance and the similarity score
                    else:
                        self.candidate_tuples[t].append(
                            (self.patterns_index[pattern_best.id], sim_best))

                # update tuple confidence based on patterns confidence
                print "\n\nCalculating tuples confidence"
                for t in self.candidate_tuples.keys():
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                    if self.curr_iteration > 0:
                        t.confidence = \
                            t.confidence * self.config.wUpdt + \
                            t.confidence_old * (1 - self.config.wUpdt)

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = self.candidate_tuples.keys()
                    tuples_sorted = sorted(extracted_tuples,
                                           key=lambda tl: tl.confidence,
                                           reverse=True)
                    for t in tuples_sorted:
                        print t.sentence
                        print t.e1, t.e2
                        print t.confidence
                        print "\n"

                # update seed set of tuples to use in next iteration
                # seeds = { T | conf(T) > instance_confidence }
                print "Adding tuples to seed with confidence >=" + \
                      str(self.config.instance_confidence)
                for t in self.candidate_tuples.keys():
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()