Example #1
0
 def __init__(self, config_file, seeds_file, negative_seeds, similarity,
              confidence):
     self.curr_iteration = 0
     self.patterns = list()
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(list)
     self.config = Config(config_file, seeds_file, negative_seeds,
                          similarity, confidence)
 def __init__(self, args):
     if args.num_cores == 0:
         self.num_cpus = multiprocessing.cpu_count()
     else:
         self.num_cpus = args.num_cores
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(
         list)  # 当字典里的key不存在但被查找时,返回的不是keyError而是一个默认空list
     self.config = Config(args.config_file, args.positive_seeds_file,
                          args.negative_seeds_file, args.similarity,
                          args.confidence)
Example #3
0
 def __init__(self, config_file, seeds_file, negative_seeds, similarity,
              confidence, num_cores):
     if num_cores == 0:
         self.num_cpus = multiprocessing.cpu_count()
     else:
         self.num_cpus = num_cores
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(list)
     self.curr_iteration = 0
     self.patterns = list()
     self.patterns_index = dict()
     self.config = Config(config_file, seeds_file, negative_seeds,
                          similarity, confidence)
Example #4
0
def gather_sizes_with_bootstrapping_patterns(cfg: Box, patterns, all_new_objects) -> DefaultDict[Tuple, list]:
    """Gather text, parse tuples and check if tuples include valid sizes."""
    visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors)
    config = Config(cfg, visual_config)
    tuples = generate_tuples(randomString(), config, names=all_new_objects)

    config.visual = cfg.parameters.visual_at_inference

    candidate_tuples = extract_tuples(config, patterns, tuples)
    filtered_tuples = filter_tuples(candidate_tuples, cfg.parameters.dev_threshold)

    for t in candidate_tuples.keys():
        logger.info(t.sentence)
        logger.info(f"{t.e1} {t.e2}")
        logger.info(t.confidence)
        logger.info("\n")

    return filtered_tuples
Example #5
0
 def __init__(self, config_file, seeds_file, negative_seeds, similarity,
              confidence, num_cores):
     if num_cores == 0:
         self.num_cpus = multiprocessing.cpu_count()
     else:
         self.num_cpus = num_cores
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(list)
     self.curr_iteration = 0
     self.patterns = list()
     self.patterns_index = dict()
     self.config = Config(config_file, seeds_file, negative_seeds,
                          similarity, confidence)
Example #6
0
class BREDS(object):

    def __init__(self, config_file, seeds_file, negative_seeds, similarity,
                 confidence, num_cores):
        if num_cores == 0:
            self.num_cpus = multiprocessing.cpu_count()
        else:
            self.num_cpus = num_cores
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(list)
        self.curr_iteration = 0
        self.patterns = list()
        self.patterns_index = dict()
        self.config = Config(config_file, seeds_file, negative_seeds,
                             similarity, confidence)

    def generate_tuples(self, sentences_file):

        # generate tuples instances from a text file with sentences
        # where named entities are already tagged

        # load word2vec model
        self.config.read_word2vec()

        # copy all sentences from input file into a Queue
        # shared by all processes
        manager = multiprocessing.Manager()
        queue = manager.Queue()

        print("\nLoading sentences from file")
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        count = 0
        for line in f_sentences:
            if line.startswith("#"):
                continue
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")
            queue.put(line.strip())
        f_sentences.close()

        pipes = [multiprocessing.Pipe(False) for _ in range(self.num_cpus)]
        processes = [
            multiprocessing.Process(target=self.generate_instances,
                                    args=(queue, pipes[i][1]))
            for i in range(self.num_cpus)]

        print("\nGenerating relationship instances from sentences")
        print("Running", len(processes), " processes")
        for proc in processes:
            proc.start()

        for i in range(len(pipes)):
            data = pipes[i][0].recv()
            child_instances = data[1]
            for x in child_instances:
                self.processed_tuples.append(x)

        for proc in processes:
            proc.join()

        print("\n", len(self.processed_tuples), "instances generated")
        print("Writing generated tuples to disk")
        f = open("processed_tuples.pkl", "wb")
        pickle.dump(self.processed_tuples, f)
        f.close()

    def generate_instances(self, sentences, child_conn):
        # Each process has its own NLTK PoS-tagger
        tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
        instances = list()
        while True:
            try:
                s = sentences.get_nowait()
                if sentences.qsize() % 500 == 0:
                    print(multiprocessing.current_process(), \
                        "Instances to process", sentences.qsize())

                sentence = Sentence(s,
                                    self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size,
                                    tagger,
                                    self.config)

                for rel in sentence.relationships:
                    t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                              rel.between, rel.after, self.config)
                    instances.append(t)

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, instances))
                break

    def similarity_3_contexts(self, t, p):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(
                matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)
            )

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(
                matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)
            )

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(
                matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)
            )

        return self.config.alpha*bef + \
               self.config.beta*bet + \
               self.config.gamma*aft

    def similarity_all(self, t, extraction_pattern):

        # calculates the cosine similarity between all patterns part of a
        # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern
        # extracted from a sentence;
        #
        # returns the max similarity scores

        good = 0
        bad = 0
        max_similarity = 0

        for p in list(extraction_pattern.tuples):
            score = self.similarity_3_contexts(t, p)
            if score > max_similarity:
                max_similarity = score
            if score >= self.config.threshold_similarity:
                good += 1
            else:
                bad += 1

        if good >= bad:
            return True, max_similarity
        else:
            return False, 0.0

    def match_seeds_tuples(self):

        # checks if an extracted tuple matches seeds tuples

        matched_tuples = list()
        count_matches = dict()
        for t in self.processed_tuples:
            for s in self.config.positive_seed_tuples:
                if t.e1 == s.e1 and t.e2 == s.e2:
                    matched_tuples.append(t)
                    try:
                        count_matches[(t.e1, t.e2)] += 1
                    except KeyError:
                        count_matches[(t.e1, t.e2)] = 1
        return count_matches, matched_tuples

    def cluster_tuples(self, matched_tuples):

        # single-Pass clustering

        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            # having this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)

    def write_relationships_to_disk(self):
        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w")
        tmp = sorted(list(self.candidate_tuples.keys()), reverse=True)
        for t in tmp:
            f_output.write("instance: " + t.e1+'\t'+t.e2+'\tscore:'+str(t.confidence)+'\n')
            f_output.write("sentence: "+t.sentence+'\n')
            f_output.write("pattern_bef: "+t.bef_words+'\n')
            f_output.write("pattern_bet: "+t.bet_words+'\n')
            f_output.write("pattern_aft: "+t.aft_words+'\n')
            if t.passive_voice is False:
                f_output.write("passive voice: False\n")
            elif t.passive_voice is True:
                f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()

    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("Loading pre-processed sentences", tuples)
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(
                    list(count_matches.items()),
                    key=operator.itemgetter(1),
                    reverse=True
                )

                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])
                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances: generate patterns
                print("\nClustering matched instances to generate patterns")
                if len(self.patterns) == 0:
                    self.cluster_tuples(matched_tuples)

                    # Eliminate patterns supported by less than
                    # 'min_pattern_support' tuples
                    new_patterns = [p for p in self.patterns if len(p.tuples) >
                                    self.config.min_pattern_support]
                    self.patterns = new_patterns

                else:
                    # Parallelize single-pass clustering
                    # Each tuple must be compared with each extraction pattern

                    # Map:
                    # - Divide the tuples into smaller lists,
                    # accordingly to the number of CPUs
                    # - Pass to each CPU a sub-list of tuples and all the
                    # patterns, comparison is done by each CPU

                    # Merge:
                    # - Each CPU sends to the father process the updated
                    # patterns and new patterns
                    # - Merge patterns based on a pattern_id
                    # - Cluster new created patterns with single-pass clustering

                    # make a copy of the extraction patterns to be
                    # passed to each CPU
                    patterns = [
                        list(self.patterns) for _ in range(self.num_cpus)
                        ]

                    # distribute tuples per different CPUs
                    chunks = [list() for _ in range(self.num_cpus)]
                    n_tuples_per_child = int(
                        math.ceil(float(len(matched_tuples)) / self.num_cpus)
                    )

                    print("\n#CPUS", self.num_cpus, '\t', \
                        "Tuples per CPU", n_tuples_per_child)

                    chunk_n = 0
                    chunck_begin = 0
                    chunck_end = n_tuples_per_child

                    while chunk_n < self.num_cpus:
                        chunks[chunk_n] = matched_tuples[
                                          chunck_begin:chunck_end
                                          ]
                        chunck_begin = chunck_end
                        chunck_end += n_tuples_per_child
                        chunk_n += 1

                    count = 0
                    for c in chunks:
                        print("CPU_"+str(count), "  ", len(c), "patterns")
                        count += 1

                    pipes = [
                        multiprocessing.Pipe(False)
                        for _ in range(self.num_cpus)
                        ]
                    processes = [
                        multiprocessing.Process(
                            target=self.cluster_tuples_parallel,
                            args=(patterns[i], chunks[i], pipes[i][1]))
                        for i in range(self.num_cpus)
                        ]

                    print("\nRunning", len(processes), " processes")
                    for proc in processes:
                        proc.start()

                    # Receive and merge all patterns by 'pattern_id'
                    # new created patterns (new pattern_id) go into
                    # 'child_patterns' and then are merged
                    # by single-pass clustering between patterns

                    child_patterns = list()

                    for i in range(len(pipes)):
                        data = pipes[i][0].recv()
                        patterns = data[1]
                        for p_updated in patterns:
                            pattern_exists = False
                            for p_original in self.patterns:
                                if p_original.id == p_updated.id:
                                    p_original.tuples.update(p_updated.tuples)
                                    pattern_exists = True
                                    break

                            if pattern_exists is False:
                                child_patterns.append(p_updated)

                    for proc in processes:
                        proc.join()

                    print("\nSELF Patterns:")
                    for p in self.patterns:
                        p.merge_all_tuples_bet()
                        print('\n'+str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words.encode("utf8"))

                    print("\nChild Patterns:")
                    for p in child_patterns:
                        p.merge_all_tuples_bet()
                        print('\n'+str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words.encode("utf8"))

                    print(len(child_patterns), "new created patterns")

                    # merge/aggregate similar patterns generated by
                    # the child processes

                    # start comparing smaller ones with greater ones
                    child_patterns.sort(key=lambda y: len(y.tuples),
                                        reverse=False)
                    count = 0
                    new_list = list(self.patterns)
                    for p1 in child_patterns:
                        print("\nNew Patterns", len(child_patterns), \
                            "Processed", count)
                        print("New List", len(new_list))
                        print("Pattern:", p1.id, "Tuples:", len(p1.tuples))
                        max_similarity = 0
                        max_similarity_cluster = None
                        for p2 in new_list:
                            if p1 == p2:
                                continue
                            score = self.similarity_cluster(p1, p2)
                            if score > max_similarity:
                                max_similarity = score
                                max_similarity_cluster = p2
                        if max_similarity >= self.config.threshold_similarity:
                            for t in p1.tuples:
                                max_similarity_cluster.tuples.add(t)
                        else:
                            new_list.append(p1)
                        count += 1

                    # add merged patterns to main patterns structure
                    for p in new_list:
                        if p not in self.patterns:
                            self.patterns.append(p)

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                print("\n", len(self.patterns), "patterns generated")

                # merge equal tuples inside patterns to make
                # less comparisons in collecting instances
                for p in self.patterns:
                    # if only the BET context is being used,
                    # merge only based on BET contexts
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        p.merge_all_tuples_bet()

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        print('\n'+str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words)
                        else:
                            for t in p.tuples:
                                print("BEF", t.bef_words)
                                print("BET", t.bet_words)
                                print("AFT", t.aft_words)
                                print("========")

                # Look for sentences with occurrence of
                # seeds semantic types (e.g., ORG - LOC)

                # This was already collect and its stored in
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with
                # each extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("\nNumber of tuples to be analyzed:", \
                    len(self.processed_tuples))

                print("\nCollecting instances based on", \
                    len(self.patterns), "extraction patterns")

                # create copies of generated extraction patterns
                # to be passed to each process
                patterns = [list(self.patterns) for _ in range(self.num_cpus)]

                # copy all tuples into a Queue shared by all processes
                manager = multiprocessing.Manager()
                queue = manager.Queue()
                for t in self.processed_tuples:
                    queue.put(t)

                # each distinct process receives as arguments:
                #   - a list, copy of all the original extraction patterns
                #   - a Queue of the tuples
                #   - a pipe to return the collected tuples and updated
                #     patterns to the parent process

                pipes = [
                    multiprocessing.Pipe(False) for _ in range(self.num_cpus)
                    ]
                processes = [
                    multiprocessing.Process(
                        target=self.find_instances,
                        args=(patterns[i], queue, pipes[i][1]))
                    for i in range(self.num_cpus)
                    ]

                print("Running", len(processes), " processes")
                for proc in processes:
                    proc.start()

                # structures to store each process altered patterns
                # and collected tuples
                patterns_updated = list()
                collected_tuples = list()

                for i in range(len(pipes)):
                    data = pipes[i][0].recv()
                    child_pid = data[0]
                    patterns = data[1]
                    tuples = data[2]
                    print(child_pid, "patterns", len(patterns), \
                        "tuples", len(tuples))
                    patterns_updated.extend(patterns)
                    collected_tuples.extend(tuples)

                for proc in processes:
                    proc.join()

                # Extraction patterns aggregation happens here:
                for p_updated in patterns_updated:
                    for p_original in self.patterns:
                        if p_original.id == p_updated.id:
                            p_original.positive += p_updated.positive
                            p_original.negative += p_updated.negative
                            p_original.unknown += p_updated.unknown

                # Index the patterns in an hashtable for later use
                for p in self.patterns:
                    self.patterns_index[p.id] = p

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(p.id)
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # Candidate tuples aggregation happens here:
                print("Collecting generated candidate tuples")
                for e in collected_tuples:
                    t = e[0]
                    pattern_best = e[1]
                    sim_best = e[2]

                    # if this tuple was already extracted, check if this
                    # extraction pattern is already associated with it, if not,
                    # associate this pattern with it and similarity score
                    if t in self.candidate_tuples:
                        t_patterns = self.candidate_tuples[t]
                        if t_patterns is not None:
                            if pattern_best not in [x[0] for x in t_patterns]:
                                self.candidate_tuples[t].append(
                                    (self.patterns_index[pattern_best.id],
                                     sim_best)
                                )

                    # if this tuple was not extracted before, associate this
                    # pattern with the instance and the similarity score
                    else:
                        self.candidate_tuples[t].append(
                            (self.patterns_index[pattern_best.id], sim_best)
                        )

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for t in list(self.candidate_tuples.keys()):
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                    if self.curr_iteration > 0:
                        t.confidence = \
                            t.confidence * self.config.wUpdt + \
                            t.confidence_old * (1 - self.config.wUpdt)

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = list(self.candidate_tuples.keys())
                    tuples_sorted = sorted(
                        extracted_tuples,
                        key=lambda tl: tl.confidence,
                        reverse=True
                    )
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                # update seed set of tuples to use in next iteration
                # seeds = { T | conf(T) > instance_confidence }
                print("Adding tuples to seed with confidence >=" + \
                      str(self.config.instance_confidence))
                for t in list(self.candidate_tuples.keys()):
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()

    def similarity_cluster(self, p1, p2):
        count = 0
        score = 0
        if self.config.alpha == 0 and self.config.gamma == 0:
            p1.merge_all_tuples_bet()
            p2.merge_all_tuples_bet()
            for v_bet1 in p1.bet_uniques_vectors:
                for v_bet2 in p2.bet_uniques_vectors:
                    if v_bet1 is not None and v_bet2 is not None:
                        score += dot(
                            matutils.unitvec(asarray(v_bet1)),
                            matutils.unitvec(asarray(v_bet2))
                        )
                        count += 1
        else:
            for t1 in p1.tuples:
                for t2 in p2.tuples:
                    score += self.similarity_3_contexts(t1, t2)
                    count += 1

        return float(score) / float(count)

    def find_instances(self, patterns, instances, child_conn):
        updated_patterns = list()
        candidate_tuples = list()
        while True:
            try:
                t = instances.get_nowait()
                if instances.qsize() % 500 == 0:
                    sys.stdout.write(
                        str(multiprocessing.current_process()) +
                        " Instances to process: " +
                        str(instances.qsize())+'\n')
                    sys.stdout.flush()

                # measure similarity towards every extraction pattern
                max_similarity = 0
                pattern_best = None
                for p in patterns:
                    good = 0
                    bad = 0
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        for p_bet_v in list(p.bet_uniques_vectors):
                            if t.bet_vector is not None and p_bet_v is not None:
                                score = dot(
                                    matutils.unitvec(t.bet_vector),
                                    matutils.unitvec(asarray(p_bet_v))
                                )
                                if score >= self.config.threshold_similarity:
                                    good += 1
                                else:
                                    bad += 1

                    if good > bad:
                        p.update_selectivity(t, self.config)
                        if score > max_similarity:
                            max_similarity = score
                            pattern_best = p

                # if its above a threshold associated the pattern with it
                if max_similarity >= self.config.threshold_similarity:
                    candidate_tuples.append((t, pattern_best, max_similarity))

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                for p in patterns:
                    updated_patterns.append(p)
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, updated_patterns, candidate_tuples))
                break

    def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn):
        updated_patterns = list(patterns)
        count = 0
        for t in matched_tuples:
            count += 1
            if count % 500 == 0:
                print(multiprocessing.current_process(), count, \
                    "tuples processed")

            # go through all patterns(clusters of tuples) and find the one with
            # the highest similarity score
            max_similarity = 0
            max_similarity_cluster_index = 0
            for i in range(0, len(updated_patterns)):
                extraction_pattern = updated_patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                updated_patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                updated_patterns[max_similarity_cluster_index].add_tuple(t)

        # Eliminate clusters with two or less patterns
        new_patterns = [p for p in updated_patterns if len(p.tuples) > 5]
        pid = multiprocessing.current_process().pid
        print(multiprocessing.current_process(), "Patterns: ", len(new_patterns))
        child_conn.send((pid, new_patterns))
Example #7
0
class BREDS(object):
    def __init__(self, config_file, seeds_file, negative_seeds, similarity,
                 confidence):
        self.curr_iteration = 0
        self.patterns = list()
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(list)
        self.config = Config(config_file, seeds_file, negative_seeds,
                             similarity, confidence)

    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences where named entities are
        already tagged

        :param sentences_file:
        """
        if os.path.exists("processed_tuples.pkl"):

            with open("processed_tuples.pkl", "rb") as f_in:
                print("\nLoading processed tuples from disk...")
                self.processed_tuples = pickle.load(f_in)
            print(len(self.processed_tuples), "tuples loaded")

            temp_file = open("temp.txt", "w", encoding='utf-8')
            for i in self.processed_tuples:
                temp_file.write(i.e1 + '\t' + i.e2 + '\n')
            temp_file.close()

        else:

            # load needed stuff, word2vec model and a pos-tagger
            self.config.read_word2vec()
            tagger = None

            print("\nGenerating relationship instances from sentences")
            with open(sentences_file, encoding='utf-8') as f_sentences:
                count = 0
                for line in f_sentences:
                    if line.startswith("#"):
                        continue
                    count += 1
                    if count % 10000 == 0:
                        sys.stdout.write(".")

                    sentence = Sentence(line.strip(), self.config.e1_type,
                                        self.config.e2_type,
                                        self.config.max_tokens_away,
                                        self.config.min_tokens_away,
                                        self.config.context_window_size,
                                        tagger, self.config)

                    for rel in sentence.relationships:
                        t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                                  rel.between, rel.after, self.config)
                        self.processed_tuples.append(t)
                print("\n", len(self.processed_tuples), "tuples generated")

            print("Writing generated tuples to disk")
            with open("processed_tuples.pkl", "wb") as f_out:
                pickle.dump(self.processed_tuples, f_out)

    def similarity_3_contexts(self, p, t):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector),
                      matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector),
                      matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector),
                      matutils.unitvec(p.aft_vector))

        return self.config.alpha * bef + self.config.beta * bet + self.config.gamma * aft

    def similarity_all(self, t, extraction_pattern):

        # calculates the cosine similarity between all patterns part of a
        # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern
        # extracted from a sentence;

        # returns the max similarity scores

        good = 0
        bad = 0
        max_similarity = 0

        for p in list(extraction_pattern.tuples):
            score = self.similarity_3_contexts(t, p)
            if score > max_similarity:
                max_similarity = score
            if score >= self.config.threshold_similarity:
                good += 1
            else:
                bad += 1

        if good >= bad:
            return True, max_similarity
        else:
            return False, 0.0

    def match_seeds_tuples(self):
        # checks if an extracted tuple matches seeds tuples
        matched_tuples = list()
        count_matches = dict()
        for t in self.processed_tuples:
            for s in self.config.positive_seed_tuples:
                if t.e1 == s.e1 and t.e2 == s.e2:
                    matched_tuples.append(t)
                    try:
                        count_matches[(t.e1, t.e2)] += 1
                    except KeyError:
                        count_matches[(t.e1, t.e2)] = 1

        return count_matches, matched_tuples

    def write_relationships_to_disk(self):
        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w", encoding='utf-8')
        tmp = sorted(list(self.candidate_tuples.keys()), reverse=True)
        for t in tmp:
            f_output.write("instance: " + t.e1 + '\t' + t.e2 + '\tscore:' +
                           str(t.confidence) + '\n')
            f_output.write("sentence: " + t.sentence + '\n')
            f_output.write("pattern_bef: " + t.bef_words + '\n')
            f_output.write("pattern_bet: " + t.bet_words + '\n')
            f_output.write("pattern_aft: " + t.aft_words + '\n')
            # if t.passive_voice is False:
            #     f_output.write("passive voice: False\n")
            # elif t.passive_voice is True:
            #     f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()

    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("\nLoading processed tuples from disk...")
            self.processed_tuples = pickle.load(f)
            f.close()
            temp_file = open("temp.txt", "w", encoding='utf-8')
            for i in self.processed_tuples:
                print(i)
                temp_file.write(i + '\n')
            temp_file.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(list(count_matches.items()),
                                       key=operator.itemgetter(1),
                                       reverse=True)
                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])

                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances, to generate
                # patterns/update patterns
                print("\nClustering matched instances to generate patterns")
                self.cluster_tuples(matched_tuples)

                # Eliminate patterns supported by less than
                # 'min_pattern_support' tuples
                new_patterns = [
                    p for p in self.patterns
                    if len(p.tuples) > self.config.min_pattern_support
                ]
                self.patterns = new_patterns

                print("\n", len(self.patterns), "patterns generated")

                if PRINT_PATTERNS is True:
                    count = 1
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(count)
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                            print("\n")
                        count += 1

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                # Look for sentences with occurrence of seeds
                # semantic types (e.g., ORG - LOC)
                # This was already collect and its stored in:
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with each
                # extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("Number of tuples to be analyzed:",
                      len(self.processed_tuples))

                print("\nCollecting instances based on extraction patterns")
                count = 0

                for t in self.processed_tuples:

                    count += 1
                    if count % 1000 == 0:
                        sys.stdout.write(".")
                        sys.stdout.flush()

                    sim_best = 0
                    for extraction_pattern in self.patterns:
                        accept, score = self.similarity_all(
                            t, extraction_pattern)
                        if accept is True:
                            extraction_pattern.update_selectivity(
                                t, self.config)
                            if score > sim_best:
                                sim_best = score
                                pattern_best = extraction_pattern

                    if sim_best >= self.config.threshold_similarity:
                        # if this tuple was already extracted, check if this
                        # extraction pattern is already associated with it,
                        # if not, associate this pattern with it and store the
                        # similarity score
                        patterns = self.candidate_tuples[t]
                        if patterns is not None:
                            if pattern_best not in [x[0] for x in patterns]:
                                self.candidate_tuples[t].append(
                                    (pattern_best, sim_best))

                        # If this tuple was not extracted before
                        # associate this pattern with the instance
                        # and the similarity score
                        else:
                            self.candidate_tuples[t].append(
                                (pattern_best, sim_best))

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Unknown", p.unknown)
                        print("Tuples", len(p.tuples))
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for idx, t in enumerate(list(self.candidate_tuples.keys())):
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                    if idx > 0:
                        t.confidence = t.confidence * self.config.wUpdt + t.confidence_old * (
                            1 - self.config.wUpdt)

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = list(self.candidate_tuples.keys())
                    tuples_sorted = sorted(extracted_tuples,
                                           key=lambda tpl: tpl.confidence,
                                           reverse=True)
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                print("Adding tuples to seed with confidence >= {}".format(
                    str(self.config.instance_confidence)))
                for t in list(self.candidate_tuples.keys()):
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()

    def cluster_tuples(self, matched_tuples):
        # this is a single-pass clustering
        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster having
            #  this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)
class BREDS:
    def __init__(self, args):
        if args.num_cores == 0:
            self.num_cpus = multiprocessing.cpu_count()
        else:
            self.num_cpus = args.num_cores
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(
            list)  # 当字典里的key不存在但被查找时,返回的不是keyError而是一个默认空list
        self.config = Config(args.config_file, args.positive_seeds_file,
                             args.negative_seeds_file, args.similarity,
                             args.confidence)

    def generate_tuples(self, data_dir: str):
        """
        用于从源数据中,用多线程的方式生成tuples
        Args:
            data_dir: 数据存储的路径,其中包括:
                      eg. 源文章名称    __ data/round2/0.txt
                          NER结果名称   __ data/round2/0_ner.pkl
                          文章分句结果   __ data/round2/0_sentence_split.pkl
        """

        # Step1 : load word2idx and emb_matrix
        self.config.load_word2idx_embmatrix()

        # Step2 : 生成候选关系对
        instances = list()
        file_names = scan_files(data_dir)

        for file in file_names:
            passage = load_file(data_dir, file, "txt")  # type:str
            sent_split = pickle.load(
                open(data_dir + file + "_sentence_split.pkl",
                     "rb"))  # type:List[tuple]
            ner_result = pickle.load(open(data_dir + file + "_ner.pkl",
                                          "rb"))  # type:List[tuple]

            sent_split.sort(key=lambda x: x[0])

            # Step2.1 : 找出属于e1与e2的实体
            e1_entities, e2_entities = list(), list()
            for e in ner_result:
                # e是个4元组,例如:('Disease', 1, 10, '糖尿病下肢动脉病变')
                if e[0] == self.config.e1_type:
                    e1_entities.append(e)
                elif e[0] == self.config.e2_type:
                    e2_entities.append(e)
            e1_entities.sort(key=lambda x: x[1])
            e2_entities.sort(key=lambda x: x[1])

            # Step2.2 : 对每一个e1去找到候选的e2,并确定三元组<BEF,BET,AFT,sequence_tag>
            for e1 in e1_entities:
                e1_start, e1_end = e1[1], e1[2]
                cur_sentence_idx = -1
                for idx, s in enumerate(sent_split):
                    if s[0] <= e1_start and s[1] >= e1_end:
                        cur_sentence_idx = idx
                        break
                # 根据当前实体的位置确定了寻找e2的上下界:即 上一句 + 当前句 + 下一句
                search_e2_start = sent_split[
                    cur_sentence_idx - 1 if cur_sentence_idx > 1 else 0][0]
                search_e2_end = sent_split[cur_sentence_idx + 1 if cur_sentence_idx < len(sent_split) - 1 \
                    else len(sent_split) - 1][1]

                for i in range(len(e2_entities)):
                    e2 = e2_entities[i]
                    e2_start = e2[1]
                    e2_end = e2[2]
                    if e2_end < search_e2_start:
                        continue
                    elif e2_start > search_e2_end:
                        break
                    elif e2_start >= search_e2_start and e2_end <= search_e2_end:
                        if e1_end == e2_start:
                            # 情况(1):e1在e2前,且紧挨着
                            before = passage[search_e2_start:e1_start]
                            between = ""
                            after = passage[e2_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=True,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e2_end == e1_start:
                            # 情况(2):e1在e2后,且紧挨着
                            before = passage[search_e2_start:e2_start]
                            between = ""
                            after = passage[e1_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=False,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e1_end < e2_start:
                            # 情况(3):e1在e2前,不挨着
                            before = passage[search_e2_start:e1_start]
                            between = passage[e1_end:e2_start]
                            after = passage[e2_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=True,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e2_end < e1_start:
                            # 情况(4):e1在e2后,不挨着
                            before = passage[search_e2_start:e2_start]
                            between = passage[e2_end:e1_start]
                            after = passage[e1_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=False,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)

        # Stpe3 : 持久化
        pickle.dump(
            instances,
            open("./saved_model_files/RE_candidate_instances.pkl", "wb"))

    def similarity_3_contexts(self, t: Tuple, p: Tuple) -> float:
        bef, bet, aft = 0, 0, 0
        # TODO 貌似应该增加参数的
        return 0
Example #9
0
def main():
    with open("config.yml", "r") as ymlfile:
        cfg = Box(yaml.safe_load(ymlfile))
        # cfg = Box(yaml.safe_load(ymlfile), default_box=True, default_box_attr=None)

    test_pairs, unseen_objects = comparison_dev_set(cfg)
    unseen_objects = [o.replace('_', " ") for o in unseen_objects]

    # TODO check whether the objects aren't in the bootstrapped objects
    visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors)
    config = Config(cfg, visual_config)

    visual_config = config.visual_config
    objects = list(visual_config.entity_to_synsets.keys())
    logger.info(f'Objects: {objects}')
    G = build_cooccurrence_graph(objects, visual_config)

    word2vec_model = load_word2vec(cfg.parameters.word2vec_path)
    similar_words = find_similar_words(word2vec_model, unseen_objects, n_word2vec=200)

    # calc coverage and precision
    results = list()
    settings: List[BackoffSettings] = [
        # BackoffSettings(use_direct=True),
        # BackoffSettings(use_word2vec=True),
        # BackoffSettings(use_hypernyms=True),
        # BackoffSettings(use_hyponyms=True),
        # BackoffSettings(use_head_noun=True),
        # BackoffSettings(use_direct=True, use_word2vec=True),
        BackoffSettings(use_direct=True, use_word2vec=True, use_hypernyms=True),
        # BackoffSettings(use_direct=True, use_hypernyms=True),
        # BackoffSettings(use_direct=True, use_hyponyms=True),
        # BackoffSettings(use_direct=True, use_head_noun=True),
        # BackoffSettings(use_direct=True, use_hyponyms=True)
    ]
    golds = [p.larger for p in test_pairs]

    for setting in settings:
        preds = list()
        fractions_larger = list()
        notes = list()

        prop = VisualPropagation(G, config.visual_config)
        logger.info(f'\nRunning for setting {setting.print()}')
        comparer = Comparer(prop, setting, similar_words, objects)
        for test_pair in tqdm.tqdm(test_pairs):
            # TODO return confidence; use the higher one
            res_visual, fraction_larger, note = comparer.compare_visual_with_backoff(test_pair)
            fractions_larger.append(fraction_larger)
            preds.append(res_visual)
            notes.append(note)

        with open(f'visual_comparison_predictions_{setting.print()}.pkl', 'wb') as f:
            pickle.dump(list(zip(preds, fractions_larger, notes)), f)

        useful_counts = comparer.useful_paths_count
        tr = SymmetricalLogTransform(base=10, linthresh=1, linscale=1)
        ss = tr.transform([0., max(useful_counts) + 1])
        bins = tr.inverted().transform(np.linspace(*ss, num=100))
        fig, ax = plt.subplots()
        plt.hist(useful_counts, bins=bins)
        plt.xlabel('Number of useful paths')
        ax.set_xscale('symlog')
        plt.savefig(f'useful_paths{setting.print()}.png')

        useful_counts = np.array(useful_counts)
        logger.info(f'Number of objects with no useful path: {len(np.extract(useful_counts == 0, useful_counts))}')
        logger.info(f'Not recog count: {comparer.not_recognized_count}')

        logger.info(f'Total number of test cases: {len(golds)}')
        coverage, selectivity = coverage_accuracy_relational(golds, preds)
        logger.info(f'Coverage: {coverage}')
        logger.info(f'selectivity: {selectivity}')

        results.append(RelationalResult(setting.print(), selectivity, coverage))

        assert len(fractions_larger) == len(preds)
        corrects_not_none = list()
        diffs_not_none = list()
        for i, fraction_larger in enumerate(fractions_larger):
            gold = golds[i]
            res = preds[i]
            if fraction_larger is not None and fraction_larger != 0.5:
                fraction_larger_centered = fraction_larger - .5
                corrects_not_none.append(gold == res)
                diffs_not_none.append(abs(fraction_larger_centered))
        # TODO do something special for when fraction_larger_centered == 0

        regr_linear = Ridge(alpha=1.0)
        regr_linear.fit(np.reshape(diffs_not_none, (-1, 1)), corrects_not_none)
        with open('visual_confidence_model.pkl', 'wb') as f:
            pickle.dump(regr_linear, f)

        fig, ax = plt.subplots()
        bin_means, bin_edges, binnumber = stats.binned_statistic(diffs_not_none, corrects_not_none, 'mean',
                                                                 bins=20)
        bin_counts, _, _ = stats.binned_statistic(diffs_not_none, corrects_not_none, 'count',
                                                  bins=20)
        x = np.linspace(min(diffs_not_none), max(diffs_not_none), 500)
        X = np.reshape(x, (-1, 1))
        plt.plot(x, regr_linear.predict(X), '-', label='linear ridge regression')

        minc = min(bin_counts)
        maxc = max(bin_counts)
        norm = colors.SymLogNorm(vmin=minc, vmax=maxc, linthresh=1)
        bin_counts_normalized = [norm(c) for c in bin_counts]
        logger.info(f'counts, norm: {list(zip(bin_counts, bin_counts_normalized))}')
        viridis = cm.get_cmap('viridis', 20)

        mins = bin_edges[:-1]
        maxs = bin_edges[1:]
        mask = ~np.isnan(bin_means)
        plt.hlines(np.extract(mask, bin_means), np.extract(mask, mins), np.extract(mask, maxs),
                   colors=viridis(np.extract(mask, bin_counts_normalized)), lw=5,
                   label='binned statistic of data')
        sm = plt.cm.ScalarMappable(cmap=viridis, norm=norm)
        ticks = [10**1.5, 10**1.75, 10**2, 10**2.5]
        colorbar = plt.colorbar(sm, ticks=ticks)
        colorbar.ax.set_yticklabels(['10^1.5', '10^1.75', '10^2', '10^2.5'])
        colorbar.set_label('bin count')
        plt.ylim(-0.05, 1.05)
        plt.legend()
        plt.xlabel('Absolute fraction_larger')
        plt.ylabel('Selectivity')
        ax.set_xscale('linear')
        plt.savefig('fraction_larger_selectivity_linear.png')
        plt.show()

        correlation, _ = pearsonr(diffs_not_none, corrects_not_none)
        logger.info(f'Pearsons correlation: {correlation}')

        correlation_spearman, _ = spearmanr(np.array(diffs_not_none), b=np.array(corrects_not_none))
        logger.info(f'Spearman correlation: {correlation_spearman}')

    results_df = pd.DataFrame(results)
    results_df.to_csv('results_visual_backoff.csv')
Example #10
0
def main():
    with open("config.yml", "r") as ymlfile:
        cfg = Box(yaml.safe_load(ymlfile))
        # cfg = Box(yaml.safe_load(ymlfile), default_box=True, default_box_attr=None)

    # TODO check whether the objects aren't in the bootstrapped objects
    visual_config = VisualConfig(cfg.path.vg_objects,
                                 cfg.path.vg_objects_anchors)
    config = Config(cfg, visual_config)

    input: DataFrame = pd.read_csv(cfg.path.dev)
    input = input.astype({'object': str})
    unseen_objects = list(input['object'])
    logger.info(f'Unseen objects: {unseen_objects}')

    visual_config = config.visual_config
    objects = list(visual_config.entity_to_synsets.keys())
    logger.info(f'Objects: {objects}')
    G = build_cooccurrence_graph(objects, visual_config)

    with open(cfg.path.final_seeds_cache) as f:
        numeric_seeds = json.load(f)

    numeric_seeds = dict((key.strip().replace(' ', '_'), value)
                         for (key, value) in numeric_seeds.items())
    del numeric_seeds[
        'rhine']  # There is a 'rhine' in VG, which was included in VG as the river. fixing this manually,
    # since it's in a lot of results

    point_predictions = dict()
    point_predictions_evenly = dict()
    point_predictions_svm = dict()
    prop = VisualPropagation(G, config.visual_config)
    for unseen_object in unseen_objects:
        unseen_object = unseen_object.replace(' ', '_')
        logger.info(f'Processing {unseen_object}')
        if unseen_object not in objects:
            logger.info(f'{unseen_object} not in visuals')
            point_predictions[unseen_object.replace('_', ' ')] = None
            point_predictions_evenly[unseen_object.replace('_', ' ')] = None
            point_predictions_svm[unseen_object.replace('_', ' ')] = None
            continue
        none_count = 0
        lower_bounds = set()
        upper_bounds = set()
        for numeric_seed in tqdm.tqdm(numeric_seeds.keys()):
            pair = Pair(unseen_object, numeric_seed)
            if pair.both_in_list(objects):
                fraction_larger, _ = prop.compare_pair(pair)
                if fraction_larger is None:
                    none_count += 1
                    continue
                if fraction_larger < .5:
                    upper_bounds.add(numeric_seed)
                if fraction_larger > .5:
                    lower_bounds.add(numeric_seed)
                logger.debug(
                    f'{pair.e1} {pair.e2} fraction larger: {fraction_larger}')
            else:
                logger.debug(
                    f'{pair.e1} or {pair.e2} not in VG. Objects: {objects}')

        lower_bounds_sizes = fill_sizes_list(lower_bounds, numeric_seeds)
        upper_bounds_sizes = fill_sizes_list(upper_bounds, numeric_seeds)

        # size = predict_size_with_bounds(lower_bounds_sizes, upper_bounds_sizes)
        size = iterativily_find_size(lower_bounds_sizes, upper_bounds_sizes)
        size_evenly = iterativily_find_size_evenly(lower_bounds_sizes,
                                                   upper_bounds_sizes)
        size_svm = predict_size_with_bounds(lower_bounds_sizes,
                                            upper_bounds_sizes)

        point_predictions[unseen_object.replace('_', ' ')] = size
        point_predictions_evenly[unseen_object.replace('_', ' ')] = size_evenly
        point_predictions_svm[unseen_object.replace('_', ' ')] = size_svm
        logger.info(f'\nObject: {unseen_object}')
        logger.info(f'Size: {size}')
        logger.info(f'Size evenly: {size_evenly}')
        logger.info(f'Size svm: {size_svm}')
        logger.info(
            f"None count: {none_count} out of {len(numeric_seeds.keys())}")
        logger.info(
            f"Lower bounds (n={len(lower_bounds)}): mean: {np.mean(lower_bounds_sizes)} median: {np.median(lower_bounds_sizes)}\n\t{lower_bounds}\n\t{lower_bounds_sizes}"
        )
        logger.info(
            f"Upper bounds (n={len(upper_bounds)}): mean: {np.mean(upper_bounds_sizes)} median: {np.median(upper_bounds_sizes)}\n\t{upper_bounds}\n\t{upper_bounds_sizes}"
        )

    with open(f'point_predictions_visual_ranges.pkl', 'wb') as f:
        pickle.dump(point_predictions, f)

    with open(f'point_predictions_visual_ranges_evenly.pkl', 'wb') as f:
        pickle.dump(point_predictions_evenly, f)

    with open(f'point_predictions_visual_ranges_svm.pkl', 'wb') as f:
        pickle.dump(point_predictions_svm, f)
Example #11
0
class BREDS(object):
    def __init__(self, config_file, seeds_file, negative_seeds, similarity,
                 confidence, num_cores):
        if num_cores == 0:
            self.num_cpus = multiprocessing.cpu_count()
        else:
            self.num_cpus = num_cores
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(list)
        self.curr_iteration = 0
        self.patterns = list()
        self.patterns_index = dict()
        self.config = Config(config_file, seeds_file, negative_seeds,
                             similarity, confidence)

    def generate_tuples(self, sentences_file):

        # generate tuples instances from a text file with sentences
        # where named entities are already tagged

        # load word2vec model
        self.config.read_word2vec()

        # copy all sentences from input file into a Queue
        # shared by all processes
        manager = multiprocessing.Manager()
        queue = manager.Queue()

        print("\nLoading sentences from file")
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        count = 0
        for line in f_sentences:
            if line.startswith("#"):
                continue
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")
            queue.put(line.strip())
        f_sentences.close()

        pipes = [multiprocessing.Pipe(False) for _ in range(self.num_cpus)]
        processes = [
            multiprocessing.Process(target=self.generate_instances,
                                    args=(queue, pipes[i][1]))
            for i in range(self.num_cpus)
        ]

        print("\nGenerating relationship instances from sentences")
        print("Running", len(processes), " processes")
        for proc in processes:
            proc.start()

        for i in range(len(pipes)):
            data = pipes[i][0].recv()
            child_instances = data[1]
            for x in child_instances:
                self.processed_tuples.append(x)

        for proc in processes:
            proc.join()

        print("\n", len(self.processed_tuples), "instances generated")
        print("Writing generated tuples to disk")
        f = open("processed_tuples.pkl", "wb")
        pickle.dump(self.processed_tuples, f)
        f.close()

    def generate_instances(self, sentences, child_conn):
        # Each process has its own NLTK PoS-tagger
        tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
        instances = list()
        while True:
            try:
                s = sentences.get_nowait()
                if sentences.qsize() % 500 == 0:
                    print(multiprocessing.current_process(), \
                        "Instances to process", sentences.qsize())

                sentence = Sentence(s, self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size, tagger,
                                    self.config)

                for rel in sentence.relationships:
                    t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                              rel.between, rel.after, self.config)
                    instances.append(t)

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, instances))
                break

    def similarity_3_contexts(self, t, p):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector),
                      matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector),
                      matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector),
                      matutils.unitvec(p.aft_vector))

        return self.config.alpha*bef + \
               self.config.beta*bet + \
               self.config.gamma*aft

    def similarity_all(self, t, extraction_pattern):

        # calculates the cosine similarity between all patterns part of a
        # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern
        # extracted from a sentence;
        #
        # returns the max similarity scores

        good = 0
        bad = 0
        max_similarity = 0

        for p in list(extraction_pattern.tuples):
            score = self.similarity_3_contexts(t, p)
            if score > max_similarity:
                max_similarity = score
            if score >= self.config.threshold_similarity:
                good += 1
            else:
                bad += 1

        if good >= bad:
            return True, max_similarity
        else:
            return False, 0.0

    def match_seeds_tuples(self):

        # checks if an extracted tuple matches seeds tuples

        matched_tuples = list()
        count_matches = dict()
        for t in self.processed_tuples:
            for s in self.config.positive_seed_tuples:
                if t.e1 == s.e1 and t.e2 == s.e2:
                    matched_tuples.append(t)
                    try:
                        count_matches[(t.e1, t.e2)] += 1
                    except KeyError:
                        count_matches[(t.e1, t.e2)] = 1
        return count_matches, matched_tuples

    def cluster_tuples(self, matched_tuples):

        # single-Pass clustering

        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            # having this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)

    def write_relationships_to_disk(self):
        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w")
        tmp = sorted(list(self.candidate_tuples.keys()), reverse=True)
        for t in tmp:
            f_output.write("instance: " + t.e1 + '\t' + t.e2 + '\tscore:' +
                           str(t.confidence) + '\n')
            f_output.write("sentence: " + t.sentence + '\n')
            f_output.write("pattern_bef: " + t.bef_words + '\n')
            f_output.write("pattern_bet: " + t.bet_words + '\n')
            f_output.write("pattern_aft: " + t.aft_words + '\n')
            if t.passive_voice is False:
                f_output.write("passive voice: False\n")
            elif t.passive_voice is True:
                f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()

    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("Loading pre-processed sentences", tuples)
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(list(count_matches.items()),
                                       key=operator.itemgetter(1),
                                       reverse=True)

                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])
                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances: generate patterns
                print("\nClustering matched instances to generate patterns")
                if len(self.patterns) == 0:
                    self.cluster_tuples(matched_tuples)

                    # Eliminate patterns supported by less than
                    # 'min_pattern_support' tuples
                    new_patterns = [
                        p for p in self.patterns
                        if len(p.tuples) > self.config.min_pattern_support
                    ]
                    self.patterns = new_patterns

                else:
                    # Parallelize single-pass clustering
                    # Each tuple must be compared with each extraction pattern

                    # Map:
                    # - Divide the tuples into smaller lists,
                    # accordingly to the number of CPUs
                    # - Pass to each CPU a sub-list of tuples and all the
                    # patterns, comparison is done by each CPU

                    # Merge:
                    # - Each CPU sends to the father process the updated
                    # patterns and new patterns
                    # - Merge patterns based on a pattern_id
                    # - Cluster new created patterns with single-pass clustering

                    # make a copy of the extraction patterns to be
                    # passed to each CPU
                    patterns = [
                        list(self.patterns) for _ in range(self.num_cpus)
                    ]

                    # distribute tuples per different CPUs
                    chunks = [list() for _ in range(self.num_cpus)]
                    n_tuples_per_child = int(
                        math.ceil(float(len(matched_tuples)) / self.num_cpus))

                    print("\n#CPUS", self.num_cpus, '\t', \
                        "Tuples per CPU", n_tuples_per_child)

                    chunk_n = 0
                    chunck_begin = 0
                    chunck_end = n_tuples_per_child

                    while chunk_n < self.num_cpus:
                        chunks[chunk_n] = matched_tuples[
                            chunck_begin:chunck_end]
                        chunck_begin = chunck_end
                        chunck_end += n_tuples_per_child
                        chunk_n += 1

                    count = 0
                    for c in chunks:
                        print("CPU_" + str(count), "  ", len(c), "patterns")
                        count += 1

                    pipes = [
                        multiprocessing.Pipe(False)
                        for _ in range(self.num_cpus)
                    ]
                    processes = [
                        multiprocessing.Process(
                            target=self.cluster_tuples_parallel,
                            args=(patterns[i], chunks[i], pipes[i][1]))
                        for i in range(self.num_cpus)
                    ]

                    print("\nRunning", len(processes), " processes")
                    for proc in processes:
                        proc.start()

                    # Receive and merge all patterns by 'pattern_id'
                    # new created patterns (new pattern_id) go into
                    # 'child_patterns' and then are merged
                    # by single-pass clustering between patterns

                    child_patterns = list()

                    for i in range(len(pipes)):
                        data = pipes[i][0].recv()
                        patterns = data[1]
                        for p_updated in patterns:
                            pattern_exists = False
                            for p_original in self.patterns:
                                if p_original.id == p_updated.id:
                                    p_original.tuples.update(p_updated.tuples)
                                    pattern_exists = True
                                    break

                            if pattern_exists is False:
                                child_patterns.append(p_updated)

                    for proc in processes:
                        proc.join()

                    print("\nSELF Patterns:")
                    for p in self.patterns:
                        p.merge_all_tuples_bet()
                        print('\n' + str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words.encode("utf8"))

                    print("\nChild Patterns:")
                    for p in child_patterns:
                        p.merge_all_tuples_bet()
                        print('\n' + str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words.encode("utf8"))

                    print(len(child_patterns), "new created patterns")

                    # merge/aggregate similar patterns generated by
                    # the child processes

                    # start comparing smaller ones with greater ones
                    child_patterns.sort(key=lambda y: len(y.tuples),
                                        reverse=False)
                    count = 0
                    new_list = list(self.patterns)
                    for p1 in child_patterns:
                        print("\nNew Patterns", len(child_patterns), \
                            "Processed", count)
                        print("New List", len(new_list))
                        print("Pattern:", p1.id, "Tuples:", len(p1.tuples))
                        max_similarity = 0
                        max_similarity_cluster = None
                        for p2 in new_list:
                            if p1 == p2:
                                continue
                            score = self.similarity_cluster(p1, p2)
                            if score > max_similarity:
                                max_similarity = score
                                max_similarity_cluster = p2
                        if max_similarity >= self.config.threshold_similarity:
                            for t in p1.tuples:
                                max_similarity_cluster.tuples.add(t)
                        else:
                            new_list.append(p1)
                        count += 1

                    # add merged patterns to main patterns structure
                    for p in new_list:
                        if p not in self.patterns:
                            self.patterns.append(p)

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                print("\n", len(self.patterns), "patterns generated")

                # merge equal tuples inside patterns to make
                # less comparisons in collecting instances
                for p in self.patterns:
                    # if only the BET context is being used,
                    # merge only based on BET contexts
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        p.merge_all_tuples_bet()

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        print('\n' + str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words)
                        else:
                            for t in p.tuples:
                                print("BEF", t.bef_words)
                                print("BET", t.bet_words)
                                print("AFT", t.aft_words)
                                print("========")

                # Look for sentences with occurrence of
                # seeds semantic types (e.g., ORG - LOC)

                # This was already collect and its stored in
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with
                # each extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("\nNumber of tuples to be analyzed:", \
                    len(self.processed_tuples))

                print("\nCollecting instances based on", \
                    len(self.patterns), "extraction patterns")

                # create copies of generated extraction patterns
                # to be passed to each process
                patterns = [list(self.patterns) for _ in range(self.num_cpus)]

                # copy all tuples into a Queue shared by all processes
                manager = multiprocessing.Manager()
                queue = manager.Queue()
                for t in self.processed_tuples:
                    queue.put(t)

                # each distinct process receives as arguments:
                #   - a list, copy of all the original extraction patterns
                #   - a Queue of the tuples
                #   - a pipe to return the collected tuples and updated
                #     patterns to the parent process

                pipes = [
                    multiprocessing.Pipe(False) for _ in range(self.num_cpus)
                ]
                processes = [
                    multiprocessing.Process(target=self.find_instances,
                                            args=(patterns[i], queue,
                                                  pipes[i][1]))
                    for i in range(self.num_cpus)
                ]

                print("Running", len(processes), " processes")
                for proc in processes:
                    proc.start()

                # structures to store each process altered patterns
                # and collected tuples
                patterns_updated = list()
                collected_tuples = list()

                for i in range(len(pipes)):
                    data = pipes[i][0].recv()
                    child_pid = data[0]
                    patterns = data[1]
                    tuples = data[2]
                    print(child_pid, "patterns", len(patterns), \
                        "tuples", len(tuples))
                    patterns_updated.extend(patterns)
                    collected_tuples.extend(tuples)

                for proc in processes:
                    proc.join()

                # Extraction patterns aggregation happens here:
                for p_updated in patterns_updated:
                    for p_original in self.patterns:
                        if p_original.id == p_updated.id:
                            p_original.positive += p_updated.positive
                            p_original.negative += p_updated.negative
                            p_original.unknown += p_updated.unknown

                # Index the patterns in an hashtable for later use
                for p in self.patterns:
                    self.patterns_index[p.id] = p

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(p.id)
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # Candidate tuples aggregation happens here:
                print("Collecting generated candidate tuples")
                for e in collected_tuples:
                    t = e[0]
                    pattern_best = e[1]
                    sim_best = e[2]

                    # if this tuple was already extracted, check if this
                    # extraction pattern is already associated with it, if not,
                    # associate this pattern with it and similarity score
                    if t in self.candidate_tuples:
                        t_patterns = self.candidate_tuples[t]
                        if t_patterns is not None:
                            if pattern_best not in [x[0] for x in t_patterns]:
                                self.candidate_tuples[t].append(
                                    (self.patterns_index[pattern_best.id],
                                     sim_best))

                    # if this tuple was not extracted before, associate this
                    # pattern with the instance and the similarity score
                    else:
                        self.candidate_tuples[t].append(
                            (self.patterns_index[pattern_best.id], sim_best))

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for t in list(self.candidate_tuples.keys()):
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                    if self.curr_iteration > 0:
                        t.confidence = \
                            t.confidence * self.config.wUpdt + \
                            t.confidence_old * (1 - self.config.wUpdt)

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = list(self.candidate_tuples.keys())
                    tuples_sorted = sorted(extracted_tuples,
                                           key=lambda tl: tl.confidence,
                                           reverse=True)
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                # update seed set of tuples to use in next iteration
                # seeds = { T | conf(T) > instance_confidence }
                print("Adding tuples to seed with confidence >=" + \
                      str(self.config.instance_confidence))
                for t in list(self.candidate_tuples.keys()):
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()

    def similarity_cluster(self, p1, p2):
        count = 0
        score = 0
        if self.config.alpha == 0 and self.config.gamma == 0:
            p1.merge_all_tuples_bet()
            p2.merge_all_tuples_bet()
            for v_bet1 in p1.bet_uniques_vectors:
                for v_bet2 in p2.bet_uniques_vectors:
                    if v_bet1 is not None and v_bet2 is not None:
                        score += dot(matutils.unitvec(asarray(v_bet1)),
                                     matutils.unitvec(asarray(v_bet2)))
                        count += 1
        else:
            for t1 in p1.tuples:
                for t2 in p2.tuples:
                    score += self.similarity_3_contexts(t1, t2)
                    count += 1

        return float(score) / float(count)

    def find_instances(self, patterns, instances, child_conn):
        updated_patterns = list()
        candidate_tuples = list()
        while True:
            try:
                t = instances.get_nowait()
                if instances.qsize() % 500 == 0:
                    sys.stdout.write(
                        str(multiprocessing.current_process()) +
                        " Instances to process: " + str(instances.qsize()) +
                        '\n')
                    sys.stdout.flush()

                # measure similarity towards every extraction pattern
                max_similarity = 0
                pattern_best = None
                for p in patterns:
                    good = 0
                    bad = 0
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        for p_bet_v in list(p.bet_uniques_vectors):
                            if t.bet_vector is not None and p_bet_v is not None:
                                score = dot(matutils.unitvec(t.bet_vector),
                                            matutils.unitvec(asarray(p_bet_v)))
                                if score >= self.config.threshold_similarity:
                                    good += 1
                                else:
                                    bad += 1

                    if good > bad:
                        p.update_selectivity(t, self.config)
                        if score > max_similarity:
                            max_similarity = score
                            pattern_best = p

                # if its above a threshold associated the pattern with it
                if max_similarity >= self.config.threshold_similarity:
                    candidate_tuples.append((t, pattern_best, max_similarity))

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                for p in patterns:
                    updated_patterns.append(p)
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, updated_patterns, candidate_tuples))
                break

    def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn):
        updated_patterns = list(patterns)
        count = 0
        for t in matched_tuples:
            count += 1
            if count % 500 == 0:
                print(multiprocessing.current_process(), count, \
                    "tuples processed")

            # go through all patterns(clusters of tuples) and find the one with
            # the highest similarity score
            max_similarity = 0
            max_similarity_cluster_index = 0
            for i in range(0, len(updated_patterns)):
                extraction_pattern = updated_patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                updated_patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                updated_patterns[max_similarity_cluster_index].add_tuple(t)

        # Eliminate clusters with two or less patterns
        new_patterns = [p for p in updated_patterns if len(p.tuples) > 5]
        pid = multiprocessing.current_process().pid
        print(multiprocessing.current_process(), "Patterns: ",
              len(new_patterns))
        child_conn.send((pid, new_patterns))
Example #12
0
 def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence):
     self.curr_iteration = 0
     self.patterns = list()
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(list)
     self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence)
Example #13
0
class BREDS(object):

    def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence):
        self.curr_iteration = 0
        self.patterns = list()
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(list)
        self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence)

    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences where named entities are
        already tagged

        :param sentences_file:
        """
        if os.path.exists("processed_tuples.pkl"):

            with open("processed_tuples.pkl", "rb") as f_in:
                print("\nLoading processed tuples from disk...")
                self.processed_tuples = pickle.load(f_in)
            print(len(self.processed_tuples), "tuples loaded")

        else:

            # load needed stuff, word2vec model and a pos-tagger
            self.config.read_word2vec()
            tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')

            print("\nGenerating relationship instances from sentences")
            with open(sentences_file, encoding='utf-8') as f_sentences:
                count = 0
                for line in f_sentences:
                    if line.startswith("#"):
                        continue
                    count += 1
                    if count % 10000 == 0:
                        sys.stdout.write(".")

                    sentence = Sentence(line.strip(),
                                        self.config.e1_type,
                                        self.config.e2_type,
                                        self.config.max_tokens_away,
                                        self.config.min_tokens_away,
                                        self.config.context_window_size, tagger,
                                        self.config)

                    for rel in sentence.relationships:
                        t = Tuple(rel.e1, rel.e2,
                                  rel.sentence, rel.before, rel.between, rel.after,
                                  self.config)
                        self.processed_tuples.append(t)
                print("\n", len(self.processed_tuples), "tuples generated")

            print("Writing generated tuples to disk")
            with open("processed_tuples.pkl", "wb") as f_out:
                pickle.dump(self.processed_tuples, f_out)

    def similarity_3_contexts(self, p, t):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))

        return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft

    def similarity_all(self, t, extraction_pattern):

        # calculates the cosine similarity between all patterns part of a
        # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern
        # extracted from a sentence;

        # returns the max similarity scores

        good = 0
        bad = 0
        max_similarity = 0

        for p in list(extraction_pattern.tuples):
            score = self.similarity_3_contexts(t, p)
            if score > max_similarity:
                max_similarity = score
            if score >= self.config.threshold_similarity:
                good += 1
            else:
                bad += 1

        if good >= bad:
            return True, max_similarity
        else:
            return False, 0.0

    def match_seeds_tuples(self):
        # checks if an extracted tuple matches seeds tuples
        matched_tuples = list()
        count_matches = dict()
        for t in self.processed_tuples:
            for s in self.config.positive_seed_tuples:
                if t.e1 == s.e1 and t.e2 == s.e2:
                    matched_tuples.append(t)
                    try:
                        count_matches[(t.e1, t.e2)] += 1
                    except KeyError:
                        count_matches[(t.e1, t.e2)] = 1

        return count_matches, matched_tuples

    def write_relationships_to_disk(self):
        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w")
        tmp = sorted(list(self.candidate_tuples.keys()), reverse=True)
        for t in tmp:
            f_output.write("instance: " + t.e1+'\t'+t.e2+'\tscore:'+str(t.confidence)+'\n')
            f_output.write("sentence: "+t.sentence+'\n')
            f_output.write("pattern_bef: "+t.bef_words+'\n')
            f_output.write("pattern_bet: "+t.bet_words+'\n')
            f_output.write("pattern_aft: "+t.aft_words+'\n')
            if t.passive_voice is False:
                f_output.write("passive voice: False\n")
            elif t.passive_voice is True:
                f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()

    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("\nLoading processed tuples from disk...")
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(
                    list(count_matches.items()),
                    key=operator.itemgetter(1),
                    reverse=True
                )
                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])

                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances, to generate
                # patterns/update patterns
                print("\nClustering matched instances to generate patterns")
                self.cluster_tuples(matched_tuples)

                # Eliminate patterns supported by less than
                # 'min_pattern_support' tuples
                new_patterns = [p for p in self.patterns if len(p.tuples) >
                                self.config.min_pattern_support]
                self.patterns = new_patterns

                print("\n", len(self.patterns), "patterns generated")

                if PRINT_PATTERNS is True:
                    count = 1
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(count)
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                            print("\n")
                        count += 1

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                # Look for sentences with occurrence of seeds
                # semantic types (e.g., ORG - LOC)
                # This was already collect and its stored in:
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with each
                # extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("Number of tuples to be analyzed:", len(self.processed_tuples))

                print("\nCollecting instances based on extraction patterns")
                count = 0

                for t in self.processed_tuples:

                    count += 1
                    if count % 1000 == 0:
                        sys.stdout.write(".")
                        sys.stdout.flush()

                    sim_best = 0
                    for extraction_pattern in self.patterns:
                        accept, score = self.similarity_all(
                            t, extraction_pattern
                        )
                        if accept is True:
                            extraction_pattern.update_selectivity(
                                t, self.config
                            )
                            if score > sim_best:
                                sim_best = score
                                pattern_best = extraction_pattern

                    if sim_best >= self.config.threshold_similarity:
                        # if this tuple was already extracted, check if this
                        # extraction pattern is already associated with it,
                        # if not, associate this pattern with it and store the
                        # similarity score
                        patterns = self.candidate_tuples[t]
                        if patterns is not None:
                            if pattern_best not in [x[0] for x in patterns]:
                                self.candidate_tuples[t].append(
                                    (pattern_best, sim_best)
                                )

                        # If this tuple was not extracted before
                        # associate this pattern with the instance
                        # and the similarity score
                        else:
                            self.candidate_tuples[t].append(
                                (pattern_best, sim_best)
                            )

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Unknown", p.unknown)
                        print("Tuples", len(p.tuples))
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for t in list(self.candidate_tuples.keys()):
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = list(self.candidate_tuples.keys())
                    tuples_sorted = sorted(extracted_tuples, key=lambda tpl: tpl.confidence,
                                           reverse=True)
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                print("Adding tuples to seed with confidence >= {}".format(
                    str(self.config.instance_confidence)))
                for t in list(self.candidate_tuples.keys()):
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()

    def cluster_tuples(self, matched_tuples):
        # this is a single-pass clustering
        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster having
            #  this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)