Python Config Examples

Programming Language: Python

Namespace/Package Name: breds.config

Class/Type: Config

Examples at hotexamples.com: 13

Python Config - 13 examples found. These are the top rated real world Python examples of breds.config.Config extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Config(6)

load_word2idx_embmatrix(1)

Example #1

Show file

 def __init__(self, config_file, seeds_file, negative_seeds, similarity,
              confidence):
     self.curr_iteration = 0
     self.patterns = list()
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(list)
     self.config = Config(config_file, seeds_file, negative_seeds,
                          similarity, confidence)

Example #2

Show file

File: breds_parallel.py Project: wangbq18/Medical_NER_RE

 def __init__(self, args):
     if args.num_cores == 0:
         self.num_cpus = multiprocessing.cpu_count()
     else:
         self.num_cpus = args.num_cores
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(
         list)  # 当字典里的key不存在但被查找时，返回的不是keyError而是一个默认空list
     self.config = Config(args.config_file, args.positive_seeds_file,
                          args.negative_seeds_file, args.similarity,
                          args.confidence)

Example #3

Show file

File: breds-parallel.py Project: yespon/BREDS

 def __init__(self, config_file, seeds_file, negative_seeds, similarity,
              confidence, num_cores):
     if num_cores == 0:
         self.num_cpus = multiprocessing.cpu_count()
     else:
         self.num_cpus = num_cores
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(list)
     self.curr_iteration = 0
     self.patterns = list()
     self.patterns_index = dict()
     self.config = Config(config_file, seeds_file, negative_seeds,
                          similarity, confidence)

Example #4

Show file

File: breds_inference.py Project: DAlkemade/BREDS

def gather_sizes_with_bootstrapping_patterns(cfg: Box, patterns, all_new_objects) -> DefaultDict[Tuple, list]:
    """Gather text, parse tuples and check if tuples include valid sizes."""
    visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors)
    config = Config(cfg, visual_config)
    tuples = generate_tuples(randomString(), config, names=all_new_objects)

    config.visual = cfg.parameters.visual_at_inference

    candidate_tuples = extract_tuples(config, patterns, tuples)
    filtered_tuples = filter_tuples(candidate_tuples, cfg.parameters.dev_threshold)

    for t in candidate_tuples.keys():
        logger.info(t.sentence)
        logger.info(f"{t.e1} {t.e2}")
        logger.info(t.confidence)
        logger.info("\n")

    return filtered_tuples

Example #5

Show file

File: breds-parallel.py Project: davidsbatista/BREDS

 def __init__(self, config_file, seeds_file, negative_seeds, similarity,
              confidence, num_cores):
     if num_cores == 0:
         self.num_cpus = multiprocessing.cpu_count()
     else:
         self.num_cpus = num_cores
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(list)
     self.curr_iteration = 0
     self.patterns = list()
     self.patterns_index = dict()
     self.config = Config(config_file, seeds_file, negative_seeds,
                          similarity, confidence)

Example #6

Show file

File: breds-parallel.py Project: davidsbatista/BREDS

class BREDS(object):

    def __init__(self, config_file, seeds_file, negative_seeds, similarity,
                 confidence, num_cores):
        if num_cores == 0:
            self.num_cpus = multiprocessing.cpu_count()
        else:
            self.num_cpus = num_cores
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(list)
        self.curr_iteration = 0
        self.patterns = list()
        self.patterns_index = dict()
        self.config = Config(config_file, seeds_file, negative_seeds,
                             similarity, confidence)

    def generate_tuples(self, sentences_file):

        # generate tuples instances from a text file with sentences
        # where named entities are already tagged

        # load word2vec model
        self.config.read_word2vec()

        # copy all sentences from input file into a Queue
        # shared by all processes
        manager = multiprocessing.Manager()
        queue = manager.Queue()

        print("\nLoading sentences from file")
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        count = 0
        for line in f_sentences:
            if line.startswith("#"):
                continue
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")
            queue.put(line.strip())
        f_sentences.close()

        pipes = [multiprocessing.Pipe(False) for _ in range(self.num_cpus)]
        processes = [
            multiprocessing.Process(target=self.generate_instances,
                                    args=(queue, pipes[i][1]))
            for i in range(self.num_cpus)]

        print("\nGenerating relationship instances from sentences")
        print("Running", len(processes), " processes")
        for proc in processes:
            proc.start()

        for i in range(len(pipes)):
            data = pipes[i][0].recv()
            child_instances = data[1]
            for x in child_instances:
                self.processed_tuples.append(x)

        for proc in processes:
            proc.join()

        print("\n", len(self.processed_tuples), "instances generated")
        print("Writing generated tuples to disk")
        f = open("processed_tuples.pkl", "wb")
        pickle.dump(self.processed_tuples, f)
        f.close()

    def generate_instances(self, sentences, child_conn):
        # Each process has its own NLTK PoS-tagger
        tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
        instances = list()
        while True:
            try:
                s = sentences.get_nowait()
                if sentences.qsize() % 500 == 0:
                    print(multiprocessing.current_process(), \
                        "Instances to process", sentences.qsize())

                sentence = Sentence(s,
                                    self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size,
                                    tagger,
                                    self.config)

                for rel in sentence.relationships:
                    t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                              rel.between, rel.after, self.config)
                    instances.append(t)

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, instances))
                break

    def similarity_3_contexts(self, t, p):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(
                matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector)
            )

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(
                matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector)
            )

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(
                matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector)
            )

        return self.config.alpha*bef + \
               self.config.beta*bet + \
               self.config.gamma*aft

    def similarity_all(self, t, extraction_pattern):

        # calculates the cosine similarity between all patterns part of a
        # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern
        # extracted from a sentence;
        #
        # returns the max similarity scores

        good = 0
        bad = 0
        max_similarity = 0

        for p in list(extraction_pattern.tuples):
            score = self.similarity_3_contexts(t, p)
            if score > max_similarity:
                max_similarity = score
            if score >= self.config.threshold_similarity:
                good += 1
            else:
                bad += 1

        if good >= bad:
            return True, max_similarity
        else:
            return False, 0.0

    def match_seeds_tuples(self):

        # checks if an extracted tuple matches seeds tuples

        matched_tuples = list()
        count_matches = dict()
        for t in self.processed_tuples:
            for s in self.config.positive_seed_tuples:
                if t.e1 == s.e1 and t.e2 == s.e2:
                    matched_tuples.append(t)
                    try:
                        count_matches[(t.e1, t.e2)] += 1
                    except KeyError:
                        count_matches[(t.e1, t.e2)] = 1
        return count_matches, matched_tuples

    def cluster_tuples(self, matched_tuples):

        # single-Pass clustering

        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            # having this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)

    def write_relationships_to_disk(self):
        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w")
        tmp = sorted(list(self.candidate_tuples.keys()), reverse=True)
        for t in tmp:
            f_output.write("instance: " + t.e1+'\t'+t.e2+'\tscore:'+str(t.confidence)+'\n')
            f_output.write("sentence: "+t.sentence+'\n')
            f_output.write("pattern_bef: "+t.bef_words+'\n')
            f_output.write("pattern_bet: "+t.bet_words+'\n')
            f_output.write("pattern_aft: "+t.aft_words+'\n')
            if t.passive_voice is False:
                f_output.write("passive voice: False\n")
            elif t.passive_voice is True:
                f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()

    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("Loading pre-processed sentences", tuples)
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(
                    list(count_matches.items()),
                    key=operator.itemgetter(1),
                    reverse=True
                )

                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])
                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances: generate patterns
                print("\nClustering matched instances to generate patterns")
                if len(self.patterns) == 0:
                    self.cluster_tuples(matched_tuples)

                    # Eliminate patterns supported by less than
                    # 'min_pattern_support' tuples
                    new_patterns = [p for p in self.patterns if len(p.tuples) >
                                    self.config.min_pattern_support]
                    self.patterns = new_patterns

                else:
                    # Parallelize single-pass clustering
                    # Each tuple must be compared with each extraction pattern

                    # Map:
                    # - Divide the tuples into smaller lists,
                    # accordingly to the number of CPUs
                    # - Pass to each CPU a sub-list of tuples and all the
                    # patterns, comparison is done by each CPU

                    # Merge:
                    # - Each CPU sends to the father process the updated
                    # patterns and new patterns
                    # - Merge patterns based on a pattern_id
                    # - Cluster new created patterns with single-pass clustering

                    # make a copy of the extraction patterns to be
                    # passed to each CPU
                    patterns = [
                        list(self.patterns) for _ in range(self.num_cpus)
                        ]

                    # distribute tuples per different CPUs
                    chunks = [list() for _ in range(self.num_cpus)]
                    n_tuples_per_child = int(
                        math.ceil(float(len(matched_tuples)) / self.num_cpus)
                    )

                    print("\n#CPUS", self.num_cpus, '\t', \
                        "Tuples per CPU", n_tuples_per_child)

                    chunk_n = 0
                    chunck_begin = 0
                    chunck_end = n_tuples_per_child

                    while chunk_n < self.num_cpus:
                        chunks[chunk_n] = matched_tuples[
                                          chunck_begin:chunck_end
                                          ]
                        chunck_begin = chunck_end
                        chunck_end += n_tuples_per_child
                        chunk_n += 1

                    count = 0
                    for c in chunks:
                        print("CPU_"+str(count), "  ", len(c), "patterns")
                        count += 1

                    pipes = [
                        multiprocessing.Pipe(False)
                        for _ in range(self.num_cpus)
                        ]
                    processes = [
                        multiprocessing.Process(
                            target=self.cluster_tuples_parallel,
                            args=(patterns[i], chunks[i], pipes[i][1]))
                        for i in range(self.num_cpus)
                        ]

                    print("\nRunning", len(processes), " processes")
                    for proc in processes:
                        proc.start()

                    # Receive and merge all patterns by 'pattern_id'
                    # new created patterns (new pattern_id) go into
                    # 'child_patterns' and then are merged
                    # by single-pass clustering between patterns

                    child_patterns = list()

                    for i in range(len(pipes)):
                        data = pipes[i][0].recv()
                        patterns = data[1]
                        for p_updated in patterns:
                            pattern_exists = False
                            for p_original in self.patterns:
                                if p_original.id == p_updated.id:
                                    p_original.tuples.update(p_updated.tuples)
                                    pattern_exists = True
                                    break

                            if pattern_exists is False:
                                child_patterns.append(p_updated)

                    for proc in processes:
                        proc.join()

                    print("\nSELF Patterns:")
                    for p in self.patterns:
                        p.merge_all_tuples_bet()
                        print('\n'+str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words.encode("utf8"))

                    print("\nChild Patterns:")
                    for p in child_patterns:
                        p.merge_all_tuples_bet()
                        print('\n'+str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words.encode("utf8"))

                    print(len(child_patterns), "new created patterns")

                    # merge/aggregate similar patterns generated by
                    # the child processes

                    # start comparing smaller ones with greater ones
                    child_patterns.sort(key=lambda y: len(y.tuples),
                                        reverse=False)
                    count = 0
                    new_list = list(self.patterns)
                    for p1 in child_patterns:
                        print("\nNew Patterns", len(child_patterns), \
                            "Processed", count)
                        print("New List", len(new_list))
                        print("Pattern:", p1.id, "Tuples:", len(p1.tuples))
                        max_similarity = 0
                        max_similarity_cluster = None
                        for p2 in new_list:
                            if p1 == p2:
                                continue
                            score = self.similarity_cluster(p1, p2)
                            if score > max_similarity:
                                max_similarity = score
                                max_similarity_cluster = p2
                        if max_similarity >= self.config.threshold_similarity:
                            for t in p1.tuples:
                                max_similarity_cluster.tuples.add(t)
                        else:
                            new_list.append(p1)
                        count += 1

                    # add merged patterns to main patterns structure
                    for p in new_list:
                        if p not in self.patterns:
                            self.patterns.append(p)

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                print("\n", len(self.patterns), "patterns generated")

                # merge equal tuples inside patterns to make
                # less comparisons in collecting instances
                for p in self.patterns:
                    # if only the BET context is being used,
                    # merge only based on BET contexts
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        p.merge_all_tuples_bet()

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        print('\n'+str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words)
                        else:
                            for t in p.tuples:
                                print("BEF", t.bef_words)
                                print("BET", t.bet_words)
                                print("AFT", t.aft_words)
                                print("========")

                # Look for sentences with occurrence of
                # seeds semantic types (e.g., ORG - LOC)

                # This was already collect and its stored in
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with
                # each extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("\nNumber of tuples to be analyzed:", \
                    len(self.processed_tuples))

                print("\nCollecting instances based on", \
                    len(self.patterns), "extraction patterns")

                # create copies of generated extraction patterns
                # to be passed to each process
                patterns = [list(self.patterns) for _ in range(self.num_cpus)]

                # copy all tuples into a Queue shared by all processes
                manager = multiprocessing.Manager()
                queue = manager.Queue()
                for t in self.processed_tuples:
                    queue.put(t)

                # each distinct process receives as arguments:
                #   - a list, copy of all the original extraction patterns
                #   - a Queue of the tuples
                #   - a pipe to return the collected tuples and updated
                #     patterns to the parent process

                pipes = [
                    multiprocessing.Pipe(False) for _ in range(self.num_cpus)
                    ]
                processes = [
                    multiprocessing.Process(
                        target=self.find_instances,
                        args=(patterns[i], queue, pipes[i][1]))
                    for i in range(self.num_cpus)
                    ]

                print("Running", len(processes), " processes")
                for proc in processes:
                    proc.start()

                # structures to store each process altered patterns
                # and collected tuples
                patterns_updated = list()
                collected_tuples = list()

                for i in range(len(pipes)):
                    data = pipes[i][0].recv()
                    child_pid = data[0]
                    patterns = data[1]
                    tuples = data[2]
                    print(child_pid, "patterns", len(patterns), \
                        "tuples", len(tuples))
                    patterns_updated.extend(patterns)
                    collected_tuples.extend(tuples)

                for proc in processes:
                    proc.join()

                # Extraction patterns aggregation happens here:
                for p_updated in patterns_updated:
                    for p_original in self.patterns:
                        if p_original.id == p_updated.id:
                            p_original.positive += p_updated.positive
                            p_original.negative += p_updated.negative
                            p_original.unknown += p_updated.unknown

                # Index the patterns in an hashtable for later use
                for p in self.patterns:
                    self.patterns_index[p.id] = p

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(p.id)
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # Candidate tuples aggregation happens here:
                print("Collecting generated candidate tuples")
                for e in collected_tuples:
                    t = e[0]
                    pattern_best = e[1]
                    sim_best = e[2]

                    # if this tuple was already extracted, check if this
                    # extraction pattern is already associated with it, if not,
                    # associate this pattern with it and similarity score
                    if t in self.candidate_tuples:
                        t_patterns = self.candidate_tuples[t]
                        if t_patterns is not None:
                            if pattern_best not in [x[0] for x in t_patterns]:
                                self.candidate_tuples[t].append(
                                    (self.patterns_index[pattern_best.id],
                                     sim_best)
                                )

                    # if this tuple was not extracted before, associate this
                    # pattern with the instance and the similarity score
                    else:
                        self.candidate_tuples[t].append(
                            (self.patterns_index[pattern_best.id], sim_best)
                        )

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for t in list(self.candidate_tuples.keys()):
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                    if self.curr_iteration > 0:
                        t.confidence = \
                            t.confidence * self.config.wUpdt + \
                            t.confidence_old * (1 - self.config.wUpdt)

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = list(self.candidate_tuples.keys())
                    tuples_sorted = sorted(
                        extracted_tuples,
                        key=lambda tl: tl.confidence,
                        reverse=True
                    )
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                # update seed set of tuples to use in next iteration
                # seeds = { T | conf(T) > instance_confidence }
                print("Adding tuples to seed with confidence >=" + \
                      str(self.config.instance_confidence))
                for t in list(self.candidate_tuples.keys()):
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()

    def similarity_cluster(self, p1, p2):
        count = 0
        score = 0
        if self.config.alpha == 0 and self.config.gamma == 0:
            p1.merge_all_tuples_bet()
            p2.merge_all_tuples_bet()
            for v_bet1 in p1.bet_uniques_vectors:
                for v_bet2 in p2.bet_uniques_vectors:
                    if v_bet1 is not None and v_bet2 is not None:
                        score += dot(
                            matutils.unitvec(asarray(v_bet1)),
                            matutils.unitvec(asarray(v_bet2))
                        )
                        count += 1
        else:
            for t1 in p1.tuples:
                for t2 in p2.tuples:
                    score += self.similarity_3_contexts(t1, t2)
                    count += 1

        return float(score) / float(count)

    def find_instances(self, patterns, instances, child_conn):
        updated_patterns = list()
        candidate_tuples = list()
        while True:
            try:
                t = instances.get_nowait()
                if instances.qsize() % 500 == 0:
                    sys.stdout.write(
                        str(multiprocessing.current_process()) +
                        " Instances to process: " +
                        str(instances.qsize())+'\n')
                    sys.stdout.flush()

                # measure similarity towards every extraction pattern
                max_similarity = 0
                pattern_best = None
                for p in patterns:
                    good = 0
                    bad = 0
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        for p_bet_v in list(p.bet_uniques_vectors):
                            if t.bet_vector is not None and p_bet_v is not None:
                                score = dot(
                                    matutils.unitvec(t.bet_vector),
                                    matutils.unitvec(asarray(p_bet_v))
                                )
                                if score >= self.config.threshold_similarity:
                                    good += 1
                                else:
                                    bad += 1

                    if good > bad:
                        p.update_selectivity(t, self.config)
                        if score > max_similarity:
                            max_similarity = score
                            pattern_best = p

                # if its above a threshold associated the pattern with it
                if max_similarity >= self.config.threshold_similarity:
                    candidate_tuples.append((t, pattern_best, max_similarity))

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                for p in patterns:
                    updated_patterns.append(p)
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, updated_patterns, candidate_tuples))
                break

    def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn):
        updated_patterns = list(patterns)
        count = 0
        for t in matched_tuples:
            count += 1
            if count % 500 == 0:
                print(multiprocessing.current_process(), count, \
                    "tuples processed")

            # go through all patterns(clusters of tuples) and find the one with
            # the highest similarity score
            max_similarity = 0
            max_similarity_cluster_index = 0
            for i in range(0, len(updated_patterns)):
                extraction_pattern = updated_patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                updated_patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                updated_patterns[max_similarity_cluster_index].add_tuple(t)

        # Eliminate clusters with two or less patterns
        new_patterns = [p for p in updated_patterns if len(p.tuples) > 5]
        pid = multiprocessing.current_process().pid
        print(multiprocessing.current_process(), "Patterns: ", len(new_patterns))
        child_conn.send((pid, new_patterns))

Example #7

Show file

class BREDS(object):
    def __init__(self, config_file, seeds_file, negative_seeds, similarity,
                 confidence):
        self.curr_iteration = 0
        self.patterns = list()
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(list)
        self.config = Config(config_file, seeds_file, negative_seeds,
                             similarity, confidence)

    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences where named entities are
        already tagged

        :param sentences_file:
        """
        if os.path.exists("processed_tuples.pkl"):

            with open("processed_tuples.pkl", "rb") as f_in:
                print("\nLoading processed tuples from disk...")
                self.processed_tuples = pickle.load(f_in)
            print(len(self.processed_tuples), "tuples loaded")

            temp_file = open("temp.txt", "w", encoding='utf-8')
            for i in self.processed_tuples:
                temp_file.write(i.e1 + '\t' + i.e2 + '\n')
            temp_file.close()

        else:

            # load needed stuff, word2vec model and a pos-tagger
            self.config.read_word2vec()
            tagger = None

            print("\nGenerating relationship instances from sentences")
            with open(sentences_file, encoding='utf-8') as f_sentences:
                count = 0
                for line in f_sentences:
                    if line.startswith("#"):
                        continue
                    count += 1
                    if count % 10000 == 0:
                        sys.stdout.write(".")

                    sentence = Sentence(line.strip(), self.config.e1_type,
                                        self.config.e2_type,
                                        self.config.max_tokens_away,
                                        self.config.min_tokens_away,
                                        self.config.context_window_size,
                                        tagger, self.config)

                    for rel in sentence.relationships:
                        t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                                  rel.between, rel.after, self.config)
                        self.processed_tuples.append(t)
                print("\n", len(self.processed_tuples), "tuples generated")

            print("Writing generated tuples to disk")
            with open("processed_tuples.pkl", "wb") as f_out:
                pickle.dump(self.processed_tuples, f_out)

    def similarity_3_contexts(self, p, t):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector),
                      matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector),
                      matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector),
                      matutils.unitvec(p.aft_vector))

        return self.config.alpha * bef + self.config.beta * bet + self.config.gamma * aft

    def similarity_all(self, t, extraction_pattern):

        # calculates the cosine similarity between all patterns part of a
        # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern
        # extracted from a sentence;

        # returns the max similarity scores

        good = 0
        bad = 0
        max_similarity = 0

        for p in list(extraction_pattern.tuples):
            score = self.similarity_3_contexts(t, p)
            if score > max_similarity:
                max_similarity = score
            if score >= self.config.threshold_similarity:
                good += 1
            else:
                bad += 1

        if good >= bad:
            return True, max_similarity
        else:
            return False, 0.0

    def match_seeds_tuples(self):
        # checks if an extracted tuple matches seeds tuples
        matched_tuples = list()
        count_matches = dict()
        for t in self.processed_tuples:
            for s in self.config.positive_seed_tuples:
                if t.e1 == s.e1 and t.e2 == s.e2:
                    matched_tuples.append(t)
                    try:
                        count_matches[(t.e1, t.e2)] += 1
                    except KeyError:
                        count_matches[(t.e1, t.e2)] = 1

        return count_matches, matched_tuples

    def write_relationships_to_disk(self):
        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w", encoding='utf-8')
        tmp = sorted(list(self.candidate_tuples.keys()), reverse=True)
        for t in tmp:
            f_output.write("instance: " + t.e1 + '\t' + t.e2 + '\tscore:' +
                           str(t.confidence) + '\n')
            f_output.write("sentence: " + t.sentence + '\n')
            f_output.write("pattern_bef: " + t.bef_words + '\n')
            f_output.write("pattern_bet: " + t.bet_words + '\n')
            f_output.write("pattern_aft: " + t.aft_words + '\n')
            # if t.passive_voice is False:
            #     f_output.write("passive voice: False\n")
            # elif t.passive_voice is True:
            #     f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()

    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("\nLoading processed tuples from disk...")
            self.processed_tuples = pickle.load(f)
            f.close()
            temp_file = open("temp.txt", "w", encoding='utf-8')
            for i in self.processed_tuples:
                print(i)
                temp_file.write(i + '\n')
            temp_file.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(list(count_matches.items()),
                                       key=operator.itemgetter(1),
                                       reverse=True)
                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])

                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances, to generate
                # patterns/update patterns
                print("\nClustering matched instances to generate patterns")
                self.cluster_tuples(matched_tuples)

                # Eliminate patterns supported by less than
                # 'min_pattern_support' tuples
                new_patterns = [
                    p for p in self.patterns
                    if len(p.tuples) > self.config.min_pattern_support
                ]
                self.patterns = new_patterns

                print("\n", len(self.patterns), "patterns generated")

                if PRINT_PATTERNS is True:
                    count = 1
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(count)
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                            print("\n")
                        count += 1

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                # Look for sentences with occurrence of seeds
                # semantic types (e.g., ORG - LOC)
                # This was already collect and its stored in:
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with each
                # extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("Number of tuples to be analyzed:",
                      len(self.processed_tuples))

                print("\nCollecting instances based on extraction patterns")
                count = 0

                for t in self.processed_tuples:

                    count += 1
                    if count % 1000 == 0:
                        sys.stdout.write(".")
                        sys.stdout.flush()

                    sim_best = 0
                    for extraction_pattern in self.patterns:
                        accept, score = self.similarity_all(
                            t, extraction_pattern)
                        if accept is True:
                            extraction_pattern.update_selectivity(
                                t, self.config)
                            if score > sim_best:
                                sim_best = score
                                pattern_best = extraction_pattern

                    if sim_best >= self.config.threshold_similarity:
                        # if this tuple was already extracted, check if this
                        # extraction pattern is already associated with it,
                        # if not, associate this pattern with it and store the
                        # similarity score
                        patterns = self.candidate_tuples[t]
                        if patterns is not None:
                            if pattern_best not in [x[0] for x in patterns]:
                                self.candidate_tuples[t].append(
                                    (pattern_best, sim_best))

                        # If this tuple was not extracted before
                        # associate this pattern with the instance
                        # and the similarity score
                        else:
                            self.candidate_tuples[t].append(
                                (pattern_best, sim_best))

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Unknown", p.unknown)
                        print("Tuples", len(p.tuples))
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for idx, t in enumerate(list(self.candidate_tuples.keys())):
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                    if idx > 0:
                        t.confidence = t.confidence * self.config.wUpdt + t.confidence_old * (
                            1 - self.config.wUpdt)

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = list(self.candidate_tuples.keys())
                    tuples_sorted = sorted(extracted_tuples,
                                           key=lambda tpl: tpl.confidence,
                                           reverse=True)
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                print("Adding tuples to seed with confidence >= {}".format(
                    str(self.config.instance_confidence)))
                for t in list(self.candidate_tuples.keys()):
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()

    def cluster_tuples(self, matched_tuples):
        # this is a single-pass clustering
        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster having
            #  this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)

Example #8

Show file

File: breds_parallel.py Project: wangbq18/Medical_NER_RE

class BREDS:
    def __init__(self, args):
        if args.num_cores == 0:
            self.num_cpus = multiprocessing.cpu_count()
        else:
            self.num_cpus = args.num_cores
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(
            list)  # 当字典里的key不存在但被查找时，返回的不是keyError而是一个默认空list
        self.config = Config(args.config_file, args.positive_seeds_file,
                             args.negative_seeds_file, args.similarity,
                             args.confidence)

    def generate_tuples(self, data_dir: str):
        """
        用于从源数据中，用多线程的方式生成tuples
        Args:
            data_dir: 数据存储的路径，其中包括：
                      eg. 源文章名称    __ data/round2/0.txt
                          NER结果名称   __ data/round2/0_ner.pkl
                          文章分句结果   __ data/round2/0_sentence_split.pkl
        """

        # Step1 : load word2idx and emb_matrix
        self.config.load_word2idx_embmatrix()

        # Step2 : 生成候选关系对
        instances = list()
        file_names = scan_files(data_dir)

        for file in file_names:
            passage = load_file(data_dir, file, "txt")  # type:str
            sent_split = pickle.load(
                open(data_dir + file + "_sentence_split.pkl",
                     "rb"))  # type:List[tuple]
            ner_result = pickle.load(open(data_dir + file + "_ner.pkl",
                                          "rb"))  # type:List[tuple]

            sent_split.sort(key=lambda x: x[0])

            # Step2.1 : 找出属于e1与e2的实体
            e1_entities, e2_entities = list(), list()
            for e in ner_result:
                # e是个4元组，例如：('Disease', 1, 10, '糖尿病下肢动脉病变')
                if e[0] == self.config.e1_type:
                    e1_entities.append(e)
                elif e[0] == self.config.e2_type:
                    e2_entities.append(e)
            e1_entities.sort(key=lambda x: x[1])
            e2_entities.sort(key=lambda x: x[1])

            # Step2.2 : 对每一个e1去找到候选的e2，并确定三元组<BEF，BET，AFT,sequence_tag>
            for e1 in e1_entities:
                e1_start, e1_end = e1[1], e1[2]
                cur_sentence_idx = -1
                for idx, s in enumerate(sent_split):
                    if s[0] <= e1_start and s[1] >= e1_end:
                        cur_sentence_idx = idx
                        break
                # 根据当前实体的位置确定了寻找e2的上下界：即 上一句 + 当前句 + 下一句
                search_e2_start = sent_split[
                    cur_sentence_idx - 1 if cur_sentence_idx > 1 else 0][0]
                search_e2_end = sent_split[cur_sentence_idx + 1 if cur_sentence_idx < len(sent_split) - 1 \
                    else len(sent_split) - 1][1]

                for i in range(len(e2_entities)):
                    e2 = e2_entities[i]
                    e2_start = e2[1]
                    e2_end = e2[2]
                    if e2_end < search_e2_start:
                        continue
                    elif e2_start > search_e2_end:
                        break
                    elif e2_start >= search_e2_start and e2_end <= search_e2_end:
                        if e1_end == e2_start:
                            # 情况(1)：e1在e2前，且紧挨着
                            before = passage[search_e2_start:e1_start]
                            between = ""
                            after = passage[e2_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=True,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e2_end == e1_start:
                            # 情况（2）：e1在e2后，且紧挨着
                            before = passage[search_e2_start:e2_start]
                            between = ""
                            after = passage[e1_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=False,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e1_end < e2_start:
                            # 情况（3）：e1在e2前，不挨着
                            before = passage[search_e2_start:e1_start]
                            between = passage[e1_end:e2_start]
                            after = passage[e2_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=True,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)
                        elif e2_end < e1_start:
                            # 情况（4）：e1在e2后，不挨着
                            before = passage[search_e2_start:e2_start]
                            between = passage[e2_end:e1_start]
                            after = passage[e1_end:search_e2_end]
                            t = Tuple(e1[3],
                                      e2[3],
                                      sequence_tag=False,
                                      before=before,
                                      between=between,
                                      after=after,
                                      config=self.config)
                            instances.append(t)

        # Stpe3 : 持久化
        pickle.dump(
            instances,
            open("./saved_model_files/RE_candidate_instances.pkl", "wb"))

    def similarity_3_contexts(self, t: Tuple, p: Tuple) -> float:
        bef, bet, aft = 0, 0, 0
        # TODO 貌似应该增加参数的
        return 0

Example #9

Show file

File: visual_propagation.py Project: DAlkemade/BREDS

def main():
    with open("config.yml", "r") as ymlfile:
        cfg = Box(yaml.safe_load(ymlfile))
        # cfg = Box(yaml.safe_load(ymlfile), default_box=True, default_box_attr=None)

    test_pairs, unseen_objects = comparison_dev_set(cfg)
    unseen_objects = [o.replace('_', " ") for o in unseen_objects]

    # TODO check whether the objects aren't in the bootstrapped objects
    visual_config = VisualConfig(cfg.path.vg_objects, cfg.path.vg_objects_anchors)
    config = Config(cfg, visual_config)

    visual_config = config.visual_config
    objects = list(visual_config.entity_to_synsets.keys())
    logger.info(f'Objects: {objects}')
    G = build_cooccurrence_graph(objects, visual_config)

    word2vec_model = load_word2vec(cfg.parameters.word2vec_path)
    similar_words = find_similar_words(word2vec_model, unseen_objects, n_word2vec=200)

    # calc coverage and precision
    results = list()
    settings: List[BackoffSettings] = [
        # BackoffSettings(use_direct=True),
        # BackoffSettings(use_word2vec=True),
        # BackoffSettings(use_hypernyms=True),
        # BackoffSettings(use_hyponyms=True),
        # BackoffSettings(use_head_noun=True),
        # BackoffSettings(use_direct=True, use_word2vec=True),
        BackoffSettings(use_direct=True, use_word2vec=True, use_hypernyms=True),
        # BackoffSettings(use_direct=True, use_hypernyms=True),
        # BackoffSettings(use_direct=True, use_hyponyms=True),
        # BackoffSettings(use_direct=True, use_head_noun=True),
        # BackoffSettings(use_direct=True, use_hyponyms=True)
    ]
    golds = [p.larger for p in test_pairs]

    for setting in settings:
        preds = list()
        fractions_larger = list()
        notes = list()

        prop = VisualPropagation(G, config.visual_config)
        logger.info(f'\nRunning for setting {setting.print()}')
        comparer = Comparer(prop, setting, similar_words, objects)
        for test_pair in tqdm.tqdm(test_pairs):
            # TODO return confidence; use the higher one
            res_visual, fraction_larger, note = comparer.compare_visual_with_backoff(test_pair)
            fractions_larger.append(fraction_larger)
            preds.append(res_visual)
            notes.append(note)

        with open(f'visual_comparison_predictions_{setting.print()}.pkl', 'wb') as f:
            pickle.dump(list(zip(preds, fractions_larger, notes)), f)

        useful_counts = comparer.useful_paths_count
        tr = SymmetricalLogTransform(base=10, linthresh=1, linscale=1)
        ss = tr.transform([0., max(useful_counts) + 1])
        bins = tr.inverted().transform(np.linspace(*ss, num=100))
        fig, ax = plt.subplots()
        plt.hist(useful_counts, bins=bins)
        plt.xlabel('Number of useful paths')
        ax.set_xscale('symlog')
        plt.savefig(f'useful_paths{setting.print()}.png')

        useful_counts = np.array(useful_counts)
        logger.info(f'Number of objects with no useful path: {len(np.extract(useful_counts == 0, useful_counts))}')
        logger.info(f'Not recog count: {comparer.not_recognized_count}')

        logger.info(f'Total number of test cases: {len(golds)}')
        coverage, selectivity = coverage_accuracy_relational(golds, preds)
        logger.info(f'Coverage: {coverage}')
        logger.info(f'selectivity: {selectivity}')

        results.append(RelationalResult(setting.print(), selectivity, coverage))

        assert len(fractions_larger) == len(preds)
        corrects_not_none = list()
        diffs_not_none = list()
        for i, fraction_larger in enumerate(fractions_larger):
            gold = golds[i]
            res = preds[i]
            if fraction_larger is not None and fraction_larger != 0.5:
                fraction_larger_centered = fraction_larger - .5
                corrects_not_none.append(gold == res)
                diffs_not_none.append(abs(fraction_larger_centered))
        # TODO do something special for when fraction_larger_centered == 0

        regr_linear = Ridge(alpha=1.0)
        regr_linear.fit(np.reshape(diffs_not_none, (-1, 1)), corrects_not_none)
        with open('visual_confidence_model.pkl', 'wb') as f:
            pickle.dump(regr_linear, f)

        fig, ax = plt.subplots()
        bin_means, bin_edges, binnumber = stats.binned_statistic(diffs_not_none, corrects_not_none, 'mean',
                                                                 bins=20)
        bin_counts, _, _ = stats.binned_statistic(diffs_not_none, corrects_not_none, 'count',
                                                  bins=20)
        x = np.linspace(min(diffs_not_none), max(diffs_not_none), 500)
        X = np.reshape(x, (-1, 1))
        plt.plot(x, regr_linear.predict(X), '-', label='linear ridge regression')

        minc = min(bin_counts)
        maxc = max(bin_counts)
        norm = colors.SymLogNorm(vmin=minc, vmax=maxc, linthresh=1)
        bin_counts_normalized = [norm(c) for c in bin_counts]
        logger.info(f'counts, norm: {list(zip(bin_counts, bin_counts_normalized))}')
        viridis = cm.get_cmap('viridis', 20)

        mins = bin_edges[:-1]
        maxs = bin_edges[1:]
        mask = ~np.isnan(bin_means)
        plt.hlines(np.extract(mask, bin_means), np.extract(mask, mins), np.extract(mask, maxs),
                   colors=viridis(np.extract(mask, bin_counts_normalized)), lw=5,
                   label='binned statistic of data')
        sm = plt.cm.ScalarMappable(cmap=viridis, norm=norm)
        ticks = [10**1.5, 10**1.75, 10**2, 10**2.5]
        colorbar = plt.colorbar(sm, ticks=ticks)
        colorbar.ax.set_yticklabels(['10^1.5', '10^1.75', '10^2', '10^2.5'])
        colorbar.set_label('bin count')
        plt.ylim(-0.05, 1.05)
        plt.legend()
        plt.xlabel('Absolute fraction_larger')
        plt.ylabel('Selectivity')
        ax.set_xscale('linear')
        plt.savefig('fraction_larger_selectivity_linear.png')
        plt.show()

        correlation, _ = pearsonr(diffs_not_none, corrects_not_none)
        logger.info(f'Pearsons correlation: {correlation}')

        correlation_spearman, _ = spearmanr(np.array(diffs_not_none), b=np.array(corrects_not_none))
        logger.info(f'Spearman correlation: {correlation_spearman}')

    results_df = pd.DataFrame(results)
    results_df.to_csv('results_visual_backoff.csv')

Example #10

Show file

def main():
    with open("config.yml", "r") as ymlfile:
        cfg = Box(yaml.safe_load(ymlfile))
        # cfg = Box(yaml.safe_load(ymlfile), default_box=True, default_box_attr=None)

    # TODO check whether the objects aren't in the bootstrapped objects
    visual_config = VisualConfig(cfg.path.vg_objects,
                                 cfg.path.vg_objects_anchors)
    config = Config(cfg, visual_config)

    input: DataFrame = pd.read_csv(cfg.path.dev)
    input = input.astype({'object': str})
    unseen_objects = list(input['object'])
    logger.info(f'Unseen objects: {unseen_objects}')

    visual_config = config.visual_config
    objects = list(visual_config.entity_to_synsets.keys())
    logger.info(f'Objects: {objects}')
    G = build_cooccurrence_graph(objects, visual_config)

    with open(cfg.path.final_seeds_cache) as f:
        numeric_seeds = json.load(f)

    numeric_seeds = dict((key.strip().replace(' ', '_'), value)
                         for (key, value) in numeric_seeds.items())
    del numeric_seeds[
        'rhine']  # There is a 'rhine' in VG, which was included in VG as the river. fixing this manually,
    # since it's in a lot of results

    point_predictions = dict()
    point_predictions_evenly = dict()
    point_predictions_svm = dict()
    prop = VisualPropagation(G, config.visual_config)
    for unseen_object in unseen_objects:
        unseen_object = unseen_object.replace(' ', '_')
        logger.info(f'Processing {unseen_object}')
        if unseen_object not in objects:
            logger.info(f'{unseen_object} not in visuals')
            point_predictions[unseen_object.replace('_', ' ')] = None
            point_predictions_evenly[unseen_object.replace('_', ' ')] = None
            point_predictions_svm[unseen_object.replace('_', ' ')] = None
            continue
        none_count = 0
        lower_bounds = set()
        upper_bounds = set()
        for numeric_seed in tqdm.tqdm(numeric_seeds.keys()):
            pair = Pair(unseen_object, numeric_seed)
            if pair.both_in_list(objects):
                fraction_larger, _ = prop.compare_pair(pair)
                if fraction_larger is None:
                    none_count += 1
                    continue
                if fraction_larger < .5:
                    upper_bounds.add(numeric_seed)
                if fraction_larger > .5:
                    lower_bounds.add(numeric_seed)
                logger.debug(
                    f'{pair.e1} {pair.e2} fraction larger: {fraction_larger}')
            else:
                logger.debug(
                    f'{pair.e1} or {pair.e2} not in VG. Objects: {objects}')

        lower_bounds_sizes = fill_sizes_list(lower_bounds, numeric_seeds)
        upper_bounds_sizes = fill_sizes_list(upper_bounds, numeric_seeds)

        # size = predict_size_with_bounds(lower_bounds_sizes, upper_bounds_sizes)
        size = iterativily_find_size(lower_bounds_sizes, upper_bounds_sizes)
        size_evenly = iterativily_find_size_evenly(lower_bounds_sizes,
                                                   upper_bounds_sizes)
        size_svm = predict_size_with_bounds(lower_bounds_sizes,
                                            upper_bounds_sizes)

        point_predictions[unseen_object.replace('_', ' ')] = size
        point_predictions_evenly[unseen_object.replace('_', ' ')] = size_evenly
        point_predictions_svm[unseen_object.replace('_', ' ')] = size_svm
        logger.info(f'\nObject: {unseen_object}')
        logger.info(f'Size: {size}')
        logger.info(f'Size evenly: {size_evenly}')
        logger.info(f'Size svm: {size_svm}')
        logger.info(
            f"None count: {none_count} out of {len(numeric_seeds.keys())}")
        logger.info(
            f"Lower bounds (n={len(lower_bounds)}): mean: {np.mean(lower_bounds_sizes)} median: {np.median(lower_bounds_sizes)}\n\t{lower_bounds}\n\t{lower_bounds_sizes}"
        )
        logger.info(
            f"Upper bounds (n={len(upper_bounds)}): mean: {np.mean(upper_bounds_sizes)} median: {np.median(upper_bounds_sizes)}\n\t{upper_bounds}\n\t{upper_bounds_sizes}"
        )

    with open(f'point_predictions_visual_ranges.pkl', 'wb') as f:
        pickle.dump(point_predictions, f)

    with open(f'point_predictions_visual_ranges_evenly.pkl', 'wb') as f:
        pickle.dump(point_predictions_evenly, f)

    with open(f'point_predictions_visual_ranges_svm.pkl', 'wb') as f:
        pickle.dump(point_predictions_svm, f)

Example #11

Show file

File: breds-parallel.py Project: yespon/BREDS

class BREDS(object):
    def __init__(self, config_file, seeds_file, negative_seeds, similarity,
                 confidence, num_cores):
        if num_cores == 0:
            self.num_cpus = multiprocessing.cpu_count()
        else:
            self.num_cpus = num_cores
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(list)
        self.curr_iteration = 0
        self.patterns = list()
        self.patterns_index = dict()
        self.config = Config(config_file, seeds_file, negative_seeds,
                             similarity, confidence)

    def generate_tuples(self, sentences_file):

        # generate tuples instances from a text file with sentences
        # where named entities are already tagged

        # load word2vec model
        self.config.read_word2vec()

        # copy all sentences from input file into a Queue
        # shared by all processes
        manager = multiprocessing.Manager()
        queue = manager.Queue()

        print("\nLoading sentences from file")
        f_sentences = codecs.open(sentences_file, encoding='utf-8')
        count = 0
        for line in f_sentences:
            if line.startswith("#"):
                continue
            count += 1
            if count % 10000 == 0:
                sys.stdout.write(".")
            queue.put(line.strip())
        f_sentences.close()

        pipes = [multiprocessing.Pipe(False) for _ in range(self.num_cpus)]
        processes = [
            multiprocessing.Process(target=self.generate_instances,
                                    args=(queue, pipes[i][1]))
            for i in range(self.num_cpus)
        ]

        print("\nGenerating relationship instances from sentences")
        print("Running", len(processes), " processes")
        for proc in processes:
            proc.start()

        for i in range(len(pipes)):
            data = pipes[i][0].recv()
            child_instances = data[1]
            for x in child_instances:
                self.processed_tuples.append(x)

        for proc in processes:
            proc.join()

        print("\n", len(self.processed_tuples), "instances generated")
        print("Writing generated tuples to disk")
        f = open("processed_tuples.pkl", "wb")
        pickle.dump(self.processed_tuples, f)
        f.close()

    def generate_instances(self, sentences, child_conn):
        # Each process has its own NLTK PoS-tagger
        tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
        instances = list()
        while True:
            try:
                s = sentences.get_nowait()
                if sentences.qsize() % 500 == 0:
                    print(multiprocessing.current_process(), \
                        "Instances to process", sentences.qsize())

                sentence = Sentence(s, self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size, tagger,
                                    self.config)

                for rel in sentence.relationships:
                    t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                              rel.between, rel.after, self.config)
                    instances.append(t)

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, instances))
                break

    def similarity_3_contexts(self, t, p):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector),
                      matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector),
                      matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector),
                      matutils.unitvec(p.aft_vector))

        return self.config.alpha*bef + \
               self.config.beta*bet + \
               self.config.gamma*aft

    def similarity_all(self, t, extraction_pattern):

        # calculates the cosine similarity between all patterns part of a
        # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern
        # extracted from a sentence;
        #
        # returns the max similarity scores

        good = 0
        bad = 0
        max_similarity = 0

        for p in list(extraction_pattern.tuples):
            score = self.similarity_3_contexts(t, p)
            if score > max_similarity:
                max_similarity = score
            if score >= self.config.threshold_similarity:
                good += 1
            else:
                bad += 1

        if good >= bad:
            return True, max_similarity
        else:
            return False, 0.0

    def match_seeds_tuples(self):

        # checks if an extracted tuple matches seeds tuples

        matched_tuples = list()
        count_matches = dict()
        for t in self.processed_tuples:
            for s in self.config.positive_seed_tuples:
                if t.e1 == s.e1 and t.e2 == s.e2:
                    matched_tuples.append(t)
                    try:
                        count_matches[(t.e1, t.e2)] += 1
                    except KeyError:
                        count_matches[(t.e1, t.e2)] = 1
        return count_matches, matched_tuples

    def cluster_tuples(self, matched_tuples):

        # single-Pass clustering

        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            # having this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)

    def write_relationships_to_disk(self):
        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w")
        tmp = sorted(list(self.candidate_tuples.keys()), reverse=True)
        for t in tmp:
            f_output.write("instance: " + t.e1 + '\t' + t.e2 + '\tscore:' +
                           str(t.confidence) + '\n')
            f_output.write("sentence: " + t.sentence + '\n')
            f_output.write("pattern_bef: " + t.bef_words + '\n')
            f_output.write("pattern_bet: " + t.bet_words + '\n')
            f_output.write("pattern_aft: " + t.aft_words + '\n')
            if t.passive_voice is False:
                f_output.write("passive voice: False\n")
            elif t.passive_voice is True:
                f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()

    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("Loading pre-processed sentences", tuples)
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(list(count_matches.items()),
                                       key=operator.itemgetter(1),
                                       reverse=True)

                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])
                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances: generate patterns
                print("\nClustering matched instances to generate patterns")
                if len(self.patterns) == 0:
                    self.cluster_tuples(matched_tuples)

                    # Eliminate patterns supported by less than
                    # 'min_pattern_support' tuples
                    new_patterns = [
                        p for p in self.patterns
                        if len(p.tuples) > self.config.min_pattern_support
                    ]
                    self.patterns = new_patterns

                else:
                    # Parallelize single-pass clustering
                    # Each tuple must be compared with each extraction pattern

                    # Map:
                    # - Divide the tuples into smaller lists,
                    # accordingly to the number of CPUs
                    # - Pass to each CPU a sub-list of tuples and all the
                    # patterns, comparison is done by each CPU

                    # Merge:
                    # - Each CPU sends to the father process the updated
                    # patterns and new patterns
                    # - Merge patterns based on a pattern_id
                    # - Cluster new created patterns with single-pass clustering

                    # make a copy of the extraction patterns to be
                    # passed to each CPU
                    patterns = [
                        list(self.patterns) for _ in range(self.num_cpus)
                    ]

                    # distribute tuples per different CPUs
                    chunks = [list() for _ in range(self.num_cpus)]
                    n_tuples_per_child = int(
                        math.ceil(float(len(matched_tuples)) / self.num_cpus))

                    print("\n#CPUS", self.num_cpus, '\t', \
                        "Tuples per CPU", n_tuples_per_child)

                    chunk_n = 0
                    chunck_begin = 0
                    chunck_end = n_tuples_per_child

                    while chunk_n < self.num_cpus:
                        chunks[chunk_n] = matched_tuples[
                            chunck_begin:chunck_end]
                        chunck_begin = chunck_end
                        chunck_end += n_tuples_per_child
                        chunk_n += 1

                    count = 0
                    for c in chunks:
                        print("CPU_" + str(count), "  ", len(c), "patterns")
                        count += 1

                    pipes = [
                        multiprocessing.Pipe(False)
                        for _ in range(self.num_cpus)
                    ]
                    processes = [
                        multiprocessing.Process(
                            target=self.cluster_tuples_parallel,
                            args=(patterns[i], chunks[i], pipes[i][1]))
                        for i in range(self.num_cpus)
                    ]

                    print("\nRunning", len(processes), " processes")
                    for proc in processes:
                        proc.start()

                    # Receive and merge all patterns by 'pattern_id'
                    # new created patterns (new pattern_id) go into
                    # 'child_patterns' and then are merged
                    # by single-pass clustering between patterns

                    child_patterns = list()

                    for i in range(len(pipes)):
                        data = pipes[i][0].recv()
                        patterns = data[1]
                        for p_updated in patterns:
                            pattern_exists = False
                            for p_original in self.patterns:
                                if p_original.id == p_updated.id:
                                    p_original.tuples.update(p_updated.tuples)
                                    pattern_exists = True
                                    break

                            if pattern_exists is False:
                                child_patterns.append(p_updated)

                    for proc in processes:
                        proc.join()

                    print("\nSELF Patterns:")
                    for p in self.patterns:
                        p.merge_all_tuples_bet()
                        print('\n' + str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words.encode("utf8"))

                    print("\nChild Patterns:")
                    for p in child_patterns:
                        p.merge_all_tuples_bet()
                        print('\n' + str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words.encode("utf8"))

                    print(len(child_patterns), "new created patterns")

                    # merge/aggregate similar patterns generated by
                    # the child processes

                    # start comparing smaller ones with greater ones
                    child_patterns.sort(key=lambda y: len(y.tuples),
                                        reverse=False)
                    count = 0
                    new_list = list(self.patterns)
                    for p1 in child_patterns:
                        print("\nNew Patterns", len(child_patterns), \
                            "Processed", count)
                        print("New List", len(new_list))
                        print("Pattern:", p1.id, "Tuples:", len(p1.tuples))
                        max_similarity = 0
                        max_similarity_cluster = None
                        for p2 in new_list:
                            if p1 == p2:
                                continue
                            score = self.similarity_cluster(p1, p2)
                            if score > max_similarity:
                                max_similarity = score
                                max_similarity_cluster = p2
                        if max_similarity >= self.config.threshold_similarity:
                            for t in p1.tuples:
                                max_similarity_cluster.tuples.add(t)
                        else:
                            new_list.append(p1)
                        count += 1

                    # add merged patterns to main patterns structure
                    for p in new_list:
                        if p not in self.patterns:
                            self.patterns.append(p)

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                print("\n", len(self.patterns), "patterns generated")

                # merge equal tuples inside patterns to make
                # less comparisons in collecting instances
                for p in self.patterns:
                    # if only the BET context is being used,
                    # merge only based on BET contexts
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        p.merge_all_tuples_bet()

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        print('\n' + str(p.id))
                        if self.config.alpha == 0 and self.config.gamma == 0:
                            for bet_words in p.bet_uniques_words:
                                print("BET", bet_words)
                        else:
                            for t in p.tuples:
                                print("BEF", t.bef_words)
                                print("BET", t.bet_words)
                                print("AFT", t.aft_words)
                                print("========")

                # Look for sentences with occurrence of
                # seeds semantic types (e.g., ORG - LOC)

                # This was already collect and its stored in
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with
                # each extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("\nNumber of tuples to be analyzed:", \
                    len(self.processed_tuples))

                print("\nCollecting instances based on", \
                    len(self.patterns), "extraction patterns")

                # create copies of generated extraction patterns
                # to be passed to each process
                patterns = [list(self.patterns) for _ in range(self.num_cpus)]

                # copy all tuples into a Queue shared by all processes
                manager = multiprocessing.Manager()
                queue = manager.Queue()
                for t in self.processed_tuples:
                    queue.put(t)

                # each distinct process receives as arguments:
                #   - a list, copy of all the original extraction patterns
                #   - a Queue of the tuples
                #   - a pipe to return the collected tuples and updated
                #     patterns to the parent process

                pipes = [
                    multiprocessing.Pipe(False) for _ in range(self.num_cpus)
                ]
                processes = [
                    multiprocessing.Process(target=self.find_instances,
                                            args=(patterns[i], queue,
                                                  pipes[i][1]))
                    for i in range(self.num_cpus)
                ]

                print("Running", len(processes), " processes")
                for proc in processes:
                    proc.start()

                # structures to store each process altered patterns
                # and collected tuples
                patterns_updated = list()
                collected_tuples = list()

                for i in range(len(pipes)):
                    data = pipes[i][0].recv()
                    child_pid = data[0]
                    patterns = data[1]
                    tuples = data[2]
                    print(child_pid, "patterns", len(patterns), \
                        "tuples", len(tuples))
                    patterns_updated.extend(patterns)
                    collected_tuples.extend(tuples)

                for proc in processes:
                    proc.join()

                # Extraction patterns aggregation happens here:
                for p_updated in patterns_updated:
                    for p_original in self.patterns:
                        if p_original.id == p_updated.id:
                            p_original.positive += p_updated.positive
                            p_original.negative += p_updated.negative
                            p_original.unknown += p_updated.unknown

                # Index the patterns in an hashtable for later use
                for p in self.patterns:
                    self.patterns_index[p.id] = p

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(p.id)
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # Candidate tuples aggregation happens here:
                print("Collecting generated candidate tuples")
                for e in collected_tuples:
                    t = e[0]
                    pattern_best = e[1]
                    sim_best = e[2]

                    # if this tuple was already extracted, check if this
                    # extraction pattern is already associated with it, if not,
                    # associate this pattern with it and similarity score
                    if t in self.candidate_tuples:
                        t_patterns = self.candidate_tuples[t]
                        if t_patterns is not None:
                            if pattern_best not in [x[0] for x in t_patterns]:
                                self.candidate_tuples[t].append(
                                    (self.patterns_index[pattern_best.id],
                                     sim_best))

                    # if this tuple was not extracted before, associate this
                    # pattern with the instance and the similarity score
                    else:
                        self.candidate_tuples[t].append(
                            (self.patterns_index[pattern_best.id], sim_best))

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for t in list(self.candidate_tuples.keys()):
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                    if self.curr_iteration > 0:
                        t.confidence = \
                            t.confidence * self.config.wUpdt + \
                            t.confidence_old * (1 - self.config.wUpdt)

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = list(self.candidate_tuples.keys())
                    tuples_sorted = sorted(extracted_tuples,
                                           key=lambda tl: tl.confidence,
                                           reverse=True)
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                # update seed set of tuples to use in next iteration
                # seeds = { T | conf(T) > instance_confidence }
                print("Adding tuples to seed with confidence >=" + \
                      str(self.config.instance_confidence))
                for t in list(self.candidate_tuples.keys()):
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()

    def similarity_cluster(self, p1, p2):
        count = 0
        score = 0
        if self.config.alpha == 0 and self.config.gamma == 0:
            p1.merge_all_tuples_bet()
            p2.merge_all_tuples_bet()
            for v_bet1 in p1.bet_uniques_vectors:
                for v_bet2 in p2.bet_uniques_vectors:
                    if v_bet1 is not None and v_bet2 is not None:
                        score += dot(matutils.unitvec(asarray(v_bet1)),
                                     matutils.unitvec(asarray(v_bet2)))
                        count += 1
        else:
            for t1 in p1.tuples:
                for t2 in p2.tuples:
                    score += self.similarity_3_contexts(t1, t2)
                    count += 1

        return float(score) / float(count)

    def find_instances(self, patterns, instances, child_conn):
        updated_patterns = list()
        candidate_tuples = list()
        while True:
            try:
                t = instances.get_nowait()
                if instances.qsize() % 500 == 0:
                    sys.stdout.write(
                        str(multiprocessing.current_process()) +
                        " Instances to process: " + str(instances.qsize()) +
                        '\n')
                    sys.stdout.flush()

                # measure similarity towards every extraction pattern
                max_similarity = 0
                pattern_best = None
                for p in patterns:
                    good = 0
                    bad = 0
                    if self.config.alpha == 0 and self.config.gamma == 0:
                        for p_bet_v in list(p.bet_uniques_vectors):
                            if t.bet_vector is not None and p_bet_v is not None:
                                score = dot(matutils.unitvec(t.bet_vector),
                                            matutils.unitvec(asarray(p_bet_v)))
                                if score >= self.config.threshold_similarity:
                                    good += 1
                                else:
                                    bad += 1

                    if good > bad:
                        p.update_selectivity(t, self.config)
                        if score > max_similarity:
                            max_similarity = score
                            pattern_best = p

                # if its above a threshold associated the pattern with it
                if max_similarity >= self.config.threshold_similarity:
                    candidate_tuples.append((t, pattern_best, max_similarity))

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                for p in patterns:
                    updated_patterns.append(p)
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, updated_patterns, candidate_tuples))
                break

    def cluster_tuples_parallel(self, patterns, matched_tuples, child_conn):
        updated_patterns = list(patterns)
        count = 0
        for t in matched_tuples:
            count += 1
            if count % 500 == 0:
                print(multiprocessing.current_process(), count, \
                    "tuples processed")

            # go through all patterns(clusters of tuples) and find the one with
            # the highest similarity score
            max_similarity = 0
            max_similarity_cluster_index = 0
            for i in range(0, len(updated_patterns)):
                extraction_pattern = updated_patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                updated_patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                updated_patterns[max_similarity_cluster_index].add_tuple(t)

        # Eliminate clusters with two or less patterns
        new_patterns = [p for p in updated_patterns if len(p.tuples) > 5]
        pid = multiprocessing.current_process().pid
        print(multiprocessing.current_process(), "Patterns: ",
              len(new_patterns))
        child_conn.send((pid, new_patterns))

Example #12

Show file

File: breds.py Project: davidsbatista/BREDS

 def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence):
     self.curr_iteration = 0
     self.patterns = list()
     self.processed_tuples = list()
     self.candidate_tuples = defaultdict(list)
     self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence)

Example #13

Show file

File: breds.py Project: davidsbatista/BREDS

class BREDS(object):

    def __init__(self, config_file, seeds_file, negative_seeds, similarity, confidence):
        self.curr_iteration = 0
        self.patterns = list()
        self.processed_tuples = list()
        self.candidate_tuples = defaultdict(list)
        self.config = Config(config_file, seeds_file, negative_seeds, similarity, confidence)

    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences where named entities are
        already tagged

        :param sentences_file:
        """
        if os.path.exists("processed_tuples.pkl"):

            with open("processed_tuples.pkl", "rb") as f_in:
                print("\nLoading processed tuples from disk...")
                self.processed_tuples = pickle.load(f_in)
            print(len(self.processed_tuples), "tuples loaded")

        else:

            # load needed stuff, word2vec model and a pos-tagger
            self.config.read_word2vec()
            tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')

            print("\nGenerating relationship instances from sentences")
            with open(sentences_file, encoding='utf-8') as f_sentences:
                count = 0
                for line in f_sentences:
                    if line.startswith("#"):
                        continue
                    count += 1
                    if count % 10000 == 0:
                        sys.stdout.write(".")

                    sentence = Sentence(line.strip(),
                                        self.config.e1_type,
                                        self.config.e2_type,
                                        self.config.max_tokens_away,
                                        self.config.min_tokens_away,
                                        self.config.context_window_size, tagger,
                                        self.config)

                    for rel in sentence.relationships:
                        t = Tuple(rel.e1, rel.e2,
                                  rel.sentence, rel.before, rel.between, rel.after,
                                  self.config)
                        self.processed_tuples.append(t)
                print("\n", len(self.processed_tuples), "tuples generated")

            print("Writing generated tuples to disk")
            with open("processed_tuples.pkl", "wb") as f_out:
                pickle.dump(self.processed_tuples, f_out)

    def similarity_3_contexts(self, p, t):
        (bef, bet, aft) = (0, 0, 0)

        if t.bef_vector is not None and p.bef_vector is not None:
            bef = dot(matutils.unitvec(t.bef_vector), matutils.unitvec(p.bef_vector))

        if t.bet_vector is not None and p.bet_vector is not None:
            bet = dot(matutils.unitvec(t.bet_vector), matutils.unitvec(p.bet_vector))

        if t.aft_vector is not None and p.aft_vector is not None:
            aft = dot(matutils.unitvec(t.aft_vector), matutils.unitvec(p.aft_vector))

        return self.config.alpha*bef + self.config.beta*bet + self.config.gamma*aft

    def similarity_all(self, t, extraction_pattern):

        # calculates the cosine similarity between all patterns part of a
        # cluster (i.e., extraction pattern) and the vector of a ReVerb pattern
        # extracted from a sentence;

        # returns the max similarity scores

        good = 0
        bad = 0
        max_similarity = 0

        for p in list(extraction_pattern.tuples):
            score = self.similarity_3_contexts(t, p)
            if score > max_similarity:
                max_similarity = score
            if score >= self.config.threshold_similarity:
                good += 1
            else:
                bad += 1

        if good >= bad:
            return True, max_similarity
        else:
            return False, 0.0

    def match_seeds_tuples(self):
        # checks if an extracted tuple matches seeds tuples
        matched_tuples = list()
        count_matches = dict()
        for t in self.processed_tuples:
            for s in self.config.positive_seed_tuples:
                if t.e1 == s.e1 and t.e2 == s.e2:
                    matched_tuples.append(t)
                    try:
                        count_matches[(t.e1, t.e2)] += 1
                    except KeyError:
                        count_matches[(t.e1, t.e2)] = 1

        return count_matches, matched_tuples

    def write_relationships_to_disk(self):
        print("\nWriting extracted relationships to disk")
        f_output = open("relationships.txt", "w")
        tmp = sorted(list(self.candidate_tuples.keys()), reverse=True)
        for t in tmp:
            f_output.write("instance: " + t.e1+'\t'+t.e2+'\tscore:'+str(t.confidence)+'\n')
            f_output.write("sentence: "+t.sentence+'\n')
            f_output.write("pattern_bef: "+t.bef_words+'\n')
            f_output.write("pattern_bet: "+t.bet_words+'\n')
            f_output.write("pattern_aft: "+t.aft_words+'\n')
            if t.passive_voice is False:
                f_output.write("passive voice: False\n")
            elif t.passive_voice is True:
                f_output.write("passive voice: True\n")
            f_output.write("\n")
        f_output.close()

    def init_bootstrap(self, tuples):

        # starts a bootstrap iteration

        if tuples is not None:
            f = open(tuples, "r")
            print("\nLoading processed tuples from disk...")
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        self.curr_iteration = 0
        while self.curr_iteration <= self.config.number_iterations:
            print("==========================================")
            print("\nStarting iteration", self.curr_iteration)
            print("\nLooking for seed matches of:")
            for s in self.config.positive_seed_tuples:
                print(s.e1, '\t', s.e2)

            # Looks for sentences matching the seed instances
            count_matches, matched_tuples = self.match_seeds_tuples()

            if len(matched_tuples) == 0:
                print("\nNo seed matches found")
                sys.exit(0)

            else:
                print("\nNumber of seed matches found")
                sorted_counts = sorted(
                    list(count_matches.items()),
                    key=operator.itemgetter(1),
                    reverse=True
                )
                for t in sorted_counts:
                    print(t[0][0], '\t', t[0][1], t[1])

                print("\n", len(matched_tuples), "tuples matched")

                # Cluster the matched instances, to generate
                # patterns/update patterns
                print("\nClustering matched instances to generate patterns")
                self.cluster_tuples(matched_tuples)

                # Eliminate patterns supported by less than
                # 'min_pattern_support' tuples
                new_patterns = [p for p in self.patterns if len(p.tuples) >
                                self.config.min_pattern_support]
                self.patterns = new_patterns

                print("\n", len(self.patterns), "patterns generated")

                if PRINT_PATTERNS is True:
                    count = 1
                    print("\nPatterns:")
                    for p in self.patterns:
                        print(count)
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                            print("\n")
                        count += 1

                if self.curr_iteration == 0 and len(self.patterns) == 0:
                    print("No patterns generated")
                    sys.exit(0)

                # Look for sentences with occurrence of seeds
                # semantic types (e.g., ORG - LOC)
                # This was already collect and its stored in:
                # self.processed_tuples
                #
                # Measure the similarity of each occurrence with each
                # extraction pattern and store each pattern that has a
                # similarity higher than a given threshold
                #
                # Each candidate tuple will then have a number of patterns
                # that extracted it each with an associated degree of match.
                print("Number of tuples to be analyzed:", len(self.processed_tuples))

                print("\nCollecting instances based on extraction patterns")
                count = 0

                for t in self.processed_tuples:

                    count += 1
                    if count % 1000 == 0:
                        sys.stdout.write(".")
                        sys.stdout.flush()

                    sim_best = 0
                    for extraction_pattern in self.patterns:
                        accept, score = self.similarity_all(
                            t, extraction_pattern
                        )
                        if accept is True:
                            extraction_pattern.update_selectivity(
                                t, self.config
                            )
                            if score > sim_best:
                                sim_best = score
                                pattern_best = extraction_pattern

                    if sim_best >= self.config.threshold_similarity:
                        # if this tuple was already extracted, check if this
                        # extraction pattern is already associated with it,
                        # if not, associate this pattern with it and store the
                        # similarity score
                        patterns = self.candidate_tuples[t]
                        if patterns is not None:
                            if pattern_best not in [x[0] for x in patterns]:
                                self.candidate_tuples[t].append(
                                    (pattern_best, sim_best)
                                )

                        # If this tuple was not extracted before
                        # associate this pattern with the instance
                        # and the similarity score
                        else:
                            self.candidate_tuples[t].append(
                                (pattern_best, sim_best)
                            )

                # update all patterns confidence
                for p in self.patterns:
                    p.update_confidence(self.config)

                if PRINT_PATTERNS is True:
                    print("\nPatterns:")
                    for p in self.patterns:
                        for t in p.tuples:
                            print("BEF", t.bef_words)
                            print("BET", t.bet_words)
                            print("AFT", t.aft_words)
                            print("========")
                        print("Positive", p.positive)
                        print("Negative", p.negative)
                        print("Unknown", p.unknown)
                        print("Tuples", len(p.tuples))
                        print("Pattern Confidence", p.confidence)
                        print("\n")

                # update tuple confidence based on patterns confidence
                print("\n\nCalculating tuples confidence")
                for t in list(self.candidate_tuples.keys()):
                    confidence = 1
                    t.confidence_old = t.confidence
                    for p in self.candidate_tuples.get(t):
                        confidence *= 1 - (p[0].confidence * p[1])
                    t.confidence = 1 - confidence

                # sort tuples by confidence and print
                if PRINT_TUPLES is True:
                    extracted_tuples = list(self.candidate_tuples.keys())
                    tuples_sorted = sorted(extracted_tuples, key=lambda tpl: tpl.confidence,
                                           reverse=True)
                    for t in tuples_sorted:
                        print(t.sentence)
                        print(t.e1, t.e2)
                        print(t.confidence)
                        print("\n")

                print("Adding tuples to seed with confidence >= {}".format(
                    str(self.config.instance_confidence)))
                for t in list(self.candidate_tuples.keys()):
                    if t.confidence >= self.config.instance_confidence:
                        seed = Seed(t.e1, t.e2)
                        self.config.positive_seed_tuples.add(seed)

                # increment the number of iterations
                self.curr_iteration += 1

        self.write_relationships_to_disk()

    def cluster_tuples(self, matched_tuples):
        # this is a single-pass clustering
        # Initialize: if no patterns exist, first tuple goes to first cluster
        if len(self.patterns) == 0:
            c1 = Pattern(matched_tuples[0])
            self.patterns.append(c1)

        count = 0
        for t in matched_tuples:
            count += 1
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            max_similarity = 0
            max_similarity_cluster_index = 0

            # go through all patterns(clusters of tuples) and find the one
            # with the highest similarity score
            for i in range(0, len(self.patterns), 1):
                extraction_pattern = self.patterns[i]
                accept, score = self.similarity_all(t, extraction_pattern)
                if accept is True and score > max_similarity:
                    max_similarity = score
                    max_similarity_cluster_index = i

            # if max_similarity < min_degree_match create a new cluster having
            #  this tuple as the centroid
            if max_similarity < self.config.threshold_similarity:
                c = Pattern(t)
                self.patterns.append(c)

            # if max_similarity >= min_degree_match add to the cluster with
            # the highest similarity
            else:
                self.patterns[max_similarity_cluster_index].add_tuple(t)