Ejemplo n.º 1
0
    def generate_instances(self, sentences, child_conn):
        # Each process has its own NLTK PoS-tagger
        tagger = load('taggers/maxent_treebank_pos_tagger/english.pickle')
        instances = list()
        while True:
            try:
                s = sentences.get_nowait()
                if sentences.qsize() % 500 == 0:
                    print(multiprocessing.current_process(), \
                        "Instances to process", sentences.qsize())

                sentence = Sentence(s, self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size, tagger,
                                    self.config)

                for rel in sentence.relationships:
                    t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                              rel.between, rel.after, self.config)
                    instances.append(t)

            except queue.Empty:
                print(multiprocessing.current_process(), "Queue is Empty")
                pid = multiprocessing.current_process().pid
                child_conn.send((pid, instances))
                break
Ejemplo n.º 2
0
    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences where named entities are
        already tagged

        :param sentences_file:
        """
        if os.path.exists("processed_tuples.pkl"):

            with open("processed_tuples.pkl", "rb") as f_in:
                print("\nLoading processed tuples from disk...")
                self.processed_tuples = pickle.load(f_in)
            print(len(self.processed_tuples), "tuples loaded")

            temp_file = open("temp.txt", "w", encoding='utf-8')
            for i in self.processed_tuples:
                temp_file.write(i.e1 + '\t' + i.e2 + '\n')
            temp_file.close()

        else:

            # load needed stuff, word2vec model and a pos-tagger
            self.config.read_word2vec()
            tagger = None

            print("\nGenerating relationship instances from sentences")
            with open(sentences_file, encoding='utf-8') as f_sentences:
                count = 0
                for line in f_sentences:
                    if line.startswith("#"):
                        continue
                    count += 1
                    if count % 10000 == 0:
                        sys.stdout.write(".")

                    sentence = Sentence(line.strip(), self.config.e1_type,
                                        self.config.e2_type,
                                        self.config.max_tokens_away,
                                        self.config.min_tokens_away,
                                        self.config.context_window_size,
                                        tagger, self.config)

                    for rel in sentence.relationships:
                        t = Tuple(rel.e1, rel.e2, rel.sentence, rel.before,
                                  rel.between, rel.after, self.config)
                        self.processed_tuples.append(t)
                print("\n", len(self.processed_tuples), "tuples generated")

            print("Writing generated tuples to disk")
            with open("processed_tuples.pkl", "wb") as f_out:
                pickle.dump(self.processed_tuples, f_out)
Ejemplo n.º 3
0
def process_corpus(queue, g_dash, e1_type, e2_type):
    count = 0
    added = 0
    while True:
        try:
            if count % 25000 == 0:
                print(multiprocessing.current_process(), "In Queue",
                      queue.qsize(), "Total added: ", added)
            line = queue.get_nowait()
            s = Sentence(line.strip(), e1_type, e2_type, MAX_TOKENS_AWAY,
                         MIN_TOKENS_AWAY, CONTEXT_WINDOW)
            for r in s.relationships:
                tokens = word_tokenize(r.between)
                if all(x in not_valid for x in word_tokenize(r.between)):
                    continue
                elif "," in tokens and tokens[0] != ',':
                    continue
                else:
                    g_dash.append(r)
                    added += 1
            count += 1
        except queue.Empty:
            break
Ejemplo n.º 4
0
 def test_update_selectivity(self):
     sentence = Sentence('A tiger is 2.1 meters', 'OBJECT', 'NUMBER', 20, 0,
                         2, 'tiger', self.tagger, self.config)
     self.assertEqual(len(sentence.relationships), 1)
     print(sentence.relationships)
Ejemplo n.º 5
0
def proximity_pmi_a(e1_type, e2_type, queue, index, results, not_found,
                    rel_words_unigrams, rel_words_bigrams):
    idx = open_dir(index)
    count = 0
    q_limit = 500
    with idx.searcher() as searcher:
        while True:
            try:
                r = queue.get_nowait()
                count += 1
                if count % 50 == 0:
                    print(multiprocessing.current_process(), \
                        "To Process", queue.qsize(), \
                        "Correct found:", len(results))

                # if its not in the database calculate the PMI
                entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                t1 = query.Term('sentence', entity1)
                t3 = query.Term('sentence', entity2)

                # First count the proximity (MAX_TOKENS_AWAY) occurrences
                # of entities r.e1 and r.e2
                q1 = spans.SpanNear2([t1, t3],
                                     slop=MAX_TOKENS_AWAY,
                                     ordered=True,
                                     mindist=1)
                hits = searcher.search(q1, limit=q_limit)

                # Entities proximity considering relational words
                # From the results above count how many contain a
                # valid relational word
                hits_with_r = 0
                hits_without_r = 0
                fact_bet_words_tokens = word_tokenize(r.bet_words)
                for s in hits:
                    sentence = s.get("sentence")
                    s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY,
                                 MIN_TOKENS_AWAY, CONTEXT_WINDOW)
                    for s_r in s.relationships:
                        if r.ent1.decode("utf8") == s_r.ent1 and \
                                        r.ent2.decode("utf8") == s_r.ent2:
                            unigrams_bef_words = word_tokenize(s_r.before)
                            unigrams_bet_words = word_tokenize(s_r.between)
                            unigrams_aft_words = word_tokenize(s_r.after)
                            bigrams_rel_words = extract_bigrams(s_r.between)

                            if fact_bet_words_tokens == unigrams_bet_words:
                                hits_with_r += 1

                            elif any(x in rel_words_unigrams
                                     for x in unigrams_bef_words):
                                hits_with_r += 1

                            elif any(x in rel_words_unigrams
                                     for x in unigrams_bet_words):
                                hits_with_r += 1

                            elif any(x in rel_words_unigrams
                                     for x in unigrams_aft_words):
                                hits_with_r += 1

                            elif rel_words_bigrams == bigrams_rel_words:
                                hits_with_r += 1
                            else:
                                hits_without_r += 1

                if hits_with_r > 0 and hits_without_r > 0:
                    pmi = float(hits_with_r) / float(hits_without_r)
                    if pmi >= PMI:
                        results.append(r)

                    else:
                        not_found.append(r)

                else:
                    not_found.append(r)
                count += 1

            except queue.Empty:
                break
Ejemplo n.º 6
0
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results,
                           rel_words_unigrams, rel_words_bigrams):
    idx = open_dir(index)
    count = 0
    distance = MAX_TOKENS_AWAY
    q_limit = 500
    with idx.searcher() as searcher:
        while True:
            try:
                r = queue.get_nowait()
                if count % 50 == 0:
                    print("\n", multiprocessing.current_process(), \
                        "In Queue", queue.qsize(), \
                        "Total Matched: ", len(results))
                if (r.ent1, r.ent2) not in all_in_database:
                    # if its not in the database calculate the PMI
                    entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                    entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                    t1 = query.Term('sentence', entity1)
                    t3 = query.Term('sentence', entity2)

                    # Entities proximity query without relational words
                    q1 = spans.SpanNear2([t1, t3],
                                         slop=distance,
                                         ordered=True,
                                         mindist=1)
                    hits = searcher.search(q1, limit=q_limit)

                    # Entities proximity considering relational words
                    # From the results above count how many contain a
                    # valid relational word

                    hits_with_r = 0
                    hits_without_r = 0
                    for s in hits:
                        sentence = s.get("sentence")
                        s = Sentence(sentence, e1_type, e2_type,
                                     MAX_TOKENS_AWAY, MIN_TOKENS_AWAY,
                                     CONTEXT_WINDOW)

                        for s_r in s.relationships:
                            if r.ent1.decode("utf8") == s_r.ent1 and \
                                            r.ent2.decode("utf8") == s_r.ent2:

                                unigrams_rel_words = word_tokenize(s_r.between)
                                bigrams_rel_words = extract_bigrams(
                                    s_r.between)

                                if all(x in not_valid
                                       for x in unigrams_rel_words):
                                    hits_without_r += 1
                                    continue
                                elif any(x in rel_words_unigrams
                                         for x in unigrams_rel_words):

                                    hits_with_r += 1

                                elif any(x in rel_words_bigrams
                                         for x in bigrams_rel_words):

                                    hits_with_r += 1
                                else:
                                    hits_without_r += 1

                    if hits_with_r > 0 and hits_without_r > 0:
                        pmi = float(hits_with_r) / float(hits_without_r)
                        if pmi >= PMI:
                            if word_tokenize(s_r.between)[-1] == 'by':
                                tmp = s_r.ent2
                                s_r.ent2 = s_r.ent1
                                s_r.ent1 = tmp
                            results.append(r)

                count += 1
            except queue.Empty:
                break