예제 #1
0
def process_corpus(queue, g_dash, e1_type, e2_type):
    while True:
        line = queue.get_nowait()
        s = Sentence(line.strip(), e1_type, e2_type, MAX_TOKENS_AWAY,
                     MIN_TOKENS_AWAY, CONTEXT_WINDOW)
        for r in s.relationships:
            if r.between == " , " or r.between == " ( " or r.between == " ) ":
                continue
            else:
                g_dash.append(r)
        if queue.empty() is True:
            break
예제 #2
0
    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences
        where named entities are already tagged
        """
        try:
            os.path.isfile("processed_tuples.pkl")
            f = open("processed_tuples.pkl", "rb")
            print("\nLoading processed tuples from disk...")
            self.processed_tuples = pickle.load(f)
            f.close()
            print(len(self.processed_tuples), "tuples loaded")

        except IOError:
            print("\nGenerating relationship instances from sentences")
            print("\n read sentences ......")
            f_sentences = codecs.open(sentences_file, encoding='utf-8')
            print("\n read sentences ......")
            count = 0
            for line in f_sentences:
                count += 1
                if count % 10000 == 0:
                    sys.stdout.write(".")
                    break
                sentence = Sentence(line.strip(), self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size)

                for rel in sentence.relationships:
                    if rel.arg1type == self.config.e1_type and rel.arg2type == self.config.e2_type:
                        bef_tokens = word_tokenize(rel.before)
                        bet_tokens = word_tokenize(rel.between)
                        aft_tokens = word_tokenize(rel.after)
                        if not (bef_tokens == 0 and bet_tokens == 0
                                and aft_tokens == 0):
                            t = Tuple.Tuple(rel.ent1, rel.ent2, rel.sentence,
                                            rel.before, rel.between, rel.after,
                                            self.config)
                            self.processed_tuples.append(t)
            f_sentences.close()

            print("\n", len(self.processed_tuples), "relationships generated")
            print("Dumping relationships to file")
            f = open("processed_tuples.pkl", "wb")
            pickle.dump(self.processed_tuples, f)
            f.close()
예제 #3
0
    def generate_tuples(self, sentences_file):
        """
        Generate tuples instances from a text file with sentences
        where named entities are already tagged
        """
        try:
            os.path.isfile("processed_tuples.pkl")
            f = open("processed_tuples.pkl", "r")
            print "\nLoading processed tuples from disk..."
            self.processed_tuples = cPickle.load(f)
            f.close()
            print len(self.processed_tuples), "tuples loaded"

        except IOError:
            print "\nGenerating relationship instances from sentences"
            f_sentences = codecs.open(sentences_file, encoding='utf-8')
            count = 0
            for line in f_sentences:
                count += 1
                if count % 10000 == 0:
                    sys.stdout.write(".")
                sentence = Sentence(line.strip(), self.config.e1_type,
                                    self.config.e2_type,
                                    self.config.max_tokens_away,
                                    self.config.min_tokens_away,
                                    self.config.context_window_size)
                for rel in sentence.relationships:
                    if rel.arg1type == self.config.e1_type and rel.arg2type == self.config.e2_type:
                        t = Tuple(rel.ent1, rel.ent2, rel.sentence, rel.before,
                                  rel.between, rel.after, self.config)
                        if len(t.patterns_vectors) >= 1:
                            self.processed_tuples.append(t)
            f_sentences.close()

            print "\n", len(self.processed_tuples), "tuples generated"
            print "Writing generated tuples to disk"
            f = open("processed_tuples.pkl", "wb")
            cPickle.dump(self.processed_tuples, f)
            f.close()
예제 #4
0
def get_sentences(sentences, freebase, results):
    count = 0
    while True:
        sentence = sentences.get_nowait()
        count += 1
        s = Sentence(sentence.strip(), None, None, MAX_TOKENS_AWAY,
                     MIN_TOKENS_AWAY, CONTEXT_WINDOW)
        for r in s.relationships:
            if r.between == " , " or r.between == " ( " or r.between == " ) ":
                continue
            else:
                try:
                    freebase[r.ent1]
                    freebase[r.ent2]
                except KeyError:
                    continue
            results.append(sentence)
        if count % 50000 == 0:
            print multiprocessing.current_process(
            ), "queue size", sentences.qsize()
        if sentences.empty is True:
            break
예제 #5
0
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results, rel_words):
    """
    #TODO: proximity_pmi with relation specific given relational words
    :param e1_type:
    :param e2_type:
    :param queue:
    :param index:
    :param results:
    :param rel_word:
    :return:
    """
    """
    sentences with tagged entities are indexed in whoosh
    perform the following query
    ent1 NEAR:X r NEAR:X ent2
    X is the maximum number of words between the query elements.
    """
    idx = open_dir(index)
    count = 0
    distance = 9
    q_limit = 500
    with idx.searcher() as searcher:
        while True:
            count += 1
            if count % 50 == 0:
                print multiprocessing.current_process(
                ), "In Queue", queue.qsize(), "Total Matched: ", len(results)
            r = queue.get_nowait()
            if (r.ent1, r.ent2) not in all_in_freebase:
                # if its not in the database calculate the PMI
                entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                t1 = query.Term('sentence', entity1)
                t3 = query.Term('sentence', entity2)

                # Entities proximity query without relational words
                q1 = spans.SpanNear2([t1, t3],
                                     slop=distance,
                                     ordered=True,
                                     mindist=1)
                hits = searcher.search(q1, limit=q_limit)

                # Entities proximity considering relational words
                # From the results above count how many contain a relational word
                hits_with_r = 0
                for s in hits:
                    sentence = s.get("sentence")
                    s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY,
                                 MIN_TOKENS_AWAY, CONTEXT_WINDOW)
                    for s_r in s.relationships:
                        if r.ent1.decode("utf8") == s_r.ent1 and r.ent2.decode(
                                "utf8") == s_r.ent2:
                            for rel in rel_words:
                                if rel in r.between:
                                    hits_with_r += 1
                                    break

                    if not len(hits) >= hits_with_r:
                        print "ERROR!"
                        print "hits", len(hits)
                        print "hits_with_r", hits_with_r
                        print entity1, '\t', entity2
                        print "\n"
                        sys.exit(0)

                if float(len(hits)) > 0:
                    pmi = float(hits_with_r) / float(len(hits))
                    if pmi > PMI:
                        results.append(r)
                        """
                        if isinstance(r, ExtractedFact):
                            print r.ent1, '\t', r.patterns, '\t', r.ent2, pmi
                        elif isinstance(r, Relationship):
                            print r.ent1, '\t', r.between, '\t', r.ent2, pmi
                        """
                if queue.empty() is True:
                    break