コード例 #1
0
def query_a(r, list_a, e1_type, e2_type, index):
    idx = open_dir(index)
    entity1 = "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">"
    entity2 = "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">"
    t1 = query.Term("sentence", entity1)
    t2 = query.Term("sentence", r.patterns)
    t3 = query.Term("sentence", entity2)
    q1 = spans.SpanNear2([t1, t2, t3], slop=5, ordered=True)
    q2 = spans.SpanNear2([t1, t3], slop=5, ordered=True)

    with idx.searcher() as searcher:
        entities_r = searcher.search(q1)
        entities = searcher.search(q2)
        # TODO: fazer stemming ou normalização da palavra a usar no query
        if len(entities) > 0:
            pmi = float(len(entities_r)) / float(len(entities))
            # TODO: qual o melhor valor de threshold ?
            if pmi >= 0.5:
                #print entity1, '\t', r.patterns, '\t', entity2, pmi
                list_a.append(r)
コード例 #2
0
def calculate_pmi(e1, e2, r):
    # NOTE: this queries an index build over the whole AFT corpus
    """
    # sentences with tagged entities are indexed in whoosh, perform the following query
    # ent1 NEAR:X r NEAR:X ent2
    # X is the maximum number of words between the query elements.
    #
    """
    idx = open_dir("index")
    t1 = query.Term("sentence", e1)
    t2 = query.Term("sentence", r)
    t3 = query.Term("sentence", e2)
    q1 = spans.SpanNear2([t1, t2, t3], slop=5, ordered=False)
    q2 = spans.SpanNear2([t1, t3], slop=5, ordered=False)

    with idx.searcher() as searcher:
        entities_r = searcher.search(q1)
        entities = searcher.search(q2)
        print len(entities_r)
        print len(entities)
        pmi = float(len(entities_r)) / float(len(entities))

    idx.close()
    return pmi
コード例 #3
0
ファイル: test_spans.py プロジェクト: mukulhase/Scholar-Gate
def test_spannear2():
    schema = fields.Schema(id=fields.STORED, text=fields.TEXT)
    with TempIndex(schema) as ix:
        with ix.writer() as w:
            w.add_document(id="a", text=u"alfa echo")
            w.add_document(id="b", text=u"alfa bravo echo")
            w.add_document(id="c", text=u"alfa bravo charlie echo")
            w.add_document(id="d", text=u"alfa bravo charlie delta echo")
            w.add_document(id="e", text=u"alfa bravo charlie fox delta echo")
            w.add_document(id="f", text=u"charlie delta echo fox golf hotel")

        with ix.searcher() as s:
            q = spans.SpanNear2([Term("text", "bravo"), Term("text", "echo")],
                                slop=3)
            assert q.estimate_size(s.reader()) == 4

            ids = "".join(sorted(hit["id"] for hit in s.search(q)))
            assert ids == "bcd"
コード例 #4
0
def proximity_pmi_a(e1_type, e2_type, queue, index, results, not_found,
                    rel_words_unigrams, rel_words_bigrams):
    idx = open_dir(index)
    count = 0
    q_limit = 500
    with idx.searcher() as searcher:
        while True:
            try:
                r = queue.get_nowait()
                count += 1
                if count % 50 == 0:
                    print multiprocessing.current_process(), \
                        "To Process", queue.qsize(), \
                        "Correct found:", len(results)

                # if its not in the database calculate the PMI
                entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                t1 = query.Term('sentence', entity1)
                t3 = query.Term('sentence', entity2)

                # First count the proximity (MAX_TOKENS_AWAY) occurrences
                # of entities r.e1 and r.e2
                q1 = spans.SpanNear2([t1, t3],
                                     slop=MAX_TOKENS_AWAY,
                                     ordered=True,
                                     mindist=1)
                hits = searcher.search(q1, limit=q_limit)

                # Entities proximity considering relational words
                # From the results above count how many contain a
                # valid relational word
                hits_with_r = 0
                hits_without_r = 0
                fact_bet_words_tokens = word_tokenize(r.bet_words)
                for s in hits:
                    sentence = s.get("sentence")
                    s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY,
                                 MIN_TOKENS_AWAY, CONTEXT_WINDOW)
                    for s_r in s.relationships:
                        if r.ent1.decode("utf8") == s_r.ent1 and \
                                        r.ent2.decode("utf8") == s_r.ent2:
                            unigrams_bef_words = word_tokenize(s_r.before)
                            unigrams_bet_words = word_tokenize(s_r.between)
                            unigrams_aft_words = word_tokenize(s_r.after)
                            bigrams_rel_words = extract_bigrams(s_r.between)

                            if fact_bet_words_tokens == unigrams_bet_words:
                                hits_with_r += 1

                            elif any(x in rel_words_unigrams
                                     for x in unigrams_bef_words):
                                hits_with_r += 1

                            elif any(x in rel_words_unigrams
                                     for x in unigrams_bet_words):
                                hits_with_r += 1

                            elif any(x in rel_words_unigrams
                                     for x in unigrams_aft_words):
                                hits_with_r += 1

                            elif rel_words_bigrams == bigrams_rel_words:
                                hits_with_r += 1
                            else:
                                hits_without_r += 1

                if hits_with_r > 0 and hits_without_r > 0:
                    pmi = float(hits_with_r) / float(hits_without_r)
                    if pmi >= PMI:
                        results.append(r)

                    else:
                        not_found.append(r)

                else:
                    not_found.append(r)
                count += 1

            except Queue.Empty:
                break
コード例 #5
0
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results,
                           rel_words_unigrams, rel_words_bigrams):
    idx = open_dir(index)
    count = 0
    distance = MAX_TOKENS_AWAY
    q_limit = 500
    with idx.searcher() as searcher:
        while True:
            try:
                r = queue.get_nowait()
                if count % 50 == 0:
                    print "\n", multiprocessing.current_process(), \
                        "In Queue", queue.qsize(), \
                        "Total Matched: ", len(results)
                if (r.ent1, r.ent2) not in all_in_database:
                    # if its not in the database calculate the PMI
                    entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                    entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                    t1 = query.Term('sentence', entity1)
                    t3 = query.Term('sentence', entity2)

                    # Entities proximity query without relational words
                    q1 = spans.SpanNear2([t1, t3],
                                         slop=distance,
                                         ordered=True,
                                         mindist=1)
                    hits = searcher.search(q1, limit=q_limit)

                    # Entities proximity considering relational words
                    # From the results above count how many contain a
                    # valid relational word

                    hits_with_r = 0
                    hits_without_r = 0
                    for s in hits:
                        sentence = s.get("sentence")
                        s = Sentence(sentence, e1_type, e2_type,
                                     MAX_TOKENS_AWAY, MIN_TOKENS_AWAY,
                                     CONTEXT_WINDOW)

                        for s_r in s.relationships:
                            if r.ent1.decode("utf8") == s_r.ent1 and \
                                            r.ent2.decode("utf8") == s_r.ent2:

                                unigrams_rel_words = word_tokenize(s_r.between)
                                bigrams_rel_words = extract_bigrams(
                                    s_r.between)

                                if all(x in not_valid
                                       for x in unigrams_rel_words):
                                    hits_without_r += 1
                                    continue
                                elif any(x in rel_words_unigrams
                                         for x in unigrams_rel_words):

                                    hits_with_r += 1

                                elif any(x in rel_words_bigrams
                                         for x in bigrams_rel_words):

                                    hits_with_r += 1
                                else:
                                    hits_without_r += 1

                    if hits_with_r > 0 and hits_without_r > 0:
                        pmi = float(hits_with_r) / float(hits_without_r)
                        if pmi >= PMI:
                            if word_tokenize(s_r.between)[-1] == 'by':
                                tmp = s_r.ent2
                                s_r.ent2 = s_r.ent1
                                s_r.ent1 = tmp
                            results.append(r)

                count += 1
            except Queue.Empty:
                break
コード例 #6
0
def query_thread(queue, database, g_minus_d, e1_type, e2_type, index):
    idx = open_dir(index)
    regex_tokenize = re.compile('\w+|-|<[A-Z]+>[^<]+</[A-Z]+>', re.U)
    tokenizer = RegexTokenizer(regex_tokenize)
    stopper = StopFilter()
    count = 0

    with idx.searcher() as searcher:
        while True:
            r = queue.get_nowait()
            count += 1
            if count % 25000 == 0:
                print multiprocessing.current_process(), count, queue.qsize()

            if len(database[(r.ent1, r.ent2)]) == 0:
                # if its not in the database calculate the PMI
                entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                terms = list()
                for token in stopper(
                        tokenizer((r.between.decode("utf8")), renumber=True)):
                    terms.append(query.Term("sentence", token.text))

                #print terms
                t1 = query.Term("sentence", entity1)
                t3 = query.Term("sentence", entity2)

                query_terms = list()
                query_terms.append(t1)
                for t in terms:
                    query_terms.append(t)
                query_terms.append(t3)

                q1 = spans.SpanNear2(query_terms, slop=2, ordered=True)
                q2 = spans.SpanNear2([t1, t3], slop=8, ordered=True)
                entities_r = searcher.search(q1)
                entities = searcher.search(q2)
                """
                print query_terms, len(entities_r)
                print [t1, t3], len(entities)
                print "\n"
                """

                #print entity1, '\t', r.between, '\t', entity2, len(entities_r), len(entities)

                try:
                    assert not len(entities_r) > len(entities)
                except AssertionError, e:
                    print e
                    print r.sentence
                    print r.ent1
                    print r.ent2
                    print query_terms
                    print[t1, t3]

                if len(entities) > 0:
                    pmi = float(len(entities_r)) / float(len(entities))
                    if pmi >= 0.5:
                        #print entity1, '\t', r.between, '\t', entity2, pmi
                        g_minus_d.append(r)

                if queue.empty() is True:
                    break
コード例 #7
0
def proximity_pmi_a(e1_type, e2_type, queue, index, results, not_found,
                    rel_words_unigrams, rel_words_bigrams):
    idx = open_dir(index)
    count = 0
    # cache to store already evaluted triples
    cache_correct = set()
    cache_incorrect = set()

    with idx.searcher() as searcher:
        while True:
            try:
                r = queue.get_nowait()
                t = Triple(r.e1, r.e2, r.bet_words)

                count += 1
                if count % 50 == 0:
                    sys.stdout.write(
                        str(multiprocessing.current_process()) +
                        " To Process: " + str(queue.qsize()) +
                        " Correct found: " + str(len(results)) + '\n')
                    sys.stdout.flush()

                if t in cache_correct:
                    results.append(r)
                    continue

                if t in cache_incorrect:
                    not_found.append(r)
                    continue

                # relational phrase/word of the relationship/triple
                fact_bet_words_tokens = word_tokenize(r.bet_words)

                entity1 = "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">"
                t1 = query.Term('sentence', entity1)
                t3 = query.Term('sentence', entity2)

                # First count the proximity (MAX_TOKENS_AWAY) occurrences of entities r.e1 and r.e2
                q1 = spans.SpanNear2([t1, t3],
                                     slop=MAX_TOKENS_AWAY,
                                     ordered=True,
                                     mindist=1)
                hits = searcher.search(q1, limit=q_limit)

                # Entities proximity considering relational word
                # From the results above count how many contain the relational word/phrase
                hits_with_r = 0

                for s in hits:
                    sentence = s.get("sentence")
                    start_e1 = sentence.rindex(entity1)
                    end_e1 = start_e1 + len(entity1)
                    start_e2 = sentence.rindex(entity2)
                    bet = sentence[end_e1:start_e2]
                    bet_tokens = word_tokenize(bet)

                    if not (MIN_TOKENS_AWAY <= len(bet_tokens) <=
                            MAX_TOKENS_AWAY):
                        continue

                    if fact_bet_words_tokens == bet_tokens:
                        hits_with_r += 1

                assert len(hits) >= hits_with_r

                if len(hits) > 0:
                    pmi = float(hits_with_r) / float(len(hits))
                    if pmi >= PMI:
                        results.append(r)
                        cache_correct.add(t)
                        """
                        print "**VALID**:", entity1, '\t', entity2
                        print "hits_without_r ", float(hits_without_r)
                        print "hits_with_r ", float(hits_with_r)
                        print "discarded", discarded
                        print "Index hits", len(hits)
                        print "PMI", pmi
                        print r.sentence
                        print r.bet_words
                        print
                        """

                    else:
                        # check against a list
                        if r.bet_words.strip() in rel_words_unigrams:
                            results.append(r)
                            cache_correct.add(t)

                        elif r.bet_words.strip() in rel_words_bigrams:
                            results.append(r)
                            cache_correct.add(t)

                        else:
                            not_found.append(r)
                            cache_incorrect.add(t)
                            """
                            print "**INVALID**:"
                            print 'ExtractedFact:', entity1, '\t', entity2
                            print r.bet_words.strip()
                            print
                            """
                else:
                    not_found.append(r)
                    cache_incorrect.add(t)

            except Queue.Empty:
                break
コード例 #8
0
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results,
                           rel_words_unigrams, rel_words_bigrams):
    idx = open_dir(index)
    count = 0
    distance = MAX_TOKENS_AWAY
    with idx.searcher() as searcher:
        while True:
            try:
                r = queue.get_nowait()
                if count % 50 == 0:
                    print multiprocessing.current_process(
                    ), "In Queue", queue.qsize(), "Total Matched: ", len(
                        results)

                #TODO: fazer uma cache

                t1 = query.Term(
                    'sentence',
                    "<" + e1_type + ">" + r.e1 + "</" + e1_type + ">")
                t3 = query.Term(
                    'sentence',
                    "<" + e2_type + ">" + r.e2 + "</" + e2_type + ">")

                # Entities proximity query without relational words
                q1 = spans.SpanNear2([t1, t3],
                                     slop=distance,
                                     ordered=True,
                                     mindist=1)
                hits = searcher.search(q1, limit=q_limit)

                # Entities proximity considering relational words
                # From the results above count how many contain a relational word
                hits_with_r = 0
                total_hits = 0

                for s in hits:
                    sentence = s.get("sentence")
                    s = SentenceEvaluation(sentence, e1_type, e2_type,
                                           MAX_TOKENS_AWAY, MIN_TOKENS_AWAY,
                                           CONTEXT_WINDOW, stopwords)
                    for s_r in s.relationships:
                        if r.e1.decode("utf8") == s_r.e1 and r.e2.decode(
                                "utf8") == s_r.e2:
                            total_hits += 1
                            unigrams_rel_words = s_r.between
                            bigrams_rel_words = extract_bigrams(' '.join(
                                s_r.between))
                            if any(x in rel_words_unigrams
                                   for x in unigrams_rel_words):
                                hits_with_r += 1
                                continue

                            if any(x in rel_words_bigrams
                                   for x in bigrams_rel_words):
                                hits_with_r += 1
                                continue

                assert total_hits >= hits_with_r

                if total_hits > 0:
                    pmi = float(hits_with_r) / float(total_hits)
                    if pmi >= PMI:
                        results.append(r)
                count += 1
            except Queue.Empty:
                break
コード例 #9
0
 #find all combinations of our three words with itertools.product, which takes list of lists and gives all combinations, e.g. A1,B1,C1, A1,B1,C2...An,Bn,Cn as a list
 exhaustive_combinations = list(itertools.product(*list_containing_word_and_variant_lists))
 
 #now we need only iterate through exhaustive_combinations, searching for each, and writing any hits to outfile
 for search_terms in exhaustive_combinations:
 
     #list of query components will start empty, we'll populate it, then submit our query
     list_of_query_components = []
     
     #iterate through search terms and add each to our list_of_query_components
     for term in search_terms:
         query_component = Term("content", term)
         list_of_query_components.append(query_component)
         
     #now take all of those query components and submit them to the spans.SpanNear2 function, which (loosely speaking) facilitates proximity search (with high recall and low precision, thus why we have to iterate through all results and select only the true matches in our process_results() function defined above and called below)
     q = spans.SpanNear2(list_of_query_components, slop=proximity_value, ordered=False)
      
     #by default the results contains at most the first 10 matching documents. To get more results, use the limit keyword: results = searcher.search(q, limit=20). printing "results" object is handy because it gives runtime for each query. Terms=true allows us to determine which of the search terms each hit has
     results = searcher.search(q, limit=None, terms=True)
     
     #the following line allows one to retrieve hits from farther into the document than 32K characters (so if character 32,001 is the beginning of a new word that matches query, we can grab that hit with the following line but will fail to catch it without that line)
     results.fragmenter.charlimit = None
     
     #add the current results to a bag of results for our three terms (e.g. find all results for all variant spellings of "so so now", then count up the total number of times that series of words yielded hits)
     if results:
         if proximity_search_desired == 1:
             process_results_with_proximity_function(search_terms, results, proximity_value)
         
         if exact_search_desired == 1:
             process_results_with_exact_function(search_terms, results)
         
コード例 #10
0
def proximity_pmi(e1_type, e2_type, queue, index, results):
    """
    sentences with tagged entities are indexed in whoosh
    perform the following query
    ent1 NEAR:X r NEAR:X ent2
    X is the maximum number of words between the query elements.
    """
    tokenize = re.compile('\w+(?:-\w+)+|<[A-Z]+>[^<]+</[A-Z]+>|\w+', re.U)
    entity = re.compile('<[A-Z]+>[^<]+</[A-Z]+>', re.U)
    idx = open_dir(index)
    count = 0
    distance = 9
    q_limit = 500
    with idx.searcher() as searcher:
        while True:
            count += 1
            if count % 50 == 0:
                print multiprocessing.current_process(
                ), "In Queue", queue.qsize(), "Total Matched: ", len(results)
            r = queue.get_nowait()
            n_1 = set()
            n_2 = set()
            n_3 = set()
            # if its not in the database calculate the proximity PMI
            if (r.ent1, r.ent2) not in all_in_freebase:
                entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                t1 = query.Term('sentence', entity1)
                t3 = query.Term('sentence', entity2)

                # Entities proximity query without relational words
                q1 = spans.SpanNear2([t1, t3],
                                     slop=distance,
                                     ordered=True,
                                     mindist=1)
                hits_1 = searcher.search(q1, limit=q_limit)

                # Entities proximity considering relational words
                if isinstance(r, ExtractedFact):
                    tokens_rel = re.findall(tokenize, r.patterns)

                elif isinstance(r, Relationship):
                    tokens_rel = re.findall(tokenize, r.between)

                token_terms = list()
                for t in tokens_rel:
                    if re.search(entity, t) is None:
                        token_terms.append(query.Term('sentence', t))

                l1 = [t for t in token_terms]
                l1.insert(0, t1)
                l2 = [t for t in token_terms]
                l2.append(t3)

                q2 = spans.SpanNear2(l1,
                                     slop=distance - 1,
                                     ordered=True,
                                     mindist=1)
                hits_2 = searcher.search(q2, limit=q_limit)

                q3 = spans.SpanNear2(l2,
                                     slop=distance - 1,
                                     ordered=True,
                                     mindist=1)
                hits_3 = searcher.search(q3, limit=q_limit)

                for d in hits_1:
                    n_1.add(d.get("sentence"))

                for d in hits_2:
                    n_2.add(d.get("sentence"))

                for d in hits_3:
                    n_3.add(d.get("sentence"))

                entities_occurr = len(hits_1)
                entities_occurr_with_r = len(
                    n_1.intersection(n_2).intersection(n_3))

                try:
                    assert not entities_occurr_with_r > entities_occurr
                except AssertionError, e:
                    print e
                    print r.sentence
                    print r.ent1
                    print r.ent2
                    print q1, len(hits_1)
                    print q2, len(hits_2)
                    print q3, len(hits_3)
                    print "intersection", len(
                        n_1.intersection(n_2).intersection(n_3))
                    sys.exit(0)

                if float(entities_occurr) > 0:
                    if float(entities_occurr
                             ) > 1 and entities_occurr_with_r > 1:
                        pmi = float(entities_occurr_with_r) / float(
                            entities_occurr)
                        if pmi > PMI:
                            """
                            # TODO: há coisas aqui que sao falsas, por exemplo: 'chief'
                            if isinstance(r, ExtractedFact):
                                print r.ent1, '\t', r.patterns, '\t', r.ent2, pmi
                            elif isinstance(r, Relationship):
                                print r.ent1, '\t', r.between, '\t', r.ent2, pmi
                            """
                            results.append(r)

                if queue.empty() is True:
                    break
コード例 #11
0
def proximity_pmi_rel_word(e1_type, e2_type, queue, index, results, rel_words):
    """
    #TODO: proximity_pmi with relation specific given relational words
    :param e1_type:
    :param e2_type:
    :param queue:
    :param index:
    :param results:
    :param rel_word:
    :return:
    """
    """
    sentences with tagged entities are indexed in whoosh
    perform the following query
    ent1 NEAR:X r NEAR:X ent2
    X is the maximum number of words between the query elements.
    """
    idx = open_dir(index)
    count = 0
    distance = 9
    q_limit = 500
    with idx.searcher() as searcher:
        while True:
            count += 1
            if count % 50 == 0:
                print multiprocessing.current_process(
                ), "In Queue", queue.qsize(), "Total Matched: ", len(results)
            r = queue.get_nowait()
            if (r.ent1, r.ent2) not in all_in_freebase:
                # if its not in the database calculate the PMI
                entity1 = "<" + e1_type + ">" + r.ent1 + "</" + e1_type + ">"
                entity2 = "<" + e2_type + ">" + r.ent2 + "</" + e2_type + ">"
                t1 = query.Term('sentence', entity1)
                t3 = query.Term('sentence', entity2)

                # Entities proximity query without relational words
                q1 = spans.SpanNear2([t1, t3],
                                     slop=distance,
                                     ordered=True,
                                     mindist=1)
                hits = searcher.search(q1, limit=q_limit)

                # Entities proximity considering relational words
                # From the results above count how many contain a relational word
                hits_with_r = 0
                for s in hits:
                    sentence = s.get("sentence")
                    s = Sentence(sentence, e1_type, e2_type, MAX_TOKENS_AWAY,
                                 MIN_TOKENS_AWAY, CONTEXT_WINDOW)
                    for s_r in s.relationships:
                        if r.ent1.decode("utf8") == s_r.ent1 and r.ent2.decode(
                                "utf8") == s_r.ent2:
                            for rel in rel_words:
                                if rel in r.between:
                                    hits_with_r += 1
                                    break

                    if not len(hits) >= hits_with_r:
                        print "ERROR!"
                        print "hits", len(hits)
                        print "hits_with_r", hits_with_r
                        print entity1, '\t', entity2
                        print "\n"
                        sys.exit(0)

                if float(len(hits)) > 0:
                    pmi = float(hits_with_r) / float(len(hits))
                    if pmi > PMI:
                        results.append(r)
                        """
                        if isinstance(r, ExtractedFact):
                            print r.ent1, '\t', r.patterns, '\t', r.ent2, pmi
                        elif isinstance(r, Relationship):
                            print r.ent1, '\t', r.between, '\t', r.ent2, pmi
                        """
                if queue.empty() is True:
                    break