コード例 #1
0
def get_candidate_answers(question, text, W2vecextractor, q_verb, sgraphs, useWord2Vec = False, useVerb = True):
    stopwords = set(nltk.corpus.stopwords.words("english"))
    
    # Collect all the candidate answers
    candidate_answers = []

    if(useWord2Vec==True and useVerb == False):
        q_feat = W2vecextractor.sent2vec(question)
        sentences = nltk.sent_tokenize(text)
        for i in range(0, len(sentences)):
            sent = sentences[i]
            a_feat = W2vecextractor.sent2vec(sent)
            dist = cosine_similarity(q_feat, a_feat) #calculate cosine similarity between the question and the candidate answer
            candidate_answers.append((dist, i, sent))
            #print("distance: "+str(dist)+"\t sent: "+sent)

    if(useWord2Vec==True and useVerb == True):
        #print(q_verb)
        q_feat = W2vecextractor.word2v(q_verb)
        sentences = nltk.sent_tokenize(text)
        for i in range(0, len(sentences)):
            sent = sentences[i]
            s_verb = find_main(sgraphs[i])['word']
            #print(s_verb)
            a_feat = W2vecextractor.word2v(s_verb)
            dist = cosine_similarity(q_feat, a_feat)   #calculate cosine similarity between the main verbs in the question and the candidate answer
            candidate_answers.append((dist, i, sent))
            #print("distance: "+str(dist)+"\t sent: "+sent)

    else:
        qbow = get_bow(question, stopwords)
        sentences = nltk.sent_tokenize(text)
        for i in range(0, len(sentences)):
            sent = sentences[i]
            # A list of all the word tokens in the sentence
            sbow = get_bow(sent, stopwords)
        
            # Count the # of overlapping words between the Q and the A
            # & is the set intersection operator
            overlap = len(qbow & sbow)
        
            candidate_answers.append((overlap, i, sent))
        
    # Sort the results by the first element of the tuple (i.e., the count)
    # Sort answers from smallest to largest by default, so reverse it
    # Make sure to check about whether the results are null.
    #if len(candidate_answers) > 0:
        #best_answer = sorted(candidate_answers, key=lambda x: x[0], reverse=True)[0][1]
        #best_answer = max(candidate_answers, key=lambda x: x[0])[1]
        #return best_answer 
    return sorted(candidate_answers, key=lambda x: x[0], reverse=True)
コード例 #2
0
def baseline_word2vec_verb(question, sentences, stopwords, W2vecextractor, q_verb, sgraphs):
    q_feat = W2vecextractor.word2v(q_verb)
    candidate_answers = []
    print("ROOT of question: "+str(q_verb))

    for i in range(0, len(sentences)):
        sent = sentences[i]
        s_verb = find_main(sgraphs[i])['word']
        print("ROOT of sentence: "+str(s_verb))
        a_feat = W2vecextractor.word2v(s_verb)

        dist = cosine_similarity([q_feat], [a_feat])
        candidate_answers.append((dist[0], sent))

    answers = sorted(candidate_answers, key=operator.itemgetter(0), reverse=True)

    best_answer = (answers[0])[1]    
    return best_answer
コード例 #3
0
            qgraphs = read_dep_parses(fname + ".questions.dep")

            for j in range(0, len(questions)):
                qname = "{0}-{1}".format(fname, j + 1)
                if qname in questions:
                    print("QuestionID: " + qname)
                    question = questions[qname]['Question']
                    print(question)
                    qtypes = questions[qname]['Type']

                    # Get the question dep graph
                    qgraph = qgraphs[i]

                    # Get main verb in the question
                    q_verb = find_main(qgraph)['word']

                    answer = None
                    # qtypes can be "Story", "Sch", "Sch | Story"
                    for qt in qtypes.split("|"):
                        qt = qt.strip().lower()
                        # These are the text data where you can look for answers.
                        raw_text = data_dict[qt]
                        par_text = data_dict[qt + ".par"]
                        dep_text = data_dict[qt + ".dep"]

                        # get the applicable dep file for finding the answer
                        ans_dep_file = fname + "." + str(qt) + ".dep"

                        # get the dep graphs for all sentences in the answer file
                        sgraphs = read_dep_parses(ans_dep_file)