def test_on_train(path_model, n_combination_question = 3, n_combination_answer = 3, n_word_question = 5):
    model = gensim.models.Word2Vec.load(path_model)
    path_train = 'data/training_set.tsv'
    n_total = 0
    n_correct = 0
    set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for')
    d_word_count = util.load_d_word_count()
    for index, line in enumerate(open(path_train)):
        n_total += 1
        if index == 0:
            continue
        else:
            lst = line.lower().strip('\n').split('\t')
            question = lst[1].split(' ')
            answer = lst[2]
            lst_choice = [l.split(' ') for l in lst[3:]]
            #question_u = list(set(question).difference(set_stopword))
            d = {}
            for word in question:
                word = word.strip('?').strip('.').strip(',').strip('!')
                if d_word_count.has_key(word):
                    d[word] = d_word_count[word]
                else:
                    d[word] = 0
            # Only consider the word with lowest frequency.
            sort = sorted(d.iteritems(), key = lambda dd : dd[1])
            question_u = [s[0] for s in sort[:n_word_question]]
            lst_com_q = util.combination_index(len(question_u), n_combination_question)
            max = -1000000
            answer_p = ''
            words_focus_question = []
            words_focus_choice = []
            for com_q in lst_com_q:
                vec_q = np.sum([get_vector_from_model(model, question_u[i]) for i in com_q], axis = 0)
                for i_choice in range(4):
                    choice_u =list(set(lst_choice[i_choice]))
                    lst_com_choice = util.combination_index(len(choice_u), n_combination_answer)
                    for com_c in lst_com_choice:
                        vec_c = np.sum([get_vector_from_model(model, choice_u[i]) for i in com_c], axis = 0)
                        score = vec_q.dot(vec_c)
                        if score > max:
                            words_focus_question = [question_u[i] for i in com_q]
                            words_focus_choice = [choice_u[i] for i in com_c]
                            max = score
                            if i_choice == 0:
                                answer_p = 'a'
                            elif i_choice == 1:
                                answer_p = 'b'
                            elif i_choice == 2:
                                answer_p = 'c'
                            elif i_choice == 3:
                                answer_p = 'd'
            
            if answer == answer_p:
                n_correct += 1
            print ' '.join(lst) + '\n' + "Focus Words in Question: " + ' '.join(words_focus_question) + '\n' + "Focus Words in Choice: " + ' '.join(words_focus_choice)
            print 'Predicted Answer: ' + answer_p
            print str(n_correct) + ' / ' + str(n_total) + '\t' + str(n_correct * 1.0 / n_total)
    print n_correct * 1.0 / n_total
def test_on_validation(path_model, n_combination_question = 3, n_combination_answer = 3, n_word_question = 5):
    model = gensim.models.Word2Vec.load(path_model)
    path_train = 'data/validation_set.tsv'
    n_combination_question = 4
    n_combination_answer = 3
    n_word_question = 5
    n_total = 0
    n_correct = 0
    #set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for')
    d_word_count = util.load_d_word_count()
    for index, line in enumerate(open(path_train)):
        n_total += 1
        if index == 0:
            continue
        else:
            lst = line.lower().strip('\n').split('\t')
            question = lst[1].split(' ')
            lst_choice = [l.split(' ') for l in lst[2:]]
            #question_u = list(set(question).difference(set_stopword))
            d = {}
            for word in question:
                #word = word.strip('?').strip('.').strip(',').strip('!')
                word = util.norm_word(word)
                if d_word_count.has_key(word):
                    d[word] = d_word_count[word]
                else:
                    d[word] = 0
            sort = sorted(d.iteritems(), key = lambda dd : dd[1])
            question_u = [s[0] for s in sort[:n_word_question]]
            lst_com_q = util.combination_index(len(question_u), n_combination_question)
            max = -1000000
            answer_p = ''
            for com_q in lst_com_q:
                vec_q = np.sum([get_vector_from_model(model, question_u[i]) for i in com_q], axis = 0)
                for i_choice in range(4):
                    choice_u =list(set(lst_choice[i_choice]))
                    lst_com_choice = util.combination_index(len(choice_u), n_combination_answer)
                    for com_c in lst_com_choice:
                        vec_c = np.sum([get_vector_from_model(model, choice_u[i]) for i in com_c], axis = 0)
                        score = vec_q.dot(vec_c)
                        if score > max:
                            max = score
                            if i_choice == 0:
                                answer_p = 'A'
                            elif i_choice == 1:
                                answer_p = 'B'
                            elif i_choice == 2:
                                answer_p = 'C'
                            elif i_choice == 3:
                                answer_p = 'D'
            print "%s,%s" % (lst[0], answer_p)
Example #3
0
def get_max_occurence(lst_set_sentence, question, lst_choice, d_word_count,
                      set_stopword, n_word_question, n_combination_question,
                      n_combination_answer):
    answer_p = ''
    MAX = -1
    lst_word_focus_q = []
    lst_word_focus_c = []
    lst_word_question_u = list(set(map(util.norm_word, question.split())))
    d = {}
    for word in question:
        #word = word.strip('?').strip('.').strip(',').strip('!')
        word = util.norm_word(word)
        if d_word_count.has_key(word):
            d[word] = d_word_count[word]
        else:
            d[word] = 0
    sort = sorted(d.iteritems(), key=lambda dd: dd[1])
    question_u = [s[0] for s in sort[:n_word_question]]
    lst_com_q = util.combination_index(len(lst_word_question_u),
                                       n_combination_question)
    for com_q in lst_com_q:
        lst_word_question = [lst_word_question_u[i] for i in com_q]
        for index_c, choice in enumerate(lst_choice):
            # Filter stop words in choice in order to prevent them from calculating cooccurence.
            lst_word_choice_u = list(
                set(map(util.norm_word,
                        choice.split())).difference(set_stopword))
            lst_com_c = util.combination_index(len(lst_word_choice_u),
                                               n_combination_answer)
            for com_c in lst_com_c:
                lst_word_choice = [lst_word_choice_u[i] for i in com_c]
                n_cooccurence = get_cooccurence(lst_set_sentence,
                                                lst_word_question,
                                                lst_word_choice)
                if n_cooccurence > MAX:
                    MAX = n_cooccurence
                    lst_word_focus_q = lst_word_question
                    lst_word_focus_c = lst_word_choice
                    if index_c == 0:
                        answer_p = 'A'
                    elif index_c == 1:
                        answer_p = 'B'
                    elif index_c == 2:
                        answer_p = 'C'
                    elif index_c == 3:
                        answer_p = 'D'

    return answer_p, MAX, lst_word_focus_q, lst_word_focus_c
def get_max_occurence(lst_set_sentence, question, lst_choice, d_word_count, set_stopword, n_word_question, n_combination_question, n_combination_answer):
    answer_p = ''
    MAX = -1
    lst_word_focus_q = []
    lst_word_focus_c = []
    lst_word_question_u = list(set(map(util.norm_word, question.split())))
    d = {}
    for word in question:
        #word = word.strip('?').strip('.').strip(',').strip('!')
        word = util.norm_word(word)
        if d_word_count.has_key(word):
            d[word] = d_word_count[word]
        else:
            d[word] = 0
    sort = sorted(d.iteritems(), key = lambda dd : dd[1])
    question_u = [s[0] for s in sort[:n_word_question]]
    lst_com_q = util.combination_index(len(lst_word_question_u), n_combination_question)
    for com_q in lst_com_q:
        lst_word_question = [lst_word_question_u[i] for i in com_q]
        for index_c, choice in enumerate(lst_choice):
            # Filter stop words in choice in order to prevent them from calculating cooccurence.
            lst_word_choice_u = list(set(map(util.norm_word, choice.split())).difference(set_stopword))
            lst_com_c = util.combination_index(len(lst_word_choice_u), n_combination_answer)
            for com_c in lst_com_c:
                lst_word_choice = [lst_word_choice_u[i] for i in com_c]
                n_cooccurence = get_cooccurence(lst_set_sentence, lst_word_question, lst_word_choice)
                if n_cooccurence > MAX :
                    MAX = n_cooccurence
                    lst_word_focus_q = lst_word_question
                    lst_word_focus_c = lst_word_choice
                    if index_c == 0:
                        answer_p = 'A'
                    elif index_c == 1:
                        answer_p = 'B'
                    elif index_c == 2:
                        answer_p = 'C'
                    elif index_c == 3:
                        answer_p = 'D'

    return answer_p, MAX, lst_word_focus_q, lst_word_focus_c
Example #5
0
def test_on_train(path_model,
                  n_combination_question=3,
                  n_combination_answer=3,
                  n_word_question=5):
    model = gensim.models.Word2Vec.load(path_model)
    path_train = 'data/training_set.tsv'
    n_total = 0
    n_correct = 0
    set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will',
                    'is', 'are', 'was', 'were', 'of', 'for')
    d_word_count = util.load_d_word_count()
    for index, line in enumerate(open(path_train)):
        n_total += 1
        if index == 0:
            continue
        else:
            lst = line.lower().strip('\n').split('\t')
            question = lst[1].split(' ')
            answer = lst[2]
            lst_choice = [l.split(' ') for l in lst[3:]]
            #question_u = list(set(question).difference(set_stopword))
            d = {}
            for word in question:
                word = word.strip('?').strip('.').strip(',').strip('!')
                if d_word_count.has_key(word):
                    d[word] = d_word_count[word]
                else:
                    d[word] = 0
            # Only consider the word with lowest frequency.
            sort = sorted(d.iteritems(), key=lambda dd: dd[1])
            question_u = [s[0] for s in sort[:n_word_question]]
            lst_com_q = util.combination_index(len(question_u),
                                               n_combination_question)
            max = -1000000
            answer_p = ''
            words_focus_question = []
            words_focus_choice = []
            for com_q in lst_com_q:
                vec_q = np.sum([
                    get_vector_from_model(model, question_u[i]) for i in com_q
                ],
                               axis=0)
                for i_choice in range(4):
                    choice_u = list(set(lst_choice[i_choice]))
                    lst_com_choice = util.combination_index(
                        len(choice_u), n_combination_answer)
                    for com_c in lst_com_choice:
                        vec_c = np.sum([
                            get_vector_from_model(model, choice_u[i])
                            for i in com_c
                        ],
                                       axis=0)
                        score = vec_q.dot(vec_c)
                        if score > max:
                            words_focus_question = [
                                question_u[i] for i in com_q
                            ]
                            words_focus_choice = [choice_u[i] for i in com_c]
                            max = score
                            if i_choice == 0:
                                answer_p = 'a'
                            elif i_choice == 1:
                                answer_p = 'b'
                            elif i_choice == 2:
                                answer_p = 'c'
                            elif i_choice == 3:
                                answer_p = 'd'

            if answer == answer_p:
                n_correct += 1
            print ' '.join(lst) + '\n' + "Focus Words in Question: " + ' '.join(
                words_focus_question
            ) + '\n' + "Focus Words in Choice: " + ' '.join(words_focus_choice)
            print 'Predicted Answer: ' + answer_p
            print str(n_correct) + ' / ' + str(n_total) + '\t' + str(
                n_correct * 1.0 / n_total)
    print n_correct * 1.0 / n_total
Example #6
0
def test_on_validation(path_model,
                       n_combination_question=3,
                       n_combination_answer=3,
                       n_word_question=5):
    model = gensim.models.Word2Vec.load(path_model)
    path_train = 'data/validation_set.tsv'
    n_combination_question = 4
    n_combination_answer = 3
    n_word_question = 5
    n_total = 0
    n_correct = 0
    #set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for')
    d_word_count = util.load_d_word_count()
    for index, line in enumerate(open(path_train)):
        n_total += 1
        if index == 0:
            continue
        else:
            lst = line.lower().strip('\n').split('\t')
            question = lst[1].split(' ')
            lst_choice = [l.split(' ') for l in lst[2:]]
            #question_u = list(set(question).difference(set_stopword))
            d = {}
            for word in question:
                #word = word.strip('?').strip('.').strip(',').strip('!')
                word = util.norm_word(word)
                if d_word_count.has_key(word):
                    d[word] = d_word_count[word]
                else:
                    d[word] = 0
            sort = sorted(d.iteritems(), key=lambda dd: dd[1])
            question_u = [s[0] for s in sort[:n_word_question]]
            lst_com_q = util.combination_index(len(question_u),
                                               n_combination_question)
            max = -1000000
            answer_p = ''
            for com_q in lst_com_q:
                vec_q = np.sum([
                    get_vector_from_model(model, question_u[i]) for i in com_q
                ],
                               axis=0)
                for i_choice in range(4):
                    choice_u = list(set(lst_choice[i_choice]))
                    lst_com_choice = util.combination_index(
                        len(choice_u), n_combination_answer)
                    for com_c in lst_com_choice:
                        vec_c = np.sum([
                            get_vector_from_model(model, choice_u[i])
                            for i in com_c
                        ],
                                       axis=0)
                        score = vec_q.dot(vec_c)
                        if score > max:
                            max = score
                            if i_choice == 0:
                                answer_p = 'A'
                            elif i_choice == 1:
                                answer_p = 'B'
                            elif i_choice == 2:
                                answer_p = 'C'
                            elif i_choice == 3:
                                answer_p = 'D'
            print "%s,%s" % (lst[0], answer_p)