def test_on_train(path_model, n_combination_question = 3, n_combination_answer = 3, n_word_question = 5): model = gensim.models.Word2Vec.load(path_model) path_train = 'data/training_set.tsv' n_total = 0 n_correct = 0 set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') d_word_count = util.load_d_word_count() for index, line in enumerate(open(path_train)): n_total += 1 if index == 0: continue else: lst = line.lower().strip('\n').split('\t') question = lst[1].split(' ') answer = lst[2] lst_choice = [l.split(' ') for l in lst[3:]] #question_u = list(set(question).difference(set_stopword)) d = {} for word in question: word = word.strip('?').strip('.').strip(',').strip('!') if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 # Only consider the word with lowest frequency. sort = sorted(d.iteritems(), key = lambda dd : dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(question_u), n_combination_question) max = -1000000 answer_p = '' words_focus_question = [] words_focus_choice = [] for com_q in lst_com_q: vec_q = np.sum([get_vector_from_model(model, question_u[i]) for i in com_q], axis = 0) for i_choice in range(4): choice_u =list(set(lst_choice[i_choice])) lst_com_choice = util.combination_index(len(choice_u), n_combination_answer) for com_c in lst_com_choice: vec_c = np.sum([get_vector_from_model(model, choice_u[i]) for i in com_c], axis = 0) score = vec_q.dot(vec_c) if score > max: words_focus_question = [question_u[i] for i in com_q] words_focus_choice = [choice_u[i] for i in com_c] max = score if i_choice == 0: answer_p = 'a' elif i_choice == 1: answer_p = 'b' elif i_choice == 2: answer_p = 'c' elif i_choice == 3: answer_p = 'd' if answer == answer_p: n_correct += 1 print ' '.join(lst) + '\n' + "Focus Words in Question: " + ' '.join(words_focus_question) + '\n' + "Focus Words in Choice: " + ' '.join(words_focus_choice) print 'Predicted Answer: ' + answer_p print str(n_correct) + ' / ' + str(n_total) + '\t' + str(n_correct * 1.0 / n_total) print n_correct * 1.0 / n_total
def test_on_validation(path_model, n_combination_question = 3, n_combination_answer = 3, n_word_question = 5): model = gensim.models.Word2Vec.load(path_model) path_train = 'data/validation_set.tsv' n_combination_question = 4 n_combination_answer = 3 n_word_question = 5 n_total = 0 n_correct = 0 #set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') d_word_count = util.load_d_word_count() for index, line in enumerate(open(path_train)): n_total += 1 if index == 0: continue else: lst = line.lower().strip('\n').split('\t') question = lst[1].split(' ') lst_choice = [l.split(' ') for l in lst[2:]] #question_u = list(set(question).difference(set_stopword)) d = {} for word in question: #word = word.strip('?').strip('.').strip(',').strip('!') word = util.norm_word(word) if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 sort = sorted(d.iteritems(), key = lambda dd : dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(question_u), n_combination_question) max = -1000000 answer_p = '' for com_q in lst_com_q: vec_q = np.sum([get_vector_from_model(model, question_u[i]) for i in com_q], axis = 0) for i_choice in range(4): choice_u =list(set(lst_choice[i_choice])) lst_com_choice = util.combination_index(len(choice_u), n_combination_answer) for com_c in lst_com_choice: vec_c = np.sum([get_vector_from_model(model, choice_u[i]) for i in com_c], axis = 0) score = vec_q.dot(vec_c) if score > max: max = score if i_choice == 0: answer_p = 'A' elif i_choice == 1: answer_p = 'B' elif i_choice == 2: answer_p = 'C' elif i_choice == 3: answer_p = 'D' print "%s,%s" % (lst[0], answer_p)
def get_max_occurence(lst_set_sentence, question, lst_choice, d_word_count, set_stopword, n_word_question, n_combination_question, n_combination_answer): answer_p = '' MAX = -1 lst_word_focus_q = [] lst_word_focus_c = [] lst_word_question_u = list(set(map(util.norm_word, question.split()))) d = {} for word in question: #word = word.strip('?').strip('.').strip(',').strip('!') word = util.norm_word(word) if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 sort = sorted(d.iteritems(), key=lambda dd: dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(lst_word_question_u), n_combination_question) for com_q in lst_com_q: lst_word_question = [lst_word_question_u[i] for i in com_q] for index_c, choice in enumerate(lst_choice): # Filter stop words in choice in order to prevent them from calculating cooccurence. lst_word_choice_u = list( set(map(util.norm_word, choice.split())).difference(set_stopword)) lst_com_c = util.combination_index(len(lst_word_choice_u), n_combination_answer) for com_c in lst_com_c: lst_word_choice = [lst_word_choice_u[i] for i in com_c] n_cooccurence = get_cooccurence(lst_set_sentence, lst_word_question, lst_word_choice) if n_cooccurence > MAX: MAX = n_cooccurence lst_word_focus_q = lst_word_question lst_word_focus_c = lst_word_choice if index_c == 0: answer_p = 'A' elif index_c == 1: answer_p = 'B' elif index_c == 2: answer_p = 'C' elif index_c == 3: answer_p = 'D' return answer_p, MAX, lst_word_focus_q, lst_word_focus_c
def get_max_occurence(lst_set_sentence, question, lst_choice, d_word_count, set_stopword, n_word_question, n_combination_question, n_combination_answer): answer_p = '' MAX = -1 lst_word_focus_q = [] lst_word_focus_c = [] lst_word_question_u = list(set(map(util.norm_word, question.split()))) d = {} for word in question: #word = word.strip('?').strip('.').strip(',').strip('!') word = util.norm_word(word) if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 sort = sorted(d.iteritems(), key = lambda dd : dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(lst_word_question_u), n_combination_question) for com_q in lst_com_q: lst_word_question = [lst_word_question_u[i] for i in com_q] for index_c, choice in enumerate(lst_choice): # Filter stop words in choice in order to prevent them from calculating cooccurence. lst_word_choice_u = list(set(map(util.norm_word, choice.split())).difference(set_stopword)) lst_com_c = util.combination_index(len(lst_word_choice_u), n_combination_answer) for com_c in lst_com_c: lst_word_choice = [lst_word_choice_u[i] for i in com_c] n_cooccurence = get_cooccurence(lst_set_sentence, lst_word_question, lst_word_choice) if n_cooccurence > MAX : MAX = n_cooccurence lst_word_focus_q = lst_word_question lst_word_focus_c = lst_word_choice if index_c == 0: answer_p = 'A' elif index_c == 1: answer_p = 'B' elif index_c == 2: answer_p = 'C' elif index_c == 3: answer_p = 'D' return answer_p, MAX, lst_word_focus_q, lst_word_focus_c
def test_on_train(path_model, n_combination_question=3, n_combination_answer=3, n_word_question=5): model = gensim.models.Word2Vec.load(path_model) path_train = 'data/training_set.tsv' n_total = 0 n_correct = 0 set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') d_word_count = util.load_d_word_count() for index, line in enumerate(open(path_train)): n_total += 1 if index == 0: continue else: lst = line.lower().strip('\n').split('\t') question = lst[1].split(' ') answer = lst[2] lst_choice = [l.split(' ') for l in lst[3:]] #question_u = list(set(question).difference(set_stopword)) d = {} for word in question: word = word.strip('?').strip('.').strip(',').strip('!') if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 # Only consider the word with lowest frequency. sort = sorted(d.iteritems(), key=lambda dd: dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(question_u), n_combination_question) max = -1000000 answer_p = '' words_focus_question = [] words_focus_choice = [] for com_q in lst_com_q: vec_q = np.sum([ get_vector_from_model(model, question_u[i]) for i in com_q ], axis=0) for i_choice in range(4): choice_u = list(set(lst_choice[i_choice])) lst_com_choice = util.combination_index( len(choice_u), n_combination_answer) for com_c in lst_com_choice: vec_c = np.sum([ get_vector_from_model(model, choice_u[i]) for i in com_c ], axis=0) score = vec_q.dot(vec_c) if score > max: words_focus_question = [ question_u[i] for i in com_q ] words_focus_choice = [choice_u[i] for i in com_c] max = score if i_choice == 0: answer_p = 'a' elif i_choice == 1: answer_p = 'b' elif i_choice == 2: answer_p = 'c' elif i_choice == 3: answer_p = 'd' if answer == answer_p: n_correct += 1 print ' '.join(lst) + '\n' + "Focus Words in Question: " + ' '.join( words_focus_question ) + '\n' + "Focus Words in Choice: " + ' '.join(words_focus_choice) print 'Predicted Answer: ' + answer_p print str(n_correct) + ' / ' + str(n_total) + '\t' + str( n_correct * 1.0 / n_total) print n_correct * 1.0 / n_total
def test_on_validation(path_model, n_combination_question=3, n_combination_answer=3, n_word_question=5): model = gensim.models.Word2Vec.load(path_model) path_train = 'data/validation_set.tsv' n_combination_question = 4 n_combination_answer = 3 n_word_question = 5 n_total = 0 n_correct = 0 #set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') d_word_count = util.load_d_word_count() for index, line in enumerate(open(path_train)): n_total += 1 if index == 0: continue else: lst = line.lower().strip('\n').split('\t') question = lst[1].split(' ') lst_choice = [l.split(' ') for l in lst[2:]] #question_u = list(set(question).difference(set_stopword)) d = {} for word in question: #word = word.strip('?').strip('.').strip(',').strip('!') word = util.norm_word(word) if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 sort = sorted(d.iteritems(), key=lambda dd: dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(question_u), n_combination_question) max = -1000000 answer_p = '' for com_q in lst_com_q: vec_q = np.sum([ get_vector_from_model(model, question_u[i]) for i in com_q ], axis=0) for i_choice in range(4): choice_u = list(set(lst_choice[i_choice])) lst_com_choice = util.combination_index( len(choice_u), n_combination_answer) for com_c in lst_com_choice: vec_c = np.sum([ get_vector_from_model(model, choice_u[i]) for i in com_c ], axis=0) score = vec_q.dot(vec_c) if score > max: max = score if i_choice == 0: answer_p = 'A' elif i_choice == 1: answer_p = 'B' elif i_choice == 2: answer_p = 'C' elif i_choice == 3: answer_p = 'D' print "%s,%s" % (lst[0], answer_p)