def search_train(): # Too Slow set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') path = 'data/training_set.tsv' d_word_count = util.load_d_word_count() n_word_question = 5 n_combination_question = 3 n_combination_answer = 3 n_correct = 0 lst_set_sentence = [] path_data = 'data/wikipedia_content_based_on_ck_12_keyword_v1/wikipedia_content_based_on_ck_12_keyword_v1.txt' print 'Begin load all sentences' for line in open(path_data): set_sentence = set(map(util.norm_word, line.strip('\n').split())) if len(set_sentence) >= 5: lst_set_sentence.append(set_sentence) print 'End load all sentences' print len(lst_set_sentence) for index, line in enumerate(open(path)): if index == 0: continue lst = line.strip('\n').split('\t') id = lst[0] question = lst[1] answer = lst[2] lst_choice = lst[3:] answer_p, MAX, lst_word_focus_q, lst_word_focus_c = get_max_occurence( lst_set_sentence, question, lst_choice, d_word_count, set_stopword, n_word_question, n_combination_question, n_combination_answer) if answer_p == answer: n_correct += 1 print ' '.join(lst) print 'Answer: ' + answer print 'Answer_p: ', answer_p print n_correct, index + 1, n_correct * 1.0 / (index + 1)
def search_train(): # Too Slow set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') path = 'data/training_set.tsv' d_word_count = util.load_d_word_count() n_word_question = 5 n_combination_question = 3 n_combination_answer = 3 n_correct = 0 lst_set_sentence = [] path_data = 'data/wikipedia_content_based_on_ck_12_keyword_v1/wikipedia_content_based_on_ck_12_keyword_v1.txt' print 'Begin load all sentences' for line in open(path_data): set_sentence = set(map(util.norm_word, line.strip('\n').split())) if len(set_sentence) >= 5: lst_set_sentence.append(set_sentence) print 'End load all sentences' print len(lst_set_sentence) for index, line in enumerate(open(path)): if index == 0: continue lst = line.strip('\n').split('\t') id = lst[0] question = lst[1] answer = lst[2] lst_choice = lst[3:] answer_p, MAX, lst_word_focus_q, lst_word_focus_c = get_max_occurence(lst_set_sentence, question, lst_choice, d_word_count, set_stopword, n_word_question, n_combination_question, n_combination_answer) if answer_p == answer: n_correct += 1 print ' '.join(lst) print 'Answer: ' + answer print 'Answer_p: ', answer_p print n_correct, index + 1, n_correct * 1.0 / (index + 1)
def test_on_train(path_model, n_combination_question = 3, n_combination_answer = 3, n_word_question = 5): model = gensim.models.Word2Vec.load(path_model) path_train = 'data/training_set.tsv' n_total = 0 n_correct = 0 set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') d_word_count = util.load_d_word_count() for index, line in enumerate(open(path_train)): n_total += 1 if index == 0: continue else: lst = line.lower().strip('\n').split('\t') question = lst[1].split(' ') answer = lst[2] lst_choice = [l.split(' ') for l in lst[3:]] #question_u = list(set(question).difference(set_stopword)) d = {} for word in question: word = word.strip('?').strip('.').strip(',').strip('!') if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 # Only consider the word with lowest frequency. sort = sorted(d.iteritems(), key = lambda dd : dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(question_u), n_combination_question) max = -1000000 answer_p = '' words_focus_question = [] words_focus_choice = [] for com_q in lst_com_q: vec_q = np.sum([get_vector_from_model(model, question_u[i]) for i in com_q], axis = 0) for i_choice in range(4): choice_u =list(set(lst_choice[i_choice])) lst_com_choice = util.combination_index(len(choice_u), n_combination_answer) for com_c in lst_com_choice: vec_c = np.sum([get_vector_from_model(model, choice_u[i]) for i in com_c], axis = 0) score = vec_q.dot(vec_c) if score > max: words_focus_question = [question_u[i] for i in com_q] words_focus_choice = [choice_u[i] for i in com_c] max = score if i_choice == 0: answer_p = 'a' elif i_choice == 1: answer_p = 'b' elif i_choice == 2: answer_p = 'c' elif i_choice == 3: answer_p = 'd' if answer == answer_p: n_correct += 1 print ' '.join(lst) + '\n' + "Focus Words in Question: " + ' '.join(words_focus_question) + '\n' + "Focus Words in Choice: " + ' '.join(words_focus_choice) print 'Predicted Answer: ' + answer_p print str(n_correct) + ' / ' + str(n_total) + '\t' + str(n_correct * 1.0 / n_total) print n_correct * 1.0 / n_total
def test_on_validation(path_model, n_combination_question = 3, n_combination_answer = 3, n_word_question = 5): model = gensim.models.Word2Vec.load(path_model) path_train = 'data/validation_set.tsv' n_combination_question = 4 n_combination_answer = 3 n_word_question = 5 n_total = 0 n_correct = 0 #set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') d_word_count = util.load_d_word_count() for index, line in enumerate(open(path_train)): n_total += 1 if index == 0: continue else: lst = line.lower().strip('\n').split('\t') question = lst[1].split(' ') lst_choice = [l.split(' ') for l in lst[2:]] #question_u = list(set(question).difference(set_stopword)) d = {} for word in question: #word = word.strip('?').strip('.').strip(',').strip('!') word = util.norm_word(word) if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 sort = sorted(d.iteritems(), key = lambda dd : dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(question_u), n_combination_question) max = -1000000 answer_p = '' for com_q in lst_com_q: vec_q = np.sum([get_vector_from_model(model, question_u[i]) for i in com_q], axis = 0) for i_choice in range(4): choice_u =list(set(lst_choice[i_choice])) lst_com_choice = util.combination_index(len(choice_u), n_combination_answer) for com_c in lst_com_choice: vec_c = np.sum([get_vector_from_model(model, choice_u[i]) for i in com_c], axis = 0) score = vec_q.dot(vec_c) if score > max: max = score if i_choice == 0: answer_p = 'A' elif i_choice == 1: answer_p = 'B' elif i_choice == 2: answer_p = 'C' elif i_choice == 3: answer_p = 'D' print "%s,%s" % (lst[0], answer_p)
def test_on_train(path_model, n_combination_question=3, n_combination_answer=3, n_word_question=5): model = gensim.models.Word2Vec.load(path_model) path_train = 'data/training_set.tsv' n_total = 0 n_correct = 0 set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') d_word_count = util.load_d_word_count() for index, line in enumerate(open(path_train)): n_total += 1 if index == 0: continue else: lst = line.lower().strip('\n').split('\t') question = lst[1].split(' ') answer = lst[2] lst_choice = [l.split(' ') for l in lst[3:]] #question_u = list(set(question).difference(set_stopword)) d = {} for word in question: word = word.strip('?').strip('.').strip(',').strip('!') if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 # Only consider the word with lowest frequency. sort = sorted(d.iteritems(), key=lambda dd: dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(question_u), n_combination_question) max = -1000000 answer_p = '' words_focus_question = [] words_focus_choice = [] for com_q in lst_com_q: vec_q = np.sum([ get_vector_from_model(model, question_u[i]) for i in com_q ], axis=0) for i_choice in range(4): choice_u = list(set(lst_choice[i_choice])) lst_com_choice = util.combination_index( len(choice_u), n_combination_answer) for com_c in lst_com_choice: vec_c = np.sum([ get_vector_from_model(model, choice_u[i]) for i in com_c ], axis=0) score = vec_q.dot(vec_c) if score > max: words_focus_question = [ question_u[i] for i in com_q ] words_focus_choice = [choice_u[i] for i in com_c] max = score if i_choice == 0: answer_p = 'a' elif i_choice == 1: answer_p = 'b' elif i_choice == 2: answer_p = 'c' elif i_choice == 3: answer_p = 'd' if answer == answer_p: n_correct += 1 print ' '.join(lst) + '\n' + "Focus Words in Question: " + ' '.join( words_focus_question ) + '\n' + "Focus Words in Choice: " + ' '.join(words_focus_choice) print 'Predicted Answer: ' + answer_p print str(n_correct) + ' / ' + str(n_total) + '\t' + str( n_correct * 1.0 / n_total) print n_correct * 1.0 / n_total
def test_on_validation(path_model, n_combination_question=3, n_combination_answer=3, n_word_question=5): model = gensim.models.Word2Vec.load(path_model) path_train = 'data/validation_set.tsv' n_combination_question = 4 n_combination_answer = 3 n_word_question = 5 n_total = 0 n_correct = 0 #set_stopword = ('a', 'an', 'the', 'are', 'is', 'that', 'this', 'will', 'is', 'are', 'was', 'were', 'of', 'for') d_word_count = util.load_d_word_count() for index, line in enumerate(open(path_train)): n_total += 1 if index == 0: continue else: lst = line.lower().strip('\n').split('\t') question = lst[1].split(' ') lst_choice = [l.split(' ') for l in lst[2:]] #question_u = list(set(question).difference(set_stopword)) d = {} for word in question: #word = word.strip('?').strip('.').strip(',').strip('!') word = util.norm_word(word) if d_word_count.has_key(word): d[word] = d_word_count[word] else: d[word] = 0 sort = sorted(d.iteritems(), key=lambda dd: dd[1]) question_u = [s[0] for s in sort[:n_word_question]] lst_com_q = util.combination_index(len(question_u), n_combination_question) max = -1000000 answer_p = '' for com_q in lst_com_q: vec_q = np.sum([ get_vector_from_model(model, question_u[i]) for i in com_q ], axis=0) for i_choice in range(4): choice_u = list(set(lst_choice[i_choice])) lst_com_choice = util.combination_index( len(choice_u), n_combination_answer) for com_c in lst_com_choice: vec_c = np.sum([ get_vector_from_model(model, choice_u[i]) for i in com_c ], axis=0) score = vec_q.dot(vec_c) if score > max: max = score if i_choice == 0: answer_p = 'A' elif i_choice == 1: answer_p = 'B' elif i_choice == 2: answer_p = 'C' elif i_choice == 3: answer_p = 'D' print "%s,%s" % (lst[0], answer_p)