def load_questions(args): """ Load the question text to be analyzed """ train_reader, test_reader = read_csv_data.read_csv_data() train = list(train_reader) test = list(test_reader) random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] #trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) #print ("len(xval_train) = {}, len(xval_test) = {}"\ # .format(len(trainx), len(testx))) pdb.set_trace() questions_pos = [] for ind in range(0, len(test)): kk = test[ind] questions_pos.append(gen_pos(kk)) sys.stdout.write("Parse Progress: %f%% \r" % (ind * 100 / float(len(test)))) sys.stdout.flush() #filepath = 'pos/train_pos.pkl' filepath = 'pos/test_pos.pkl' pickle.dump(questions_pos, open(filepath, 'wb')) pdb.set_trace()
def load_questions(args): """ Load the question text to be analyzed """ train_reader, test_reader = read_csv_data.read_csv_data() train = list(train_reader); test = list(test_reader) random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] #trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) #print ("len(xval_train) = {}, len(xval_test) = {}"\ # .format(len(trainx), len(testx))) pdb.set_trace() questions_pos = [] for ind in range(0,len(test)): kk = test[ind] questions_pos.append(gen_pos(kk)) sys.stdout.write("Parse Progress: %f%% \r" % (ind*100/float(len(test))) ) sys.stdout.flush() #filepath = 'pos/train_pos.pkl' filepath = 'pos/test_pos.pkl' pickle.dump(questions_pos,open(filepath, 'wb')) pdb.set_trace()
def read_answers(): """ Read in the test and train data and find all distinct answers. It is assumed that each answer is the title of a wikipedia page. Args: None Returns: list of distinct answer strings """ train_reader, test_reader = read_csv_data.read_csv_data() train_answers = set() for row in train_reader: for ans in (row["answerA"], row["answerB"], row["answerC"],\ row["answerD"]): train_answers.add(ans) test_answers = set() for row in test_reader: for ans in (row["answerA"], row["answerB"], row["answerC"],\ row["answerD"]): test_answers.add(ans) return list(train_answers | test_answers)
def answer_questions(args): """ Answer questions on the real-deal dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) if not args.load: train_reader, test_reader = read_csv_data.read_csv_data() train = list(train_reader); test = list(test_reader) print ("len(train) = {}, len(test) = {}"\ .format(len(train), len(test))) analyzer = similarity.Analyzer() feat = similarity.Featurizer(analyzer, pages_dict) print ("Computing feature strings:") fs, fv = feat.compute_feat_strings(train + test, print_info=True) pickle.dump((train, test, fs, fv, analyzer, feat), open('../data/realdeal_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) elif args.load: # load pre-comuted feature strings print ("Loading precomputed feature strings for real-deal train and test:") train, test, fs, fv, analyzer, feat = \ pickle.load(open('../data/realdeal_feat_strings.pkl', 'rb')) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures # running into memory issues, so release this guy feat = None # try some LDA stuff print ("Training LDA topic model") topic_mod = lda.LDA(n_topics=20, n_iter=500) tm_feat = topic_model.Featurizer(analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(train+test, fs, fv, pages_dict) # adding these seems to hurt public test performance. Does slightly better on Xval topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic print ("Evaluating train data (overfitting!):") acc_train = test_xval(train, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) print ("Train accuracy = {}\n".format(acc_train)) print ("Making predictions for test data:") our_answers = make_predictions(test, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) answer_file = "../data/our_answers.csv" print ("Writing predictions to {}:".format(answer_file)) o = csv.DictWriter(open(answer_file, 'w'), ["id", "correctAnswer"]) o.writeheader() for q,a in itertools.izip(test, our_answers): d = {"id": q['id'], "correctAnswer": a} o.writerow(d)
def answer_xval_lr(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) if not args.load: train_reader, _ = read_csv_data.read_csv_data() train = list(train_reader) random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) analyzer = similarity.Analyzer() feat = similarity.Featurizer(analyzer, pages_dict) print ("Computing feature strings:") fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) pickle.dump((trainx, testx, fs, fv, analyzer, feat), open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) elif args.load: # load pre-comuted feature strings print ("Loading precomputed feature strings for trainx and testx:") trainx, testx, fs, fv, analyzer, feat = \ pickle.load(open('../data/xval_feat_strings.pkl', 'rb')) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures #X_scaler = sklearn.preprocessing.StandardScaler(with_mean=False, with_std=True) #X = X_scaler.fit_transform(X) # try some LDA stuff print ("Training LDA topic model") topic_mod = lda.LDA(n_topics=20, n_iter=150) tm_analyzer = topic_model.Analyzer() tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict) topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic #topics_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True) #topics = topics_scaler.fit_transform(topics) # compute similarity for each question and each answer (of 4) # use this as X (e.g. NLP similarity, LDA similarity) # binary classification with LR (i.e. is the answer right or not) print ("Evaluating train data:") X_lr_train, y_lr_train = compute_scores(trainx, X, fv,\ scorer=similarity.Scorer.cosine, topics=topics, train=True,\ print_info=True) print ("Training LR") # standardizing lr_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True) X_lr_train = lr_scaler.fit_transform(X_lr_train) # alpha sets the weight on regularization term lr = sklearn.linear_model.SGDClassifier(loss='log', penalty='l2',\ n_iter=100, shuffle=True, fit_intercept=True, class_weight={0:.1, 1:.9}) lr.fit(X_lr_train, y_lr_train) #lr.coef_[0,0] = 0.75 #lr.coef_[0,1] = 0.25 #lr.intercept_[0] = 0.0 print (lr.coef_) print (lr.intercept_) our_answers = lr_make_predictions(X_lr_train, lr) acc_trainx = compute_accuracy(trainx, our_answers) print ("Train accuracy = {}\n".format(acc_trainx)) print ("Evaluating test data:") X_lr_test = compute_scores(testx, X, fv,\ scorer=similarity.Scorer.cosine, topics=topics, print_info=True) X_lr_test = lr_scaler.transform(X_lr_test) our_answers = lr_make_predictions(X_lr_test, lr) acc_testx = compute_accuracy(testx, our_answers) print ("Test accuracy = {}\n".format(acc_testx))
def answer_xval(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) if not args.load: train_reader, _ = read_csv_data.read_csv_data() train = list(train_reader) random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) analyzer = similarity.Analyzer() feat = similarity.Featurizer(analyzer, pages_dict) print ("Computing feature strings:") fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) pickle.dump((trainx, testx, fs, fv, analyzer, feat), open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) elif args.load: # load pre-comuted feature strings print ("Loading precomputed feature strings for trainx and testx:") trainx, testx, fs, fv, analyzer, feat = \ pickle.load(open('../data/xval_feat_strings.pkl', 'rb')) #XXX use this one instead #feat = None #analyzer = similarity.Analyzer() #feat = similarity.Featurizer(analyzer, pages_dict) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures # try some LDA stuff print ("Training LDA topic model") topic_mod = lda.LDA(n_topics=20, n_iter=150) tm_analyzer = topic_model.Analyzer() tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict) topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic print ("Evaluating train data:") acc_trainx = test_xval(trainx, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) print ("Train accuracy = {}\n".format(acc_trainx)) print ("Evaluating test data:") acc_testx = test_xval(testx, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) print ("Test accuracy = {}\n".format(acc_testx))
def answer_xval(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) train_pos = pickle.load(open('pos/train_pos.pkl', 'rb')) test_pos = pickle.load(open('pos/test_pos.pkl', 'rb')) all_pos = train_pos + test_pos example_d = pickle.load(open('pos/sim_prob_dict.pkl', 'rb')) row_num = 0 old_ans = [] with open('pos/our_answers.csv', 'rb') as csvfile: ans_reader = csv.reader(csvfile, delimiter=',') for row in ans_reader: if row_num > 0: old_ans.append({'id':row[0],'correctAnswer':row[1]}) row_num += 1 if not args.load: train_reader, test_reader = read_csv_data.read_csv_data() train = list(train_reader) test = list(test_reader) all_data = train + test random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) #analyzer = similarity.Analyzer() #feat = similarity.Featurizer(analyzer, pages_dict) #print ("Computing feature strings:") #fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) ##################################### #use_data = train #use_pos = train_pos use_data = all_data use_pos = all_pos ind = 0 num_this = 0 ans_types = {} num_q = 0 old_relevant = [] #for kk in trainx: for kk in use_data: for kk_pos in use_pos: #for kk_pos in train_pos: if kk_pos['id'] == kk['id']: break #for kk_old in old_ans: # if kk_old['id'] == kk['id']: # break #ans_types.append(question_features2(kk)) #ans_types.append(question_features2(kk_pos)) [k,t] = question_features2(kk_pos) if k != 0: ans_types[k] = t num_q += 1 #old_relevant.append(kk_old) ind += 1 sys.stdout.write("Parse Progress: %f%% \r" % (ind*100/float(len(use_data))) ) sys.stdout.flush() num_empty = 0 for ans in ans_types: if not(ans): num_empty += 1 pred_list = {} ind = 0 max_ind = len(use_data) for kk in range(0,len(use_data)): #if ind > max_ind: #break if use_data[kk]['id'] in ans_types.keys(): ind += 1 pred_list[use_data[kk]['id']] = answer_question(use_data[kk], \ ans_types[use_data[kk]['id']], pages_dict) else: ind += 1 pred_list[use_data[kk]['id']] = [] sys.stdout.write("Parse Progress: %f%% \r" % (ind*100/max_ind) ) sys.stdout.flush() ''' for kk in range(0,len(ans_types)): if ind > max_ind: break if (ans_types[kk]): ind += 1 #pred_list.append(google_ans(trainx[kk], ans_types[kk])) #pred_list.append(answer_question(trainx[kk], ans_types[kk], pages_dict)) pred_list.append(answer_question(use_data[kk], ans_types[kk], pages_dict)) else: ind += 1 pred_list.append([]) sys.stdout.write("Parse Progress: %f%% \r" % (ind*100/max_ind) ) sys.stdout.flush() ''' corr = 0 total = 0 for p in range(0,len(train)): q_key = train[p]['id'] if q_key in pred_list.keys(): if pred_list[q_key]: if pred_list[q_key] == train[p]['correctAnswer']: #if pred_list[p] == old_relevant[p]['correctAnswer']: corr += 1 total +=1 print ('Performance: ' + str(corr/float(total))) print ('Fraction Answered: ' + str(float(total)/float(len(use_data)))) final_answers = pickle_ans(pred_list, use_data) pdb.set_trace() filepath = 'pos/metric_dict_10_90.pkl' pickle.dump(final_answers,open(filepath, 'wb'))
def answer_xval(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) train_pos = pickle.load(open('pos/train_pos.pkl', 'rb')) test_pos = pickle.load(open('pos/test_pos.pkl', 'rb')) all_pos = train_pos + test_pos example_d = pickle.load(open('pos/sim_prob_dict.pkl', 'rb')) row_num = 0 old_ans = [] with open('pos/our_answers.csv', 'rb') as csvfile: ans_reader = csv.reader(csvfile, delimiter=',') for row in ans_reader: if row_num > 0: old_ans.append({'id': row[0], 'correctAnswer': row[1]}) row_num += 1 if not args.load: train_reader, test_reader = read_csv_data.read_csv_data() train = list(train_reader) test = list(test_reader) all_data = train + test random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100. - args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) #analyzer = similarity.Analyzer() #feat = similarity.Featurizer(analyzer, pages_dict) #print ("Computing feature strings:") #fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) ##################################### #use_data = train #use_pos = train_pos use_data = all_data use_pos = all_pos ind = 0 num_this = 0 ans_types = {} num_q = 0 old_relevant = [] #for kk in trainx: for kk in use_data: for kk_pos in use_pos: #for kk_pos in train_pos: if kk_pos['id'] == kk['id']: break #for kk_old in old_ans: # if kk_old['id'] == kk['id']: # break #ans_types.append(question_features2(kk)) #ans_types.append(question_features2(kk_pos)) [k, t] = question_features2(kk_pos) if k != 0: ans_types[k] = t num_q += 1 #old_relevant.append(kk_old) ind += 1 sys.stdout.write("Parse Progress: %f%% \r" % (ind * 100 / float(len(use_data)))) sys.stdout.flush() num_empty = 0 for ans in ans_types: if not (ans): num_empty += 1 pred_list = {} ind = 0 max_ind = len(use_data) for kk in range(0, len(use_data)): #if ind > max_ind: #break if use_data[kk]['id'] in ans_types.keys(): ind += 1 pred_list[use_data[kk]['id']] = answer_question(use_data[kk], \ ans_types[use_data[kk]['id']], pages_dict) else: ind += 1 pred_list[use_data[kk]['id']] = [] sys.stdout.write("Parse Progress: %f%% \r" % (ind * 100 / max_ind)) sys.stdout.flush() ''' for kk in range(0,len(ans_types)): if ind > max_ind: break if (ans_types[kk]): ind += 1 #pred_list.append(google_ans(trainx[kk], ans_types[kk])) #pred_list.append(answer_question(trainx[kk], ans_types[kk], pages_dict)) pred_list.append(answer_question(use_data[kk], ans_types[kk], pages_dict)) else: ind += 1 pred_list.append([]) sys.stdout.write("Parse Progress: %f%% \r" % (ind*100/max_ind) ) sys.stdout.flush() ''' corr = 0 total = 0 for p in range(0, len(train)): q_key = train[p]['id'] if q_key in pred_list.keys(): if pred_list[q_key]: if pred_list[q_key] == train[p]['correctAnswer']: #if pred_list[p] == old_relevant[p]['correctAnswer']: corr += 1 total += 1 print('Performance: ' + str(corr / float(total))) print('Fraction Answered: ' + str(float(total) / float(len(use_data)))) final_answers = pickle_ans(pred_list, use_data) pdb.set_trace() filepath = 'pos/metric_dict_10_90.pkl' pickle.dump(final_answers, open(filepath, 'wb'))