def answer_questions(args): """ Answer questions on the real-deal dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) if not args.load: train_reader, test_reader = read_csv_data.read_csv_data() train = list(train_reader); test = list(test_reader) print ("len(train) = {}, len(test) = {}"\ .format(len(train), len(test))) analyzer = similarity.Analyzer() feat = similarity.Featurizer(analyzer, pages_dict) print ("Computing feature strings:") fs, fv = feat.compute_feat_strings(train + test, print_info=True) pickle.dump((train, test, fs, fv, analyzer, feat), open('../data/realdeal_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) elif args.load: # load pre-comuted feature strings print ("Loading precomputed feature strings for real-deal train and test:") train, test, fs, fv, analyzer, feat = \ pickle.load(open('../data/realdeal_feat_strings.pkl', 'rb')) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures # running into memory issues, so release this guy feat = None # try some LDA stuff print ("Training LDA topic model") topic_mod = lda.LDA(n_topics=20, n_iter=500) tm_feat = topic_model.Featurizer(analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(train+test, fs, fv, pages_dict) # adding these seems to hurt public test performance. Does slightly better on Xval topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic print ("Evaluating train data (overfitting!):") acc_train = test_xval(train, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) print ("Train accuracy = {}\n".format(acc_train)) print ("Making predictions for test data:") our_answers = make_predictions(test, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) answer_file = "../data/our_answers.csv" print ("Writing predictions to {}:".format(answer_file)) o = csv.DictWriter(open(answer_file, 'w'), ["id", "correctAnswer"]) o.writeheader() for q,a in itertools.izip(test, our_answers): d = {"id": q['id'], "correctAnswer": a} o.writerow(d)
def answer_xval(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) if not args.load: train_reader, _ = read_csv_data.read_csv_data() train = list(train_reader) random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) analyzer = similarity.Analyzer() feat = similarity.Featurizer(analyzer, pages_dict) print ("Computing feature strings:") fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) pickle.dump((trainx, testx, fs, fv, analyzer, feat), open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) elif args.load: # load pre-comuted feature strings print ("Loading precomputed feature strings for trainx and testx:") trainx, testx, fs, fv, analyzer, feat = \ pickle.load(open('../data/xval_feat_strings.pkl', 'rb')) #XXX use this one instead #feat = None #analyzer = similarity.Analyzer() #feat = similarity.Featurizer(analyzer, pages_dict) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures # try some LDA stuff print ("Training LDA topic model") topic_mod = lda.LDA(n_topics=20, n_iter=150) tm_analyzer = topic_model.Analyzer() tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict) topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic print ("Evaluating train data:") acc_trainx = test_xval(trainx, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) print ("Train accuracy = {}\n".format(acc_trainx)) print ("Evaluating test data:") acc_testx = test_xval(testx, X, fv,\ #scorer=similarity.Scorer.cosine, print_info=True) scorer=similarity.Scorer.cosine, topics=topics, print_info=True) print ("Test accuracy = {}\n".format(acc_testx))
def answer_xval_lr(args): """ Answer questions on a cross-validation dataset by doing the following: 1. Extract (or load) feature strings for the training and test set 2. Parse the feature strings to compute feature vectors. 2. ??? 3. Profit Args: args: ArgumentParser arguments defined in __main__ Returns: None """ pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) if not args.load: train_reader, _ = read_csv_data.read_csv_data() train = list(train_reader) random.shuffle(train) # split train for X-val if args.limit > 0: train = train[0:args.limit] trainx, testx = utils.split_list(train, (args.split, 100.-args.split)) print ("len(xval_train) = {}, len(xval_test) = {}"\ .format(len(trainx), len(testx))) analyzer = similarity.Analyzer() feat = similarity.Featurizer(analyzer, pages_dict) print ("Computing feature strings:") fs, fv = feat.compute_feat_strings(trainx + testx, print_info=True) pickle.dump((trainx, testx, fs, fv, analyzer, feat), open('../data/xval_feat_strings.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) elif args.load: # load pre-comuted feature strings print ("Loading precomputed feature strings for trainx and testx:") trainx, testx, fs, fv, analyzer, feat = \ pickle.load(open('../data/xval_feat_strings.pkl', 'rb')) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures #X_scaler = sklearn.preprocessing.StandardScaler(with_mean=False, with_std=True) #X = X_scaler.fit_transform(X) # try some LDA stuff print ("Training LDA topic model") topic_mod = lda.LDA(n_topics=20, n_iter=150) tm_analyzer = topic_model.Analyzer() tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(trainx+testx, fs, fv, pages_dict) topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic #topics_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True) #topics = topics_scaler.fit_transform(topics) # compute similarity for each question and each answer (of 4) # use this as X (e.g. NLP similarity, LDA similarity) # binary classification with LR (i.e. is the answer right or not) print ("Evaluating train data:") X_lr_train, y_lr_train = compute_scores(trainx, X, fv,\ scorer=similarity.Scorer.cosine, topics=topics, train=True,\ print_info=True) print ("Training LR") # standardizing lr_scaler = sklearn.preprocessing.StandardScaler(with_mean=True, with_std=True) X_lr_train = lr_scaler.fit_transform(X_lr_train) # alpha sets the weight on regularization term lr = sklearn.linear_model.SGDClassifier(loss='log', penalty='l2',\ n_iter=100, shuffle=True, fit_intercept=True, class_weight={0:.1, 1:.9}) lr.fit(X_lr_train, y_lr_train) #lr.coef_[0,0] = 0.75 #lr.coef_[0,1] = 0.25 #lr.intercept_[0] = 0.0 print (lr.coef_) print (lr.intercept_) our_answers = lr_make_predictions(X_lr_train, lr) acc_trainx = compute_accuracy(trainx, our_answers) print ("Train accuracy = {}\n".format(acc_trainx)) print ("Evaluating test data:") X_lr_test = compute_scores(testx, X, fv,\ scorer=similarity.Scorer.cosine, topics=topics, print_info=True) X_lr_test = lr_scaler.transform(X_lr_test) our_answers = lr_make_predictions(X_lr_test, lr) acc_testx = compute_accuracy(testx, our_answers) print ("Test accuracy = {}\n".format(acc_testx))
def make_sim_and_topic_dicts(): pages_dict = pickle.load(open('../data/wiki_pages_dict.pkl', 'rb')) #print ("Loading precomputed feature strings for trainx and testx:") #train, test, fs, fv, analyzer, feat = \ # pickle.load(open('../data/xval_feat_strings.pkl', 'rb')) print ("Loading precomputed feature strings for real-deal train and test:") train, test, fs, fv, analyzer, feat = \ pickle.load(open('../data/realdeal_feat_strings.pkl', 'rb')) ## Here we do some cross-validation X = feat.compute_feats(fs) X = X.tocsr() # might already be CSR X.sort_indices() # needed for cosine-type measures feat = None # try some LDA stuff print ("Training LDA topic model") #topic_mod = lda.LDA(n_topics=20, n_iter=150) topic_mod = lda.LDA(n_topics=20, n_iter=500) tm_analyzer = topic_model.Analyzer() tm_feat = topic_model.Featurizer(tm_analyzer, pages_dict) # use the same feature strings as similarity tm_fs = topic_model.add_wiki_categories(train+test, fs, fv, pages_dict) topic_X = tm_feat.compute_feats(tm_fs) topics = topic_mod.fit_transform(topic_X) # gives probabilities for each topic # compute similarity for each question and each answer (of 4) # use this as X (e.g. NLP similarity, LDA similarity) # binary classification with LR (i.e. is the answer right or not) print ("Evaluating train data:") X_lr_train = compute_scores(train, X, fv,\ scorer=similarity.Scorer.cosine, topics=topics, train=False,\ #scorer=similarity.Scorer.cosine, topics=None, train=False,\ print_info=True) sim_prob_dict = dict() topic_prob_dict = dict() for ind, q in enumerate(train): arr = np.exp(X_lr_train[4*ind:4*ind+4,0]) # soft-max sim_prob_dict[q['id']] = arr / sum(arr) arr = np.exp(X_lr_train[4*ind:4*ind+4,1]) topic_prob_dict[q['id']] = arr / sum(arr) #print (q['id'], sim_prob_dict[q['id']], topic_prob_dict[q['id']]) print ("Evaluating test data:") X_lr_test = compute_scores(test, X, fv,\ scorer=similarity.Scorer.cosine, topics=topics, train=False,\ print_info=True) for ind, q in enumerate(test): arr = np.exp(X_lr_test[4*ind:4*ind+4,0]) # soft-max sim_prob_dict[q['id']] = arr / sum(arr) arr = np.exp(X_lr_test[4*ind:4*ind+4,1]) topic_prob_dict[q['id']] = arr / sum(arr) #print (q['id'], sim_prob_dict[q['id']], topic_prob_dict[q['id']]) pickle.dump(sim_prob_dict, open('../data/sim_prob_dict.pkl', 'wb'), pickle.HIGHEST_PROTOCOL) pickle.dump(topic_prob_dict, open('../data/topic_prob_dict.pkl', 'wb'), pickle.HIGHEST_PROTOCOL)