def test_position_2(self): country_destination = 'FR' predictions = ['US', 'FR'] res = evaluation.ndcg(country_destination, predictions) print res self.assertAlmostEqual(res, 0.6309, msg= 'A second best predicitondoes not give 0.6309', delta=0.0001)
def query_ndcg(group_df): data = trim(group_df) score = ndcg(data["judgment"], indices=data["result_position"], ideal_judgments=data["ideals"].iloc[0][:ndcg_at]) if len(data) > 0 else 0.0 return score
def query_ndcg(group_df): data = trim(group_df) score = ndcg(data["judgment"], indices=data["result_position"], ideal_judgments=data["ideals"].iloc[0] [:ndcg_at]) if len(data) > 0 else 0.0 return score
def test_position_2(self): country_destination = 'FR' predictions = ['US', 'FR'] res = evaluation.ndcg(country_destination, predictions) print res self.assertAlmostEqual( res, 0.6309, msg='A second best predicitondoes not give 0.6309', delta=0.0001)
topK_true_rank_by_single_feature_lists[single_feature_idx].append(int(true_rank_got)) topK_output_dict[single_feature_name_list[single_feature_idx]] = [topK_aux_lang_by_single_feature_lists[single_feature_idx], topK_true_rank_by_single_feature_lists[single_feature_idx]] with open(os.path.join(output_dir, "topK_" + lang_set[task_lang_idx] + ".json"), "w") as f: json.dump(topK_output_dict, f) # Compute NDCG print("[DEBUG] y_train =", y_train) print("[DEBUG] y_test =", y_test) true_rel_exp = y_test[qg_start_idx:qg_start_idx + int(qg_size)] relevance_sorted_true = -np.sort(-true_rel_exp) relevance_sorted_lgbm = y_test[qg_start_idx + best_aux_idx] NDCG = evaluation.ndcg(relevance_sorted_lgbm, PRINT_TOP_K, relevance_sorted_true) NDCG_output_dict["LambdaRank"]["task_lang"].append(lang_set[task_lang_idx]) NDCG_output_dict["LambdaRank"]["NDCG_list"].append(NDCG) for single_feature_idx in range(len(single_feature_name_list)): if sort_sign_list[single_feature_idx] != 0: relevance_sorted_single_feature = y_test[qg_start_idx + best_aux_idx_by_single_feature_lists[single_feature_idx]] NDCG_single_feature = evaluation.ndcg(relevance_sorted_single_feature, PRINT_TOP_K, relevance_sorted_true) NDCG_output_dict[single_feature_name_list[single_feature_idx]]["task_lang"].append(lang_set[task_lang_idx]) NDCG_output_dict[single_feature_name_list[single_feature_idx]]["NDCG_list"].append(NDCG_single_feature) else: # Remove the un-used feature item placeholder if single_feature_name_list[single_feature_idx] in NDCG_output_dict: del NDCG_output_dict[single_feature_name_list[single_feature_idx]] qg_start_idx += int(qg_size)
true_rank_list_from_size.append(int(test_lang_pair[qg_start_idx + best_aux_idx_from_size[i], 2])) print("Top", PRINT_TOP_K, "auxiliary language for '%s'" % task_lang, "are:", aux_lang_list, "with true ranks", true_rank_list) print("Task language data size = %d, task languages data size =" % int(task_size), aux_size_list) print("Using only data size, the top", PRINT_TOP_K, "auxiliary language are:", aux_lang_list_from_size, "with true ranks", true_rank_list_from_size) relevance_sorted_lgbm = y_test[qg_start_idx + best_aux_idx] print("[DEBUG] y_train =", y_train) print("[DEBUG] y_test =", y_test) true_rel_exp = y_test[qg_start_idx:qg_start_idx + int(qg_size)] relevance_sorted_true = -np.sort(-true_rel_exp) NDCG = evaluation.ndcg(relevance_sorted_lgbm, PRINT_TOP_K, relevance_sorted_true) print("My calculation of model NDCG@3 =", NDCG) my_NDCG_list.append(NDCG) # NDCG, using only data size relevance_sorted_size = y_test[qg_start_idx + best_aux_idx_from_size] NDCG_size = evaluation.ndcg(relevance_sorted_size, PRINT_TOP_K, relevance_sorted_true) print("Using only dataset size, NDCG@3 =", NDCG_size) NDCG_size_list.append(NDCG_size) qg_start_idx += int(qg_size) t = range(1,10+1) s = [] error = [] print (plot_list) for i in zip(*plot_list):
def standard_experiment(train_df, test_df, feature_names, args): train_df['set'] = "train" # annotate test_df['set'] = "test" # annotate # clip training set, if necessary if (0 < args.limit_data < len(train_df)): print "Clipping training set to %d comments" % args.limit_data train_df = train_df[:args.limit_data] # Split into X, y for regression target = args.target train_X = train_df.filter(feature_names).as_matrix().astype( np.float) # training data train_y = train_df.filter([target]).as_matrix().astype( np.float) # training labels test_X = test_df.filter(feature_names).as_matrix().astype( np.float) # test data test_y = test_df.filter([target ]).as_matrix().astype(np.float) # ground truth # For compatibility, make 1D train_y = train_y.reshape((-1, )) test_y = test_y.reshape((-1, )) print "Training set: %d examples" % (train_X.shape[0], ) print "Test set: %d examples" % (test_X.shape[0], ) print "Selected %d features" % (len(feature_names), ) print 'Features: %s' % (' '.join(feature_names)) ## # Preprocessing: scale data, keep SVM happy scaler = preprocessing.StandardScaler() train_X = scaler.fit_transform( train_X) # faster than fit, transform separately test_X = scaler.transform(test_X) if args.classifier != 'baseline': if args.stock_params: if args.classifier == 'svr': print "Initializing SVR model" clf = SVR(**STANDARD_PARAMS['svr']) elif args.classifier == 'rf': print "Initializing RandomForestRegressor model, seed=%d" % args.rfseed clf = RandomForestRegressor(random_state=args.rfseed, **STANDARD_PARAMS['rf']) elif args.classifier == 'elasticnet': print "Initializing ElasticNet model" clf = ElasticNet(max_iter=10000, **STANDARD_PARAMS['elasticnet']) else: raise ValueError("Invalid classifier '%s' specified." % args.classifier) else: ## # Run Grid Search / 10xv on training/dev set start = time.time() print "== Finding optimal classifier using Grid Search ==" params, clf = train_optimal_classifier(train_X, train_y, classifier=args.classifier, rfseed=args.rfseed, quickmode=args.quickmode) print "Optimal parameters: " + json.dumps(params, indent=4) if hasattr(clf, "support_vectors_"): print 'Number of support vectors: %d' % len( clf.support_vectors_) print "Took %.2f minutes to train" % ((time.time() - start) / 60.0) if hasattr(clf, 'random_state'): clf.set_params(random_state=args.rfseed) clf.fit(train_X, train_y) params = clf.get_params() ## # Set up evaluation function if args.ndcg_weight == 'target': favfunc = evaluation.fav_target # score weighting else: favfunc = evaluation.fav_linear # rank weighting max_K = 20 eval_func = lambda data: evaluation.ndcg(data, max_K, target=args.ndcg_target, result_label=result_label, fav_func=favfunc) ## # Predict scores for training set result_label = "pred_%s" % args.target # e.g. pred_score if args.classifier != 'baseline': train_pred = clf.predict(train_X) else: # baseline: post order train_pred = -1 * train_df['position_rank'] train_df[result_label] = train_pred print 'Performance on training data (NDCG with %s weighting)' % args.ndcg_weight # ndcg_train = eval_func(train_df) ndcg_train = eval_func( train_df[train_df.parent_nchildren >= args.min_posts_ndcg]) for i, score in enumerate(ndcg_train, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(train_y, train_pred) ## # Predict scores for test set if args.classifier != 'baseline': test_pred = clf.predict(test_X) else: # baseline: post order test_pred = -1 * test_df['position_rank'] test_df[result_label] = test_pred print 'Performance on test data (NDCG with %s weighting)' % args.ndcg_weight # ndcg_test = eval_func(test_df) ndcg_test = eval_func( test_df[test_df.parent_nchildren >= args.min_posts_ndcg]) for i, score in enumerate(ndcg_test, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred) ## # Save model to disk if args.savename and (args.classifier != 'baseline'): import cPickle as pickle saveas = args.savename + ".model.pkl" print "== Saving model as %s ==" % saveas with open(saveas, 'w') as f: pickle.dump(clf, f) ## # Get feature importance, if possible if args.savename and (args.classifier != 'baseline'): feature_importances = get_feature_importance( clf, args.classifier, feature_names=feature_names, sorted=True) saveas = args.savename + ".topfeatures.txt" print "== Recording top features to %s ==" % saveas # np.savetxt(saveas, feature_importances) # with open(saveas, 'w') as f: # json.dump(feature_importances, f, indent=2) with open(saveas, 'w') as f: maxlen = max([len(fname) for fname in feature_importances[0]]) f.write("# Model: %s\n" % args.classifier) f.write("# Params: %s\n" % json.dumps(params)) for fname, val in zip(*feature_importances): f.write("%s %.06f\n" % (fname.ljust(maxlen), val)) f.flush() ## # Save data to HDF5 if args.savename: # Save score predictions fields = [ "self_id", "parent_id", 'cid', 'sid', 'set', args.target, result_label ] if not args.ndcg_target in fields: fields.append(args.ndcg_target) saveas = args.savename + ".scores.h5" print "== Saving raw predictions as %s ==" % saveas outdf = pd.concat([train_df[fields], test_df[fields]], ignore_index=True) outdf.to_hdf(saveas, 'data') if args.savefull: # Concatenate train, test df = pd.concat([train_df, test_df], ignore_index=True) print "== Exporting data to HDF5 ==" saveas = args.savename + ".data.h5" df.to_hdf(saveas, "data") print " [saved as %s]" % saveas # Save NDCG calculations dd = { 'k': range(1, max_K + 1), 'method': [args.ndcg_weight] * max_K, 'ndcg_train': ndcg_train, 'ndcg_test': ndcg_test } resdf = pd.DataFrame(dd) saveas = args.savename + ".results.csv" print "== Saving results to %s ==" % saveas resdf.to_csv(saveas)
def crossdomain_experiment(home_df, test_df, feature_names, args, cv_folds=10): target = args.target result_label = "pred_%s" % args.target # e.g. pred_score home_df['set'] = "home" # annotate test_df['set'] = "test" # annotate # prep for result storage home_df[result_label] = np.zeros(len(home_df)) test_df[result_label] = np.zeros(len(test_df)) test_X_master = test_df.filter(feature_names).as_matrix().astype( np.float) # test data test_y = test_df.filter([target ]).as_matrix().astype(np.float) # ground truth test_y = test_y.reshape((-1, )) print "Selected %d features" % (len(feature_names), ) print 'Features: %s' % (' '.join(feature_names)) feature_importances = pd.DataFrame(index=feature_names) ## # Set up evaluation function if args.ndcg_weight == 'target': favfunc = evaluation.fav_target # score weighting else: favfunc = evaluation.fav_linear # rank weighting max_K = 20 eval_func = lambda data: evaluation.ndcg(data, max_K, target=args.ndcg_target, result_label=result_label, fav_func=favfunc) train_ncomments = np.zeros(cv_folds) train_nsubs = np.zeros(cv_folds) ## # Cross-validation for training set # train/dev from train_df # test each on whole test set sids = home_df.sid.unique() kf = KFold(len(sids), cv_folds) for foldidx, (train_sids_idx, dev_sids_idx) in enumerate(kf): print ">> Fold [%d] <<" % foldidx # collect actual SIDs train_sids = set(sids[train_sids_idx]) dev_sids = set(sids[dev_sids_idx]) # filter rows by SID match train_df = home_df[home_df.sid.map(lambda x: x in train_sids)] dev_df = home_df[home_df.sid.map(lambda x: x in dev_sids)] # clip training set, if necessary if (0 < args.limit_data < len(train_df)): print "Clipping training set to %d comments" % args.limit_data train_df = train_df[:args.limit_data] train_nsubs[foldidx] = len(train_sids) train_ncomments[foldidx] = len(train_df) # Split into X, y for regression train_X = train_df.filter(feature_names).as_matrix().astype( np.float) # training data train_y = train_df.filter([target]).as_matrix().astype( np.float) # training labels dev_X = dev_df.filter(feature_names).as_matrix().astype( np.float) # training data dev_y = dev_df.filter([target]).as_matrix().astype( np.float) # training labels # For compatibility, make 1D train_y = train_y.reshape((-1, )) dev_y = dev_y.reshape((-1, )) print "Training set: %d examples" % (train_X.shape[0], ) print "Dev set: %d examples" % (dev_X.shape[0], ) print "Test set: %d examples" % (test_X_master.shape[0], ) ## # Preprocessing: scale data, keep SVM happy scaler = preprocessing.StandardScaler() train_X = scaler.fit_transform( train_X) # faster than fit, transform separately dev_X = scaler.transform(dev_X) # scale dev set test_X = scaler.transform(test_X_master) # scale test set ## # Build classifier from pre-specified parameters if args.classifier == 'svr': print "Initializing SVR model" clf = SVR(**STANDARD_PARAMS['svr']) elif args.classifier == 'rf': print "Initializing RandomForestRegressor model, seed=%d" % args.rfseed clf = RandomForestRegressor(random_state=args.rfseed, **STANDARD_PARAMS['rf']) elif args.classifier == 'elasticnet': print "Initializing ElasticNet model" clf = ElasticNet(max_iter=10000, **STANDARD_PARAMS['elasticnet']) else: raise ValueError("Invalid classifier '%s' specified." % args.classifier) clf.fit(train_X, train_y) ## # Predict scores for dev set dev_pred = clf.predict(dev_X) # dev_df[result_label] = dev_pred home_df.loc[dev_df.index, result_label] = dev_pred ## # Predict scores for test set # average comment scores across each cv fold test_pred = clf.predict(test_X) test_df[result_label] += (1.0 / cv_folds) * test_pred ## # Extract feature importances features, importance = get_feature_importance( clf, args.classifier, feature_names=feature_names, sorted=True) feature_importances["fold_%d" % foldidx] = pd.Series(data=importance, index=features) print 'Performance on dev data (NDCG with %s weighting, min %d comments)' % ( args.ndcg_weight, args.min_posts_ndcg) # ndcg_dev = eval_func(home_df) ndcg_dev = eval_func( home_df[home_df.parent_nchildren >= args.min_posts_ndcg]) for i, score in enumerate(ndcg_dev, start=1): print '\tNDCG@%d: %.5f' % (i, score) # print 'Karma MSE: %.5f' % mean_squared_error(dev_y, dev_pred) print 'Performance on test data (NDCG with %s weighting, min %d comments)' % ( args.ndcg_weight, args.min_posts_ndcg) ndcg_test = eval_func( test_df[test_df.parent_nchildren >= args.min_posts_ndcg]) # ndcg_test = eval_func(test_df) for i, score in enumerate(ndcg_test, start=1): print '\tNDCG@%d: %.5f' % (i, score) # print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred) mu = np.mean(train_nsubs) s = np.std(train_nsubs) print("Training set size: %.02f +/- %.02f subs" % (mu, s)), mu = np.mean(train_ncomments) s = np.std(train_ncomments) print "[%.02f +/- %.02f comments]" % (mu, s) ## # Save data to HDF5 if args.savename: # Save score predictions fields = [ "self_id", "parent_id", 'cid', 'sid', 'set', args.target, result_label ] if not args.ndcg_target in fields: fields.append(args.ndcg_target) saveas = args.savename + ".scores.h5" print "== Saving raw predictions as %s ==" % saveas outdf = pd.concat([home_df[fields], test_df[fields]], ignore_index=True) outdf.to_hdf(saveas, 'data') # Save feature importances saveas = args.savename + ".featurescores.h5" print "== Recording feature scores to %s ==" % saveas feature_importances.to_hdf(saveas, 'data') # Save NDCG calculations dd = { 'k': range(1, max_K + 1), 'method': [args.ndcg_weight] * max_K, 'ndcg_dev': ndcg_dev, 'ndcg_test': ndcg_test } resdf = pd.DataFrame(dd) saveas = args.savename + ".results.csv" print "== Saving results to %s ==" % saveas resdf.to_csv(saveas)
def main(args): # Load Data File data = pd.read_hdf(args.datafile, 'data') print 'Original data dims: ' + str(data.shape) if args.list_features: print '\n'.join(data.columns.values) exit(0) # Select Features and trim data so all features present feature_names = args.features data = clean_data(data, feature_names) print 'Cleaned data dims: ' + str(data.shape) # Split into train, test # and select training target target = args.target train_df, test_df = split_data(data, args.limit_data, args.test_fraction) train_df['set'] = "train" # annotate test_df['set'] = "test" # annotate # Split into X, y for regression train_X = train_df.filter(feature_names).as_matrix().astype(np.float) # training data train_y = train_df.filter([target]).as_matrix().astype(np.float) # training labels test_X = test_df.filter(feature_names).as_matrix().astype(np.float) # test data test_y = test_df.filter([target]).as_matrix().astype(np.float) # ground truth # import pdb # pdb.set_trace() # For compatibility, make 1D train_y = train_y.reshape((-1,)) test_y = test_y.reshape((-1,)) print "Training set: %d examples" % (train_X.shape[0],) print "Test set: %d examples" % (test_X.shape[0],) print "Selected %d features" % (len(feature_names),) print 'Features: %s' % (' '.join(feature_names)) ## # Preprocessing: scale data, keep SVM happy scaler = preprocessing.StandardScaler() train_X = scaler.fit_transform(train_X) # faster than fit, transform separately test_X = scaler.transform(test_X) ## # Run Grid Search / 10xv on training/dev set start = time.time() print "== Finding optimal classifier using Grid Search ==" params, svr = train_optimal_classifier(train_X, train_y) print params print "Took %.2f minutes to train" % ((time.time() - start) / 60.0) ## # Set up evaluation function if args.ndcg_weight == 'target': favfunc = evaluation.fav_target # score weighting else: favfunc = evaluation.fav_linear # rank weighting max_K = 20 eval_func = lambda data: evaluation.ndcg(data, max_K, target=target, result_label=result_label, fav_func=favfunc) ## # Predict scores for training set result_label = "pred_%s" % target # e.g. pred_score train_pred = svr.predict(train_X) train_df[result_label] = train_pred print 'Performance on training data (NDCG with %s weighting)' % args.ndcg_weight ndcg_train = eval_func(train_df) for i, score in enumerate(ndcg_train, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(train_y, train_pred) ## # Predict scores for test set test_pred = svr.predict(test_X) test_df[result_label] = test_pred print 'Performance on test data (NDCG with %s weighting)' % args.ndcg_weight ndcg_test = eval_func(test_df) for i, score in enumerate(ndcg_test, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred) ## # Save model to disk if args.savename: import cPickle as pickle saveas = args.savename + ".model.pkl" print "== Saving model as %s ==" % saveas with open(saveas, 'w') as f: pickle.dump(svr, f) ## # Save data to HDF5 if args.savename: # Concatenate train, test df = pd.concat([train_df, test_df], ignore_index=True) print "== Exporting data to HDF5 ==" saveas = args.savename + ".data.h5" df.to_hdf(saveas, "data") print " [saved as %s]" % saveas # Save NDCG calculations dd = {'k':range(1,max_K+1), 'method':[args.ndcg_weight]*max_K, 'ndcg_train':ndcg_train, 'ndcg_test':ndcg_test} resdf = pd.DataFrame(dd) saveas = args.savename + ".results.csv" print "== Saving results to %s ==" % saveas resdf.to_csv(saveas)
def standard_experiment(train_df, test_df, feature_names, args): train_df['set'] = "train" # annotate test_df['set'] = "test" # annotate # Split into X, y for regression target = args.target train_X = train_df.filter(feature_names).as_matrix().astype(np.float) # training data train_y = train_df.filter([target]).as_matrix().astype(np.float) # training labels test_X = test_df.filter(feature_names).as_matrix().astype(np.float) # test data test_y = test_df.filter([target]).as_matrix().astype(np.float) # ground truth # For compatibility, make 1D train_y = train_y.reshape((-1,)) test_y = test_y.reshape((-1,)) print "Training set: %d examples" % (train_X.shape[0],) print "Test set: %d examples" % (test_X.shape[0],) print "Selected %d features" % (len(feature_names),) print 'Features: %s' % (' '.join(feature_names)) ## # Preprocessing: scale data, keep SVM happy scaler = preprocessing.StandardScaler() train_X = scaler.fit_transform(train_X) # faster than fit, transform separately test_X = scaler.transform(test_X) ## # Run Grid Search / 10xv on training/dev set start = time.time() print "== Finding optimal classifier using Grid Search ==" params, clf = train_optimal_classifier(train_X, train_y, classifier=args.classifier, quickmode=args.quickmode) print "Optimal parameters: " + json.dumps(params, indent=4) if hasattr(clf, "support_vectors_"): print 'Number of support vectors: %d' % len(clf.support_vectors_) print "Took %.2f minutes to train" % ((time.time() - start) / 60.0) ## # Set up evaluation function if args.ndcg_weight == 'target': favfunc = evaluation.fav_target # score weighting else: favfunc = evaluation.fav_linear # rank weighting max_K = 20 eval_func = lambda data: evaluation.ndcg(data, max_K, target=args.target, result_label=result_label, fav_func=favfunc) ## # Predict scores for training set result_label = "pred_%s" % args.target # e.g. pred_score train_pred = clf.predict(train_X) train_df[result_label] = train_pred print 'Performance on training data (NDCG with %s weighting)' % args.ndcg_weight ndcg_train = eval_func(train_df) for i, score in enumerate(ndcg_train, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(train_y, train_pred) ## # Predict scores for test set test_pred = clf.predict(test_X) test_df[result_label] = test_pred print 'Performance on test data (NDCG with %s weighting)' % args.ndcg_weight ndcg_test = eval_func(test_df) for i, score in enumerate(ndcg_test, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred) ## # Save model to disk if args.savename: import cPickle as pickle saveas = args.savename + ".model.pkl" print "== Saving model as %s ==" % saveas with open(saveas, 'w') as f: pickle.dump(clf, f) ## # Save data to HDF5 if args.savename: # Save score predictions fields = ["self_id", "parent_id", args.target, result_label] saveas = [args.savename + ".scores.train.csv", args.savename + ".scores.test.csv"] print "== Saving raw predictions as %s, %s ==" % tuple(saveas) train_df[fields].to_csv(saveas[0]) test_df[fields].to_csv(saveas[1]) if args.savefull: # Concatenate train, test df = pd.concat([train_df, test_df], ignore_index=True) print "== Exporting data to HDF5 ==" saveas = args.savename + ".data.h5" df.to_hdf(saveas, "data") print " [saved as %s]" % saveas # Save NDCG calculations dd = {'k':range(1,max_K+1), 'method':[args.ndcg_weight]*max_K, 'ndcg_train':ndcg_train, 'ndcg_test':ndcg_test} resdf = pd.DataFrame(dd) saveas = args.savename + ".results.csv" print "== Saving results to %s ==" % saveas resdf.to_csv(saveas)
def test_fr(self): country_destination = 'FR' predictions = ['FR', 'US'] self.assertEqual(evaluation.ndcg(country_destination, predictions), 1, 'A correct prediction does not give 1')
def main(args): # Load Data File data = pd.read_hdf(args.datafile, 'data') print 'Original data dims: ' + str(data.shape) if args.list_features: print '\n'.join(data.columns.values) exit(0) # Select Features and trim data so all features present feature_names = args.features data = clean_data(data, feature_names) print 'Cleaned data dims: ' + str(data.shape) # Split into train, test # and select training target target = args.target train_df, test_df = split_data(data, args.limit_data, args.test_fraction) train_df['set'] = "train" # annotate test_df['set'] = "test" # annotate # Split into X, y for regression train_X = train_df.filter(feature_names).as_matrix().astype( np.float) # training data train_y = train_df.filter([target]).as_matrix().astype( np.float) # training labels test_X = test_df.filter(feature_names).as_matrix().astype( np.float) # test data test_y = test_df.filter([target ]).as_matrix().astype(np.float) # ground truth # import pdb # pdb.set_trace() # For compatibility, make 1D train_y = train_y.reshape((-1, )) test_y = test_y.reshape((-1, )) print "Training set: %d examples" % (train_X.shape[0], ) print "Test set: %d examples" % (test_X.shape[0], ) print "Selected %d features" % (len(feature_names), ) print 'Features: %s' % (' '.join(feature_names)) ## # Preprocessing: scale data, keep SVM happy scaler = preprocessing.StandardScaler() train_X = scaler.fit_transform( train_X) # faster than fit, transform separately test_X = scaler.transform(test_X) ## # Run Grid Search / 10xv on training/dev set start = time.time() print "== Finding optimal classifier using Grid Search ==" params, svr = train_optimal_classifier(train_X, train_y) print params print "Took %.2f minutes to train" % ((time.time() - start) / 60.0) ## # Set up evaluation function if args.ndcg_weight == 'target': favfunc = evaluation.fav_target # score weighting else: favfunc = evaluation.fav_linear # rank weighting max_K = 20 eval_func = lambda data: evaluation.ndcg(data, max_K, target=target, result_label=result_label, fav_func=favfunc) ## # Predict scores for training set result_label = "pred_%s" % target # e.g. pred_score train_pred = svr.predict(train_X) train_df[result_label] = train_pred print 'Performance on training data (NDCG with %s weighting)' % args.ndcg_weight ndcg_train = eval_func(train_df) for i, score in enumerate(ndcg_train, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(train_y, train_pred) ## # Predict scores for test set test_pred = svr.predict(test_X) test_df[result_label] = test_pred print 'Performance on test data (NDCG with %s weighting)' % args.ndcg_weight ndcg_test = eval_func(test_df) for i, score in enumerate(ndcg_test, start=1): print '\tNDCG@%d: %.5f' % (i, score) print 'Karma MSE: %.5f' % mean_squared_error(test_y, test_pred) ## # Save model to disk if args.savename: import cPickle as pickle saveas = args.savename + ".model.pkl" print "== Saving model as %s ==" % saveas with open(saveas, 'w') as f: pickle.dump(svr, f) ## # Save data to HDF5 if args.savename: # Concatenate train, test df = pd.concat([train_df, test_df], ignore_index=True) print "== Exporting data to HDF5 ==" saveas = args.savename + ".data.h5" df.to_hdf(saveas, "data") print " [saved as %s]" % saveas # Save NDCG calculations dd = { 'k': range(1, max_K + 1), 'method': [args.ndcg_weight] * max_K, 'ndcg_train': ndcg_train, 'ndcg_test': ndcg_test } resdf = pd.DataFrame(dd) saveas = args.savename + ".results.csv" print "== Saving results to %s ==" % saveas resdf.to_csv(saveas)