def map_features(votes, reviews, users, users_sim, users_conn, trusts): """ Maps all features related to each vote, in the same order. Args: votes: list of vote dictionaries. reviews: dictionary of review dictionaries indexed by review id. users: dictionary of user dictionaries indexed by user id. users_sim: dictionary of user similarity dictionaries indexed by a 2-tuple of user ids. users_conn: dictionary of user connection dictionaries indexed by a 2-tuple of user ids. trusts: networkx DiGraph with trust network. Returns: A dictionary of features indexed by related entity name (e.g.: voter) and containing a list with a feature array for each vote. """ avg_user = compute_avg_user(users) avg_sim = compute_avg_model(users_sim) avg_conn = compute_avg_model(users_conn) features = {'review': [], 'author': [], 'voter': [], 'sim': [], 'conn': []} for vote in votes: r_id, a_id, v_id = vote['review'], vote['author'], vote['voter'] r_feat = map_review_features(reviews[r_id]) features['review'].append(r_feat) author = users[a_id] if a_id in users else avg_user a_feat = map_author_features(author, avg_user) features['author'].append(a_feat) voter = users[v_id] if v_id in users else avg_user v_feat = map_voter_features(voter, avg_user) features['voter'].append(v_feat) if v_id in users and a_id in users[v_id]['similars']: if (a_id, v_id) in users_sim: sim = users_sim[(a_id, v_id)] sim_feat = map_users_sim_features(sim, avg_sim) features['sim'].append(sim_feat) if v_id in trusts and a_id in trusts[v_id]: if (a_id, v_id) in users_conn: conn = users_conn[(a_id, v_id)] conn_feat = map_users_conn_features(conn, avg_conn) features['conn'].append(conn_feat) return features
def main(): """ Main method, which performs prediction and outputs to file. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) print 'Modeling' X_train = model_dyad(train, sim, conn, avg_sim, avg_conn) X_val = model_dyad(val, sim, conn, avg_sim, avg_conn) X_test = model_dyad(test, sim, conn, avg_sim, avg_conn) train_reviews = set([v['review'] for v in train]) test_reviews = set([v['review'] for v in val ]).union(set([v['review'] for v in test])) X_item_train, item_train_key , X_item_test, item_test_key = \ model_items(reviews, users, train_reviews, test_reviews, avg_user) # train, test: same file, different scaling train_users = set([v['voter'] for v in train]) test_users = set([v['voter'] for v in val]).union(set([v['voter'] for v in test])) X_user_train, user_train_key, X_user_test, user_test_key = \ model_users(users, train_users, test_users, avg_user) print 'Scaling' dyad_scaler = fit_scaler('minmax', X_train) X_train = scale_features(dyad_scaler, X_train) X_val = scale_features(dyad_scaler, X_val) X_test = scale_features(dyad_scaler, X_test) item_scaler = fit_scaler('minmax', X_item_train) X_item_train = scale_features(item_scaler, X_item_train) X_item_test = scale_features(item_scaler, X_item_test) user_scaler = fit_scaler('minmax', X_user_train) X_user_train = scale_features(user_scaler, X_user_train) X_user_test = scale_features(user_scaler, X_user_test) X_item = vstack((X_item_train, X_item_test)) item_key = item_train_key + item_test_key X_user = vstack((X_user_train, X_user_test)) user_key = user_train_key + user_test_key print 'Outputting model' output_dyad('train', train, X_train, i) output_dyad('val', val, X_val, i) output_dyad('test', test, X_test, i) output_entity('item', X_item, item_key, i) output_entity('user', X_user, user_key, i) for j in xrange(REP): print 'Fitting model' print getoutput( ('Rscript lib/rlfm/rlfm_fit.R %d %d %d %d %s %d %d ' '%s') % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) print getoutput( 'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s train' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) predfile = open( '%s/rlfm-%s-%d-%d.dat' % (_TRAIN_DIR, _CONF_STR, i, j), 'r') pred = [float(p.strip()) for p in predfile] predfile.close() truth = [v['vote'] for v in train] print len(pred) print len(truth) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) print 'Predicting in validation' print getoutput( 'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s val' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) print 'Predicting in test' print getoutput( 'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s test' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))
def main(): """ Predicts votes by applying LambdaMART technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Outputting model' outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') train_index = output_model(X_train, y_train, qid_train, outfile) outfile.close() outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') val_index = output_model(X_val, None, qid_val, outfile) outfile.close() outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') test_index = output_model(X_test, None, qid_test, outfile) outfile.close() for j in xrange(REP): print 'Fitting model' print getoutput(( 'java -jar lib/ranklib/RankLib.jar -train ' '%s/rank_train-%s-%d.dat -save %s/lambdamart_model-%s-%d-%d.dat ' '-gmax 5 -ranker 6 -metric2t NDCG@5 -tree %d -leaf %d -shrinkage ' '%f') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, j, _T, _L, _ALPHA)) print 'Evaluating in train' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_train-%s-%d.dat ' '-score %s/rank_pred_train-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) raw_pred = [] predfile = open( '%s/rank_pred_train-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in train_index] if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, j) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) print 'Predicting in validation' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_val-%s-%d.dat ' '-score %s/rank_pred_val-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) predfile = open( '%s/rank_pred_val-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in val_index] if _BIAS: bias.add_bias(val, reviews, pred) output = open( '%s/lambdamart-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() print 'Predicting in test' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_test-%s-%d.dat ' '-score %s/rank_pred_test-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) predfile = open( '%s/rank_pred_test-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in test_index] if _BIAS: bias.add_bias(test, reviews, pred) output = open( '%s/lambdamart-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts votes by applying a LR regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) model = Ridge(alpha=_BETA) # for standardized notation across algorithms, we consider alpha to be # learning rate of and beta, regularization weight model.fit(X_train, y_train) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open( '%s/lr-r:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) output = open( '%s/lr-r:%f,f:%s,b:%s,-%d-%d.dat' % (_OUTPUT_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts helpfulness votes using MF. Args: None. Returns: None. Results are printed to files. """ load_args() for i in xrange(NUM_SETS): t = time() print 'Reading pickles' train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r')) truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Formatting input time: %f' % (time() - t) for j in xrange(REP): print 'Fitting Model' t = time() model = LR_Model() model.fit(X_train, y_train, qid_train) print 'Learning time: %f' % (time() - t) print 'Coefficients:' print model.w print 'Calculating Predictions' pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) print 'Outputting validation prediction' output = open('%s/corasvr-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() t = time() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) print 'Prediction time: %f' % (time() - t) print 'Outputting testing prediction' output = open('%s/corasvr-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts votes by applying LambdaMART technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Outputting model' outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') train_index = output_model(X_train, y_train, qid_train, outfile) outfile.close() outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') val_index = output_model(X_val, None, qid_val, outfile) outfile.close() outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') test_index = output_model(X_test, None, qid_test, outfile) outfile.close() for j in xrange(REP): print 'Fitting model' print getoutput(('java -jar lib/ranklib/RankLib.jar -train ' '%s/rank_train-%s-%d.dat -save %s/lambdamart_model-%s-%d-%d.dat ' '-gmax 5 -ranker 6 -metric2t NDCG@5 -tree %d -leaf %d -shrinkage ' '%f') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, j, _T, _L, _ALPHA)) print 'Evaluating in train' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_train-%s-%d.dat ' '-score %s/rank_pred_train-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) raw_pred = [] predfile = open('%s/rank_pred_train-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in train_index] if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, j) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) print 'Predicting in validation' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_val-%s-%d.dat ' '-score %s/rank_pred_val-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) predfile = open('%s/rank_pred_val-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in val_index] if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/lambdamart-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() print 'Predicting in test' print getoutput(('java -jar lib/ranklib/RankLib.jar -load ' '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_test-%s-%d.dat ' '-score %s/rank_pred_test-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \ (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i, j)) predfile = open('%s/rank_pred_test-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j), 'r') raw_pred = [float(p.strip().split()[2]) for p in predfile] predfile.close() pred = [raw_pred[k] for k in test_index] if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/lambdamart-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def predict(): """ Predicts votes by applying a SVR regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): t = time() print 'Reading data' reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Formatting input time: %f' % (time() - t) t = time() model = SVR(C=_C, epsilon=_EPS, kernel=_KERNEL) model.fit(X_train , y_train) print 'Learning time: %f' % (time() - t) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close() t = time() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) print 'Prediction time: %f' % (time() - t) output = open('%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_OUTPUT_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts votes by applying a LR regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) model = Ridge(alpha=_BETA) # for standardized notation across algorithms, we consider alpha to be # learning rate of and beta, regularization weight model.fit(X_train , y_train) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/lr-r:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/lr-r:%f,f:%s,b:%s,-%d-%d.dat' % (_OUTPUT_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close()
def predict(): """ Predicts votes by applying RankSVM technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Outputting model' outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') train_index = output_model(X_train, y_train, qid_train, outfile) outfile.close() outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') val_index = output_model(X_val, None, qid_val, outfile) outfile.close() outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') test_index = output_model(X_test, None, qid_test, outfile) outfile.close() print 'Fitting model' print getoutput(('lib/svm_rank/svm_rank_learn -c %f -w %s -t %s ' '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat') % (_C, _ALGO, _KERNEL, _DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i)) print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_train-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) raw_pred = [] predfile = open('%s/rank_pred_train-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in train_index] if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) print 'Predicting in validation' print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_val-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_val-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) predfile = open('%s/rank_pred_val-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in val_index] if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/svmrank-%s-%d-0.dat' % (_VAL_DIR, _CONF_STR, i), 'w') for p in pred: print >> output, p output.close() print 'Predicting in test' print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_test-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_test-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) predfile = open('%s/rank_pred_test-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in test_index] if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/svmrank-%s-%d-0.dat' % (_OUTPUT_DIR, _CONF_STR, i), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts helpfulness votes using MF. Args: None. Returns: None. Results are printed to files. """ load_args() for i in xrange(NUM_SETS): t = time() print 'Reading pickles' train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r')) truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Formatting input time: %f' % (time() - t) for j in xrange(REP): print 'Fitting Model' t = time() model = LR_Model() model.fit(X_train, y_train, qid_train) print 'Learning time: %f' % (time() - t) print 'Coefficients:' print model.w print 'Calculating Predictions' pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print 'TRAINING ERROR' print '-- RMSE: %f' % calculate_rmse(pred, truth) print '-- nDCG@%d: %f' % ( RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) print 'Outputting validation prediction' output = open( '%s/corasvr-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close() t = time() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) print 'Prediction time: %f' % (time() - t) print 'Outputting testing prediction' output = open( '%s/corasvr-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w') for p in pred: print >> output, p output.close()
def predict(): """ Predicts votes by applying RankSVM technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Outputting model' outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') train_index = output_model(X_train, y_train, qid_train, outfile) outfile.close() outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') val_index = output_model(X_val, None, qid_val, outfile) outfile.close() outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w') test_index = output_model(X_test, None, qid_test, outfile) outfile.close() print 'Fitting model' print getoutput(('lib/svm_rank/svm_rank_learn -c %f -w %s -t %s ' '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat') % (_C, _ALGO, _KERNEL, _DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i)) print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_train-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) raw_pred = [] predfile = open( '%s/rank_pred_train-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in train_index] if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) print 'Predicting in validation' print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_val-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_val-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) predfile = open( '%s/rank_pred_val-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in val_index] if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/svmrank-%s-%d-0.dat' % (_VAL_DIR, _CONF_STR, i), 'w') for p in pred: print >> output, p output.close() print 'Predicting in test' print getoutput(('lib/svm_rank/svm_rank_classify ' '%s/rank_test-%s-%d.dat %s/rank_model-%s-%d-0.dat ' '%s/rank_pred_test-%s-%d-0.dat') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i)) predfile = open( '%s/rank_pred_test-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r') raw_pred = [float(p.strip()) for p in predfile] predfile.close() pred = [raw_pred[j] for j in test_index] if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/svmrank-%s-%d-0.dat' % (_OUTPUT_DIR, _CONF_STR, i), 'w') for p in pred: print >> output, p output.close()
def main(): """ Main method, which performs prediction and outputs to file. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) print 'Creating average user (for mean imputation)' avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) print 'Modeling' X_train = model_dyad(train, sim, conn, avg_sim, avg_conn) X_val = model_dyad(val, sim, conn, avg_sim, avg_conn) X_test = model_dyad(test, sim, conn, avg_sim, avg_conn) train_reviews = set([v['review'] for v in train]) test_reviews = set([v['review'] for v in val]).union(set([v['review'] for v in test])) X_item_train, item_train_key , X_item_test, item_test_key = \ model_items(reviews, users, train_reviews, test_reviews, avg_user) # train, test: same file, different scaling train_users = set([v['voter'] for v in train]) test_users = set([v['voter'] for v in val]).union(set([v['voter'] for v in test])) X_user_train, user_train_key, X_user_test, user_test_key = \ model_users(users, train_users, test_users, avg_user) print 'Scaling' dyad_scaler = fit_scaler('minmax', X_train) X_train = scale_features(dyad_scaler, X_train) X_val = scale_features(dyad_scaler, X_val) X_test = scale_features(dyad_scaler, X_test) item_scaler = fit_scaler('minmax', X_item_train) X_item_train = scale_features(item_scaler, X_item_train) X_item_test = scale_features(item_scaler, X_item_test) user_scaler = fit_scaler('minmax', X_user_train) X_user_train = scale_features(user_scaler, X_user_train) X_user_test = scale_features(user_scaler, X_user_test) X_item = vstack((X_item_train, X_item_test)) item_key = item_train_key + item_test_key X_user = vstack((X_user_train, X_user_test)) user_key = user_train_key + user_test_key print 'Outputting model' output_dyad('train', train, X_train, i) output_dyad('val', val, X_val, i) output_dyad('test', test, X_test, i) output_entity('item', X_item, item_key, i) output_entity('user', X_user, user_key, i) for j in xrange(REP): print 'Fitting model' print getoutput(('Rscript lib/rlfm/rlfm_fit.R %d %d %d %d %s %d %d ' '%s') % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s train' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) predfile = open('%s/rlfm-%s-%d-%d.dat' % (_TRAIN_DIR, _CONF_STR, i, j), 'r') pred = [float(p.strip()) for p in predfile] predfile.close() truth = [v['vote'] for v in train] print len(pred) print len(truth) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE)) print 'Predicting in validation' print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s val' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR)) print 'Predicting in test' print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d ' '%s test' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))
def predict(): """ Predicts votes by applying a SVR regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): t = time() print 'Reading data' reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) print 'Formatting input time: %f' % (time() - t) t = time() model = SVR(C=_C, epsilon=_EPS, kernel=_KERNEL) model.fit(X_train, y_train) print 'Learning time: %f' % (time() - t) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, 0) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open( '%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close() t = time() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) print 'Prediction time: %f' % (time() - t) output = open( '%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_OUTPUT_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w') for p in pred: print >> output, p output.close()
def main(): """ Predicts votes by applying a GBRT regressor technique. Args: None. Returns: None. """ load_args() for i in xrange(NUM_SETS): print 'Reading data' reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r')) users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r')) train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r')) test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r')) val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r')) sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r')) conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r')) train_truth = [v['vote'] for v in train] if _BIAS: bias = BiasModel() train = bias.fit_transform(train, reviews) avg_user = compute_avg_user(users) avg_sim = compute_avg_model(sim) avg_conn = compute_avg_model(conn) X_train, y_train, qid_train = generate_input(reviews, users, sim, conn, train, avg_user, avg_sim, avg_conn) X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user, avg_sim, avg_conn) X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, avg_user, avg_sim, avg_conn) scaler = fit_scaler('minmax', X_train) X_train = scale_features(scaler, X_train) X_val = scale_features(scaler, X_val) X_test = scale_features(scaler, X_test) for j in xrange(REP): model = GradientBoostingRegressor(loss=_LOSS, learning_rate=_ALPHA, n_estimators=_T, max_depth=_MAX_D, subsample=_SUBSAMPLE, max_features=_MAX_F, random_state=(int(time() * 1000000) % 1000000)) model.fit(X_train, y_train) pred = model.predict(X_train) if _BIAS: bias.add_bias(train, reviews, pred) print '~ Training error on set %d repetition %d' % (i, j) print 'RMSE: %f' % calculate_rmse(pred, train_truth) print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred, train_truth, RANK_SIZE)) pred = model.predict(X_val) if _BIAS: bias.add_bias(val, reviews, pred) output = open('%s/gbrt-l:%f,t:%d,d:%d,e:%s,p:%f,m:%s,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _ALPHA, _T, _MAX_D, _LOSS, _SUBSAMPLE, str(_MAX_F), _FEAT_TYPE, 'y' if _BIAS else 'n', i, j), 'w') for p in pred: print >> output, p output.close() pred = model.predict(X_test) if _BIAS: bias.add_bias(test, reviews, pred) output = open('%s/gbrt-l:%f,t:%d,d:%d,e:%s,p:%f,m:%s,f:%s,b:%s-%d-%d.dat' % (_OUTPUT_DIR, _ALPHA, _T, _MAX_D, _LOSS, _SUBSAMPLE, str(_MAX_F), _FEAT_TYPE, 'y' if _BIAS else 'n', i, j), 'w') for p in pred: print >> output, p output.close()