def main():
  """ Main module. Fits a mean-based predictor on training set, loaded from
      pickle, and predict votes for a test set, loaded from pickle, outputing to
      a file with predicted values and displaying training performance on
      stdout.

      Args:
        None.

      Returns:
        None.
  """
  load_args()

  for i in xrange(NUM_SETS):
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    reviews = load(open('%s/reviews-%d.pkl'% (_PKL_DIR, i), 'r'))
    predictor = fit_predictor(train)
    pred = [predictor(v) for v in train]
    truth = [v['vote'] for v in train]
    print 'TRAINING ERROR'
    print '-- RMSE: %f' % calculate_rmse(pred, truth) 
    print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
        pred, truth, RANK_SIZE))
    output = open('%s/%s-%d-0.dat' % (_OUTPUT_DIR, _PRED, i), 'w')
    for v in test:
      print >> output, predictor(v)
    output.close()
Beispiel #2
0
def main():
    """ Main module. Fits a mean-based predictor on training set, loaded from
      pickle, and predict votes for a test set, loaded from pickle, outputing to
      a file with predicted values and displaying training performance on
      stdout.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        predictor = fit_predictor(train)
        pred = [predictor(v) for v in train]
        truth = [v['vote'] for v in train]
        print 'TRAINING ERROR'
        print '-- RMSE: %f' % calculate_rmse(pred, truth)
        print '-- nDCG@%d: %f' % (RANK_SIZE,
                                  calculate_avg_ndcg(train, reviews, pred,
                                                     truth, RANK_SIZE))
        output = open('%s/%s-%d-0.dat' % (_OUTPUT_DIR, _PRED, i), 'w')
        for v in test:
            print >> output, predictor(v)
        output.close()
def evaluate_regression(pred, votes, output):
  """ Evaluates predicted values using RMSE, a regression metric.

      Args:
        pred: a list of floats with predicted values.
        votes: a list of votes, represented as dictionaries, belonging to
      votes set.
        output: a file object to pirint output on.

      Returns:
        None. The result is printed on output file and stdout.
  """
  truth = [v['vote'] for v in votes]
  rmse = calculate_rmse(pred, truth) 
  print >> output, "RMSE: %f" % rmse
  return rmse
Beispiel #4
0
def main():
    """ Predicts helpfulness votes using MF.

      Args:
        None.

      Returns:
        None. Results are printed to files.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading pickles'
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        truth = [v['vote'] for v in train]

        for j in xrange(REP):
            print 'Fitting Model'
            model = MF_Model()
            model.fit(train)

            print 'Calculating Predictions'
            pred = model.predict(train)
            print 'TRAINING ERROR'
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE))

            pred = model.predict(val)
            print 'Outputting validation prediction'
            output = open('%s/mf-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j),
                          'w')
            for p in pred:
                print >> output, p
            output.close()

            pred = model.predict(test)
            print 'Outputting testing prediction'
            output = open(
                '%s/mf-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w')
            for p in pred:
                print >> output, p
            output.close()
def main():
  """ Predicts helpfulness votes using MF.

      Args:
        None.

      Returns:
        None. Results are printed to files.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading pickles'
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    truth = [v['vote'] for v in train]
    
    for j in xrange(REP):
      print 'Fitting Model'
      model = MF_Model()
      model.fit(train)

      print 'Calculating Predictions'
      pred = model.predict(train)
      print 'TRAINING ERROR'
      print '-- RMSE: %f' % calculate_rmse(pred, truth)
      print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
          pred, truth, RANK_SIZE))
      
      pred = model.predict(val) 
      print 'Outputting validation prediction'
      output = open('%s/mf-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()
      
      pred = model.predict(test) 
      print 'Outputting testing prediction'
      output = open('%s/mf-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()
def main():
  """ Predicts helpfulness votes using MF.

      Args:
        None.

      Returns:
        None. Results are printed to files.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    t = time()
    print 'Reading pickles'
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r'))
    truth = [v['vote'] for v in train]
    
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)

    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user,
        avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, 
        avg_user, avg_sim, avg_conn)
    
    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)
    print 'Formatting input time: %f' % (time() - t)

    for j in xrange(REP):
      print 'Fitting Model'
      t = time()
      model = LR_Model()
      model.fit(X_train, y_train, qid_train)
      print 'Learning time: %f' % (time() - t)
      print 'Coefficients:'
      print model.w

      print 'Calculating Predictions'
      pred = model.predict(X_train)
      if _BIAS:
        bias.add_bias(train, reviews, pred)

      print 'TRAINING ERROR'
      print '-- RMSE: %f' % calculate_rmse(pred, truth)
      print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
          pred, truth, RANK_SIZE))
      
      pred = model.predict(X_val) 
      if _BIAS:
        bias.add_bias(val, reviews, pred)
      print 'Outputting validation prediction'
      output = open('%s/corasvr-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()
     
      t = time()
      pred = model.predict(X_test)
      if _BIAS:
        bias.add_bias(test, reviews, pred)
      print 'Prediction time: %f' % (time() - t)
      print 'Outputting testing prediction'
      output = open('%s/corasvr-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 
          'w')
      for p in pred:
        print >> output, p
      output.close()
def main():
  """ Predicts votes by applying LambdaMART technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
   
    train_truth = [v['vote'] for v in train]
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)
 
    print 'Creating average user (for mean imputation)'
    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, 
        avg_user, avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, 
        test, avg_user, avg_sim, avg_conn)
    
    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    print 'Outputting model'
    outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    train_index = output_model(X_train, y_train, qid_train, outfile)
    outfile.close()
    outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    val_index = output_model(X_val, None, qid_val, outfile)
    outfile.close()
    outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    test_index = output_model(X_test, None, qid_test, outfile)
    outfile.close()

    for j in xrange(REP):
      print 'Fitting model'
      print getoutput(('java -jar lib/ranklib/RankLib.jar -train '
          '%s/rank_train-%s-%d.dat -save %s/lambdamart_model-%s-%d-%d.dat '
          '-gmax 5 -ranker 6 -metric2t NDCG@5 -tree %d -leaf %d -shrinkage '
          '%f') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, j, _T,
          _L, _ALPHA)) 

      print 'Evaluating in train'
      print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
          '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_train-%s-%d.dat '
          '-score %s/rank_pred_train-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
          (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
          _CONF_STR, i, j))
      raw_pred = []
      predfile = open('%s/rank_pred_train-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR,
          i, j), 'r')
      raw_pred = [float(p.strip().split()[2]) for p in predfile]
      predfile.close()
      pred = [raw_pred[k] for k in train_index]
      if _BIAS:
        bias.add_bias(train, reviews, pred)
      print '~ Training error on set %d repetition %d' % (i, j)
      print 'RMSE: %f' % calculate_rmse(pred, train_truth)
      print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
          pred, train_truth, RANK_SIZE))

      print 'Predicting in validation'
      print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
          '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_val-%s-%d.dat '
          '-score %s/rank_pred_val-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
          (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
          _CONF_STR, i, j))
      predfile = open('%s/rank_pred_val-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR,
          i, j), 'r')
      raw_pred = [float(p.strip().split()[2]) for p in predfile]
      predfile.close()
      pred = [raw_pred[k] for k in val_index]
      if _BIAS:
        bias.add_bias(val, reviews, pred)
      output = open('%s/lambdamart-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j),
          'w')
      for p in pred:
        print >> output, p
      output.close()
      
      print 'Predicting in test'
      print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
          '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_test-%s-%d.dat '
          '-score %s/rank_pred_test-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
          (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
          _CONF_STR, i, j))
      predfile = open('%s/rank_pred_test-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR,
          i, j), 'r')
      raw_pred = [float(p.strip().split()[2]) for p in predfile]
      predfile.close()
      pred = [raw_pred[k] for k in test_index]
      if _BIAS:
        bias.add_bias(test, reviews, pred)
      output = open('%s/lambdamart-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, 
          j), 'w')
      for p in pred:
        print >> output, p
      output.close()
def predict():
    """ Predicts votes by applying a SVR regressor technique.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        t = time()

        print 'Reading data'
        reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r'))
        train_truth = [v['vote'] for v in train]

        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)

        print 'Formatting input time: %f' % (time() - t)

        t = time()
        model = SVR(C=_C, epsilon=_EPS, kernel=_KERNEL)
        model.fit(X_train, y_train)
        print 'Learning time: %f' % (time() - t)

        pred = model.predict(X_train)
        if _BIAS:
            bias.add_bias(train, reviews, pred)
        print '~ Training error on set %d repetition %d' % (i, 0)
        print 'RMSE: %f' % calculate_rmse(pred, train_truth)
        print 'nDCG@%d: %f' % (RANK_SIZE,
                               calculate_avg_ndcg(train, reviews, pred,
                                                  train_truth, RANK_SIZE))

        pred = model.predict(X_val)
        if _BIAS:
            bias.add_bias(val, reviews, pred)
        output = open(
            '%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' %
            (_VAL_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i,
             0), 'w')
        for p in pred:
            print >> output, p
        output.close()

        t = time()
        pred = model.predict(X_test)
        if _BIAS:
            bias.add_bias(test, reviews, pred)
        print 'Prediction time: %f' % (time() - t)
        output = open(
            '%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' %
            (_OUTPUT_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n',
             i, 0), 'w')
        for p in pred:
            print >> output, p
        output.close()
def main():
    """ Main method performing fitting, prediction and outputting to file.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        trusts = load(open('%s/trusts.pkl' % _PKL_DIR, 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
        f_train = map_features(train, reviews, users, sim, conn, trusts)
        f_val = map_features(val, reviews, users, sim, conn, trusts)
        f_test = map_features(test, reviews, users, sim, conn, trusts)
        scaler = fit_cap_scaler(f_train)
        f_train = scale_cap_features(scaler, f_train)
        f_val = scale_cap_features(scaler, f_val)
        f_test = scale_cap_features(scaler, f_test)
        for j in xrange(REP):
            print 'Creating variables'
            var_groups = create_variable_groups()
            populate_variables(var_groups, train, users, trusts, f_train)
            print 'Running EM'
            expectation_maximization(var_groups, train)
            print 'Calculating Predictions'
            pred = calculate_predictions(var_groups, train, users, trusts,
                                         f_train, sim, conn)
            print 'TRAINING ERROR'
            truth = [v['vote'] for v in train]
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE))
            print 'Outputting Validation Prediction'
            pred = calculate_predictions(var_groups, val, users, trusts, f_val,
                                         sim, conn)
            output = open('%s/cap-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j),
                          'w')
            for p in pred:
                print >> output, p
            output.close()
            truth = [v['vote'] for v in val]
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(val, reviews, pred, truth, RANK_SIZE))
            print 'Outputting Test Prediction'
            pred = calculate_predictions(var_groups, test, users, trusts,
                                         f_test, sim, conn)
            output = open(
                '%s/cap-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w')
            for p in pred:
                print >> output, p
            output.close()
            truth = [v['vote'] for v in test]
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(test, reviews, pred, truth, RANK_SIZE))
def main():
    """ Main method performing fitting, prediction and outputting to file.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading pickles'
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))

        train_reviews_ids = set([vote['review'] for vote in train])
        train_reviews = {r_id: reviews[r_id] for r_id in train_reviews_ids}

        for j in xrange(REP):
            print 'Fitting Model'
            model = BETF_Model()
            for v in train:
                v['vote'] /= 5.0
            for r_id in train_reviews:
                train_reviews[r_id]['rating'] /= 5.0
            model.fit(train, train_reviews)

            print 'Calculating Predictions'
            pred = model.predict(train, reviews)
            for v in train:
                v['vote'] *= 5.0
            for r_id in train_reviews:
                train_reviews[r_id]['rating'] *= 5.0
            pred = [p * 5.0 for p in pred]

            truth = [v['vote'] for v in train]
            print 'TRAINING ERROR'
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE))

            pred = model.predict(val, reviews)
            pred = [p * 5.0 for p in pred]
            print 'Outputting Validation Prediction'
            output = open(
                '%s/betf-k:%d,l:%f,r:%f,e:%f,i:%d-%d-%d.dat' %
                (_VAL_DIR, _K, _ALPHA, _BETA, _TOL, _ITER, i, j), 'w')
            for p in pred:
                print >> output, p
            output.close()
            truth = [v['vote'] for v in val]
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(val, reviews, pred, truth, RANK_SIZE))

            pred = model.predict(test, reviews)
            pred = [p * 5.0 for p in pred]
            print 'Outputting Test Prediction'
            output = open('%s/betf-k:%d,l:%f,r:%f,e:%f,i:%d-%d-%d.dat' % \
                (_OUTPUT_DIR, _K, _ALPHA, _BETA, _TOL, _ITER, i, j), 'w')
            for p in pred:
                print >> output, p
            output.close()
            truth = [v['vote'] for v in test]
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(test, reviews, pred, truth, RANK_SIZE))
Beispiel #11
0
def main():
    """ Predicts helpfulness votes using MF.

      Args:
        None.

      Returns:
        None. Results are printed to files.
  """
    load_args()

    for i in xrange(NUM_SETS):
        t = time()
        print 'Reading pickles'
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r'))
        truth = [v['vote'] for v in train]

        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)
        print 'Formatting input time: %f' % (time() - t)

        for j in xrange(REP):
            print 'Fitting Model'
            t = time()
            model = LR_Model()
            model.fit(X_train, y_train, qid_train)
            print 'Learning time: %f' % (time() - t)
            print 'Coefficients:'
            print model.w

            print 'Calculating Predictions'
            pred = model.predict(X_train)
            if _BIAS:
                bias.add_bias(train, reviews, pred)

            print 'TRAINING ERROR'
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE))

            pred = model.predict(X_val)
            if _BIAS:
                bias.add_bias(val, reviews, pred)
            print 'Outputting validation prediction'
            output = open(
                '%s/corasvr-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w')
            for p in pred:
                print >> output, p
            output.close()

            t = time()
            pred = model.predict(X_test)
            if _BIAS:
                bias.add_bias(test, reviews, pred)
            print 'Prediction time: %f' % (time() - t)
            print 'Outputting testing prediction'
            output = open(
                '%s/corasvr-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j),
                'w')
            for p in pred:
                print >> output, p
            output.close()
def predict():
    """ Predicts votes by applying RankSVM technique.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

        train_truth = [v['vote'] for v in train]
        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        print 'Creating average user (for mean imputation)'
        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)

        print 'Outputting model'
        outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        train_index = output_model(X_train, y_train, qid_train, outfile)
        outfile.close()
        outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        val_index = output_model(X_val, None, qid_val, outfile)
        outfile.close()
        outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        test_index = output_model(X_test, None, qid_test, outfile)
        outfile.close()

        print 'Fitting model'
        print getoutput(('lib/svm_rank/svm_rank_learn -c %f -w %s -t %s '
                         '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat') %
                        (_C, _ALGO, _KERNEL, _DATA_DIR, _CONF_STR, i,
                         _MODEL_DIR, _CONF_STR, i))
        print getoutput(('lib/svm_rank/svm_rank_classify '
                         '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat '
                         '%s/rank_pred_train-%s-%d-0.dat') %
                        (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i,
                         _DATA_DIR, _CONF_STR, i))

        raw_pred = []
        predfile = open(
            '%s/rank_pred_train-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r')
        raw_pred = [float(p.strip()) for p in predfile]
        predfile.close()
        pred = [raw_pred[j] for j in train_index]
        if _BIAS:
            bias.add_bias(train, reviews, pred)
        print '~ Training error on set %d repetition %d' % (i, 0)
        print 'RMSE: %f' % calculate_rmse(pred, train_truth)
        print 'nDCG@%d: %f' % (RANK_SIZE,
                               calculate_avg_ndcg(train, reviews, pred,
                                                  train_truth, RANK_SIZE))

        print 'Predicting in validation'
        print getoutput(('lib/svm_rank/svm_rank_classify '
                         '%s/rank_val-%s-%d.dat %s/rank_model-%s-%d-0.dat '
                         '%s/rank_pred_val-%s-%d-0.dat') %
                        (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i,
                         _DATA_DIR, _CONF_STR, i))
        predfile = open(
            '%s/rank_pred_val-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r')
        raw_pred = [float(p.strip()) for p in predfile]
        predfile.close()
        pred = [raw_pred[j] for j in val_index]
        if _BIAS:
            bias.add_bias(val, reviews, pred)
        output = open('%s/svmrank-%s-%d-0.dat' % (_VAL_DIR, _CONF_STR, i), 'w')
        for p in pred:
            print >> output, p
        output.close()

        print 'Predicting in test'
        print getoutput(('lib/svm_rank/svm_rank_classify '
                         '%s/rank_test-%s-%d.dat %s/rank_model-%s-%d-0.dat '
                         '%s/rank_pred_test-%s-%d-0.dat') %
                        (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i,
                         _DATA_DIR, _CONF_STR, i))
        predfile = open(
            '%s/rank_pred_test-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r')
        raw_pred = [float(p.strip()) for p in predfile]
        predfile.close()
        pred = [raw_pred[j] for j in test_index]
        if _BIAS:
            bias.add_bias(test, reviews, pred)
        output = open('%s/svmrank-%s-%d-0.dat' % (_OUTPUT_DIR, _CONF_STR, i),
                      'w')
        for p in pred:
            print >> output, p
        output.close()
def main():
  """ Main method, which performs prediction and outputs to file.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
    
    print 'Creating average user (for mean imputation)'
    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    
    print 'Modeling'
    X_train = model_dyad(train, sim, conn, avg_sim, avg_conn)
    X_val = model_dyad(val, sim, conn, avg_sim, avg_conn)
    X_test = model_dyad(test, sim, conn, avg_sim, avg_conn)
    train_reviews = set([v['review'] for v in train])
    test_reviews = set([v['review'] for v in val]).union(set([v['review'] for v
        in test]))
    X_item_train, item_train_key , X_item_test, item_test_key = \
        model_items(reviews, users, train_reviews, test_reviews, avg_user)
        # train, test: same file, different scaling
    train_users = set([v['voter'] for v in train])
    test_users = set([v['voter'] for v in val]).union(set([v['voter'] for v in
        test]))
    X_user_train, user_train_key, X_user_test, user_test_key = \
        model_users(users, train_users, test_users, avg_user)

    print 'Scaling'
    dyad_scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(dyad_scaler, X_train)
    X_val = scale_features(dyad_scaler, X_val)
    X_test = scale_features(dyad_scaler, X_test)
    item_scaler = fit_scaler('minmax', X_item_train)
    X_item_train = scale_features(item_scaler, X_item_train)
    X_item_test = scale_features(item_scaler, X_item_test)
    user_scaler = fit_scaler('minmax', X_user_train)
    X_user_train = scale_features(user_scaler, X_user_train)
    X_user_test = scale_features(user_scaler, X_user_test)
    X_item = vstack((X_item_train, X_item_test))
    item_key = item_train_key + item_test_key
    X_user = vstack((X_user_train, X_user_test))
    user_key = user_train_key + user_test_key

    print 'Outputting model'
    output_dyad('train', train, X_train, i)
    output_dyad('val', val, X_val, i)
    output_dyad('test', test, X_test, i)
    output_entity('item', X_item, item_key, i)
    output_entity('user', X_user, user_key, i)

    for j in xrange(REP):
      print 'Fitting model'
      print getoutput(('Rscript lib/rlfm/rlfm_fit.R %d %d %d %d %s %d %d '
          '%s') % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

      print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
          '%s train' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

      predfile = open('%s/rlfm-%s-%d-%d.dat' % (_TRAIN_DIR, _CONF_STR, i,
          j), 'r')
      pred = [float(p.strip()) for p in predfile]
      predfile.close()
      truth = [v['vote']  for v in train]
      print len(pred)
      print len(truth)
      print '~ Training error on set %d repetition %d' % (i, 0)
      print 'RMSE: %f' % calculate_rmse(pred, truth)
      print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred,
          truth, RANK_SIZE))

      print 'Predicting in validation'
      print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
          '%s val' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

      print 'Predicting in test'
      print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
          '%s test' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))
Beispiel #14
0
def main():
    """ Predicts votes by applying a LR regressor technique.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

        train_truth = [v['vote'] for v in train]
        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)

        model = Ridge(alpha=_BETA)
        # for standardized notation across algorithms, we consider alpha to be
        # learning rate of and beta, regularization weight
        model.fit(X_train, y_train)

        pred = model.predict(X_train)
        if _BIAS:
            bias.add_bias(train, reviews, pred)
        print '~ Training error on set %d repetition %d' % (i, 0)
        print 'RMSE: %f' % calculate_rmse(pred, train_truth)
        print 'nDCG@%d: %f' % (RANK_SIZE,
                               calculate_avg_ndcg(train, reviews, pred,
                                                  train_truth, RANK_SIZE))

        pred = model.predict(X_val)
        if _BIAS:
            bias.add_bias(val, reviews, pred)
        output = open(
            '%s/lr-r:%f,f:%s,b:%s-%d-%d.dat' %
            (_VAL_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w')
        for p in pred:
            print >> output, p
        output.close()

        pred = model.predict(X_test)
        if _BIAS:
            bias.add_bias(test, reviews, pred)
        output = open(
            '%s/lr-r:%f,f:%s,b:%s,-%d-%d.dat' %
            (_OUTPUT_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w')
        for p in pred:
            print >> output, p
        output.close()
def main():
  """ Predicts votes by applying a LR regressor technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

    train_truth = [v['vote'] for v in train]
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)

    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
        avg_user, avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn,
        test, avg_user, avg_sim, avg_conn)

    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    model = Ridge(alpha=_BETA) 
        # for standardized notation across algorithms, we consider alpha to be 
        # learning rate of and beta, regularization weight
    model.fit(X_train , y_train)
    
    pred = model.predict(X_train)
    if _BIAS:
      bias.add_bias(train, reviews, pred)
    print '~ Training error on set %d repetition %d' % (i, 0)
    print 'RMSE: %f' % calculate_rmse(pred, train_truth)
    print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred,
        train_truth, RANK_SIZE))

    pred = model.predict(X_val)
    if _BIAS:
      bias.add_bias(val, reviews, pred)
    output = open('%s/lr-r:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _BETA,
        _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w')
    for p in pred:
      print >> output, p
    output.close()
    
    pred = model.predict(X_test)
    if _BIAS:
      bias.add_bias(test, reviews, pred)
    output = open('%s/lr-r:%f,f:%s,b:%s,-%d-%d.dat' % (_OUTPUT_DIR, _BETA,
        _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w')
    for p in pred:
      print >> output, p
    output.close()
Beispiel #16
0
def main():
    """ Predicts votes by applying LambdaMART technique.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

        train_truth = [v['vote'] for v in train]
        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        print 'Creating average user (for mean imputation)'
        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)

        print 'Outputting model'
        outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        train_index = output_model(X_train, y_train, qid_train, outfile)
        outfile.close()
        outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        val_index = output_model(X_val, None, qid_val, outfile)
        outfile.close()
        outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        test_index = output_model(X_test, None, qid_test, outfile)
        outfile.close()

        for j in xrange(REP):
            print 'Fitting model'
            print getoutput((
                'java -jar lib/ranklib/RankLib.jar -train '
                '%s/rank_train-%s-%d.dat -save %s/lambdamart_model-%s-%d-%d.dat '
                '-gmax 5 -ranker 6 -metric2t NDCG@5 -tree %d -leaf %d -shrinkage '
                '%f') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, j,
                         _T, _L, _ALPHA))

            print 'Evaluating in train'
            print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
                '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_train-%s-%d.dat '
                '-score %s/rank_pred_train-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
                (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
                _CONF_STR, i, j))
            raw_pred = []
            predfile = open(
                '%s/rank_pred_train-%s-%d-%d.dat' %
                (_DATA_DIR, _CONF_STR, i, j), 'r')
            raw_pred = [float(p.strip().split()[2]) for p in predfile]
            predfile.close()
            pred = [raw_pred[k] for k in train_index]
            if _BIAS:
                bias.add_bias(train, reviews, pred)
            print '~ Training error on set %d repetition %d' % (i, j)
            print 'RMSE: %f' % calculate_rmse(pred, train_truth)
            print 'nDCG@%d: %f' % (RANK_SIZE,
                                   calculate_avg_ndcg(train, reviews, pred,
                                                      train_truth, RANK_SIZE))

            print 'Predicting in validation'
            print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
                '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_val-%s-%d.dat '
                '-score %s/rank_pred_val-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
                (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
                _CONF_STR, i, j))
            predfile = open(
                '%s/rank_pred_val-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j),
                'r')
            raw_pred = [float(p.strip().split()[2]) for p in predfile]
            predfile.close()
            pred = [raw_pred[k] for k in val_index]
            if _BIAS:
                bias.add_bias(val, reviews, pred)
            output = open(
                '%s/lambdamart-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j),
                'w')
            for p in pred:
                print >> output, p
            output.close()

            print 'Predicting in test'
            print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
                '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_test-%s-%d.dat '
                '-score %s/rank_pred_test-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
                (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
                _CONF_STR, i, j))
            predfile = open(
                '%s/rank_pred_test-%s-%d-%d.dat' %
                (_DATA_DIR, _CONF_STR, i, j), 'r')
            raw_pred = [float(p.strip().split()[2]) for p in predfile]
            predfile.close()
            pred = [raw_pred[k] for k in test_index]
            if _BIAS:
                bias.add_bias(test, reviews, pred)
            output = open(
                '%s/lambdamart-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j),
                'w')
            for p in pred:
                print >> output, p
            output.close()
def predict():
  """ Predicts votes by applying RankSVM technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
   
    train_truth = [v['vote'] for v in train]
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)
 
    print 'Creating average user (for mean imputation)'
    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, 
        avg_user, avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, 
        test, avg_user, avg_sim, avg_conn)
    
    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    print 'Outputting model'
    outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    train_index = output_model(X_train, y_train, qid_train, outfile)
    outfile.close()
    outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    val_index = output_model(X_val, None, qid_val, outfile)
    outfile.close()
    outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    test_index = output_model(X_test, None, qid_test, outfile)
    outfile.close()

    print 'Fitting model'
    print getoutput(('lib/svm_rank/svm_rank_learn -c %f -w %s -t %s '
        '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat') % (_C, _ALGO,
        _KERNEL, _DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i))
    print getoutput(('lib/svm_rank/svm_rank_classify ' 
        '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat '
        '%s/rank_pred_train-%s-%d-0.dat') % (_DATA_DIR,
        _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i))

    raw_pred = []
    predfile = open('%s/rank_pred_train-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR,
        i), 'r')
    raw_pred = [float(p.strip()) for p in predfile]
    predfile.close()
    pred = [raw_pred[j] for j in train_index]
    if _BIAS:
      bias.add_bias(train, reviews, pred)
    print '~ Training error on set %d repetition %d' % (i, 0)
    print 'RMSE: %f' % calculate_rmse(pred, train_truth)
    print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
        pred, train_truth, RANK_SIZE))

    print 'Predicting in validation'
    print getoutput(('lib/svm_rank/svm_rank_classify ' 
        '%s/rank_val-%s-%d.dat %s/rank_model-%s-%d-0.dat '
        '%s/rank_pred_val-%s-%d-0.dat') % (_DATA_DIR,
        _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i))
    predfile = open('%s/rank_pred_val-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR,
        i), 'r')
    raw_pred = [float(p.strip()) for p in predfile]
    predfile.close()
    pred = [raw_pred[j] for j in val_index]
    if _BIAS:
      bias.add_bias(val, reviews, pred)
    output = open('%s/svmrank-%s-%d-0.dat' % (_VAL_DIR, _CONF_STR, i), 'w')
    for p in pred:
      print >> output, p
    output.close()
    
    print 'Predicting in test'
    print getoutput(('lib/svm_rank/svm_rank_classify ' 
        '%s/rank_test-%s-%d.dat %s/rank_model-%s-%d-0.dat '
        '%s/rank_pred_test-%s-%d-0.dat') % (_DATA_DIR,
        _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i))
    predfile = open('%s/rank_pred_test-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR,
        i), 'r')
    raw_pred = [float(p.strip()) for p in predfile]
    predfile.close()
    pred = [raw_pred[j] for j in test_index]
    if _BIAS:
      bias.add_bias(test, reviews, pred)
    output = open('%s/svmrank-%s-%d-0.dat' % (_OUTPUT_DIR, _CONF_STR, i), 'w')
    for p in pred:
      print >> output, p
    output.close()
Beispiel #18
0
def main():
    """ Main method, which performs prediction and outputs to file.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

        print 'Creating average user (for mean imputation)'
        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)

        print 'Modeling'
        X_train = model_dyad(train, sim, conn, avg_sim, avg_conn)
        X_val = model_dyad(val, sim, conn, avg_sim, avg_conn)
        X_test = model_dyad(test, sim, conn, avg_sim, avg_conn)
        train_reviews = set([v['review'] for v in train])
        test_reviews = set([v['review'] for v in val
                            ]).union(set([v['review'] for v in test]))
        X_item_train, item_train_key , X_item_test, item_test_key = \
            model_items(reviews, users, train_reviews, test_reviews, avg_user)
        # train, test: same file, different scaling
        train_users = set([v['voter'] for v in train])
        test_users = set([v['voter']
                          for v in val]).union(set([v['voter'] for v in test]))
        X_user_train, user_train_key, X_user_test, user_test_key = \
            model_users(users, train_users, test_users, avg_user)

        print 'Scaling'
        dyad_scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(dyad_scaler, X_train)
        X_val = scale_features(dyad_scaler, X_val)
        X_test = scale_features(dyad_scaler, X_test)
        item_scaler = fit_scaler('minmax', X_item_train)
        X_item_train = scale_features(item_scaler, X_item_train)
        X_item_test = scale_features(item_scaler, X_item_test)
        user_scaler = fit_scaler('minmax', X_user_train)
        X_user_train = scale_features(user_scaler, X_user_train)
        X_user_test = scale_features(user_scaler, X_user_test)
        X_item = vstack((X_item_train, X_item_test))
        item_key = item_train_key + item_test_key
        X_user = vstack((X_user_train, X_user_test))
        user_key = user_train_key + user_test_key

        print 'Outputting model'
        output_dyad('train', train, X_train, i)
        output_dyad('val', val, X_val, i)
        output_dyad('test', test, X_test, i)
        output_entity('item', X_item, item_key, i)
        output_entity('user', X_user, user_key, i)

        for j in xrange(REP):
            print 'Fitting model'
            print getoutput(
                ('Rscript lib/rlfm/rlfm_fit.R %d %d %d %d %s %d %d '
                 '%s') %
                (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

            print getoutput(
                'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
                '%s train' %
                (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

            predfile = open(
                '%s/rlfm-%s-%d-%d.dat' % (_TRAIN_DIR, _CONF_STR, i, j), 'r')
            pred = [float(p.strip()) for p in predfile]
            predfile.close()
            truth = [v['vote'] for v in train]
            print len(pred)
            print len(truth)
            print '~ Training error on set %d repetition %d' % (i, 0)
            print 'RMSE: %f' % calculate_rmse(pred, truth)
            print 'nDCG@%d: %f' % (RANK_SIZE,
                                   calculate_avg_ndcg(train, reviews, pred,
                                                      truth, RANK_SIZE))

            print 'Predicting in validation'
            print getoutput(
                'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
                '%s val' %
                (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

            print 'Predicting in test'
            print getoutput(
                'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
                '%s test' %
                (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))
def predict():
  """ Predicts votes by applying a SVR regressor technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    t = time()

    print 'Reading data'
    reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r'))
    train_truth = [v['vote'] for v in train] 

    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)

    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user,
        avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, 
        avg_user, avg_sim, avg_conn)

    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    print 'Formatting input time: %f' % (time() - t)

    t = time()
    model = SVR(C=_C, epsilon=_EPS, kernel=_KERNEL)
    model.fit(X_train , y_train)
    print 'Learning time: %f' % (time() - t)

    pred = model.predict(X_train)
    if _BIAS:
      bias.add_bias(train, reviews, pred)
    print '~ Training error on set %d repetition %d' % (i, 0)
    print 'RMSE: %f' % calculate_rmse(pred, train_truth)
    print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred,
        train_truth, RANK_SIZE))

    pred = model.predict(X_val)
    if _BIAS:
      bias.add_bias(val, reviews, pred)
    output = open('%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, 
        _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0),
        'w')
    for p in pred:
      print >> output, p
    output.close()
    
    t = time()
    pred = model.predict(X_test)
    if _BIAS:
      bias.add_bias(test, reviews, pred)
    print 'Prediction time: %f' % (time() - t)
    output = open('%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % 
        (_OUTPUT_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 
        'y' if _BIAS else 'n', i, 0), 'w')
    for p in pred:
      print >> output, p
    output.close()
def main():
  """ Main method performing fitting, prediction and outputting to file.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    trusts = load(open('%s/trusts.pkl' % _PKL_DIR, 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
    f_train = map_features(train, reviews, users, sim, conn, trusts)
    f_val = map_features(val, reviews, users, sim, conn, trusts)
    f_test = map_features(test, reviews, users, sim, conn, trusts)
    scaler = fit_cap_scaler(f_train)
    f_train = scale_cap_features(scaler, f_train)
    f_val = scale_cap_features(scaler, f_val)
    f_test = scale_cap_features(scaler, f_test)
    for j in xrange(REP):
      print 'Creating variables'
      var_groups = create_variable_groups()
      populate_variables(var_groups, train, users, trusts, f_train)
      print 'Running EM'
      expectation_maximization(var_groups, train)
      print 'Calculating Predictions'
      pred = calculate_predictions(var_groups, train, users, trusts, f_train,
          sim, conn)
      print 'TRAINING ERROR'
      truth = [v['vote'] for v in train]
      print '-- RMSE: %f' % calculate_rmse(pred, truth) 
      print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
          pred, truth, RANK_SIZE))
      print 'Outputting Validation Prediction'
      pred = calculate_predictions(var_groups, val, users, trusts, f_val, sim,
          conn)
      output = open('%s/cap-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()
      truth = [v['vote'] for v in val]
      print '-- RMSE: %f' % calculate_rmse(pred, truth) 
      print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(val, reviews,
          pred, truth, RANK_SIZE))
      print 'Outputting Test Prediction'
      pred = calculate_predictions(var_groups, test, users, trusts, f_test, sim,
          conn)
      output = open('%s/cap-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()
      truth = [v['vote'] for v in test]
      print '-- RMSE: %f' % calculate_rmse(pred, truth) 
      print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(test, reviews,
          pred, truth, RANK_SIZE))

if __name__ == '__main__':
  print 'Reading pickles'
  train = load(open('%s/train%.2f.pkl' % (_PKL_DIR, _SAMPLE * 100), 'r'))
  test = load(open('%s/test%.2f.pkl' % (_PKL_DIR, _SAMPLE * 100), 'r'))
  overall_mean = float(sum([float(v['vote']) for v in train])) / len(train)
  
  print 'Fitting Model'
  model = BiasModel()
  model.fit(train)

  print 'Calculating Predictions'
  pred = model.predict(train)
   
  print 'TRAINING ERROR'
  truth = [v['vote'] for v in train]
  rmse = calculate_rmse(pred, truth) 
  print 'RMSE: %s' % rmse
  for i in xrange(5, 21, 5):
    score = calculate_ndcg(pred, truth, i)
    print 'NDCG@%d: %f' % (i, score)
  
  print 'Outputing Prediction'
  pred = model.predict(test) 
  output = open('%s/bias%.2f.dat' % (_OUTPUT_DIR, _SAMPLE * 100), 'w')
  for p in pred:
    print >> output, overall_mean if isnan(p) else p
  output.close()

Beispiel #22
0
                pred[index] += self.product_bias[product]


if __name__ == '__main__':
    print 'Reading pickles'
    train = load(open('%s/train%.2f.pkl' % (_PKL_DIR, _SAMPLE * 100), 'r'))
    test = load(open('%s/test%.2f.pkl' % (_PKL_DIR, _SAMPLE * 100), 'r'))
    overall_mean = float(sum([float(v['vote']) for v in train])) / len(train)

    print 'Fitting Model'
    model = BiasModel()
    model.fit(train)

    print 'Calculating Predictions'
    pred = model.predict(train)

    print 'TRAINING ERROR'
    truth = [v['vote'] for v in train]
    rmse = calculate_rmse(pred, truth)
    print 'RMSE: %s' % rmse
    for i in xrange(5, 21, 5):
        score = calculate_ndcg(pred, truth, i)
        print 'NDCG@%d: %f' % (i, score)

    print 'Outputing Prediction'
    pred = model.predict(test)
    output = open('%s/bias%.2f.dat' % (_OUTPUT_DIR, _SAMPLE * 100), 'w')
    for p in pred:
        print >> output, overall_mean if isnan(p) else p
    output.close()
def main():
  """ Predicts votes by applying a GBRT regressor technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
 
    train_truth = [v['vote'] for v in train]
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)

    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user,
        avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, 
        avg_user, avg_sim, avg_conn)

    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    for j in xrange(REP):
      model = GradientBoostingRegressor(loss=_LOSS, learning_rate=_ALPHA,
          n_estimators=_T, max_depth=_MAX_D, subsample=_SUBSAMPLE, 
          max_features=_MAX_F, random_state=(int(time() * 1000000) % 1000000))
      model.fit(X_train, y_train)
      
      pred = model.predict(X_train)
      if _BIAS:
        bias.add_bias(train, reviews, pred)
      print '~ Training error on set %d repetition %d' % (i, j)
      print 'RMSE: %f' % calculate_rmse(pred, train_truth)
      print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred,
          train_truth, RANK_SIZE))

      pred = model.predict(X_val)
      if _BIAS:
        bias.add_bias(val, reviews, pred)
      output = open('%s/gbrt-l:%f,t:%d,d:%d,e:%s,p:%f,m:%s,f:%s,b:%s-%d-%d.dat' % 
          (_VAL_DIR, _ALPHA, _T, _MAX_D, _LOSS, _SUBSAMPLE, str(_MAX_F), 
          _FEAT_TYPE, 'y' if _BIAS else 'n', i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()
      
      pred = model.predict(X_test)
      if _BIAS:
        bias.add_bias(test, reviews, pred)
      output = open('%s/gbrt-l:%f,t:%d,d:%d,e:%s,p:%f,m:%s,f:%s,b:%s-%d-%d.dat' % 
          (_OUTPUT_DIR, _ALPHA, _T, _MAX_D, _LOSS, _SUBSAMPLE, str(_MAX_F),
          _FEAT_TYPE, 'y' if _BIAS else 'n', i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()