Ejemplo n.º 1
0
def scale_cap_features(scaler, features):
    """ Scales features for CAP using previously fitted scaler.

      Args:
        scaler: dictionary of scalers, indexed by entity name.
        features: dictionary of list of feature arrays, indexed by entity name.

      Returns:
        A new dictionary of features, in the same format but with scaled values.
  """
    features['review'] = scale_features(scaler['review'], features['review'])
    features['author'] = scale_features(scaler['author'], features['author'])
    features['voter'] = scale_features(scaler['voter'], features['voter'])
    if features['sim']:
        features['sim'] = scale_features(scaler['sim'], features['sim'])
    if features['conn']:
        features['conn'] = scale_features(scaler['conn'], features['conn'])
    return features
Ejemplo n.º 2
0
def scale_cap_features(scaler, features):
  """ Scales features for CAP using previously fitted scaler.

      Args:
        scaler: dictionary of scalers, indexed by entity name.
        features: dictionary of list of feature arrays, indexed by entity name.

      Returns:
        A new dictionary of features, in the same format but with scaled values.
  """
  features['review'] = scale_features(scaler['review'], features['review'])
  features['author'] = scale_features(scaler['author'], features['author'])
  features['voter'] = scale_features(scaler['voter'], features['voter'])
  if features['sim']:
    features['sim'] = scale_features(scaler['sim'], features['sim'])
  if features['conn']:
    features['conn'] = scale_features(scaler['conn'], features['conn'])
  return features
Ejemplo n.º 3
0
def main():
    """ Predicts votes by applying LambdaMART technique.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

        train_truth = [v['vote'] for v in train]
        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        print 'Creating average user (for mean imputation)'
        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)

        print 'Outputting model'
        outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        train_index = output_model(X_train, y_train, qid_train, outfile)
        outfile.close()
        outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        val_index = output_model(X_val, None, qid_val, outfile)
        outfile.close()
        outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        test_index = output_model(X_test, None, qid_test, outfile)
        outfile.close()

        for j in xrange(REP):
            print 'Fitting model'
            print getoutput((
                'java -jar lib/ranklib/RankLib.jar -train '
                '%s/rank_train-%s-%d.dat -save %s/lambdamart_model-%s-%d-%d.dat '
                '-gmax 5 -ranker 6 -metric2t NDCG@5 -tree %d -leaf %d -shrinkage '
                '%f') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, j,
                         _T, _L, _ALPHA))

            print 'Evaluating in train'
            print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
                '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_train-%s-%d.dat '
                '-score %s/rank_pred_train-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
                (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
                _CONF_STR, i, j))
            raw_pred = []
            predfile = open(
                '%s/rank_pred_train-%s-%d-%d.dat' %
                (_DATA_DIR, _CONF_STR, i, j), 'r')
            raw_pred = [float(p.strip().split()[2]) for p in predfile]
            predfile.close()
            pred = [raw_pred[k] for k in train_index]
            if _BIAS:
                bias.add_bias(train, reviews, pred)
            print '~ Training error on set %d repetition %d' % (i, j)
            print 'RMSE: %f' % calculate_rmse(pred, train_truth)
            print 'nDCG@%d: %f' % (RANK_SIZE,
                                   calculate_avg_ndcg(train, reviews, pred,
                                                      train_truth, RANK_SIZE))

            print 'Predicting in validation'
            print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
                '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_val-%s-%d.dat '
                '-score %s/rank_pred_val-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
                (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
                _CONF_STR, i, j))
            predfile = open(
                '%s/rank_pred_val-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR, i, j),
                'r')
            raw_pred = [float(p.strip().split()[2]) for p in predfile]
            predfile.close()
            pred = [raw_pred[k] for k in val_index]
            if _BIAS:
                bias.add_bias(val, reviews, pred)
            output = open(
                '%s/lambdamart-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j),
                'w')
            for p in pred:
                print >> output, p
            output.close()

            print 'Predicting in test'
            print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
                '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_test-%s-%d.dat '
                '-score %s/rank_pred_test-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
                (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
                _CONF_STR, i, j))
            predfile = open(
                '%s/rank_pred_test-%s-%d-%d.dat' %
                (_DATA_DIR, _CONF_STR, i, j), 'r')
            raw_pred = [float(p.strip().split()[2]) for p in predfile]
            predfile.close()
            pred = [raw_pred[k] for k in test_index]
            if _BIAS:
                bias.add_bias(test, reviews, pred)
            output = open(
                '%s/lambdamart-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j),
                'w')
            for p in pred:
                print >> output, p
            output.close()
Ejemplo n.º 4
0
def main():
    """ Main method, which performs prediction and outputs to file.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

        print 'Creating average user (for mean imputation)'
        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)

        print 'Modeling'
        X_train = model_dyad(train, sim, conn, avg_sim, avg_conn)
        X_val = model_dyad(val, sim, conn, avg_sim, avg_conn)
        X_test = model_dyad(test, sim, conn, avg_sim, avg_conn)
        train_reviews = set([v['review'] for v in train])
        test_reviews = set([v['review'] for v in val
                            ]).union(set([v['review'] for v in test]))
        X_item_train, item_train_key , X_item_test, item_test_key = \
            model_items(reviews, users, train_reviews, test_reviews, avg_user)
        # train, test: same file, different scaling
        train_users = set([v['voter'] for v in train])
        test_users = set([v['voter']
                          for v in val]).union(set([v['voter'] for v in test]))
        X_user_train, user_train_key, X_user_test, user_test_key = \
            model_users(users, train_users, test_users, avg_user)

        print 'Scaling'
        dyad_scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(dyad_scaler, X_train)
        X_val = scale_features(dyad_scaler, X_val)
        X_test = scale_features(dyad_scaler, X_test)
        item_scaler = fit_scaler('minmax', X_item_train)
        X_item_train = scale_features(item_scaler, X_item_train)
        X_item_test = scale_features(item_scaler, X_item_test)
        user_scaler = fit_scaler('minmax', X_user_train)
        X_user_train = scale_features(user_scaler, X_user_train)
        X_user_test = scale_features(user_scaler, X_user_test)
        X_item = vstack((X_item_train, X_item_test))
        item_key = item_train_key + item_test_key
        X_user = vstack((X_user_train, X_user_test))
        user_key = user_train_key + user_test_key

        print 'Outputting model'
        output_dyad('train', train, X_train, i)
        output_dyad('val', val, X_val, i)
        output_dyad('test', test, X_test, i)
        output_entity('item', X_item, item_key, i)
        output_entity('user', X_user, user_key, i)

        for j in xrange(REP):
            print 'Fitting model'
            print getoutput(
                ('Rscript lib/rlfm/rlfm_fit.R %d %d %d %d %s %d %d '
                 '%s') %
                (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

            print getoutput(
                'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
                '%s train' %
                (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

            predfile = open(
                '%s/rlfm-%s-%d-%d.dat' % (_TRAIN_DIR, _CONF_STR, i, j), 'r')
            pred = [float(p.strip()) for p in predfile]
            predfile.close()
            truth = [v['vote'] for v in train]
            print len(pred)
            print len(truth)
            print '~ Training error on set %d repetition %d' % (i, 0)
            print 'RMSE: %f' % calculate_rmse(pred, truth)
            print 'nDCG@%d: %f' % (RANK_SIZE,
                                   calculate_avg_ndcg(train, reviews, pred,
                                                      truth, RANK_SIZE))

            print 'Predicting in validation'
            print getoutput(
                'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
                '%s val' %
                (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

            print 'Predicting in test'
            print getoutput(
                'Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
                '%s test' %
                (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))
Ejemplo n.º 5
0
def main():
  """ Predicts helpfulness votes using MF.

      Args:
        None.

      Returns:
        None. Results are printed to files.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    t = time()
    print 'Reading pickles'
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r'))
    truth = [v['vote'] for v in train]
    
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)

    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user,
        avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, 
        avg_user, avg_sim, avg_conn)
    
    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)
    print 'Formatting input time: %f' % (time() - t)

    for j in xrange(REP):
      print 'Fitting Model'
      t = time()
      model = LR_Model()
      model.fit(X_train, y_train, qid_train)
      print 'Learning time: %f' % (time() - t)
      print 'Coefficients:'
      print model.w

      print 'Calculating Predictions'
      pred = model.predict(X_train)
      if _BIAS:
        bias.add_bias(train, reviews, pred)

      print 'TRAINING ERROR'
      print '-- RMSE: %f' % calculate_rmse(pred, truth)
      print '-- nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
          pred, truth, RANK_SIZE))
      
      pred = model.predict(X_val) 
      if _BIAS:
        bias.add_bias(val, reviews, pred)
      print 'Outputting validation prediction'
      output = open('%s/corasvr-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()
     
      t = time()
      pred = model.predict(X_test)
      if _BIAS:
        bias.add_bias(test, reviews, pred)
      print 'Prediction time: %f' % (time() - t)
      print 'Outputting testing prediction'
      output = open('%s/corasvr-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j), 
          'w')
      for p in pred:
        print >> output, p
      output.close()
Ejemplo n.º 6
0
def main():
    """ Predicts votes by applying a LR regressor technique.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

        train_truth = [v['vote'] for v in train]
        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)

        model = Ridge(alpha=_BETA)
        # for standardized notation across algorithms, we consider alpha to be
        # learning rate of and beta, regularization weight
        model.fit(X_train, y_train)

        pred = model.predict(X_train)
        if _BIAS:
            bias.add_bias(train, reviews, pred)
        print '~ Training error on set %d repetition %d' % (i, 0)
        print 'RMSE: %f' % calculate_rmse(pred, train_truth)
        print 'nDCG@%d: %f' % (RANK_SIZE,
                               calculate_avg_ndcg(train, reviews, pred,
                                                  train_truth, RANK_SIZE))

        pred = model.predict(X_val)
        if _BIAS:
            bias.add_bias(val, reviews, pred)
        output = open(
            '%s/lr-r:%f,f:%s,b:%s-%d-%d.dat' %
            (_VAL_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w')
        for p in pred:
            print >> output, p
        output.close()

        pred = model.predict(X_test)
        if _BIAS:
            bias.add_bias(test, reviews, pred)
        output = open(
            '%s/lr-r:%f,f:%s,b:%s,-%d-%d.dat' %
            (_OUTPUT_DIR, _BETA, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w')
        for p in pred:
            print >> output, p
        output.close()
Ejemplo n.º 7
0
def predict():
  """ Predicts votes by applying a SVR regressor technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    t = time()

    print 'Reading data'
    reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r'))
    train_truth = [v['vote'] for v in train] 

    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)

    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user,
        avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, 
        avg_user, avg_sim, avg_conn)

    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    print 'Formatting input time: %f' % (time() - t)

    t = time()
    model = SVR(C=_C, epsilon=_EPS, kernel=_KERNEL)
    model.fit(X_train , y_train)
    print 'Learning time: %f' % (time() - t)

    pred = model.predict(X_train)
    if _BIAS:
      bias.add_bias(train, reviews, pred)
    print '~ Training error on set %d repetition %d' % (i, 0)
    print 'RMSE: %f' % calculate_rmse(pred, train_truth)
    print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred,
        train_truth, RANK_SIZE))

    pred = model.predict(X_val)
    if _BIAS:
      bias.add_bias(val, reviews, pred)
    output = open('%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, 
        _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0),
        'w')
    for p in pred:
      print >> output, p
    output.close()
    
    t = time()
    pred = model.predict(X_test)
    if _BIAS:
      bias.add_bias(test, reviews, pred)
    print 'Prediction time: %f' % (time() - t)
    output = open('%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' % 
        (_OUTPUT_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 
        'y' if _BIAS else 'n', i, 0), 'w')
    for p in pred:
      print >> output, p
    output.close()
def main():
  """ Predicts votes by applying LambdaMART technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
   
    train_truth = [v['vote'] for v in train]
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)
 
    print 'Creating average user (for mean imputation)'
    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, 
        avg_user, avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, 
        test, avg_user, avg_sim, avg_conn)
    
    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    print 'Outputting model'
    outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    train_index = output_model(X_train, y_train, qid_train, outfile)
    outfile.close()
    outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    val_index = output_model(X_val, None, qid_val, outfile)
    outfile.close()
    outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    test_index = output_model(X_test, None, qid_test, outfile)
    outfile.close()

    for j in xrange(REP):
      print 'Fitting model'
      print getoutput(('java -jar lib/ranklib/RankLib.jar -train '
          '%s/rank_train-%s-%d.dat -save %s/lambdamart_model-%s-%d-%d.dat '
          '-gmax 5 -ranker 6 -metric2t NDCG@5 -tree %d -leaf %d -shrinkage '
          '%f') % (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, j, _T,
          _L, _ALPHA)) 

      print 'Evaluating in train'
      print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
          '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_train-%s-%d.dat '
          '-score %s/rank_pred_train-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
          (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
          _CONF_STR, i, j))
      raw_pred = []
      predfile = open('%s/rank_pred_train-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR,
          i, j), 'r')
      raw_pred = [float(p.strip().split()[2]) for p in predfile]
      predfile.close()
      pred = [raw_pred[k] for k in train_index]
      if _BIAS:
        bias.add_bias(train, reviews, pred)
      print '~ Training error on set %d repetition %d' % (i, j)
      print 'RMSE: %f' % calculate_rmse(pred, train_truth)
      print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
          pred, train_truth, RANK_SIZE))

      print 'Predicting in validation'
      print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
          '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_val-%s-%d.dat '
          '-score %s/rank_pred_val-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
          (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
          _CONF_STR, i, j))
      predfile = open('%s/rank_pred_val-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR,
          i, j), 'r')
      raw_pred = [float(p.strip().split()[2]) for p in predfile]
      predfile.close()
      pred = [raw_pred[k] for k in val_index]
      if _BIAS:
        bias.add_bias(val, reviews, pred)
      output = open('%s/lambdamart-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j),
          'w')
      for p in pred:
        print >> output, p
      output.close()
      
      print 'Predicting in test'
      print getoutput(('java -jar lib/ranklib/RankLib.jar -load '
          '%s/lambdamart_model-%s-%d-%d.dat -rank %s/rank_test-%s-%d.dat '
          '-score %s/rank_pred_test-%s-%d-%d.dat -gmax 5 -metric2T NDCG@5') % \
          (_MODEL_DIR, _CONF_STR, i, j, _DATA_DIR, _CONF_STR, i, _DATA_DIR,
          _CONF_STR, i, j))
      predfile = open('%s/rank_pred_test-%s-%d-%d.dat' % (_DATA_DIR, _CONF_STR,
          i, j), 'r')
      raw_pred = [float(p.strip().split()[2]) for p in predfile]
      predfile.close()
      pred = [raw_pred[k] for k in test_index]
      if _BIAS:
        bias.add_bias(test, reviews, pred)
      output = open('%s/lambdamart-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, 
          j), 'w')
      for p in pred:
        print >> output, p
      output.close()
Ejemplo n.º 9
0
def main():
  """ Predicts votes by applying a LR regressor technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

    train_truth = [v['vote'] for v in train]
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)

    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
        avg_user, avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn,
        test, avg_user, avg_sim, avg_conn)

    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    model = Ridge(alpha=_BETA) 
        # for standardized notation across algorithms, we consider alpha to be 
        # learning rate of and beta, regularization weight
    model.fit(X_train , y_train)
    
    pred = model.predict(X_train)
    if _BIAS:
      bias.add_bias(train, reviews, pred)
    print '~ Training error on set %d repetition %d' % (i, 0)
    print 'RMSE: %f' % calculate_rmse(pred, train_truth)
    print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred,
        train_truth, RANK_SIZE))

    pred = model.predict(X_val)
    if _BIAS:
      bias.add_bias(val, reviews, pred)
    output = open('%s/lr-r:%f,f:%s,b:%s-%d-%d.dat' % (_VAL_DIR, _BETA,
        _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w')
    for p in pred:
      print >> output, p
    output.close()
    
    pred = model.predict(X_test)
    if _BIAS:
      bias.add_bias(test, reviews, pred)
    output = open('%s/lr-r:%f,f:%s,b:%s,-%d-%d.dat' % (_OUTPUT_DIR, _BETA,
        _FEAT_TYPE, 'y' if _BIAS else 'n', i, 0), 'w')
    for p in pred:
      print >> output, p
    output.close()
Ejemplo n.º 10
0
def predict():
  """ Predicts votes by applying RankSVM technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
   
    train_truth = [v['vote'] for v in train]
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)
 
    print 'Creating average user (for mean imputation)'
    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, 
        avg_user, avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, 
        test, avg_user, avg_sim, avg_conn)
    
    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    print 'Outputting model'
    outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    train_index = output_model(X_train, y_train, qid_train, outfile)
    outfile.close()
    outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    val_index = output_model(X_val, None, qid_val, outfile)
    outfile.close()
    outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i), 'w')
    test_index = output_model(X_test, None, qid_test, outfile)
    outfile.close()

    print 'Fitting model'
    print getoutput(('lib/svm_rank/svm_rank_learn -c %f -w %s -t %s '
        '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat') % (_C, _ALGO,
        _KERNEL, _DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i))
    print getoutput(('lib/svm_rank/svm_rank_classify ' 
        '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat '
        '%s/rank_pred_train-%s-%d-0.dat') % (_DATA_DIR,
        _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i))

    raw_pred = []
    predfile = open('%s/rank_pred_train-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR,
        i), 'r')
    raw_pred = [float(p.strip()) for p in predfile]
    predfile.close()
    pred = [raw_pred[j] for j in train_index]
    if _BIAS:
      bias.add_bias(train, reviews, pred)
    print '~ Training error on set %d repetition %d' % (i, 0)
    print 'RMSE: %f' % calculate_rmse(pred, train_truth)
    print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews,
        pred, train_truth, RANK_SIZE))

    print 'Predicting in validation'
    print getoutput(('lib/svm_rank/svm_rank_classify ' 
        '%s/rank_val-%s-%d.dat %s/rank_model-%s-%d-0.dat '
        '%s/rank_pred_val-%s-%d-0.dat') % (_DATA_DIR,
        _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i))
    predfile = open('%s/rank_pred_val-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR,
        i), 'r')
    raw_pred = [float(p.strip()) for p in predfile]
    predfile.close()
    pred = [raw_pred[j] for j in val_index]
    if _BIAS:
      bias.add_bias(val, reviews, pred)
    output = open('%s/svmrank-%s-%d-0.dat' % (_VAL_DIR, _CONF_STR, i), 'w')
    for p in pred:
      print >> output, p
    output.close()
    
    print 'Predicting in test'
    print getoutput(('lib/svm_rank/svm_rank_classify ' 
        '%s/rank_test-%s-%d.dat %s/rank_model-%s-%d-0.dat '
        '%s/rank_pred_test-%s-%d-0.dat') % (_DATA_DIR,
        _CONF_STR, i, _MODEL_DIR, _CONF_STR, i, _DATA_DIR, _CONF_STR, i))
    predfile = open('%s/rank_pred_test-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR,
        i), 'r')
    raw_pred = [float(p.strip()) for p in predfile]
    predfile.close()
    pred = [raw_pred[j] for j in test_index]
    if _BIAS:
      bias.add_bias(test, reviews, pred)
    output = open('%s/svmrank-%s-%d-0.dat' % (_OUTPUT_DIR, _CONF_STR, i), 'w')
    for p in pred:
      print >> output, p
    output.close()
Ejemplo n.º 11
0
def main():
    """ Predicts helpfulness votes using MF.

      Args:
        None.

      Returns:
        None. Results are printed to files.
  """
    load_args()

    for i in xrange(NUM_SETS):
        t = time()
        print 'Reading pickles'
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r'))
        truth = [v['vote'] for v in train]

        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)
        print 'Formatting input time: %f' % (time() - t)

        for j in xrange(REP):
            print 'Fitting Model'
            t = time()
            model = LR_Model()
            model.fit(X_train, y_train, qid_train)
            print 'Learning time: %f' % (time() - t)
            print 'Coefficients:'
            print model.w

            print 'Calculating Predictions'
            pred = model.predict(X_train)
            if _BIAS:
                bias.add_bias(train, reviews, pred)

            print 'TRAINING ERROR'
            print '-- RMSE: %f' % calculate_rmse(pred, truth)
            print '-- nDCG@%d: %f' % (
                RANK_SIZE,
                calculate_avg_ndcg(train, reviews, pred, truth, RANK_SIZE))

            pred = model.predict(X_val)
            if _BIAS:
                bias.add_bias(val, reviews, pred)
            print 'Outputting validation prediction'
            output = open(
                '%s/corasvr-%s-%d-%d.dat' % (_VAL_DIR, _CONF_STR, i, j), 'w')
            for p in pred:
                print >> output, p
            output.close()

            t = time()
            pred = model.predict(X_test)
            if _BIAS:
                bias.add_bias(test, reviews, pred)
            print 'Prediction time: %f' % (time() - t)
            print 'Outputting testing prediction'
            output = open(
                '%s/corasvr-%s-%d-%d.dat' % (_OUTPUT_DIR, _CONF_STR, i, j),
                'w')
            for p in pred:
                print >> output, p
            output.close()
Ejemplo n.º 12
0
def predict():
    """ Predicts votes by applying RankSVM technique.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        print 'Reading data'
        reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))

        train_truth = [v['vote'] for v in train]
        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        print 'Creating average user (for mean imputation)'
        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)

        print 'Outputting model'
        outfile = open('%s/rank_train-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        train_index = output_model(X_train, y_train, qid_train, outfile)
        outfile.close()
        outfile = open('%s/rank_val-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        val_index = output_model(X_val, None, qid_val, outfile)
        outfile.close()
        outfile = open('%s/rank_test-%s-%d.dat' % (_DATA_DIR, _CONF_STR, i),
                       'w')
        test_index = output_model(X_test, None, qid_test, outfile)
        outfile.close()

        print 'Fitting model'
        print getoutput(('lib/svm_rank/svm_rank_learn -c %f -w %s -t %s '
                         '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat') %
                        (_C, _ALGO, _KERNEL, _DATA_DIR, _CONF_STR, i,
                         _MODEL_DIR, _CONF_STR, i))
        print getoutput(('lib/svm_rank/svm_rank_classify '
                         '%s/rank_train-%s-%d.dat %s/rank_model-%s-%d-0.dat '
                         '%s/rank_pred_train-%s-%d-0.dat') %
                        (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i,
                         _DATA_DIR, _CONF_STR, i))

        raw_pred = []
        predfile = open(
            '%s/rank_pred_train-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r')
        raw_pred = [float(p.strip()) for p in predfile]
        predfile.close()
        pred = [raw_pred[j] for j in train_index]
        if _BIAS:
            bias.add_bias(train, reviews, pred)
        print '~ Training error on set %d repetition %d' % (i, 0)
        print 'RMSE: %f' % calculate_rmse(pred, train_truth)
        print 'nDCG@%d: %f' % (RANK_SIZE,
                               calculate_avg_ndcg(train, reviews, pred,
                                                  train_truth, RANK_SIZE))

        print 'Predicting in validation'
        print getoutput(('lib/svm_rank/svm_rank_classify '
                         '%s/rank_val-%s-%d.dat %s/rank_model-%s-%d-0.dat '
                         '%s/rank_pred_val-%s-%d-0.dat') %
                        (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i,
                         _DATA_DIR, _CONF_STR, i))
        predfile = open(
            '%s/rank_pred_val-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r')
        raw_pred = [float(p.strip()) for p in predfile]
        predfile.close()
        pred = [raw_pred[j] for j in val_index]
        if _BIAS:
            bias.add_bias(val, reviews, pred)
        output = open('%s/svmrank-%s-%d-0.dat' % (_VAL_DIR, _CONF_STR, i), 'w')
        for p in pred:
            print >> output, p
        output.close()

        print 'Predicting in test'
        print getoutput(('lib/svm_rank/svm_rank_classify '
                         '%s/rank_test-%s-%d.dat %s/rank_model-%s-%d-0.dat '
                         '%s/rank_pred_test-%s-%d-0.dat') %
                        (_DATA_DIR, _CONF_STR, i, _MODEL_DIR, _CONF_STR, i,
                         _DATA_DIR, _CONF_STR, i))
        predfile = open(
            '%s/rank_pred_test-%s-%d-0.dat' % (_DATA_DIR, _CONF_STR, i), 'r')
        raw_pred = [float(p.strip()) for p in predfile]
        predfile.close()
        pred = [raw_pred[j] for j in test_index]
        if _BIAS:
            bias.add_bias(test, reviews, pred)
        output = open('%s/svmrank-%s-%d-0.dat' % (_OUTPUT_DIR, _CONF_STR, i),
                      'w')
        for p in pred:
            print >> output, p
        output.close()
Ejemplo n.º 13
0
def main():
  """ Main method, which performs prediction and outputs to file.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
    
    print 'Creating average user (for mean imputation)'
    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    
    print 'Modeling'
    X_train = model_dyad(train, sim, conn, avg_sim, avg_conn)
    X_val = model_dyad(val, sim, conn, avg_sim, avg_conn)
    X_test = model_dyad(test, sim, conn, avg_sim, avg_conn)
    train_reviews = set([v['review'] for v in train])
    test_reviews = set([v['review'] for v in val]).union(set([v['review'] for v
        in test]))
    X_item_train, item_train_key , X_item_test, item_test_key = \
        model_items(reviews, users, train_reviews, test_reviews, avg_user)
        # train, test: same file, different scaling
    train_users = set([v['voter'] for v in train])
    test_users = set([v['voter'] for v in val]).union(set([v['voter'] for v in
        test]))
    X_user_train, user_train_key, X_user_test, user_test_key = \
        model_users(users, train_users, test_users, avg_user)

    print 'Scaling'
    dyad_scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(dyad_scaler, X_train)
    X_val = scale_features(dyad_scaler, X_val)
    X_test = scale_features(dyad_scaler, X_test)
    item_scaler = fit_scaler('minmax', X_item_train)
    X_item_train = scale_features(item_scaler, X_item_train)
    X_item_test = scale_features(item_scaler, X_item_test)
    user_scaler = fit_scaler('minmax', X_user_train)
    X_user_train = scale_features(user_scaler, X_user_train)
    X_user_test = scale_features(user_scaler, X_user_test)
    X_item = vstack((X_item_train, X_item_test))
    item_key = item_train_key + item_test_key
    X_user = vstack((X_user_train, X_user_test))
    user_key = user_train_key + user_test_key

    print 'Outputting model'
    output_dyad('train', train, X_train, i)
    output_dyad('val', val, X_val, i)
    output_dyad('test', test, X_test, i)
    output_entity('item', X_item, item_key, i)
    output_entity('user', X_user, user_key, i)

    for j in xrange(REP):
      print 'Fitting model'
      print getoutput(('Rscript lib/rlfm/rlfm_fit.R %d %d %d %d %s %d %d '
          '%s') % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

      print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
          '%s train' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

      predfile = open('%s/rlfm-%s-%d-%d.dat' % (_TRAIN_DIR, _CONF_STR, i,
          j), 'r')
      pred = [float(p.strip()) for p in predfile]
      predfile.close()
      truth = [v['vote']  for v in train]
      print len(pred)
      print len(truth)
      print '~ Training error on set %d repetition %d' % (i, 0)
      print 'RMSE: %f' % calculate_rmse(pred, truth)
      print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred,
          truth, RANK_SIZE))

      print 'Predicting in validation'
      print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
          '%s val' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))

      print 'Predicting in test'
      print getoutput('Rscript lib/rlfm/rlfm_predict.R %d %d %d %d %s %d %d '
          '%s test' % (_K, _ITER, _SAMPLES, _BURN_IN, _FEAT, i, j, _DATA_DIR))
Ejemplo n.º 14
0
def predict():
    """ Predicts votes by applying a SVR regressor technique.

      Args:
        None.

      Returns:
        None.
  """
    load_args()

    for i in xrange(NUM_SETS):
        t = time()

        print 'Reading data'
        reviews = load(open('%s/new-reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
        users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
        train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
        test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
        val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
        sim = load(open('%s/new-sim-%d.pkl' % (_PKL_DIR, i), 'r'))
        conn = load(open('%s/new-conn-%d.pkl' % (_PKL_DIR, i), 'r'))
        train_truth = [v['vote'] for v in train]

        if _BIAS:
            bias = BiasModel()
            train = bias.fit_transform(train, reviews)

        avg_user = compute_avg_user(users)
        avg_sim = compute_avg_model(sim)
        avg_conn = compute_avg_model(conn)
        X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
                                                     train, avg_user, avg_sim,
                                                     avg_conn)
        X_val, _, qid_val = generate_input(reviews, users, sim, conn, val,
                                           avg_user, avg_sim, avg_conn)
        X_test, _, qid_test = generate_input(reviews, users, sim, conn, test,
                                             avg_user, avg_sim, avg_conn)

        scaler = fit_scaler('minmax', X_train)
        X_train = scale_features(scaler, X_train)
        X_val = scale_features(scaler, X_val)
        X_test = scale_features(scaler, X_test)

        print 'Formatting input time: %f' % (time() - t)

        t = time()
        model = SVR(C=_C, epsilon=_EPS, kernel=_KERNEL)
        model.fit(X_train, y_train)
        print 'Learning time: %f' % (time() - t)

        pred = model.predict(X_train)
        if _BIAS:
            bias.add_bias(train, reviews, pred)
        print '~ Training error on set %d repetition %d' % (i, 0)
        print 'RMSE: %f' % calculate_rmse(pred, train_truth)
        print 'nDCG@%d: %f' % (RANK_SIZE,
                               calculate_avg_ndcg(train, reviews, pred,
                                                  train_truth, RANK_SIZE))

        pred = model.predict(X_val)
        if _BIAS:
            bias.add_bias(val, reviews, pred)
        output = open(
            '%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' %
            (_VAL_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n', i,
             0), 'w')
        for p in pred:
            print >> output, p
        output.close()

        t = time()
        pred = model.predict(X_test)
        if _BIAS:
            bias.add_bias(test, reviews, pred)
        print 'Prediction time: %f' % (time() - t)
        output = open(
            '%s/svr-c:%f,k:%s,e:%f,f:%s,b:%s-%d-%d.dat' %
            (_OUTPUT_DIR, _C, _KERNEL, _EPS, _FEAT_TYPE, 'y' if _BIAS else 'n',
             i, 0), 'w')
        for p in pred:
            print >> output, p
        output.close()
Ejemplo n.º 15
0
def main():
  """ Predicts votes by applying a GBRT regressor technique.

      Args:
        None.

      Returns:
        None.
  """
  load_args()
  
  for i in xrange(NUM_SETS):
    print 'Reading data'
    reviews = load(open('%s/reviews-%d.pkl' % (_PKL_DIR, i), 'r'))
    users = load(open('%s/users-%d.pkl' % (_PKL_DIR, i), 'r'))
    train = load(open('%s/train-%d.pkl' % (_PKL_DIR, i), 'r'))
    test = load(open('%s/test-%d.pkl' % (_PKL_DIR, i), 'r'))
    val = load(open('%s/validation-%d.pkl' % (_PKL_DIR, i), 'r'))
    sim = load(open('%s/sim-%d.pkl' % (_PKL_DIR, i), 'r'))
    conn = load(open('%s/conn-%d.pkl' % (_PKL_DIR, i), 'r'))
 
    train_truth = [v['vote'] for v in train]
    if _BIAS:
      bias = BiasModel()
      train = bias.fit_transform(train, reviews)

    avg_user = compute_avg_user(users)
    avg_sim = compute_avg_model(sim)
    avg_conn = compute_avg_model(conn)
    X_train, y_train, qid_train = generate_input(reviews, users, sim, conn,
        train, avg_user, avg_sim, avg_conn)
    X_val, _, qid_val = generate_input(reviews, users, sim, conn, val, avg_user,
        avg_sim, avg_conn)
    X_test, _, qid_test = generate_input(reviews, users, sim, conn, test, 
        avg_user, avg_sim, avg_conn)

    scaler = fit_scaler('minmax', X_train)
    X_train = scale_features(scaler, X_train)
    X_val = scale_features(scaler, X_val)
    X_test = scale_features(scaler, X_test)

    for j in xrange(REP):
      model = GradientBoostingRegressor(loss=_LOSS, learning_rate=_ALPHA,
          n_estimators=_T, max_depth=_MAX_D, subsample=_SUBSAMPLE, 
          max_features=_MAX_F, random_state=(int(time() * 1000000) % 1000000))
      model.fit(X_train, y_train)
      
      pred = model.predict(X_train)
      if _BIAS:
        bias.add_bias(train, reviews, pred)
      print '~ Training error on set %d repetition %d' % (i, j)
      print 'RMSE: %f' % calculate_rmse(pred, train_truth)
      print 'nDCG@%d: %f' % (RANK_SIZE, calculate_avg_ndcg(train, reviews, pred,
          train_truth, RANK_SIZE))

      pred = model.predict(X_val)
      if _BIAS:
        bias.add_bias(val, reviews, pred)
      output = open('%s/gbrt-l:%f,t:%d,d:%d,e:%s,p:%f,m:%s,f:%s,b:%s-%d-%d.dat' % 
          (_VAL_DIR, _ALPHA, _T, _MAX_D, _LOSS, _SUBSAMPLE, str(_MAX_F), 
          _FEAT_TYPE, 'y' if _BIAS else 'n', i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()
      
      pred = model.predict(X_test)
      if _BIAS:
        bias.add_bias(test, reviews, pred)
      output = open('%s/gbrt-l:%f,t:%d,d:%d,e:%s,p:%f,m:%s,f:%s,b:%s-%d-%d.dat' % 
          (_OUTPUT_DIR, _ALPHA, _T, _MAX_D, _LOSS, _SUBSAMPLE, str(_MAX_F),
          _FEAT_TYPE, 'y' if _BIAS else 'n', i, j), 'w')
      for p in pred:
        print >> output, p
      output.close()