def main(kmax=1, ntree=100, nruns=1, nfolds=5, tag=TAG):
    #Load data
    xtr, ytr, xte = util.load_data(as_pandas=True)

    # Create value-count features
    xall = pd.concat([xtr, xte])
    xtr = count_features.range_counts(xall, xtr, kmax)
    xte = count_features.range_counts(xall, xte, kmax)
    xtr = xtr.values
    xte = xte.values

    # Create model
    model = XGBClassifier(n_estimators=ntree, 
                          learning_rate=0.02,
                          gamma=1,
                          max_depth=20,
                          min_child_weight=0.1,
                          subsample=0.9,
                          colsample_bytree=0.5,
                          seed=1)

    # Run CV
    cv_preds = models.cv_loop(xtr, ytr, model, nfolds, nruns, SEED)

    # Save CV predictions
    util.save_cv_preds(cv_preds, tag)

    # Fit on all of train, make final predictions on all test
    preds = models.rerun(xtr, ytr, xte, model, nruns, SEED)
    util.write_submission(preds, tag)
Exemple #2
0
def predict(validation, prediction):
    lr = LinearRegression()
    v_label = validation['SalePrice']
    del validation['SalePrice']
    lr.fit(validation, v_label)
    p = lr.predict(prediction)
    util.write_submission("result.csv", p)
def main():
    train, test = u.get_train_test_df()
    user_events_dict = u.get_user_events_dict(test)

    for user in sorted(user_events_dict):
        random.shuffle(user_events_dict[user])

    u.write_submission("random_benchmark.csv", user_events_dict)
Exemple #4
0
def run_full():
    train_X, train_Y, _ = get_db_data(tags=['train1', 'train2', 'train3'])
    test_X, _, IDs = get_db_data(tags=['test'])

    m = BigModel(columns, n_est=300)
    R = m.train_test(train_X, train_Y, test_X)
    R = np.exp(R)
    write_submission("bigmodelv6.csv", R, IDs)
def run_full():
    train_X, train_Y, _ = get_db_data(tags=['train1', 'train2', 'train3'])
    test_X, _, IDs = get_db_data(tags=['test'])
    
    m = BigModel(columns, n_est=300)
    R = m.train_test(train_X, train_Y, test_X)
    R = np.exp(R)
    write_submission("bigmodelv6.csv", R, IDs)
def main():
    train, test = u.get_train_test_df()
    user_events_dict = u.get_user_events_dict(test)
    event_attendees = u.get_event_attendees()
    event_yes = u.get_event_responses_dict(event_attendees["event"], event_attendees["yes"])

    for user in user_events_dict:
        user_events_dict[user] = sorted(user_events_dict[user],
            key=lambda e: len(event_yes[e]), reverse=True)

    u.write_submission("event_popularity_benchmark.csv", user_events_dict)
Exemple #7
0
def predict(test_fea, rfs, errors):
    size = len([e for e in errors if e<0.5])
    print 'only use recent years %d' % (size)
    errors, rfs = errors[:size], rfs[-size:]
    error_sum = np.array(errors).sum()
    p = None
    for i in range(0,len(errors)):
        print 'weight %f' % (errors[i]/error_sum)
        if i==0:
            p = rfs[i].predict(test_fea)*(errors[i]/error_sum)
        else:
            p = p + rfs[i].predict(test_fea)*(errors[i]/error_sum)
    
    util.write_submission("result.csv", np.array(p))
Exemple #8
0
    def predict(self, test_data):
        self.model.eval()
        pred = None
        for step, test_x in enumerate(test_data):
            test_x = test_x.to(self.device)
            out = self.model(test_x)
            
            if step == 0:
                pred = out
            else:
                pred = torch.cat([pred, out], 0)

        pred = pred.detach().cpu().numpy()
        write_submission(pred, self.csv)
Exemple #9
0
def avg_run_all(n_models, base_model, infile_base, passes, bits, submit_id):
    '''
  Runs a batch of linear models over the data, with the input files presented
  to each in a random order. Writes a submission based on the models averaged
  predictions.
  
  Args:
    n_models - the number of models to produce
    base_model - a model that is cloned to produce the models
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    submit_id - the result is written as submissions/submission_<submit_id>.csv

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
    models = []
    orders = []
    l = range(5)
    for k in range(n_models):
        model_k = base_model.__class__()
        model_k.set_params(**base_model.get_params())
        models.append(model_k)
        random.shuffle(l)
        orders.append(l[:])
    model_orders = zip(models, orders)

    for k in range(passes):
        print 'Pass %d' % k
        for (m, order) in model_orders:
            for file_num in order:
                train_set_name = '%s.%d' % (infile_base, file_num)
                print 'loading training file: ' + train_set_name
                x, y = util.load_sparse(train_set_name,
                                        n_features=2**bits,
                                        verbose=False)
                m.partial_fit(x, y, classes=[0., 1.])

    test_set_name = infile_base + '.5'
    print 'loading test set...'
    x, y = util.load_sparse(test_set_name, n_features=2**bits, verbose=False)
    dvs = np.zeros((len(y), n_models))
    for (k, m) in enumerate(models):
        dvs[:, k] = m.decision_function(x)
    dv = dvs.mean(axis=1)
    util.write_submission(dv, submit_id)
def avg_run_all(n_models, base_model, infile_base, passes, bits, submit_id):
  '''
  Runs a batch of linear models over the data, with the input files presented
  to each in a random order. Writes a submission based on the models averaged
  predictions.
  
  Args:
    n_models - the number of models to produce
    base_model - a model that is cloned to produce the models
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    submit_id - the result is written as submissions/submission_<submit_id>.csv

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
  models = []
  orders = []
  l = range(5)
  for k in range(n_models):
    model_k = base_model.__class__()
    model_k.set_params(**base_model.get_params())
    models.append(model_k)
    random.shuffle(l)
    orders.append(l[:])
  model_orders = zip(models, orders)
    
  for k in range(passes):
    print 'Pass %d' % k
    for (m, order) in model_orders:
      for file_num in order:
        train_set_name = '%s.%d' % (infile_base, file_num)
        print 'loading training file: ' + train_set_name
        x, y = util.load_sparse(train_set_name, n_features=2**bits, verbose=False)
        m.partial_fit(x, y, classes=[0., 1.])
        
  test_set_name = infile_base + '.5'
  print 'loading test set...'
  x, y = util.load_sparse(test_set_name, n_features=2**bits, verbose=False)
  dvs = np.zeros((len(y), n_models))
  for (k, m) in enumerate(models):
    dvs[:, k] = m.decision_function(x)
  dv = dvs.mean(axis=1)
  util.write_submission(dv, submit_id)
Exemple #11
0
def run_all(model, infile_base, passes, bits, submit_id):
    '''
  Takes model and trains it on 0.zip...4.zip, then predicts on 5.zip.
  Writes predictions out as a valid submission identified by submit_id.
  
  Args:
    model - the model to train and test.
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    submit_id - the result is written as submissions/submission_<submit_id>.csv

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
    train(model, infile_base, passes, bits)
    pred = test(model, infile_base, bits)
    util.write_submission(pred, submit_id)
def run_all(model, infile_base, passes, bits, submit_id):
  '''
  Takes model and trains it on 0.zip...4.zip, then predicts on 5.zip.
  Writes predictions out as a valid submission identified by submit_id.
  
  Args:
    model - the model to train and test.
    infile_base - bare input data name without path or extension
    pases - number of passes over data in training
    bits - the feature space shoul be of dimension 2**bits
    submit_id - the result is written as submissions/submission_<submit_id>.csv

  Writes:
    A submission at paths.SUBMIT/submisssion_<submit_id>.csv
  '''
  train(model, infile_base, passes, bits)
  pred = test(model, infile_base, bits)
  util.write_submission(pred, submit_id)
Exemple #13
0
def main(k=2, C=0.7, nruns=1, nfolds=5, tag=TAG):
    # Load data
    xtr, ytr, xte = util.load_data()

    # Create one-hot encoded features for 1..k-way interactions
    xtr, xte = features.range_combos(xtr, xte, k)

    # Create model
    model = LogisticRegression(C=C)

    # Run CV
    cv_preds = models.cv_loop(xtr, ytr, model, nfolds, nruns, SEED)

    # Save CV predictions for stacking later
    util.save_cv_preds(cv_preds, tag)

    # Fit on all of train, make final predictions on all test
    preds = models.rerun(xtr, ytr, xte, model, nruns, SEED)
    util.write_submission(preds, tag)
Exemple #14
0
def main(ntree=100, nfolds=5, nruns=1, tag=TAG):
    # Load data
    _, ytr, _ = util.load_data()
    xtr = util.reload_cv_predictions(COLS)
    xte = util.reload_submissions(COLS)

    # Set-up model
    model = ExtraTreesClassifier(n_estimators=ntree,
                                 criterion='entropy',
                                 max_depth=9,
                                 max_features=6,
                                 n_jobs=3,
                                 random_state=1)

    # Run CV
    cv_preds = models.cv_loop(xtr, ytr, model, nfolds, nruns, SEED)

    # Save CV predictions
    util.save_cv_preds(cv_preds, tag)

    # Fit on all of train, make final predictions on all test
    preds = models.rerun(xtr, ytr, xte, model, nruns, SEED)
    util.write_submission(preds, tag)
train, test = util.get_train_test_df()

columns = set(train.columns)
columns.remove("SalesID")
columns.remove("SalePrice")
columns.remove("saledate")

train_fea = get_date_dataframe(train["saledate"])
test_fea = get_date_dataframe(test["saledate"])

for col in columns:
    types = set(type(x) for x in train[col])
    if str in types:
        s = set(x for x in train[col])
        str_to_categorical = defaultdict(lambda: -1, [(x[1], x[0]) for x in enumerate(s)])
        train_fea = train_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]}, index=train.index))
        test_fea = test_fea.join(pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]}, index=test.index))
    else:
        train_fea = train_fea.join(train[col])
        test_fea = test_fea.join(test[col])

rf = RandomForestRegressor(n_estimators=50, n_jobs=1, compute_importances = True)
rf.fit(train_fea, train["SalePrice"])
predictions = rf.predict(test_fea)
imp = sorted(zip(train_fea.columns, rf.feature_importances_), key=lambda tup: tup[1], reverse=True)
for fea in imp:
    print(fea)

util.write_submission("random_forest_benchmark.csv", predictions)
def main():
    train, test = u.get_train_test_df()
    user_events_dict = u.get_user_events_dict(test)
    u.write_submission("given_order.csv", user_events_dict)
def main():
    train, test = u.get_train_test_df()
    user_events_dict = u.get_user_events_dict(test)
    u.write_submission("given_order.csv", user_events_dict)
Exemple #18
0
    print "Learn Gradient Boosting" # Slower
    gbm = GradientBoostingRegressor(max_depth=10, subsample = .80, min_samples_split = 12, min_samples_leaf = 5, n_estimators = 200)  
    #n_estimators = 100 by default # Cannot parallelize
    # if testing, this is part of training set.
    gbm.fit(train_fea, train_Y)
    print "Fitting"
    predictions = gbm.predict(test_fea)

    train_predict = gbm.predict(train_fea)
    rmse = np.sqrt(mean_squared_error(train_Y, train_predict))
    logger.write("GBM Oob RMSE:" + str(rmse)+ "\n")
    print "Train Set RMSE:", rmse
    logger.write("Train Set RMSE:" + str(rmse)+ "\n")
    cPickle.dump(gbm,open( 'gbm_obj.csv','w'))

if testing == 0: util.write_submission("submit_gbm_" + comment + ".csv", np.exp(predictions))

if testing == 1:
    csv_w_both = csv.writer(open('predictions.csv','wb'))
    for x in xrange(len(predictions)):
        csv_w_both.writerow([np.exp(predictions[x]), np.exp(test_Y[x])])

imp = sorted(zip(train_fea.columns, gbm.feature_importances_), key=lambda tup: tup[1], reverse=True)
csv_w = csv.writer(open('out/rf_features_gbm_' + comment + '.csv','wb'))
for fea in imp:
    csv_w.writerow([fea[0],fea[1]])

print "# of features", len(imp)
for fea in imp:
    if fea[1] > 0.01:
        print fea[0], "|", fea[1]
Exemple #19
0
        }
    }

    for data_type in data_types:

        for i, group in enumerate(groups):
            tr = util.get_data(fname='tr-' + group + data_type + '.csv')
            ts = util.get_data(fname='ts-' + group + data_type + '.csv')

            for feat_type in feat_types:
                print data_type, group, feat_type

                train, test, y_tr, y_ts = format_data(group, tr, ts, feat_type)
                rf = RandomForestRegressor(
                    n_estimators=800,
                    n_jobs=4,
                    min_samples_split=25,
                    max_features=max_features[data_type][feat_type][i],
                    compute_importances=True)
                rf.fit(train, y_tr)
                p = rf.predict(test)
                imp = sorted(zip(train.columns, rf.feature_importances_),
                             key=lambda tup: tup[1],
                             reverse=True)
                for fea in imp:
                    print(fea)

                util.write_submission(
                    "rf" + data_type + '-' + group + '-' + feat_type + ".csv",
                    p.tolist())
train_fea = get_date_dataframe(train["saledate"])
test_fea = get_date_dataframe(test["saledate"])

for col in columns:
    types = set(type(x) for x in train[col])
    if str in types:
        s = set(x for x in train[col])
        str_to_categorical = defaultdict(lambda: -1,
                                         [(x[1], x[0]) for x in enumerate(s)])
        train_fea = train_fea.join(
            pd.DataFrame({col: [str_to_categorical[x] for x in train[col]]},
                         index=train.index))
        test_fea = test_fea.join(
            pd.DataFrame({col: [str_to_categorical[x] for x in test[col]]},
                         index=test.index))
    else:
        train_fea = train_fea.join(train[col])
        test_fea = test_fea.join(test[col])

rf = RandomForestRegressor(n_estimators=50, n_jobs=1, compute_importances=True)
rf.fit(train_fea, train["SalePrice"])
predictions = rf.predict(test_fea)
imp = sorted(zip(train_fea.columns, rf.feature_importances_),
             key=lambda tup: tup[1],
             reverse=True)
for fea in imp:
    print(fea)

util.write_submission("random_forest_benchmark.csv", predictions)
        else:
            tr_feats = tr_feats.join(tr[col])
            ts_feats = ts_feats.join(ts[col])

    return tr_feats, ts_feats, tr["SalePrice"], ts["SalePrice"]

if __name__ == '__main__':        
    data_types = ['-orig6', '-rank6']
    feat_types = ['base', 'many']
    groups = ['TTT', 'WL', 'TEX', 'BL', 'MG', 'SSL']
    max_features = {'-orig6': {'base': [7, 5, 6, 6, 10, 6], 'many': [8, 7, 5, 8, 11, 6]}, '-rank6': {'base': [8, 6, 5, 5, 12, 6], 'many': [8, 5, 5, 5, 10, 6]}}

    for data_type in data_types:

        for i, group in enumerate(groups):
            tr = util.get_data(fname='tr-' + group + data_type +  '.csv')
            ts = util.get_data(fname='ts-' + group + data_type +  '.csv')
        
            for feat_type in feat_types:
                print data_type, group, feat_type
                    
                train, test, y_tr, y_ts = format_data(group, tr, ts, feat_type)
                rf = RandomForestRegressor(n_estimators=800, n_jobs=4, min_samples_split=25, max_features=max_features[data_type][feat_type][i], compute_importances=True)
                rf.fit(train, y_tr)  
                p = rf.predict(test)
                imp = sorted(zip(train.columns, rf.feature_importances_), key=lambda tup: tup[1], reverse=True)
                for fea in imp:
                    print(fea)
                    
                util.write_submission("rf" + data_type + '-' + group + '-' + feat_type + ".csv", p.tolist())
                
Exemple #22
0
import numpy as np
import pandas as pd
import util

train, test = util.get_train_test_df()
median_price = np.median(train["SalePrice"])
print("The median price is %0.2f" % median_price)

util.write_submission("median_benchmark.csv",
                      [median_price for i in range(len(test))])
import numpy as np
import util

train, test = util.get_train_test_df()
mean_price = np.mean(train["SalePrice"])
print("The mean price is %0.2f" % mean_price)

util.write_submission("mean_benchmark.csv", [mean_price for i in range(len(test))])