Exemple #1
0
 def test_rmsle(self):
     self.assertAlmostEqual(metrics.rmsle(np.exp(2) - 1, np.exp(1) - 1), 1)
     self.assertAlmostEqual(
         metrics.rmsle([0, .5, 1, 1.5, 2], [0, .5, 1, 1.5, 2]), 0)
     self.assertAlmostEqual(
         metrics.rmsle([1, 2, 3, np.exp(1) - 1],
                       [1, 2, 3, np.exp(2) - 1]), 0.5)
 def score_rmsle(self,df,df_true):
     """
     Calculate CV score of predictions in given dataframe using RMSLE metric.  Score individually for each target and
     total for targets.  Must have df_true loaded prior to running.
     """
     all_true = []
     all_preds = []
     target_scores = []
     #Transform predictions back to normal space for scoring
     self.transform_targets_exp()
     for target in self.targets:
         all_true.append(df_true[target].tolist())
         all_preds.append(df[target].tolist())
         target_score = ml_metrics.rmsle(df_true[target], df[target])
         target_scores.append(target_score)
         utils.info('RMSLE score for %s: %f' % (target,target_score))
     utils.info('Total RMSLE score: %f' % (ml_metrics.rmsle(all_true, all_preds)))
     #Transform predictions to log space again for averaging
     self.transform_targets_log()
Exemple #3
0
def rmsle2(y, ypred):
    """
    Calculate Root Mean Squared Logarithmic Error/
    Note : Uses external library - ml_metrics - which is more stable than above version

    :param y: list (int, float)
    :param y_pred: list (int, float)
    :return: RMSLE score
    """
    return mlmetrics.rmsle(y, ypred)
Exemple #4
0
def rmsle2(y, ypred):
    """
    Calculate Root Mean Squared Logarithmic Error/
    Note : Uses external library - ml_metrics - which is more stable than above version

    :param y: list (int, float)
    :param y_pred: list (int, float)
    :return: RMSLE score
    """
    return mlmetrics.rmsle(y, ypred)
Exemple #5
0
 def score_rmsle(self, df, df_true):
     """
     Calculate CV score of predictions in given dataframe using RMSLE metric.  Score individually for each target and
     total for targets.  Must have df_true loaded prior to running.
     """
     all_true = []
     all_preds = []
     target_scores = []
     #Transform predictions back to normal space for scoring
     self.transform_targets_exp()
     for target in self.targets:
         all_true.append(df_true[target].tolist())
         all_preds.append(df[target].tolist())
         target_score = ml_metrics.rmsle(df_true[target], df[target])
         target_scores.append(target_score)
         utils.info('RMSLE score for %s: %f' % (target, target_score))
     utils.info('Total RMSLE score: %f' %
                (ml_metrics.rmsle(all_true, all_preds)))
     #Transform predictions to log space again for averaging
     self.transform_targets_log()
Exemple #6
0
 def training_run(self, field_vals, model_class):
     cls = self.get_sklearn_like_model(model_class)
     train_data, test_data, score_data, non_score_data = self.read_dataset(
         field_vals)
     logging.info('Training')
     x, y, _ = make_xy(non_score_data)
     cls.fit(x, y)
     logging.info('Testing')
     tr_x, tr_y, _ = make_xy(test_data)
     # print(pandas.Series(cls.feature_importances_, index=tr_x.columns).sort_values())
     pred = cls.predict(tr_x)
     return rmsle(pred, tr_y), cls
Exemple #7
0
def cross_validate_temporal(mtxTrn,mtxTest,mtxTrnTarget,mtxTestTarget,model):
    start_time = datetime.now()
    log.info('Temporal CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M')))
    utils.line_break()
    train_cv = mtxTrn
    test_cv = mtxTest
    y_target = mtxTrnTarget
    y_true = mtxTestTarget
    #If target variable has been transformed, transform y_true back to normal state for comparison to predictions
    y_true = [np.exp(x)-1 for x in y_true]
    #--------Hyperparameter optimization---------#
    #Make predictions
    try:
        model.estimator.fit(train_cv, y_target)
        preds = model.estimator.predict(test_cv)
    except TypeError:
        model.estimator.fit(train_cv.todense(), y_target)
        preds = model.estimator.predict(test_cv.todense())
    #----------Post processing rules----------#
    #If target variable has been transformed, transform predictions back to original state
    preds = [np.exp(x)-1 for x in preds]
    #Apply scalar
    if model.postprocess_scalar != 1:
        preds = [x*model.postprocess_scalar for x in preds]
    #set <0 predictions to 0 if views or comments, set <1 predictions to 1 if votes
    if model.target == 'num_votes':
        preds = [1 if x < 1 else x for x in preds]
    else:
        preds = [0 if x < 0 else x for x in preds]
    ##score the prediction by measuring the error using the chosen error metric
    score = ml_metrics.rmsle(y_true, preds)
    finish_time = datetime.now()
    log.info('Error Measure:' , score)
    log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' %
               (np.mean(preds)), (np.max(preds),np.std(preds),np.min(preds),np.max(preds)))
    utils.line_break()
    log.info('Temporal CV completed at: %s.  Total runtime: %s' \
          % (datetime.now().strftime('%m-%d-%y %H:%M'),str(finish_time-start_time)))
    utils.line_break()
    return preds
Exemple #8
0
def main():
    logging.info('Getting sample data')
    unique_items = train.get_dataset_unique_items()
    data = train.read_dataset_sample(1, unique_items)
    logging.info('Calculating total dataset size')
    total_size = calc_dataset_size()
    logging.info('Building model')
    mdl, input_tensors, output_rvs = model.build_model(data, unique_items)

    minibatches = train.make_training_minibatch_iterator(unique_items)

    with mdl:
        logging.info('Doing ADVI batches...')
        v_params = pymc3.variational.advi_minibatch(
            n=100,
            minibatch_tensors=input_tensors,
            minibatch_RVs=output_rvs,
            minibatches=minibatches,
            total_size=total_size,
            n_mcsamples=10,
            verbose=True
        )
        trace = pymc3.variational.sample_vp(v_params, draws=500)
        #print(pymc3.summary(trace))
        plt.plot(v_params.elbo_vals)
        plt.savefig('./elbo.png')
        plt.show()

    test_frame = read_test_dataset(unique_items)
    with mdl:
        for i in range(0, test_frame.shape[0], 10000):
            samp = test_frame.ix[i:i+10000]
            frame_parts = frame_vector_split(samp)
            for t, v in zip(input_tensors, frame_parts):
                t.set_value(v)
            samples = pymc3.sample_ppc(trace, samples=500)
            print(ml_metrics.rmsle(samp.adjusted_demand, samples['adjusted_demand'].mean(axis=0)))
Exemple #9
0
def cross_validate_using_benchmark(benchmark_name, dfTrn, mtxTrn,mtxTarget,model,folds=5,SEED=42,test_size=.15):
    fold_scores = []
    SEED = SEED *  time.localtime().tm_sec
    start_time = datetime.now()
    log.info('Benchmark CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M')))
    utils.line_break()
    for i in range(folds):
        #For each fold, create a test set (test_holdout) by randomly holding out X% of the data as CV set, where X is test_size (default .15)
        train_cv, test_cv, y_target, y_true = cross_validation.train_test_split(mtxTrn, mtxTarget, test_size=test_size, random_state=SEED*i+10)
        #If target variable has been transformed, transform y_true back to normal state for comparison to predictions
        y_true = [np.exp(x)-1 for x in y_true]
        #Calc benchmarks and use them to make a prediction
        benchmark_preds = 0
        if benchmark_name =='global_mean':
            benchmark_preds = [13.899 for x in test_cv]
        if benchmark_name =='all_ones':
            #find user avg stars mean
            benchmark_preds = [1 for x in test_cv]
        if benchmark_name =='9999':
            #find user avg stars mean
            benchmark_preds = [9999 for x in test_cv]
        log.info('Using benchmark %s:' % (benchmark_name))
        #For this CV fold, measure the error
        score = ml_metrics.rmsle(y_true, benchmark_preds)
        #print score
        fold_scores += [score]
        log.info('RMSLE (fold %d/%d): %f' % (i + 1, folds, score))

    ##Now that folds are complete, calculate and print the results
    finish_time = datetime.now()
    log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' %
            (np.mean(fold_scores)), (np.max(fold_scores),np.std(fold_scores),np.min(fold_scores),np.max(fold_scores)))
    utils.line_break()
    log.info('CV completed at: %s.  Total runtime: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'),
                                                       str(finish_time-start_time)))
    utils.line_break()
Exemple #10
0
def cross_validate_kfold(mtxTrn,mtxTarget,model,folds=5,SEED=42,test_size=.15,pred_fg='false'):
    fold_scores = []
    SEED = SEED *  time.localtime().tm_sec
    start_time = datetime.now()
    log.info('K-Fold CV started at: %s' % (datetime.now().strftime('%m-%d-%y %H:%M')))
    utils.line_break()
    #If predictions are wanted, initialize the dict so that its length will match all records in the training set,
    #even if not all records are predicted during the CV (randomness is a bitch)
    if pred_fg == 'true':
        cv_preds = {key[0]:[] for key in mtxTrn.getcol(0).toarray()}
    for i in range(folds):
        ##For each fold, create a test set (test_cv) by randomly holding out test_size% of the data as CV set
        train_cv, test_cv, y_target, y_true = \
           cross_validation.train_test_split(mtxTrn, mtxTarget, test_size=test_size, random_state=i*SEED+1)
        #If target variable has been transformed, transform y_true back to normal state for comparison to predictions
        y_true = [np.exp(x)-1 for x in y_true]
        #if predictions are wanted, parse off the first row from train and test cv sets. First row contains ID
        if pred_fg == 'true':
            #TODO: create dense matrix copies for the clf's that only use dense matrices
            train_cv = sparse.csr_matrix(train_cv)[:,1:]
            test_cv2 = sparse.csr_matrix(test_cv)[:,1:]
            test_cv = sparse.csr_matrix(test_cv)[:,1:]
        #----------Hyperparameter optimization------#
        try:
            model.estimator.fit(train_cv, y_target)
            preds = model.estimator.predict(test_cv)
        except TypeError:
            model.estimator.fit(train_cv.todense(), y_target)
            preds = model.estimator.predict(test_cv.todense())
        preds = model.estimator.predict(test_cv)
        #----------Post processing rules----------#
        #If target variable has been transformed, transform predictions back to original state
        preds = [np.exp(x)-1 for x in preds]
        #Apply scalar
        if model.postprocess_scalar != 1:
            preds = [x*model.postprocess_scalar for x in preds]
        #set <0 predictions to 0 if views or comments, set <1 predictions to 1 if votes
        if model.target == 'num_votes':
            preds = [1 if x < 1 else x for x in preds]
        else:
            preds = [0 if x < 0 else x for x in preds]
        ##For each fold, score the prediction by measuring the error using the chosen error metric
        score = ml_metrics.rmsle(y_true, preds)
        fold_scores += [score]
        log.info('RMLSE (fold %d/%d): %f' % (i + 1, folds, score))
        ##IF we want to record predictions, then for each fold add the predictions to the cv_preds dict for later output
        if pred_fg == 'true':
            for i in range(0,test_cv2.shape[0]):
                if test_cv2.getcol(0).toarray()[i][0] in cv_preds.keys():
                    cv_preds[test_cv2.getcol(0).toarray()[i][0]] += [preds[i]]
                else:
                    cv_preds[test_cv2.getcol(0).toarray()[i][0]] = [preds[i]]
    ##Now that folds are complete, calculate and print the results
    finish_time = datetime.now()
    log.info('Prediction metrics: mean=%f, std dev=%f, min/max= %f/%f' %
            (np.mean(fold_scores)), (np.max(fold_scores),np.std(fold_scores),np.min(fold_scores),np.max(fold_scores)))
    utils.line_break()
    log.info('K-Fold CV completed at: %s.  Total runtime: %s' % (datetime.now().strftime('%m-%d-%y %H:%M'),
                                                              str(finish_time-start_time)))
    utils.line_break()
    if pred_fg == 'true':
        return cv_preds
Exemple #11
0
print ('Division_Set_Shapes:', X.shape, y.shape)
print ('Validation_Set_Shapes:', X_train.shape, X_test.shape)

params = {}
params['objective'] = "reg:linear"
params['eta'] = 0.025
params['max_depth'] = 5
params['subsample'] = 0.8
params['colsample_bytree'] = 0.6
params['silent'] = True

print ('')

test_preds = np.zeros(test.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test)

watchlist = [(xg_train, 'train')]
num_rounds = 100

xgclassifier = xgb.train(params, xg_train, num_rounds, watchlist, feval = evalerror, early_stopping_rounds= 20, verbose_eval = 10)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)

print ('RMSLE Score:', rmsle(y_test, preds))

fxg_test = xgb.DMatrix(test)
fold_preds = np.around(xgclassifier.predict(fxg_test, ntree_limit=xgclassifier.best_iteration), decimals = 1)
test_preds += fold_preds

submission = pd.DataFrame({'id':ids, 'Demanda_uni_equil': test_preds})
submission.to_csv('submission.csv', index=False)
Exemple #12
0
def score(actual, prediction):
	return rmsle(actual, prediction.clip(0))
Exemple #13
0
y_test_s=y_train[upto:upto2]

X_train=[]
y_train=[]
#char_model.fit(X_train_s,y_train_s)
print "training done"
print countvect_char.fit_transform(X_train_s)
#print countvect_char

X_train=[]
y_train=[]

X_test=[]
with open("data/test_review.csv") as fi:
	fir=csv.reader(fi)
	fir.next()
	for i in fir:
		X_test.append(i[4])
print "test data read"

preds = char_model.predict(X_test_s)
print "prediction done"
X_test=[]

#with open("predictions/tfidf_cngram.csv","wb") as fo:
#	fow=csv.writer(fo)
	#fow.writerows(preds)
print "all done"

print ml_metrics.rmsle(preds,y_test_s)
test_preds = np.zeros(test.shape[0])
xg_train = xgb.DMatrix(X_train, label=y_train)
xg_test = xgb.DMatrix(X_test)

watchlist = [(xg_train, 'train')]
num_rounds = 100

xgclassifier = xgb.train(params,
                         xg_train,
                         num_rounds,
                         watchlist,
                         feval=evalerror,
                         early_stopping_rounds=20,
                         verbose_eval=10)
preds = xgclassifier.predict(xg_test, ntree_limit=xgclassifier.best_iteration)

print('RMSLE Score:', rmsle(y_test, preds))

fxg_test = xgb.DMatrix(test)
fold_preds = np.around(xgclassifier.predict(
    fxg_test, ntree_limit=xgclassifier.best_iteration),
                       decimals=1)
test_preds += fold_preds

submission = pd.DataFrame({'id': ids, 'Demanda_uni_equil': test_preds})

submission[["id", "Demanda_uni_equil"]].to_csv(
    '../submissions/' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S') + '.csv',
    index=False)

print('done')
Exemple #15
0
for i in tqdm(range(N)):
    logging.info('Starting batch {}'.format(i))
    data = features.make_train_batch(i)
    logging.info('Got data')
    X = data.drop(dropped_cols, 1)
    y = data.adjusted_demand
    logging.info('Training...')
    cls.fit(X, y)
    logging.info('Trained!')

ys = []
y_preds = []
for i in tqdm(range(N)):
    data = features.make_test_batch(i)
    X = data.drop(dropped_cols, 1)
    ys.append(data.adjusted_demand)
    y_pred = np.maximum(cls.predict(X), 1)
    y_preds.append(y_pred)

y = np.concatenate(ys)
y_pred = np.concatenate(y_preds)
del ys, y_preds

print(y_pred.shape)
print(y.shape)
print(y_pred[:10])
print(y[:10])
print(ml_metrics.rmse(y, y_pred))
print(ml_metrics.rmsle(y, y_pred))
print(pandas.Series(cls.coef_, index=X.columns).sort_values())
Exemple #16
0
 def test_rmsle(self):
     self.assertAlmostEqual(metrics.rmsle(np.exp(2)-1,np.exp(1)-1), 1)
     self.assertAlmostEqual(metrics.rmsle([0,.5,1,1.5,2], [0,.5,1,1.5,2]), 0)
     self.assertAlmostEqual(metrics.rmsle([1,2,3,np.exp(1)-1], [1,2,3,np.exp(2)-1]), 0.5)