def svm_regressor(features,target,test_size_percent=0.2,cv_split=5):
    
    scale=preprocessing.MinMaxScaler()
    X_array = scale.fit_transform(features)
    y_array = scale.fit_transform(target)  
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    svr = SVR(kernel='rbf',C=10,gamma=1)
    svr.fit(X_train,y_train.ravel())
    test_prediction = svr.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(svr,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(svr,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(svr,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(svr,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy
    return svr
def Random_forest(features,target,test_size_percent=0.2,cv_split=3):
    X_array = features.as_matrix()
    y_array = target.as_matrix()        
    model_rdf = RandomForestRegressor()
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    model_rdf.fit(X_train,y_train)
    test_prediction = model_rdf.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(model_rdf,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(model_rdf,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(model_rdf,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(model_rdf,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return model_rdf
def neural_net(features,target,test_size_percent=0.2,cv_split=3,n_iter=100,learning_rate=0.01):
    '''Features -> Pandas Dataframe with attributes as columns
        target -> Pandas Dataframe with target column for prediction
        Test_size_percent -> Percentage of data point to be used for testing'''
    scale=preprocessing.MinMaxScaler()
    X_array = scale.fit_transform(features)
    y_array = scale.fit_transform(target)
    mlp = Regressor(layers=[Layer("Rectifier",units=5), # Hidden Layer1
                            Layer("Rectifier",units=3)  # Hidden Layer2
                            ,Layer("Linear")],     # Output Layer
                        n_iter = n_iter, learning_rate=0.01)
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
    mlp.fit(X_train,y_train)
    test_prediction = mlp.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(mlp,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(mlp,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(mlp,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(mlp,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return mlp
def fit_on_increasing_size(model):
    n_samples = 100
    n_features_ = np.arange(10, 800, 20)
    r2_train, r2_test, snr = [], [], []
    for n_features in n_features_:
        # Sample the dataset (* 2 nb of samples)
        n_features_info = int(n_features/10)
        np.random.seed(42)  # Make reproducible
        X = np.random.randn(n_samples * 2, n_features)
        beta = np.zeros(n_features)
        beta[:n_features_info] = 1
        Xbeta = np.dot(X, beta)
        eps = np.random.randn(n_samples * 2)
        y =  Xbeta + eps
        # Split the dataset into train and test sample
        Xtrain, Xtest = X[:n_samples, :], X[n_samples:, :], 
        ytrain, ytest = y[:n_samples], y[n_samples:]
        # fit/predict
        lr = model.fit(Xtrain, ytrain)
        y_pred_train = lr.predict(Xtrain)
        y_pred_test = lr.predict(Xtest)
        snr.append(Xbeta.std() / eps.std())
        r2_train.append(metrics.r2_score(ytrain, y_pred_train))
        r2_test.append(metrics.r2_score(ytest, y_pred_test))
    return n_features_, np.array(r2_train), np.array(r2_test), np.array(snr)
Exemple #5
0
def decision_tree(train_features, train_labels, test_features, test_labels, feature_names):
    regressor = tree.DecisionTreeRegressor()
    regressor.fit(train_features, train_labels)

    test_results = cap_results(regressor.predict(test_features))
    train_results = cap_results(regressor.predict(train_features))

    print "test result", metrics.mean_squared_error(test_labels, test_results)
    print "test r2", metrics.r2_score(test_labels, test_results)
    print "train result", metrics.mean_squared_error(train_labels, train_results)
    print "train r2", metrics.r2_score(train_labels, train_results)

    # print "importances"
    # temp = []
    # for index, val in enumerate(regressor.feature_importances_):
    #     if val > 0.001:
    #         temp.append((index, val))
    # print sorted(temp, key=lambda x: x[1])

    '''graph stuff'''
    dot_data = StringIO()
    tree.export_graphviz(regressor, out_file=dot_data,
                        special_characters=True,
                        class_names=regressor.classes_,
                        impurity=False,
                        feature_names=feature_names)

    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("tree.pdf") 

    return (test_results, train_results)
    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]

                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]

                print ("Model %d fold %d score %f" % (i, j, r2_score(y_holdout, y_pred)))

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)
            oof_score = r2_score(y, S_train[:, i])
            print 'Final Out-of-Fold Score %f'%oof_score
        return S_train, S_test
Exemple #7
0
def metrics(y_test, clf_pred):
    print 'R^2 Score'
    print r2_score(y_test, clf_pred)
    print 'Mean Squared Error'
    print mean_squared_error(y_test, clf_pred)
    print 'Root Mean Squared Error'
    print np.sqrt(mean_squared_error(y_test, clf_pred))
def make_huber_train():
    x, y = regression.get_data(
        filenames=['/Users/Nathan/Dropbox/SedimentLearning/data/landsat_polaris_filtered/filtered_4hr.csv'])
    alpha = 8
    model = mycvx.kfolds_convex(x, y, alpha, random_seed=seed)
    y_test = model['data']['y_test']
    y_pred = model['data']['y_pred']
    y_train = model['data']['y_train']
    y_train_pred = model['data']['y_train_pred']

    r2 = np.round(r2_score(y_test, y_pred), 3)
    r2train = np.round(r2_score(y_train, y_train_pred), 3)
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.plot(y_train_pred, y_train, '.b')
    ax.plot(y_pred, y_test, '.r')
    ax.plot(np.arange(0, 1.2 * np.max(y_test), .1), np.arange(0, 1.2 * np.max(y_test), .1), '-k')
    fig.suptitle('Reconstruction Ability of Robust Regression Model')
    ax.set_xlabel('Remotely Sensed SPM (mg/L)')
    ax.set_ylabel('In situ measure SPM (mg/L)')

    # print (max(np.max(y_pred), np.max(y_test))- np.min(np.min(y_pred), 0))*5./6. - np.min(np.min(y_pred), 0)
    ax.text((max(np.max(y_pred), np.max(y_test)) - min(np.min(y_pred), 0)) * 5. / 6. - min(np.min(y_pred), 0),
            np.max(y_test) / 7., r'$R^2=%s$' % (r2train), fontsize=15)
    plt.savefig('../figures/huber_training')
    # plt.show()
    print 'r2: ', r2train
def linear_regression(features,target,test_size_percent=0.2,cv_split=5):
    ''' Features -> Pandas Dataframe with attributes as columns
        target -> Pandas Dataframe with target column for prediction
        Test_size_percent -> Percentage of data point to be used for testing'''
    X_array = features.as_matrix()
    y_array = target.as_matrix()    
    ols = linear_model.LinearRegression()
    X_train, X_test, y_train, y_test = train_test_split(X_array, y_array.T.squeeze(), test_size=test_size_percent, random_state=4)
#    model = ols.fit(X_train, y_train)
    ols.fit(X_train, y_train)
#    test_prediction_model = ols.predict(X_test)
    tscv = TimeSeriesSplit(cv_split)
    
    training_score = cross_val_score(ols,X_train,y_train,cv=tscv.n_splits) 
    testing_score = cross_val_score(ols,X_test,y_test,cv=tscv.n_splits)
    print"Cross-val Training score:", training_score.mean()
#    print"Cross-val Testing score:", testing_score.mean()
    training_predictions = cross_val_predict(ols,X_train,y_train,cv=tscv.n_splits)
    testing_predictions = cross_val_predict(ols,X_test,y_test,cv=tscv.n_splits)
    
    training_accuracy = metrics.r2_score(y_train,training_predictions) 
#    test_accuracy_model = metrics.r2_score(y_test,test_prediction_model)
    test_accuracy = metrics.r2_score(y_test,testing_predictions)
    
#    print"Cross-val predicted accuracy:", training_accuracy
    print"Test-predictions accuracy:",test_accuracy

    plot_model(target,y_train,y_test,training_predictions,testing_predictions)
    return ols
def r_square_score(target,fitted):
    #return np.corrcoef(target,fitted)[0,1]
    #===========================================================================
    # target1 = target[0:370]
    # fitted1 = fitted[0:370]
    # target2 = target[415:500]
    # fitted2 = fitted[415:500]
    # target3 = target[500:650]
    # fitted3 = fitted[500:650]
    # score1 = r2_score(target1, fitted1) * 175 / 620
    # score2 = r2_score(target2, fitted2) * 300 / 620
    # score3 = r2_score(target3, fitted3) * 145 / 620
    #===========================================================================
    target1 = target[0:370]
    fitted1 = fitted[0:370]
    target2 = target[415:450]
    fitted2 = fitted[415:450]
    target3 = target[450:520]
    fitted3 = fitted[450:520]
    target4 = target[520:550]
    fitted4 = fitted[520:550]
    target5 = target[550:650]
    fitted5 = fitted[550:650]
    score1 = r2_score(target1, fitted1) * 25 / 80
    score2 = r2_score(target2, fitted2) * 25 / 80
    score3 = r2_score(target3, fitted3) * 12 / 80
    score4 = r2_score(target4, fitted4) * 12 / 80
    score5 = r2_score(target5, fitted5) * 6 / 80
    return score1 + score2 + score3 + score4 + score5
def r2_excoeff_vs_time_cutoff(times):
    r2s = np.zeros_like(times, dtype='float64')
    num_data = np.zeros_like(times, dtype='int32')

    for index, time in enumerate(times):
        # get appropriate features
        x, y = regression.get_data(filenames=[
            '/Users/Nathan/Dropbox/SedimentLearning/data/landsat_polaris_filtered/filtered_excoeff_{}hr.csv'.format(
                time)])
        x = regression.Kau_MB_BR_features(x)

        # create the huber fit model
        alpha = 8
        model = mycvx.kfolds_convex(x, y, alpha, random_seed=seed)
        y_test = model['data']['y_test']
        y_pred = model['data']['y_pred']
        y_train = model['data']['y_train']
        y_train_pred = model['data']['y_train_pred']

        r2_test = np.round(r2_score(y_test, y_pred), 3)
        r2_train = np.round(r2_score(y_train, y_train_pred), 3)

        r2s[index] = r2_train
        num_data[index] = x.shape[0]

        print r2s, num_data
    return r2s, num_data
Exemple #12
0
def fit_all_commodities(df, commodities_list, model_name):
    """
    INPUT: df (dataframe), \
    commodity_list (list of respective commodities \
    for which one whishes to build regressio models)
    OUTPUT: print result; write pickled models to file path
    PURPOSE: fit all models at once
    """
    sklearn_models = {
        'RandomForestRegressor', 'ExtraTreesRegressor',
        'GradientBoostingRegressor'
    }
    sm_models = {"Linear Regression"}
    for commodity in commodities_list:
        if model_name in sklearn_models:
            model, X_train, X_test, y_train, y_test = \
                fit_model_sklearn(df, commodity, model_name)
            predict = model.predict(X_test)
            print "***********************"
            print "{}'s adjusted r^2 score with {} is:".format(
                    commodity, model_name
                )
            print r2_score(y_test, predict)
        elif model_name in sm_models:
            model, results, X_train, X_test, y_train, y_test = \
                fit_model_sm(df, commodity, model_name)
        # pickle model:
        joblib.dump(model, '{}_with_{}.pkl'.format(
            commodity, model_name)
        )
def main():
    random.seed(SEED)
    np.random.seed(SEED)

    trainable_model = get_trainable_model()
    groundtruth_model = get_groundtruth_model()
    my_loss = loss.HeteroscedasticNormalLossFunction()

    train_X, train_y = get_data(NUM_TRAIN, NUM_FEATURES, groundtruth_model)
    test_X, test_y = get_data(NUM_TEST, NUM_FEATURES, groundtruth_model)
    trainable_model.fit(train_X, train_y)

    print 'train results'
    expected_mu, expected_std = groundtruth_model.predict(train_X)
    for i, pred in enumerate(trainable_model.staged_predict(train_X)):
        if i % 10 != 0:
            continue
        print 'stage %d: NLL = %.3f, R2 on mu(X) = %.3f, R2 on std(X) = %.3f' \
            % (i, my_loss(train_y, pred),
               r2_score(expected_mu, pred[:, 0]),
               r2_score(expected_std, pred[:, 1]))

    print 'test results'
    expected_mu, expected_std = groundtruth_model.predict(test_X)
    for i, pred in enumerate(trainable_model.staged_predict(test_X)):
        if i % 10 != 0:
            continue
        print 'stage %d: NLL = %.3f, R2 on mu(X) = %.3f, R2 on std(X) = %.3f' \
            % (i, my_loss(test_y, pred),
               r2_score(expected_mu, pred[:, 0]),
               r2_score(expected_std, pred[:, 1]))
Exemple #14
0
def create_model():
    print('Training robust regression')
    x, y = regression.get_data(
        filenames=['/Users/Nathan/Dropbox/SedimentLearning/data/landsat_polaris_filtered/filtered_2hr.csv'],
        spm_cutoff=None)  # 2hr data

    # Get top 5 correlated band ratios and add to feature array
    x = regression.Kau_MB_BR_features(x)
    # Shape of x is (75,11)

    # log spm regression
    logy = np.log(y)
    alpha = 8
    seed = 4
    model = mycvx.kfolds_convex(x, logy, alpha, random_seed=seed)
    theta = model['theta']

    y_test = model['data']['y_test']
    y_pred = model['data']['y_pred']
    y_train = model['data']['y_train']
    y_train_pred = model['data']['y_train_pred']

    r2_test = np.round(r2_score(np.exp(y_test), np.exp(y_pred)), 3)
    r2_train = np.round(r2_score(np.exp(y_train), np.exp(y_train_pred)), 3)

    print(
    'Done training robust regression. R2 of actual spm vs predicted spm on training set = {}. \n'.format(r2_train))

    return theta
def meanDecreaseAccuracyOnWeibo():
	X, y, names = loadData()
	X = [dict(enumerate(sample)) for sample in X]
	vect = feature_extraction.DictVectorizer(sparse=False)
	X = vect.fit_transform(X)

	rf = RandomForestClassifier(n_estimators=500)
	scores = defaultdict(list)
	iter = 0
	while(iter <= 100):
		iter += 1
		X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3)
		rf = rf.fit(X_train, Y_train)
		acc = r2_score(Y_test, rf.predict(X_test))
		for i in range(X.shape[1]):
			X_t = X_test.copy()
			np.random.shuffle(X_t[:, i])
			shuff_acc = r2_score(Y_test, rf.predict(X_t))
			scores[names[i]].append((acc-shuff_acc)/acc)

	print "Features sorted by their score:"
	result = sorted([(round(np.mean(score), 4), feat) for feat, score in scores.items()], reverse=True)
	importance = []
	featurename = []
	for score, name in result:
		importance.append(score)
		featurename.append(name)
	featureRanking(importance, featurename)
Exemple #16
0
 def hyperopt_obj(self,param,train_X,train_y):
     # 5-fold crossvalidation error
     #ret = xgb.cv(param,dtrain,num_boost_round=param['num_round'])
     kf = KFold(n_splits = 3)
     errors = []
     r2 = []
     int_params = ['max_depth','num_round']
     for item in int_params:
         param[item] = int(param[item])
     for train_ind,test_ind in kf.split(train_X):
         train_valid_x,train_valid_y = train_X[train_ind],train_y[train_ind]
         test_valid_x,test_valid_y = train_X[test_ind],train_y[test_ind]
         dtrain = xgb.DMatrix(train_valid_x,label = train_valid_y)
         dtest = xgb.DMatrix(test_valid_x)
         pred_model = xgb.train(param,dtrain,num_boost_round=int(param['num_round']))
         pred_test = pred_model.predict(dtest)
         errors.append(mean_squared_error(test_valid_y,pred_test))
         r2.append(r2_score(test_valid_y,pred_test))
     all_dtrain = xgb.DMatrix(train_X,label = train_y)
     print('training score:')
     pred_model = xgb.train(param,all_dtrain,num_boost_round= int(param['num_round']))
     all_dtest = xgb.DMatrix(train_X)
     pred_train = pred_model.predict(all_dtest)
     print(str(r2_score(train_y,pred_train)))
     print(np.mean(r2))
     print('\n')
     return {'loss':np.mean(errors),'status': STATUS_OK}
def multi_regression():
    '''
    多元回归
    :return:
    '''
    from sklearn.cross_validation import train_test_split
    X = df.iloc[:, :-1].values
    y = df['MEDV'].values
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    slr = LinearRegression()
    slr.fit(X_train, y_train)
    y_train_pred = slr.predict(X_train)
    y_test_pred = slr.predict(X_test)
    # 计算Mean Squared Error (MSE)
    print('MSE train: %.3f, test: %.3f' % (
        mean_squared_error(y_train, y_train_pred),
        mean_squared_error(y_test, y_test_pred)))
    # MSE train: 19.958, test: 27.196 => over fitting

    # 计算R*R
    # If R*R =1, the model  ts the data perfectly with a corresponding MSE = 0 .
    print('R^2 train: %.3f, test: %.3f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

    # plot
    plt.scatter(y_train_pred, y_train_pred - y_train, c='blue', marker='o', label='Training data')
    plt.scatter(y_test_pred, y_test_pred - y_test, c='lightgreen', marker='s', label='Test data')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.legend(loc='upper left')
    plt.hlines(y=0, xmin=-10, xmax=50, lw=2, color='red')
    plt.xlim([-10, 50])
    plt.show()
def housing_polynomial_regression():
    '''
    housing数据1元多次
    :return:
    '''
    X = df[['LSTAT']].values
    y = df['MEDV'].values
    regr = LinearRegression()
    # create polynomial features
    quadratic = PolynomialFeatures(degree=2)
    cubic = PolynomialFeatures(degree=3)
    X_quad = quadratic.fit_transform(X)
    X_cubic = cubic.fit_transform(X)
    # linear fit
    X_fit = np.arange(X.min(), X.max(), 1)[:, np.newaxis]
    regr = regr.fit(X, y)
    y_lin_fit = regr.predict(X_fit)
    linear_r2 = r2_score(y, regr.predict(X))
    # quadratic fit
    regr = regr.fit(X_quad, y)
    y_quad_fit = regr.predict(quadratic.fit_transform(X_fit))
    quadratic_r2 = r2_score(y, regr.predict(X_quad))
    # cubic fit
    regr = regr.fit(X_cubic, y)
    y_cubic_fit = regr.predict(cubic.fit_transform(X_fit))
    cubic_r2 = r2_score(y, regr.predict(X_cubic))
    # plot results
    plt.scatter(X, y, label='training points', color='lightgray')
    plt.plot(X_fit, y_lin_fit, label='linear (d=1), $R^2=%.2f$' % linear_r2, color='blue', lw=2, linestyle=':')
    plt.plot(X_fit, y_quad_fit, label='quadratic (d=2), $R^2=%.2f$' % quadratic_r2, color='red', lw=2, linestyle='-')
    plt.plot(X_fit, y_cubic_fit, label='cubic (d=3), $R^2=%.2f$' % cubic_r2, color='green', lw=2, linestyle='--')
    plt.xlabel('% lower status of the population [LSTAT]')
    plt.ylabel('Price in $1000\'s [MEDV]')
    plt.legend(loc='upper right')
    plt.show()
def get_cv_r2(labels, features, model):
    """r2scores = get_cv_r2(labels, features, model)

       Calculate cross-validated R2 score for a model. 

       Inputs:    
           labels = Labels for the data set
           features = Features for the data set
           model = the model

       Outputs:
           r2scores = R2 scores for training and cross-validation data set.

    """

    # Get training and cross-validation metrics for each k-fold
    Nfolds = 5
    kf = cross_validation.KFold(features.shape[0], n_folds = Nfolds, shuffle = True, random_state = 47)    
    
    r2scores = np.zeros((Nfolds, 2), dtype = np.float64)
    ik = 0
    for itrain, icross in kf:
        ftrain = features[itrain, :]
        ltrain = labels[itrain]
        fcross = features[icross, :]
        lcross = labels[icross]
        model.fit(ftrain, ltrain)
        r2scores[ik, 0] = metrics.r2_score(ltrain, model.predict(ftrain))
        r2scores[ik, 1] = metrics.r2_score(lcross, model.predict(fcross))
        ik = ik + 1

    # Return linear regression model
    return r2scores
def model_years(df, model, start, end, categoricals=None):
    '''
    Run model over years from start to end

    IN
        df: dataframe with features and label
        model: initialized sklearn model
        start: int, start year
        end: int, end year
    '''
    trained = {}
    for year in range(start, end + 1):
        dfin = df.copy()[df.date < dt.date(year + 1, 1, 1)]
        print "Training... ", year
        data = BorderData(dfin, categoricals=categoricals)

        params = {}
        grid = GridSearchCV(model, params, cv=data.cv_train)
        grid.fit(data.X_train, data.y_train)

        data.predict(grid)
        data.predict_ensemble()
        print "Baseline : ", r2_score(data.y_test, data.baseline)
        print 'Model    : ', r2_score(data.y_test, data.yhat)
        print "Ensemble : ", r2_score(data.y_test, data.ensemble)

        trained[year] = (data, grid)

    return trained
Exemple #21
0
def otherOutcomeModel(clean_data_path,X_train,X_cross,X_test,X_predict,X_eval,outcome,name='default'):
    #Fit logit models for now, might want to do others later
    
    print "Running intermediate model on " + outcome 
    
    Y_predict = getOutcome(clean_data_path, 'prediction train',outcome)
    Y_train = getOutcome(clean_data_path,'two year train',outcome)
    Y_cross = getOutcome(clean_data_path,'cross validation data',outcome)
    
    Y_cross.fillna(0, inplace=True)
    Y_predict.fillna(0, inplace=True)
    Y_train.fillna(0,inplace=True)
    
    standardize = preprocessing.StandardScaler()
    X_train_predict = standardize.fit_transform(X_train)
    X_cross_predict = standardize.transform(X_cross)
    X_test_predict = standardize.transform(X_test)    
    standardize_predict = preprocessing.StandardScaler()
    X_predict_predict = standardize_predict.fit_transform(X_predict)
    X_eval_predict = standardize_predict.transform(X_eval)
    
    if ((Y_train == 1) | (Y_train == 0)).all():
        #Binary variable
        logit = LogisticRegression(penalty='l2',dual=False,tol=1,fit_intercept=True, C=.0004325, intercept_scaling=1, class_weight='auto', random_state=423)
        logit.fit(X_train_predict,Y_train)
        logit2 = LogisticRegression(penalty='l2',dual=False,tol=1,fit_intercept=True, C=.0004325, intercept_scaling=1, class_weight='auto', random_state=423)
        logit2.fit(X_predict_predict,Y_predict)
        
    
        inScore = roc_auc_score(Y_cross,logit.predict_proba(X_cross_predict)[:,1])
        print "Cross Logistic: Area under auc curve is %f" % (inScore)
            
        
        X_train[name] = logit.predict_proba(X_train_predict)[:,1]
        X_cross[name] = logit.predict_proba(X_cross_predict)[:,1]
        X_test[name] = logit.predict_proba(X_test_predict)[:,1]
        X_predict[name] = logit2.predict_proba(X_predict_predict)[:,1]
        X_eval[name] = logit2.predict_proba(X_eval_predict)[:,1]
    else:
        #Continuous variable
        ridge = Ridge(alpha=.001)
        ridge.fit(X_train_predict,Y_train)
        ridge2 = Ridge(alpha=0.001)
        ridge2.fit(X_predict_predict, Y_predict)
        
        inScore = r2_score(Y_train,ridge.predict(X_train_predict))
        print "Train Ridge: r2 score is %f" % (inScore)
        
        inScore = r2_score(Y_cross,ridge.predict(X_cross_predict))
        print "Cross Ridge: r2 score is %f" % (inScore)
        
        X_train[name] = ridge.predict(X_train_predict)
        X_cross[name] = ridge.predict(X_cross_predict)
        X_test[name] = ridge.predict(X_test_predict)
        X_predict[name] = ridge2.predict(X_predict_predict)
        X_eval[name] = ridge2.predict(X_eval_predict)
    
    
    return X_train,X_cross,X_test,X_predict,X_eval
    def fit_predict(self, X, y, T):
        X = np.array(X)
        y = np.array(y)
        T = np.array(T)

        folds = list(KFold(n_splits=self.n_splits, shuffle=True, random_state=2016).split(X, y))

        S_train = np.zeros((X.shape[0], len(self.base_models)))
        S_test = np.zeros((T.shape[0], len(self.base_models)))
        for i, clf in enumerate(self.base_models):

            S_test_i = np.zeros((T.shape[0], self.n_splits))

            for j, (train_idx, test_idx) in enumerate(folds):
                X_train = X[train_idx]
                y_train = y[train_idx]
                X_holdout = X[test_idx]
                y_holdout = y[test_idx]

                clf.fit(X_train, y_train)
                y_pred = clf.predict(X_holdout)[:]

                print ("Model %d fold %d score %f" % (i, j, r2_score(y_holdout, y_pred)))

                S_train[test_idx, i] = y_pred
                S_test_i[:, j] = clf.predict(T)[:]
            S_test[:, i] = S_test_i.mean(axis=1)
            oof_score = r2_score(y, S_train[:, i])
            print 'Final Out-of-Fold Score %f'%oof_score

        # results = cross_val_score(self.stacker, S_train, y, cv=5, scoring='r2')
        # print("Stacker score: %.4f (%.4f)" % (results.mean(), results.std()))
        # exit()

        f_train = np.zeros((X.shape[0], 1))
        f_test = np.zeros((T.shape[0], 1))
        f_test_i = np.zeros((T.shape[0], self.n_splits))
        i = 0
        total_train = np.hstack((X, S_train))
        total_test = np.hstack((T, S_test))

        for j, (train_idx, test_idx) in enumerate(folds):
                X_train = total_train[train_idx]
                y_train = y[train_idx]
                X_holdout = total_train[test_idx]
                y_holdout = y[test_idx]

                self.stacker.fit(X_train, y_train)
                y_pred = self.stacker.predict(X_holdout)[:]

                print ("Model %d fold %d score %f" % (i, j, r2_score(y_holdout, y_pred)))
                f_train[test_idx, i] = y_pred
                f_test_i[:, j] = self.stacker.predict(total_test)[:]
        f_test[:, i] = f_test_i.mean(axis=1)
        oof_score = r2_score(y, f_train[:, i])
        print 'Final Out-of-Fold Score %f'%oof_score

        return f_test
Exemple #23
0
 def permutationImportance(X,y,rf):
     # Get feature importances
     acc = r2_score(y, rf.predict(X))
     scores= defaultdict(list)
     for i in range(X.shape[1]):
         X_t = X.copy()
         np.random.shuffle(X_t[:, i])
         shuff_acc = r2_score(y, rf.predict(X_t))
         scores[i].append((acc-shuff_acc)/acc)
     return np.array([ np.mean(scores[i]) for i in range(X.shape[1]) ])
def square_score_func1(x,y,a,c,d,weight_list):
    y_predict = []
    for x_value in x:
        y_predict.append(func1(x_value, a, c, d))
    #print weight_if
    if weight_if == True:
        square_score_fitted = r2_score(y,y_predict, sample_weight = weight_list)
    else :
        square_score_fitted = r2_score(y,y_predict)
    return square_score_fitted
Exemple #25
0
 def score(self, x, y):
     yhat = self.predict(x)
     if self.loss_func == "mse":
         if self.output_dim == 1:
             return r2_score(y, yhat[:, 0])
         else:
             return np.mean([r2_score(y[:, i], yhat[:, i]) for i in
                             range(self.output_dim)])
     else:
         return accuracy_score(y, yhat)
def test_losses():
    """Test loss functions"""
    y_true, y_pred, _ = make_prediction(binary=True)
    n_samples = y_true.shape[0]
    n_classes = np.size(unique_labels(y_true))

    # Classification
    # --------------
    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one(y_true, y_pred), 13)
        assert_almost_equal(zero_one(y_true, y_pred, normalize=True),
                            13 / float(n_samples), 2)

    assert_almost_equal(zero_one_loss(y_true, y_pred),
                        13 / float(n_samples), 2)
    assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13)
    assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2)
    assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2)

    assert_almost_equal(hamming_loss(y_true, y_pred),
                        2 * 13. / (n_samples * n_classes), 2)

    assert_equal(accuracy_score(y_true, y_pred),
                 1 - zero_one_loss(y_true, y_pred))

    assert_equal(accuracy_score(y_true, y_pred, normalize=False),
                 n_samples - zero_one_loss(y_true, y_pred, normalize=False))

    with warnings.catch_warnings(True):
    # Throw deprecated warning
        assert_equal(zero_one_score(y_true, y_pred),
                     1 - zero_one_loss(y_true, y_pred))

    # Regression
    # ----------
    assert_almost_equal(mean_squared_error(y_true, y_pred),
                        12.999 / n_samples, 2)
    assert_almost_equal(mean_squared_error(y_true, y_true),
                        0.00, 2)

    # mean_absolute_error and mean_squared_error are equal because
    # it is a binary problem.
    assert_almost_equal(mean_absolute_error(y_true, y_pred),
                        12.999 / n_samples, 2)
    assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2)

    assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2)
    assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0)

    assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2)
    assert_almost_equal(r2_score(y_true, y_true), 1.00, 2)
    assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0)
    assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
Exemple #27
0
def bagofwords(X_train, X_cross, X_test, X_predict, X_eval, Y_train, Y_cross, Y_predict,variable = 'test', typeModel='binary',name='test'):

    X_train_text = X_train[variable]
    X_cross_text = X_cross[variable]
    X_test_text = X_test[variable]
    X_predict_text = X_predict[variable]
    X_eval_text = X_eval[variable]
    
    train_vec = getFeatures(X_train_text)
    X_train_text = train_vec.transform(X_train_text)
    X_cross_text = train_vec.transform(X_cross_text)
    X_test_text = train_vec.transform(X_test_text)
    
    predict_vec = getFeatures(X_predict_text)
    X_predict_text = predict_vec.transform(X_predict_text)
    X_eval_text = predict_vec.transform(X_eval_text)
   
    if typeModel == 'continuous':
        bowModel = Ridge(alpha = 0.001)
        bowModel2 = Ridge(alpha = 0.001)
        bowModel.fit(X_train_text,Y_train)
        bowModel2.fit(X_predict_text,Y_predict)
        
        inScore = r2_score(Y_train,bowModel.predict(X_train_text))
        print "Train Ridge: r2 score is %f" % (inScore)
        
        inScore = r2_score(Y_cross,bowModel.predict(X_cross_text))
        print "Cross Ridge: r2 score is %f" % (inScore)
        
        X_train[name] = bowModel.predict(X_train_text)
        X_cross[name] = bowModel.predict(X_cross_text)
        X_test[name] = bowModel.predict(X_test_text)
        X_predict[name] = bowModel2.predict(X_test_text)
        X_eval[name] = bowModel2.predict(X_eval_text)
        
    else:
        bowModel = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=0.0005, intercept_scaling=1, class_weight=None, random_state=423) 
        bowModel2 = LogisticRegression(penalty='l2',dual=False,tol=0.0001,fit_intercept=True, C=.0005, intercept_scaling=1, class_weight=None, random_state=423) 
        bowModel.fit(X_train_text,Y_train)
        bowModel2.fit(X_predict_text,Y_predict)
        
        inScore = roc_auc_score(Y_train,bowModel.predict_proba(X_train_text)[:,1])
        print "Train Logistic: Area under auc curve is %f" % (inScore)
        
        inScore = roc_auc_score(Y_cross,bowModel.predict_proba(X_cross_text)[:,1])
        print "Cross Logistic: Area under auc curve is %f" % (inScore)
        
        X_train[name] = bowModel.predict_proba(X_train_text)[:,1]
        X_cross[name] = bowModel.predict_proba(X_cross_text)[:,1]
        X_test[name] = bowModel.predict_proba(X_test_text)[:,1]
        X_predict[name] = bowModel2.predict_proba(X_predict_text)[:,1]
        X_eval[name] = bowModel2.predict_proba(X_eval_text)[:,1]
        
    return X_train, X_cross, X_test, X_predict, X_eval
Exemple #28
0
def calc_new_model(hf, pos):
    ranks = hf[hf['Points'] > 0][hf['Pos'] == pos]['Avg Rank']
    scores = hf[hf['Points'] > 0][hf['Pos'] == pos]['Points']
    if pos in PROJECTION_TYPE[FP_QB] or pos in PROJECTION_TYPE[FP_DST]:
        crazy_fit = np.poly1d(np.polyfit(ranks, scores, 5))
        print "r2 score is %f" % (r2_score(scores, map(crazy_fit, ranks)))
        return crazy_fit
    elif pos in PROJECTION_TYPE[FP_FLEX]:
        crazy_fit = np.poly1d(np.polyfit(ranks, scores, 5))
        print "r2 score is %f" % (r2_score(scores, map(crazy_fit, ranks)))
        return crazy_fit
def run_methods(train_points, train_targets, test_points, test_targets,
                model_parameters, m_list, file_name, title, show=False, full=True, vi=True):

    method = 'means'
    optimizer = 'L-BFGS-B'
    max_iter = 50
    options = {'maxiter': max_iter, 'disp': False, 'mydisp': True}

    means_r2 = []
    vi_r2 = []

    for m in m_list:
        print('m:', m)
        print('Finding means...')
        means = KMeans(n_clusters=m, n_init=1, max_iter=20)
        means.fit(train_points.T)
        inputs = means.cluster_centers_.T
        print('...found')

        model_covariance_obj = SquaredExponential(np.copy(model_parameters))
        new_gp = GPR(model_covariance_obj, method='means', optimizer=optimizer)
        res = new_gp.fit(train_points, train_targets, num_inputs=m, optimizer_options=options, inputs=inputs)
        predicted_y_test, _, _ = new_gp.predict(test_points)
        means_r2.append(r2_score(test_targets, predicted_y_test))

        if vi:
            model_covariance_obj = SquaredExponential(np.copy(model_parameters))
            new_gp = GPR(model_covariance_obj, method='vi', optimizer=optimizer)
            res = new_gp.fit(train_points, train_targets, num_inputs=m, optimizer_options=options, inputs=inputs)
            predicted_y_test, _, _ = new_gp.predict(test_points)
            vi_r2.append(r2_score(test_targets, predicted_y_test))

    if full:
        model_covariance_obj = SquaredExponential(np.copy(model_parameters))
        new_gp = GPR(model_covariance_obj, method='brute')
        res = new_gp.fit(train_points, train_targets, max_iter=max_iter)
        predicted_y_test, _, _ = new_gp.predict(test_points, train_points, train_targets)
        brute_r2 = r2_score(test_targets, predicted_y_test)

    plt.plot(range(len(m_list)), means_r2, '-kx', label='vi-means')
    if vi:
        plt.plot(range(len(m_list)), vi_r2, '-rx', label='vi')
    if full:
        plt.plot(range(len(m_list)), len(m_list) * [brute_r2], '--g', label='full GP')

    plt.xticks(range(len(m_list)), m_list)
    plt.xlabel('m')
    plt.ylabel('$R^2$-score on test data')
    # plt.ylim(0.5, 1)
    plt.legend(loc=4)
    plt.title(title)
    plt.savefig('../Plots/inducing_inputs/'+file_name + '.pgf')
    if show:
        plt.show()
def estimator_metrics(true_values, estimates):

    print "---------------------------------------"
    print "MSE: "
    print mean_squared_error(true_values, estimates)
    print "MAE: "
    print median_absolute_error(true_values, estimates)
    print "R-squared: "
    print r2_score(true_values, estimates)
    print "---------------------------------------"

    return
Exemple #31
0
x = veriler.iloc[:, 1:2]
y = veriler.iloc[:, 2:]
X = x.values
Y = y.values

#linear regression
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X, Y)

plt.scatter(X, Y, color='red')
plt.plot(x, lin_reg.predict(X), color='blue')
plt.show()

print("Linear R2  degeri:")
print(r2_score(Y, lin_reg.predict(X)))
#polynomial regression
from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=2)
x_poly = poly_reg.fit_transform(X)
print(x_poly)
lin_reg2 = LinearRegression()
lin_reg2.fit(x_poly, y)
plt.scatter(X, Y, color='red')
plt.plot(X, lin_reg2.predict(poly_reg.fit_transform(X)), color='blue')
plt.show()

from sklearn.preprocessing import PolynomialFeatures
poly_reg = PolynomialFeatures(degree=4)
x_poly = poly_reg.fit_transform(X)
print(x_poly)
Exemple #32
0
	for (i,train_day,test_day) in [(i, dp.split(train,nsplits=7)[i], dp.split(test,nsplits=7)[i]) for i in dp.split(train,nsplits=7)]: # for each day
		test_day_pred=ets_v(train_day,test_day,hor=1,batch=batch,freq=freq) # predict for all hours of the respective day
		test_pred.iloc[i::7]=test_day_pred # fill corresponding rows with out of sample predictions
	return test_pred

np.random.seed(0) # fix seed for reprodicibility
path='C:/Users/SABA/Google Drive/mtsg/data/household_power_consumption.csv' # data path
load_raw=dp.load(path) # load data
load_raw=dp.cut(load_raw) # remove leading & trailing Nans
targets=load_raw.apply(axis=1,func=(lambda x: np.nan if (x.isnull().sum()>0) else x.mean())).unstack() # custom sum function where any Nan in arguments gives Nan as result
targets.fillna(method='bfill',inplace=True)
train,test=dp.split_train_test(data=targets, test_size=0.25, base=7)

# vertical
test_pred=ets_v(train,test,batch=7,freq=7)
r2_score(y_true=test,y_pred=test_pred,multioutput='uniform_average')
dp.save(data=test_pred,path='C:/Users/SABA/Google Drive/mtsg/data/ets_v.csv')
# vertical week
test_pred=ets_vw(train,test,batch=7,freq=52)
r2_score(y_true=test,y_pred=test_pred,multioutput='uniform_average')
dp.save(data=test_pred,path='C:/Users/SABA/Google Drive/mtsg/data/ets_vw.csv')
# horizontal
test_pred=ets(train,test,hor=24,batch=7,freq=24)
r2_score(y_true=test,y_pred=test_pred,multioutput='uniform_average')
dp.save(data=test_pred,path='C:/Users/SABA/Google Drive/mtsg/data/ets_h.csv')
# horizontal week
test_pred=ets_hw(train,test,batch=7,freq=52)
r2_score(y_true=test,y_pred=test_pred,multioutput='uniform_average')
dp.save(data=test_pred,path='C:/Users/SABA/Google Drive/mtsg/data/ets_hw.csv')

def r2d2(y_train,train_yhat):
    r2_lm1 = r2_score(y_train,train_yhat)
    return r2_lm1
Exemple #34
0
df_ml = df_ml.fillna(0)

X = df_ml.drop(['like'], axis = 1).values
Y = df_ml['like'].values

X = StandardScaler().fit_transform(X)

X_Train, X_Test, Y_Train, Y_Test = train_test_split(X, Y, test_size = 0.30, random_state = 101)

randomforest = RandomForestRegressor(n_estimators=500,min_samples_split=10)
randomforest.fit(X_Train,Y_Train)

p_train = randomforest.predict(X_Train)
p_test = randomforest.predict(X_Test)

train_acc = r2_score(Y_Train, p_train)
test_acc = r2_score(Y_Test, p_test)

app.layout = html.Div([html.H1("Facebook Data Analysis", style={"textAlign": "center"}), dcc.Markdown('''
Welcome to my Plotly (Dash) Data Science interactive dashboard. In order to create this dashboard have been used two different datasets. The first one is the [Huge Stock Market Dataset by Boris Marjanovic](https://www.kaggle.com/borismarjanovic/price-volume-data-for-all-us-stocks-etfs)
and the second one is the [Facebook metrics Data Set by Moro, S., Rita, P., & Vala, B](https://archive.ics.uci.edu/ml/datasets/Facebook+metrics). This dashboard is divided in 3 main tabs. In the first one you can choose whith which other companies to compare Facebook Stock Prices to anaylise main trends.
Using the second tab, you can analyse the distributions each of the Facebook Metrics Data Set features. Particular interest is on how paying to advertise posts can boost posts visibility. Finally, in the third tab a Machine Learning analysis of the considered datasets is proposed. 
All the data displayed in this dashboard is fetched, processed and updated using Python (eg. ML models are trained in real time!).
''')  ,
    dcc.Tabs(id="tabs", children=[
        dcc.Tab(label='Stock Prices', children=[
html.Div([html.H1("Dataset Introduction", style={'textAlign': 'center'}),
dash_table.DataTable(
    id='table',
    columns=[{"name": i, "id": i} for i in df.columns],
    data=df.iloc[0:5,:].to_dict("rows"),
def reg_s_lightGBM(merge_data,outnameimp,outname):
    from sklearn.model_selection import StratifiedShuffleSplit

    # 目的変数を分離
    X = merge_data.drop("target",axis=1).values
    y = merge_data["target"].values
    columns_name = merge_data.drop("target",axis=1).columns

    # 分類するための関数を定義
    sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2) 
    def data_split(X,y):
        for train_index, test_index in sss.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

        X_train = pd.DataFrame(X_train, columns=columns_name)
        X_test = pd.DataFrame(X_test, columns=columns_name)

        return X_train, y_train, X_test, y_test

    # train, test, valに分離
    X_train, y_train, X_test, y_test = data_split(X, y)
    X_train, y_train, X_val, y_val = data_split(X_train.values, y_train)

    # shape 確認
    print("train shape", X_train.shape)
    print("test shape", X_test.shape)
    print("validation shape", X_val.shape)
    # shape 確認
    print("y_train shape", y_train.shape)
    print("y_test shape", y_test.shape)
    print("y_validation shape", y_val.shape)
    y_test_df = pd.DataFrame(y_test)
    print("y_test describe",y_test_df.describe())
    print("not_ y_test describe",(~y_test_df.duplicated()).sum())
    #y_test_df.value_counts().plot(kind="bar")
    print("y_test_df.duplicated().sum()",y_test_df.duplicated().sum())
    #print(y_test_df[y_test_df.duplicated()])
    # クラスの割合を確認
    plt.figure(figsize=(20,5))
    plt.subplot(1,3,1)
    plt.hist(y_train)

    plt.subplot(1,3,2)
    plt.hist(y_test)

    plt.subplot(1,3,3)
    plt.hist(y_val)

    import lightgbm as lgb

    # データセットを作成
    train = lgb.Dataset(X_train, label=y_train)
    valid = lgb.Dataset(X_val, label=y_val)

    # モデルのパラメータを設定
    # パラメータを設定
    params = {'task': 'train',                # 学習、トレーニング ⇔ 予測predict
              'boosting_type': 'gbdt',        # 勾配ブースティング
              'objective': 'regression',      # 目的関数:回帰
              'metric': 'rmse',               # 回帰分析モデルの性能を測る指標
              'learning_rate': 0.1 }          # 学習率(初期値0.1)
    # モデルを訓練
    model = lgb.train(params,
                      train,
                      valid_sets=valid,
                      num_boost_round=5000,
                      early_stopping_rounds=500)

    # 予測
    y_pred = model.predict(X_test, num_iteration=model.best_iteration)
    

    from sklearn.metrics import mean_squared_error # モデル評価用(平均二乗誤差)
    from sklearn.metrics import r2_score # モデル評価用(決定係数)
    # 真値と予測値の表示
    df_pred = pd.DataFrame({'regression_y_test':y_test,'regression_y_pred':y_pred})
    display(df_pred)

    # 散布図を描画(真値 vs 予測値)
    plt.plot(y_test, y_test, color = 'red', label = 'x=y') # 直線y = x (真値と予測値が同じ場合は直線状に点がプロットされる)
    plt.scatter(y_test, y_pred) # 散布図のプロット
    plt.xlabel('y') # x軸ラベル
    plt.ylabel('y_test') # y軸ラベル
    plt.title('y vs y_pred') # グラフタイトル

    # モデル評価
    # rmse : 平均二乗誤差の平方根
    mse = mean_squared_error(y_test, y_pred) # MSE(平均二乗誤差)の算出
    rmse = np.sqrt(mse) # RSME = √MSEの算出
    print('RMSE :',rmse)
    # r2 : 決定係数
    r2 = r2_score(y_test,y_pred)
    print('R2 :',r2)
    df_Df = pd.DataFrame({'regression_y_test':y_test,'regression_y_pred':y_pred,'RMSE':rmse,'R2':r2})
    df_Df.to_csv(r""+"./output/"+'DPC_g/'+outname+'.csv', encoding = 'shift-jis')
    
    importance = pd.DataFrame(model.feature_importance(), columns=['importance'])
    display(importance)
    C_you=merge_data.drop(["target"], axis=1)
    importance["columns"] =list(C_you.columns)
    importance.to_csv(r""+"./output/"+'DPC_g/'+outnameimp+'.csv', encoding = 'shift-jis')
    
    #整数に直して解析
    y_pred2=y_pred
    len(y_pred2)
    y_test2=y_test
    #for文にする
    for i in range(len(y_pred2)):
        if y_pred2[i]>=1.51:
            y_pred2[i]=2
        elif y_pred2[i]>=0.51:
            y_pred2[i]=1
        else:
            y_pred2[i]=0
        print(y_pred2)

    for i in range(len(y_test2)):
        if y_test2[i]>=1.51:
            y_test2[i]=2
        elif y_test2[i]>=0.51:
            y_test2[i]=1
        else:
            y_test2[i]=0
    print(y_test2)

    #0と1と2のデータにして解析

    df_pred2 = pd.DataFrame({'regression_y_test2':y_test2,'regression_y_pred2':y_pred2})
    display(df_pred2)

    # モデル評価
    # rmse : 平均二乗誤差の平方根
    mse = mean_squared_error(y_test2, y_pred2) # MSE(平均二乗誤差)の算出
    rmse = np.sqrt(mse) # RSME = √MSEの算出
    print('RMSE :',rmse)

    # r2 : 決定係数
    r2 = r2_score(y_test2,y_pred2)
    print('R2 :',r2)
    df_Df = pd.DataFrame({'regression_y_test2':y_test2,'regression_y_pred2':y_pred2,'RMSE':rmse,'R2':r2})
    df_Df.to_csv(r""+"./output/"+'DPC_g/'+"int"+outname+'.csv', encoding = 'shift-jis')
 model.add(Embedding(vocab_size+1, vector_length,weights=[embedding_matrix], input_length=embedding_vecor_length,trainable=False))
 # input length is the length of words in review
 model.add(Conv1D(nb_filter=50,filter_length=5,border_mode="valid",activation="relu",subsample_length=1))
 model.add(MaxPooling1D(pool_length=1))
 model.add(Flatten())
 model.add(Dense(100,activation='relu'))
 model.add(Dense(10,activation='relu'))
 model.add(Dense(1))
 # compile the model
 #loss ke liye mse(mean squared error) aur metrics me bhi
 # summarize the model
 model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mse'])
 print(model.summary())
 history1 = model.fit(padded_docs[train], labels[train],validation_data=(padded_docs[test], labels[test]),epochs=number_of_epoch,batch_size=100,verbose=verbose_value,callbacks=[history])
 prediction1 = model.predict(padded_docs[test])
 print(r2_score(labels[test],prediction1))
 a=history1.history['mean_squared_error']
 va=history1.history['val_mean_squared_error']
 l = history1.history['loss']
 vl=history1.history['val_loss']
 #for i in range(0,number_of_epoch):
 #    accuracy_single_list=accuracy_single_list.append(a[i])
 #    loss_single_list=loss_single_list.append(l[i])
 #    val_accuracy_single_list=val_accuracy_single_list.append(va[i])
 #    val_loss_single_list=val_loss_single_list.append(vl[i])
 accuracy.append(a)
 loss.append(l)
 val_accuracy.append(va)
 val_loss.append(vl)
 #plot_model(model, to_file='model_plot.png',show_shapes=True, show_layer_names=True)
 #predict =np.asarray( model.predict(X_test))
Exemple #37
0
def plot_results(trainData, testData, target_term, show): 

    target_term = target_term.replace(" ", "_").replace("/", "")
    preds = trainData["Preds"]
    target = trainData["Target"] 
    
    print("Datapoints in the training set =", len(preds))
    #plt.show()
    plt.rcParams["figure.figsize"] = (10,8)
    plt.scatter(target,preds)

    y_train = target.to_numpy()
    y_pred = preds.to_numpy()
    plt.xlim()
    limits = [(min(np.min(y_train), np.min(y_pred))) - 0.2, 0.2 + max(0, (np.max(y_train)), (np.max(y_pred)))]
    plt.xlim(limits)
    plt.ylim(limits)
    infotext = "MAE = {:.3f}\n".format(mean_absolute_error(y_train, y_pred)) + r"$r^2$ = {:.3f}".format(r2_score(y_train, y_pred))
    plt.text(limits[0], limits[1], infotext, bbox={"facecolor": "lightblue", "pad": 5})

    # for test 
    
    preds = testData["Preds"]
    target = testData["Target"] 

    print("Datapoints in the validation set =", len(preds))
    plt.rcParams["figure.figsize"] = (10,8)
    plt.scatter(target,preds)    
    
    y_test = target.to_numpy()
    y_pred = preds.to_numpy()    
    plt.suptitle(target_term, fontsize=30)
    plt.xlabel("%s_DFT"%target_term, fontsize=18)
    plt.ylabel("%s_GNN"%target_term, fontsize=18)        
    infotext2 = "MAE = {:.3f}\n".format(mean_absolute_error(y_test, y_pred)) + r"$r^2$ = {:.3f}".format(r2_score(y_test, y_pred))
    #plt.text(-6, -6, infotext2)
    plt.text(limits[0], 0.8*limits[1], infotext2, bbox={"facecolor": "orange", "pad": 5})

    plt.savefig('../plots/python/%s.png'%target_term)
    if (0 == show):
        plt.close()
    #plt.show()
    #plt.close()
    return(trainData, testData)
print(" beta_1 = %f, beta_2 = %f" % (popt[0], popt[1]))

x = np.linspace(1960, 2015, 55)
x = x/max(x)
plt.figure(figsize=(8,5))
y = sigmoid(x, *popt)
plt.plot(xdata, ydata, 'ro', label='data')
plt.plot(x,y, linewidth=3.0, label='fit')
plt.legend(loc='best')
plt.ylabel('GDP')
plt.xlabel('Year')
plt.show()

# split data into train/test
msk = np.random.rand(len(df)) < 0.8
train_x = xdata[msk]
test_x = xdata[~msk]
train_y = ydata[msk]
test_y = ydata[~msk]

# build the model using train set
popt, pcov = curve_fit(sigmoid, train_x, train_y)

# predict using test set
y_hat = sigmoid(test_x, *popt)

# evaluation
print("Mean absolute error: %.2f" % np.mean(np.absolute(y_hat - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((y_hat - test_y) ** 2))
print("R2-score: %.2f" % r2_score(y_hat , test_y) )
from sklearn import linear_model
regr = linear_model.LinearRegression()
train_x = np.asanyarray(train[['ENGINESIZE']])
train_y = np.asanyarray(train[['CO2EMISSIONS']])
regr.fit(train_x, train_y)
# The coefficients
print('Coefficients: ', regr.coef_)
print('Intercept: ', regr.intercept_)

# In[28]:

plt.scatter(train.ENGINESIZE, train.CO2EMISSIONS, color='blue')
plt.plot(train_x, regr.coef_[0][0] * train_x + regr.intercept_[0], '-r')
plt.xlabel("Engine size")
plt.ylabel("Emission")
plt.show()

# In[29]:

from sklearn.metrics import r2_score

test_x = np.asanyarray(test[['ENGINESIZE']])
test_y = np.asanyarray(test[['CO2EMISSIONS']])
test_y_ = regr.predict(test_x)

print("Mean absolute error: %.2f" % np.mean(np.absolute(test_y_ - test_y)))
print("Residual sum of squares (MSE): %.2f" % np.mean((test_y_ - test_y)**2))
print("R2-score: %.2f" % r2_score(test_y_, test_y))

# In[ ]:
data.drop_duplicates(inplace=True)
onehot = LabelEncoder()
for i in data.columns:
    if data[i].dtype == 'object':
        data[i] = onehot.fit_transform(data[i])
x = data.iloc[:,:-1]
y = data.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(x,y,train_size=0.6,random_state=1)
clf1 = LinearRegression()
clf1.fit(X_train,y_train)
'''R2方法是将预测值跟只使用均值的情况下相比,看能好多少。其区间通常在(0,1)之间。
0表示还不如什么都不预测,直接取均值的情况,而1表示所有预测跟真实结果完美匹配的情况。 
与均值相比的优秀程度,介于[0~1]。0表示不如均值。1表示完美预测
'''
print('均方误差',metrics.mean_squared_error(y_test,clf1.predict(X_test)))
print('R^2 ',int(round(metrics.r2_score(y_test,clf1.predict(X_test)),2)*100),'%')
#岭回归
ridge = linear_model.Ridge(alpha=10)
ridge.fit(X_train,y_train)
print('均方误差',metrics.mean_squared_error(y_test,ridge.predict(X_test)))
print('R^2 ',int(round(metrics.r2_score(y_test,ridge.predict(X_test)),2)*100),'%')
#LASSO回归
lasso = linear_model.Lasso(alpha=0.1)
lasso.fit(X_train,y_train)
print('均方误差',metrics.mean_squared_error(y_test,lasso.predict(X_test)))
print('R^2 ',int(round(metrics.r2_score(y_test,lasso.predict(X_test)),2)*100),'%')
# 网格搜索
print('-------------岭回归--------------------')
gs1 = GridSearchCV(ridge,param_grid={'alpha':[0.01,0.1,1,10]},scoring='r2')
gs1.fit(X_train,y_train)
print(gs1.best_score_)
# train model with your data
model.fit(feature_train, target_train)

# Score your model
score = model.score(feature_train, target_train)
print("Score:\n", score)

# predict data using your model
target_prediction = model.predict(feature_test)
print("Prediction:\n", target_prediction)
plt.plot(len(feature_train), target_prediction, 'rx')

# calculate the amount of error in your prediction
MSE = mean_squared_error(target_test, target_prediction)
R2 = r2_score(target_test, target_prediction)
print("MSE:\n", MSE)
print("R2:\n", R2)

# print statistics for your model
intercept = model.intercept_
coeff = model.coef_
print("Intercept:\n", intercept)
print("Coeff:\n", coeff)

# plot the graph based on the intercept and coefficient
points = [intercept + coeff[0] * eachitem[0] for eachitem in feature_train]
print(points)
plt.plot(points, 'r--')

# Save model
Exemple #42
0
                              include_estimators=[
                                  "gaussian_process",
                              ],
                              exclude_estimators=None,
                              resampling_strategy_arguments={'folds': 5})
start = time.time()

#X_train = X_train.astype('float') # when?
automl.fit(X_train, y_train,
           dataset_name='boston_housing')  #change dataset name accordingly
automl.refit(X_train.copy(), y_train.copy())
print(
    '[INFO] Elapsed time finding best model: {} seconds.'.format(time.time() -
                                                                 start))

predictions = automl.predict(X_test)
#print('--- CLASSIFICATION REPORT: ---')        #not for regression
#print(classification_report(y_test, predictions, digits=5))
print('\n\n--- MODELS: ---')
print(automl.show_models())
print('\n\n--- STATISTICS: ---')
print(automl.sprint_statistics())

#-----CLASSIFIER-----
#print('\n\n--- SCORE: ---')
#print("Balanced error score", 1 - balanced_accuracy_score(y_test, predictions))

#-----REGRESSION-----
print('\n\n--- SCORE: ---')
print("R2 score", r2_score(y_test, predictions))
           validation_split=0.2, verbose=1)

#4. 평가예측
loss, mae = model.evaluate(x_test, y_test, batch_size=10)
print('loss, mae : ', loss, mae)
y_predict=model.predict(x_test)

#RMSE
from sklearn.metrics import mean_squared_error
def RMSE(y_test, y_predict) :
    return np.sqrt(mean_squared_error(y_test, y_predict))
print("RMSE : ", RMSE(y_test, y_predict))

#R2
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_predict)
print('R2 :', r2)

'''
파라미터는 그대로 이다. dropout은 layer가 아니므로 적용안한다
실질적으로 train 할 때만 dropout적용
test시에는 그대로 레이어 다쓴다.
Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #
=================================================================
input_1 (InputLayer)         [(None, 10)]              0
_________________________________________________________________
dense (Dense)                (None, 15)                165       
_________________________________________________________________
dropout (Dropout)            (None, 15)                0
Exemple #44
0
# 3. 훈련
model.compile(loss='mse', optimizer='adam', metrics=['mse'])
model.fit(x_train,
          y_train,
          epochs=100,
          batch_size=1,
          validation_split=0.25,
          verbose=1)

# 4. 평가, 예측
loss, mse = model.evaluate(x_test, y_test, batch_size=1)
print(f"loss : {loss}, mse : {mse}")

y_pred = model.predict(x_test)
print(f"y_predict : {y_pred}")

# RMSE 구하기
from sklearn.metrics import mean_squared_error


def RMSE(y_true, y_predict):
    return np.sqrt(mean_squared_error(y_test, y_pred))


print(f"RMSE : {RMSE(y_test, y_pred)}")

# R^2 구하기
from sklearn.metrics import r2_score
r2_y_pred = r2_score(y_test, y_pred)
print(f"R2: {r2_y_pred}")
def calc_ToP_result(alg_hyp_set_combo,
                    scaler_class,
                    dependent_variable,
                    first_train_idx,
                    test_idx,
                    random_state=None):
    if not isinstance(alg_hyp_set_combo, BaseEstimator):
        raise TypeError(
            "alg_hyp_set_combo must be an instance of BaseEstimator, but was a {type(alg_hyp_set_combo).__name__}"
        )
    if scaler_class not in [StandardScaler, RobustScaler, MinMaxScaler]:
        raise ValueError(
            "scaler_class must be either the StandardScaler, RobustScaler, or MinMaxScaler classes, but was {scaler_class}"
        )
    if not isinstance(dependent_variable, str):
        raise TypeError(
            f"dependent_variable must be a str, but was a {type(dependent_variable).__name__}"
        )
    elif dependent_variable not in ["Log(Rmax)", "Log(Efficiency)"]:
        raise ValueError(
            f"dependent_variable must be either 'Log(Rmax)' or 'Log(Efficiency)', but was '{dependent_variable}'"
        )
    if not isinstance(first_train_idx, int):
        raise TypeError(
            f"first_train_idx must be an int, but was a {type(first_train_idx).__name__}"
        )
    elif first_train_idx not in range(1, 18):
        raise ValueError(
            f"first_train_idx must be between 1 and 17 inclusive, but was {first_train_idx}"
        )
    if not isinstance(test_idx, int):
        raise TypeError(
            f"test_idx must be an int, but was a {type(test_idx).__name__}")
    elif test_idx not in range(2, 19):
        raise ValueError(
            f"test_idx must be between 2 and 18 inclusive, but was {test_idx}")
    if test_idx <= first_train_idx:
        raise ValueError(
            f"test_idx must be greater than first_train_idx, but test_idx was {test_idx} and first_train_idx was {first_train_idx}"
        )

    if random_state is not None:
        random.seed(random_state)
        np.random.seed(random_state)
        tensorflow.random.set_seed(random_state)

    clean_train = None
    clean_test = None

    if dependent_variable == "Log(Rmax)":
        clean_train_dep_var_rmax, clean_test_dep_var_rmax = get_clean_datasets_rmax(
            all_datasets, test_idx, range(first_train_idx, test_idx))
        clean_test_dep_var_rmax_no_dupes = no_dupes(clean_train_dep_var_rmax,
                                                    clean_test_dep_var_rmax)
        clean_train = clean_train_dep_var_rmax
        clean_test = clean_test_dep_var_rmax_no_dupes
    else:
        clean_train_dep_var_efficiency, clean_test_dep_var_efficiency = get_clean_datasets_efficiency(
            all_datasets, test_idx, range(first_train_idx, test_idx))
        clean_test_dep_var_efficiency_no_dupes = no_dupes(
            clean_train_dep_var_efficiency, clean_test_dep_var_efficiency)
        clean_train = clean_train_dep_var_efficiency
        clean_test = clean_test_dep_var_efficiency_no_dupes

    random.seed(10)

    train_x, train_y, test_x, test_y = normalize_and_split(
        clean_train, clean_test, normalizer=scaler_class)

    model = clone(
        alg_hyp_set_combo)  #clone() in case the model was already trained
    model.fit(train_x, train_y)
    pred_y = model.predict(test_x)
    return r2_score(test_y, pred_y)
Exemple #46
0
hLayer = Dense(hLayer.relu(), 128, initialize, 'w1')
hLayer = hLayer.dropout(0.5)
hLayer = Dense(hLayer.relu(), 128, initialize, 'w2')
hLayer = hLayer.dropout(0.5)
hLayer = Dense(hLayer.relu(), 256, initialize, 'w3')
hLayer = hLayer.dropout(0.5)
hLayer = Dense(hLayer.relu(), output_dim, initialize, 'w_out')
hLayer = hLayer.relu()
hypothesis = hLayer.Y

cost = tf.reduce_mean(tf.square(hypothesis - Y))
train = tf.train.GradientDescentOptimizer(learning_rate=1e-5).minimize(cost)

# Launch the graph in Sesstion
from sklearn.metrics import r2_score, mean_squared_error

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    for step in range(5001):
        cost_val, _ = sess.run([cost, train],
                               feed_dict={
                                   X: x_train,
                                   Y: y_train
                               })
        if step % 20 == 0: print('>>', step, 'cost:', cost_val)

    h = sess.run(hypothesis, feed_dict={X: x_test})
    r2Score = r2_score(y_test, h)  # 높을 수록 좋음.
    rmseScore = np.sqrt(mean_squared_error(y_test, h))  # 낮을 수록 좋다.
    print('R2:', r2Score, 'RMSE:', rmseScore)
Exemple #47
0
def r2_score(y_true, y_pred):
    """Implements r2_score metric from sklearn"""
    return r2_score(y_true, y_pred, multioutput="raw_values")
    # CV による成分数の最適化
    components = []  # 空の list の変数を作成して、成分数をこの変数に追加していきます同じく成分数をこの変数に追加
    r2_in_cv_all = []  # 空の list の変数を作成して、成分数ごとのクロスバリデーション後の r2 をこの変数に追加
    for component in range(
            1,
            min(np.linalg.matrix_rank(autoscaled_x),
                max_number_of_principal_components) + 1):
        # PLS
        model = PLSRegression(n_components=component)  # PLS モデルの宣言
        estimated_y_in_cv = pd.DataFrame(
            cross_val_predict(
                model, autoscaled_x, autoscaled_y,
                cv=fold_number))  # クロスバリデーション推定値の計算し、DataFrame型に変換
        estimated_y_in_cv = estimated_y_in_cv * y.std() + y.mean(
        )  # スケールをもとに戻す
        r2_in_cv = metrics.r2_score(y, estimated_y_in_cv)  # r2 を計算
        print(component, r2_in_cv)  # 成分数と r2 を表示
        r2_in_cv_all.append(r2_in_cv)  # r2 を追加
        components.append(component)  # 成分数を追加
    optimal_component_number = components[r2_in_cv_all.index(
        max(r2_in_cv_all))]
    print('\nCV で最適化された成分数 :', optimal_component_number)
    # PLS
    model = PLSRegression(n_components=optimal_component_number)  # モデルの宣言
elif method_name == 'svr':
    # グラム行列の分散を最大化することによる γ の最適化
    variance_of_gram_matrix = list()
    for index, ocsvm_gamma in enumerate(svr_gammas):
        print(index + 1, '/', len(svr_gammas))
        gram_matrix = np.exp(
            -ocsvm_gamma *
Exemple #49
0
plt.title('Price ~ RM')
plt.xlabel('RM')
plt.ylabel('Price')
plt.show()

# MSE(Mean Squared Error: 오차 제곱들의 평균) 계산
# error = y - y_hat, error^2 = (y - y_hat)^2
# MSE = sum(error^2) / 개수
mse = mean_squared_error(y_test, y_pred_rm)
# RMSE(Squared-Root MSE)
rmse = np.sqrt(mse)
print('Price ~ RM: RMSE =', rmse)

# R2-score(결정 계수) 계산
r2_1 = lin_reg.score(X_test_rm, y_test)
r2_2 = r2_score(y_test, y_pred_rm)
print(f'Price ~ RM: R^2 = {r2_1}, {r2_2}')

# Price ~ LSTAT 선형 회귀: price = b0 + b1 * lstat
X_train_lstat = X_train[:, np.newaxis, 12]  # 학습 세트
X_test_lstat = X_test[:, np.newaxis, 12]  # 검증 세트

lin_reg.fit(X_train_lstat, y_train)  # 모델 fit, train
print(f'Price ~ LSTAT: intercept: {lin_reg.intercept_}, coefficients: {lin_reg.coef_}')

y_pred_lstat = lin_reg.predict(X_test_lstat)  # 예측, 테스트

plt.scatter(X_test_lstat, y_test)  # 실제값 산점도 그래프
plt.plot(X_test_lstat, y_pred_lstat, 'r')  # 예측값 선 그래프
plt.title('Price ~ LSTAT')
plt.xlabel('LSTAT')
def train(df,
          target_column="price",
          initial_features=None,
          method='rf',
          test_size=0.3,
          random_state_split=66,
          **kwargs):
    """ train model return model object and print model R square
    Args:
        df: dataframe including additional features
        target_column: target column name (dependent variable)
        initial_features: features for training model
        method: Type of model to train ('logistic')
        test_size: Test set size for training model
        random_state: Seed for spliting train and test 
         **kwargs: Keyword arguments for sklearn.ensemble.RandomForestRegressor. Please see sklearn documentation
            for all possible options:
            https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
    
    Returns:
        model: model object 
        df_metric: metric saved in a dataframe
    """

    # Check if input is pandas dataframe
    if df is None:
        logger.warning("Input dataframe is empty. Empty frame returned")
        return None, None

    if initial_features is None:
        logger.warning("No features to train the model. Empty frame returned")
        return None, None

    if not isinstance(df, pd.DataFrame):
        logger.error("Parameter %s is not a DataFrame object.", df)

    logger.info('Training a %s model', method)

    # Generate features
    X = df[initial_features]
    Y = df[target_column]

    try:
        # Split data into test and train
        X_train, X_test, Y_train, Y_test = train_test_split(
            X, Y, test_size=test_size, random_state=random_state_split)

        # Instantiate model
        model = methods[method](**kwargs)

        # Train the model on training data
        model.fit(X_train, Y_train)

    except Exception as err:
        logger.error("Error occurred while training the model: %s", err)

    # Use the forest's predict method on the test data
    y_hat = model.predict(X_test)

    df_metric = pd.DataFrame(data={'R square': [r2_score(Y_test, y_hat)]})

    return model, df_metric
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes.target[:-20]
diabetes_y_test = diabetes.target[-20:]

# Create linear regression object
regr = linear_model.LinearRegression()

# Train the model using the training sets
regr.fit(diabetes_X_train, diabetes_y_train)

# Make predictions using the testing set
diabetes_y_pred = regr.predict(diabetes_X_test)

# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Mean squared error: %.2f" %
      mean_squared_error(diabetes_y_test, diabetes_y_pred))
# Explained variance score: 1 is perfect prediction
print('Variance score: %.2f' % r2_score(diabetes_y_test, diabetes_y_pred))

# Plot outputs
plt.scatter(diabetes_X_test, diabetes_y_test, color='black')
plt.plot(diabetes_X_test, diabetes_y_pred, color='blue', linewidth=3)

plt.xticks(())
plt.yticks(())

plt.show()
Exemple #52
0
def validation(fit, outcome , time, continuous=False):

    plt.rcParams['figure.dpi']= 300
    plt.rcParams['figure.figsize'] = (16, 9)
    plt.rcParams.update({'font.size': 16})
    
    fitP=pd.DataFrame(data=fit)
    outcomeP=pd.DataFrame(data=outcome)
    timeP=pd.DataFrame(data=time)
    
    if isinstance(fit, pd.Series):
        fit=fit.values
    if isinstance(outcome, pd.Series):
        outcome=outcome.values
    if isinstance(time, pd.Series):
        time=time.values
    
    data_in = pd.concat([fitP, outcomeP, timeP], axis=1)
    data_in.columns = ['fit', 'outcome', 'time']
    means = data_in.groupby('time')[['fit', 'outcome']].mean().reset_index(drop=False)
  
    data_in['outcomeD']=data_in.loc[:,'outcome']    
    if continuous==True:
        data_in.loc[data_in['outcome'] >= data_in.outcome.mean(), 'outcomeD'] = 1
        data_in.loc[data_in['outcome'] <  data_in.outcome.mean(), 'outcomeD'] = 0
    
    outcomeD=data_in.loc[:,'outcomeD'].values

    lr_log_loss = np.nan
    roc_auc = np.nan
    brier = np.nan
    binom_p = np.nan
    Jeffreys_p =  np.nan
    
    max_outcome_fit=np.maximum(max(outcome), max(fit))
    min_outcome_fit=np.minimum(min(outcome), min(fit)) 
    if min_outcome_fit>=0 and max_outcome_fit<=1:
        lr_log_loss = log_loss(outcomeD, fit).round(4)
        roc_auc = roc_auc_score(outcomeD, fit).round(4)
        binom_p = binom_test(sum(outcomeD), n=len(outcomeD), p= np.mean(fit), alternative='greater').round(decimals=4)
        Jeffreys_p =  beta.cdf(np.mean(fit), sum(outcomeD)+0.5, len(outcomeD)-sum(outcomeD)+0.5).round(decimals=4)

            
    corr,_=pearsonr(fit,outcome)
    r2_OLS=corr**2
    
    the_table = [['Counts', len(outcome)],
                      ['Mean outcome', (sum(outcome)/len(outcome)).round(4)],
                      ['Mean fit', np.mean(fit).round(4)],
                      ['AUC ', roc_auc],
                      ['R-squared (OLS)', round(r2_OLS,4)],
                      ['R-squared', r2_score(outcome, fit).round(decimals=4)],
                      ['RMSE/ SQR(Brier score)', round(np.sqrt(((outcome-fit).dot(outcome-fit))/len(outcome)),4)],
                      ['Log loss', lr_log_loss], 
                      ['Binomial p-value', binom_p],
                      ['Jeffreys p-value', Jeffreys_p]]
    the_table=pd.DataFrame(data=the_table)
    the_table.columns = ['Metric', 'Value']
    
    plt.subplots_adjust(hspace=0.4, wspace=0.4)
 
    plt.subplot(221)
    plt.title('Summary')
    plt.axis('off')
    plt.axis('tight')
    test=plt.table(cellText=the_table.values, colLabels=the_table.columns, loc='center', cellLoc='center', colWidths=[0.34, 0.2])
    test.auto_set_font_size(False)
    test.set_fontsize(16) 
    test.scale(2, 1.5)
    
    plt.subplot(222)
    plt.title('Time-Series Real-Fit')
    plt.plot(means['time'],means['outcome'])
    plt.plot(means['time'],means['fit'], color='red', ls='dashed')
    plt.xlabel('Time', fontsize=15)
    plt.ylabel('Mean', fontsize=15)
    plt.tick_params(axis='both', labelsize=13)
    plt.legend(('Outcome','Fit'), loc='best', fontsize=15)
    
    plt.subplot(223)
    plt.title('Fit Histogram')
    plt.hist(fit, bins=20, histtype='bar', density=True)
    plt.xlabel('Fit', fontsize=15)
    plt.ylabel('Frequency', fontsize=15)
    plt.tick_params(axis='both', labelsize=13)
    
    data_in['cat'] = pd.qcut(data_in.fit, 10, labels=False, duplicates='drop')
    real_fit = data_in.groupby('cat')[['fit', 'outcome']].mean()
    mpv=real_fit.fit.values
    fop=real_fit.outcome.values
    
    maximum=np.maximum(max(fop), max(mpv))       
    maximum=np.ceil(maximum*100)/100
    minimum=np.minimum(min(fop), min(mpv))
    minimum=np.floor(minimum*100)/100
    
    plt.subplot(224)
    plt.title('Calibration Curve')
    plt.plot(mpv, fop, marker='.', linestyle='', markersize=18)
    plt.plot([minimum,maximum],[minimum,maximum], linestyle='--', color='gray')
    plt.xlim((minimum,maximum))
    plt.ylim((minimum,maximum))
    plt.xlabel('Mean fit', fontsize=15)
    plt.ylabel('Mean outcome', fontsize=15)
    plt.tick_params(axis='both', labelsize=13)
    plt.show()    
# activation: {‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default ‘relu’

# others regressors
#reg = RandomForestRegressor(max_depth=2, random_state=9)

# uncomment if wanna see all the parameters of the model
print(reg)

# fit model with trin data
model = reg.fit(Xtrain, ytrain)

# prediciton for test set
preds = model.predict(Xtest)

# sklearn regression scores
print('r2 (Pearson) score: ', r2_score(ytest, preds))
print('explained_variance_score: ', explained_variance_score(ytest, preds))
print('mean_absolute_error ', mean_absolute_error(ytest, preds))
print('mean_squared_error ', mean_squared_error(ytest, preds))
#print('mean_squared_log_error ', mean_squared_log_error(ytest,preds))
print('median_absolute_error ', median_absolute_error(ytest, preds))

# visualization of predections vs target
# ---
hm = len(ytest)
ytestsortind = sorted(range(len(ytest)), key=lambda x: ytest[x])
ytestsort = ytest[ytestsortind[:hm]]
predssort = preds[ytestsortind[:hm]]

plt.title('Ordered test set target vs regression')
plt.xlabel('label')
Exemple #54
0
            c='limegreen',
            marker='s',
            edgecolor='white',
            label='Test Data')
plt.xlabel('predicted value')
plt.ylabel('residual')
plt.legend(loc='upper left')
plt.hlines(y=0, xmin=-3, xmax=3, color='black', lw=2)
plt.xlim([-3, 3])
plt.show()
print('\n')
print('The MSE of prediction is:')
print(mean_squared_error(y_test, y_pred))
print('\n')
print('The R^2 score is:')
print(r2_score(y_test, y_pred))

for i in [0.5, 1, 5, 10]:
    ridge = Ridge(alpha=i)
    ridge.fit(X_train, y_train)
    print('\nfor alpha =', i, '\n')
    print('Slope:')
    print(ridge.coef_)
    print('Intercept:')
    print(ridge.intercept_)
    y_r_pred = ridge.predict(X_test)
    y_r_tpred = ridge.predict(X_train)
    plt.scatter(y_r_tpred,
                y_r_tpred - y_train,
                c='steelblue',
                marker='o',
Exemple #55
0
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Code starts here

#Instantiate linear regression model
regressor = LinearRegression()

# fit the model
regressor.fit(X_train, y_train)

# predict the result
y_pred = regressor.predict(X_test)

# Calculate r2_score
r2 = r2_score(y_test, y_pred)

#print r2
print(r2)

# Code ends here

# --------------
from sklearn.linear_model import Lasso

# Code starts here

# instantiate lasso model
lasso = Lasso()

# fit and predict
    if iter % 100 == 0:
        print("iteration: %s, loss: %s" % (iter, loss.item()))

# Save model
save_filename = 'checkpoints/LSTM_FC.pth'
torch.save(model, save_filename)
print('Saved as %s' % save_filename)

# Start evaluating model
model.eval()

y_pred_dep_ = model(X_test_dep_std).detach().numpy()
y_pred_dep = ss_y_dep.inverse_transform(y_pred_dep_[0, 144:])

print('the value of R-squared of Evaporation is ',
      r2_score(Outputs[144:], y_pred_dep))
print('the value of Root mean squared error of Evaporation is ',
      rmse(Outputs[144:], y_pred_dep))

f, ax1 = plt.subplots(1, 1, sharex=True, figsize=(6, 4))

ax1.plot(Outputs[144:],
         color="blue",
         linestyle="-",
         linewidth=1.5,
         label="Measurements")
ax1.plot(y_pred_dep,
         color="green",
         linestyle="--",
         linewidth=1.5,
         label="Proposed model")
Exemple #57
0
def polynomial_regression():
    # Check if user is loggedin
    if ('loggedin' in session):

        # Init variables
        (data_to_html, feature, graph_title, 
         msg_suc, msg_err, msg_warn) = (None,) * 6

        # Init list
        (columns, res_list, score_list) = (list(), ) * 3

        # Get session details
        username = session['username']
        lang     = session['lang']

        # Define tag category + model
        cat_tag = 'REG'
        mod_tag = 'PR'

        # Connect to database
        cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor)

        # Get categories of navbar
        navbar_cat      = datas_cat_nav(cursor, lang)
        navbar_cat_name = navbar_cat[0]
        navbar_cat_tag  = navbar_cat[1]
        navbar_cat_icon = navbar_cat[2]
        navbar_cat_link = navbar_cat[3]

        # Get models of navbar
        navbar_models = datas_mod_nav(cursor, lang, navbar_cat_tag)

        # Get settings of navbar
        navbar_settings = datas_set_nav(cursor, lang)
        navbar_set_name = navbar_settings[0]
        navbar_set_icon = navbar_settings[1]
        navbar_set_link = navbar_settings[2]

        # Get category details for breadcrumb
        cat_details   = cards_categories(cursor, lang, cat_tag)
        cat_name      = cat_details[0]
        cat_link      = cat_details[3]

        # Get model details for breadcrumb
        model_details = datas_model(cursor, lang, mod_tag)
        model_name    = model_details[0]
        model_link    = model_details[1]

        # Break connection
        cursor.close()

        if (request.method == 'POST'):

            # Upload file
            if (request.form['submit_btn'] == 'Upload Now'
                or request.form['submit_btn'] == 'Envoyer maintenant'):

                # All fields was complete
                if (bool(request.files['file']) == 1 
                    and bool(request.form['sep_select']) == 1
                ):
                    get_upload_datas = upload_file(lang, False)
                    msg_err          = get_upload_datas[0]
                    msg_suc          = get_upload_datas[1]
                    msg_warn         = get_upload_datas[2]

                    global new_tmp_path
                    new_tmp_path = get_upload_datas[3]

                    global colname_list
                    colname_list = get_upload_datas[4]
                    columns      = colname_list

                    data_to_html = get_upload_datas[5]

                    global df
                    df = get_upload_datas[6]

                else:
                    if (lang == 'en'):
                        # Submit without upload file
                        msg_err = (
                            'Please upload your data and select a separator.'
                        )
                    
                    else:
                        msg_err = (
                            'Veuillez télécharger vos données et '
                            'choisir un séparateur.'
                        )

            # Model compute
            if (request.form['submit_btn'] == 'Launch the model'
                or request.form['submit_btn'] == 'Lancer le modèle'):
                feature = request.form['feature']

                # Get colname list
                columns = colname_list

                # Show uploading files
                data_to_html = df_html_show(df)

                # Delete feature from columns
                columns.remove(feature)

                for i in columns:
                    x_feat = df[feature].values.reshape(-1, 1)
                    y_targ = df[i].values.reshape(-1, 1)

                    # Train Test
                    X_train, X_test, y_train, y_test = train_test_split(
                        x_feat,
                        y_targ,
                        test_size=0.33,
                        random_state=42
                    )

                    score_rmse = list()
                    min_rmse, min_deg = (math.inf,) * 2

                    for deg in range(1, 11):

                        # Train features
                        poly_features = PolynomialFeatures(degree=deg, include_bias=False)
                        x_poly_train  = poly_features.fit_transform(X_train)

                        # Linear regression
                        poly_reg = LinearRegression().fit(x_poly_train, y_train)

                        # Compare with test data
                        x_poly_test  = poly_features.fit_transform(X_test)
                        poly_predict = poly_reg.predict(x_poly_test)

                        poly_rmse = np.sqrt(mean_squared_error(y_test, poly_predict))

                        score_rmse.append(poly_rmse)

                        # Cross-validation of degree
                        if (min_rmse > poly_rmse):
                            min_rmse = poly_rmse
                            min_deg  = deg

                    # Create Polynomial model
                    polynomial = PolynomialFeatures(degree=min_deg)

                    # Fit polynomial model
                    X_train = polynomial.fit_transform(X_train)
                    X_test  = polynomial.fit_transform(X_test)

                    # Create linear model and fit
                    regressor = linear_model.LinearRegression().fit(X_train, y_train)

                    # Predicting test set results
                    y_test_pred = regressor.predict(X_test)

                    # Prediction
                    y_pred = regressor.predict(X_train)
                    y_pred = y_pred.tolist()

                    # Accuracy
                    r2_test  = r2_score(y_test , y_test_pred) * 100
                    r2_train = r2_score(y_train, y_pred) * 100

                    res = [i, round(statistics.mean([r2_test, r2_train]), 2)]
                    res_list.append(res)

                # Save scoring
                score_list = [score[1] for score in res_list]

                if (lang == 'en'):
                    # Add graph title
                    graph_title = (
                        'Comparison of the correlation between ' + feature + 
                        ' and the columns :'
                    )

                    # Success
                    msg_suc = (
                        'The model was successfully calculated. '
                        'Your data was automatically deleted.'
                    )

                else:
                    graph_title = (
                        'Comparaison de la corrélation entre ' + feature + 
                        ' et les colonnes :'
                    )

                    msg_suc = (
                        'Le modèle a été calculé avec succès.  '
                        'Vos données ont été automatiquement supprimées.'
                    )

                # Delete file
                file_remove(new_tmp_path)

        return render_template(
            'regression/pol_reg.html',
            title        = model_name,
            username     = username,
            lang         = lang,
            nav_cat_name = navbar_cat_name,
            nav_cat_tag  = navbar_cat_tag,
            nav_cat_icon = navbar_cat_icon,
            nav_cat_lnk  = navbar_cat_link,
            nav_models   = navbar_models,
            nav_set_name = navbar_set_name,
            nav_set_icon = navbar_set_icon,
            nav_set_lnk  = navbar_set_link,
            cat_name     = cat_name,
            cat_tag      = cat_tag,
            cat_link     = cat_link,
            model_name   = model_name,
            model_link   = model_link,
            msg_err      = msg_err,
            msg_suc      = msg_suc,
            msg_warn     = msg_warn,
            data_show    = data_to_html,
            df_columns   = columns,
            feature      = feature,
            score_list   = score_list,
            graph_title  = graph_title
        )

    else:
        return redirect('404')
Exemple #58
0
    select_x_train = selection.transform(x_train)
    select_x_test = selection.transform(x_test)

    selection_model = XGBRegressor(n_jobs=-1)

    selection_model.fit(select_x_train,
                        y_train,
                        eval_metric=["logloss", "rmse", "mae"],
                        eval_set=[(select_x_train, y_train),
                                  (select_x_test, y_test)],
                        early_stopping_rounds=20,
                        verbose=0)

    y_pred = selection_model.predict(select_x_test)

    score = r2_score(y_test, y_pred)
    # print("R2 : ", r2_score)

    results = selection_model.evals_result()

    print("Thresh=%.3f, n=%d, R2: %.2f%%" %
          (thresh, select_x_train.shape[1], score * 100.0))
'''
Thresh=0.002, n=13, R2: 85.98%
Thresh=0.004, n=12, R2: 85.98%
Thresh=0.008, n=11, R2: 85.99%
Thresh=0.009, n=10, R2: 85.67%
Thresh=0.009, n=9, R2: 85.79%
Thresh=0.013, n=8, R2: 85.70%
Thresh=0.016, n=7, R2: 86.26%
Thresh=0.032, n=6, R2: 82.53%
Exemple #59
0
def multiple_linear_regression():
    # Check if user is loggedin
    if ('loggedin' in session):

        # Init variables
        (data_to_html, X_col, Y_col, graph_title, 
         msg_suc, msg_err, msg_warn) = (None,) * 7

        # Init list
        (columns, score_list) = (list(), ) * 2

        # Get session details
        username = session['username']
        lang     = session['lang']

        # Define tag category + model
        cat_tag = 'REG'
        mod_tag = 'MLR'

        # Connect to database
        cursor = mysql.connection.cursor(MySQLdb.cursors.DictCursor)

        # Get categories of navbar
        navbar_cat      = datas_cat_nav(cursor, lang)
        navbar_cat_name = navbar_cat[0]
        navbar_cat_tag  = navbar_cat[1]
        navbar_cat_icon = navbar_cat[2]
        navbar_cat_link = navbar_cat[3]

        # Get models of navbar
        navbar_models = datas_mod_nav(cursor, lang, navbar_cat_tag)

        # Get settings of navbar
        navbar_settings = datas_set_nav(cursor, lang)
        navbar_set_name = navbar_settings[0]
        navbar_set_icon = navbar_settings[1]
        navbar_set_link = navbar_settings[2]

        # Get category details for breadcrumb
        cat_details   = cards_categories(cursor, lang, cat_tag)
        cat_name      = cat_details[0]
        cat_link      = cat_details[3]

        # Get model details for breadcrumb
        model_details = datas_model(cursor, lang, mod_tag)
        model_name    = model_details[0]
        model_link    = model_details[1]

        # Break connection
        cursor.close()

        if (request.method == 'POST'):

            # Upload file
            if (request.form['submit_btn'] == 'Upload Now'
                or request.form['submit_btn'] == 'Envoyer maintenant'):

                # All fields was complete
                if (bool(request.files['file']) == 1 
                    and bool(request.form['sep_select']) == 1
                ):
                    get_upload_datas = upload_file(lang, False)
                    msg_err          = get_upload_datas[0]
                    msg_suc          = get_upload_datas[1]
                    msg_warn         = get_upload_datas[2]

                    global new_tmp_path
                    new_tmp_path = get_upload_datas[3]

                    global colname_list
                    colname_list = get_upload_datas[4]
                    columns      = colname_list

                    data_to_html = get_upload_datas[5]

                    global df
                    df = get_upload_datas[6]

                else:
                    if (lang == 'en'):
                        # Submit without upload file
                        msg_err = (
                            'Please upload your data and select a separator.'
                        )
                    
                    else:
                        msg_err = (
                            'Veuillez télécharger vos données et '
                            'choisir un séparateur.'
                        )

            # Model compute
            if (request.form['submit_btn'] == 'Launch the model'
                or request.form['submit_btn'] == 'Lancer le modèle'):
                X_col = request.form['X_col']
                Y_col = request.form['Y_col']

                # Show uploading files
                data_to_html = df_html_show(df)

                # Get colname list
                columns = colname_list

                # Delete feature from columns
                columns.remove(X_col)
                columns.remove(Y_col)

                for i in columns:
                    x_feat = df[[X_col, i]].values
                    y_targ = df[Y_col].values

                    # Train Test
                    X_train, X_test, y_train, y_test = train_test_split(
                        x_feat,
                        y_targ,
                        test_size=0.33,
                        random_state=42
                    )

                    # Create model and fit
                    regressor = LinearRegression().fit(X_train, y_train)

                    # Predicting test set results
                    y_test_pred = regressor.predict(X_test)

                    # Prediction
                    y_pred = regressor.predict(X_train)
                    y_pred = y_pred.tolist()

                    # Accuracy
                    r2_test  = r2_score(y_test , y_test_pred) * 100
                    r2_train = r2_score(y_train, y_pred) * 100

                    score_list.append(round(statistics.mean([r2_test, r2_train]), 2))

                columns = [X_col + ' + ' + c for c in columns]

                if (lang == 'en'):
                    # Add graph title
                    graph_title = (
                        'Comparison of the correlation between ' + Y_col + 
                        ' and the columns :'
                    )

                    # Success
                    msg_suc = (
                        'The model was successfully calculated. '
                        'Your data was automatically deleted.'
                    )

                else:
                    graph_title = (
                        'Comparaison de la corrélation entre ' + Y_col + 
                        ' et les colonnes :'
                    )

                    msg_suc = (
                        'Le modèle a été calculé avec succès.  '
                        'Vos données ont été automatiquement supprimées.'
                    )

                # Delete file
                file_remove(new_tmp_path)

        return render_template(
            'regression/mul_lin_reg.html',
            title        = model_name,
            username     = username,
            lang         = lang,
            nav_cat_name = navbar_cat_name,
            nav_cat_tag  = navbar_cat_tag,
            nav_cat_icon = navbar_cat_icon,
            nav_cat_lnk  = navbar_cat_link,
            nav_models   = navbar_models,
            nav_set_name = navbar_set_name,
            nav_set_icon = navbar_set_icon,
            nav_set_lnk  = navbar_set_link,
            cat_name     = cat_name,
            cat_tag      = cat_tag,
            cat_link     = cat_link,
            model_name   = model_name,
            model_link   = model_link,
            msg_err      = msg_err,
            msg_suc      = msg_suc,
            msg_warn     = msg_warn,
            data_show    = data_to_html,
            df_columns   = columns,
            X_col        = X_col,
            Y_col        = Y_col,
            score_list   = score_list,
            graph_title  = graph_title
        )

    else:
        return redirect('404')
Exemple #60
0
model = LinearRegression()
model.fit(x_poly, y)
y_poly_pred = model.predict(x_poly)

# In[74]:

y_poly_pred[:20]

# In[75]:

math.sqrt(mean_squared_error(y, y_poly_pred))

# In[76]:

r2_score(y, y_poly_pred)

# In[49]:

##visualize(degree = 2)
plt.scatter(x, y)
plt.plot(x, y_poly_pred, color='m')
plt.show()

# In[77]:

##visualize(degree = 1)
plt.scatter(x, y)
plt.plot(x, y_poly_pred, color='m')
plt.show()