def train_BayesianRegressionModel(
    X,
    y,
    n_iter=300,
    tol=0.001,
    alpha_1=1e-06,
    alpha_2=1e-06,
    lambda_1=1e-06,
    lambda_2=1e-06,
    compute_score=False,
    fit_intercept=True,
    normalize=False,
    copy_X=True,
    verbose=False,
):
    """
    Train a Bayesian regression model
    """
    model = BayesianRidge(
        n_iter=n_iter,
        tol=tol,
        alpha_1=alpha_1,
        alpha_2=alpha_2,
        lambda_1=lambda_1,
        lambda_2=lambda_2,
        compute_score=compute_score,
        fit_intercept=fit_intercept,
        normalize=normalize,
        copy_X=copy_X,
        verbose=verbose,
    )
    model = model.fit(X, y)
    return model
def bayesian_ridge_regression(feature_array, label_array):
    clf = BayesianRidge(compute_score=True)
    clf.fit(feature_array, label_array)

    ols = LinearRegression()
    ols.fit(feature_array, label_array)


    n_features = 9

    plt.figure(figsize=(6, 5))
    plt.title("Weights of the model")
    plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate")
    plt.plot(label_array, 'g-', label="Ground truth")
    plt.plot(ols.coef_, 'r--', label="OLS estimate")
    plt.xlabel("Features")
    plt.ylabel("Values of the weights")
    plt.legend(loc="best", prop=dict(size=12))

    plt.figure(figsize=(6, 5))
    plt.title("Histogram of the weights")
    plt.hist(clf.coef_, bins=n_features, log=True)
    # plt.plot(clf.coef_[feature_array], 5 * np.ones(len(feature_array)),
    #          'ro', label="Relevant features")
    plt.ylabel("Features")
    plt.xlabel("Values of the weights")
    plt.legend(loc="lower left")

    plt.figure(figsize=(6, 5))
    plt.title("Marginal log-likelihood")
    plt.plot(clf.scores_)
    plt.ylabel("Score")
    plt.xlabel("Iterations")
    plt.show()
Beispiel #3
0
 def bayes_ridge_reg(self):
     br = BayesianRidge()
     br.fit(self.x_data, self.y_data)
     adjusted_result = br.predict(self.x_data)
     print "bayes ridge params", br.coef_, br.intercept_
     print "bayes ridge accuracy", get_accuracy(adjusted_result, self.y_data)
     return map(int, list(adjusted_result))
def ridreg(df,test):
    clf = BayesianRidge()
    
    target = df['count']
    train  = df[['time','temp']]
    test   = test2[['time','temp']]

    clf.fit(train,target)
    final = []
    print(test.head(3))
    for i, row in enumerate(test.values):
        y=[]
        for x in row:
            x= float(x)
            y.append(x)
            # print(x)
            final.append(y)
    predicted_probs= clf.predict(final)
    # print(predicted_probs.shape)
    # predicted_probs = pd.Series(predicted_probs)
    # predicted_probs = predicted_probs.map(lambda x: int(x))

    keep = pd.read_csv('data/test.csv')
    keep = keep['datetime']
    # #save to file
    predicted_probs= pd.DataFrame(predicted_probs)
    print(predicted_probs.head(3))
    predicted_probs.to_csv('data/submission3.csv',index=False)
Beispiel #5
0
def bayesRegr(source, target):
    # Binarize source
    clf = BayesianRidge()
    features = source.columns[:-1]
    klass = source[source.columns[-1]]
    clf.fit(source[features], klass)
    preds = clf.predict(target[target.columns[:-1]])
    return preds
Beispiel #6
0
def br_modeling(data, y_name, candidates_location):
    from sklearn.linear_model import BayesianRidge
    temp = data.copy()
    candidates = get_variables("./%s" % candidates_location)
    temp = rf_trim(temp, y_name, candidates)
    model = BayesianRidge()
    res = model.fit(temp[candidates], temp[y_name])
    joblib.dump(res, "./%sbr_model%s.pkl" % (y_name, datetime.datetime.today()))
    return res
Beispiel #7
0
    def fit_model_10(self,toWrite=False):
        model = BayesianRidge(n_iter=5000)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 10 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model10/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
def br_modeling(data,y_name,candidates_location):
 from sklearn.linear_model import BayesianRidge
 temp=data.copy()
 print("made temp copy")
 candidates=get_variables("./%s"%candidates_location)
 print("got candidates for regressors")
 temp=rf_trim(temp,y_name,candidates)
 print("trimmed dataset")
 model=BayesianRidge()
 print("assigned model")
 res=model.fit(temp[candidates],temp[y_name])
 print("fit model")
 joblib.dump(res,"./%sbr_model%s.pkl"%(y_name,datetime.datetime.today()))
 print("saved model")
 return res
Beispiel #9
0
def fit_polynomial_bayesian_skl(X, Y, degree,
                                lambda_shape=1.e-6, lambda_invscale=1.e-6,
                                padding=10, n=100,
                                X_unknown=None):
    X_v = pol.polyvander(X, degree)

    clf = BayesianRidge(lambda_1=lambda_shape, lambda_2=lambda_invscale)
    clf.fit(X_v, Y)

    coeff = np.copy(clf.coef_)

    # there some weird intercept thing
    # since the Vandermonde matrix has 1 at the beginning, just add this
    # intercept to the first coeff
    coeff[0] += clf.intercept_

    ret_ = [coeff]

    # generate the line
    x = np.linspace(X.min()-padding, X.max()+padding, n)
    x_v = pol.polyvander(x, degree)

    # using the provided predict method
    y_1 = clf.predict(x_v)

    # using np.dot() with coeff
    y_2 = np.dot(x_v, coeff)

    ret_.append(((x, y_1), (x, y_2)))

    if X_unknown is not None:
        xu_v = pol.polyvander(X_unknown, degree)

        # using the predict method
        yu_1 = clf.predict(xu_v)

        # using np.dot() with coeff
        yu_2 = np.dot(xu_v, coeff)

        ret_.append(((X_unknown, yu_1), (X_unknown, yu_2)))

    return ret_
Beispiel #10
0
def train_classiifer(X_train, y_train, to_tune, classifier):
    # Initialize Classifier.
    clf = BayesianRidge()
    clf = SVR(kernel='rbf', C=1e3, gamma=0.1)
    #clf = RandomForestRegressor()
    if classifier:
        clf = classifier
        to_tune = False
    if to_tune:
        # Grid search: find optimal classifier parameters.
        param_grid = {'alpha_1': sp_rand(), 'alpha_2': sp_rand()}
        param_grid = {'C': sp_rand(), 'gamma': sp_rand()}
        rsearch = RandomizedSearchCV(estimator=clf, 
                                     param_distributions=param_grid, n_iter=5000)
        rsearch.fit(X_train, y_train)
        # Use tuned classifier.
        clf = rsearch.best_estimator_
          
    # Trains Classifier   
    clf.fit(X_train, y_train)
    return clf
Beispiel #11
0
def build_bayesian_rr(x_train, y_train, x_test, y_test, n_features):
    """
    Constructing a Bayesian ridge regression model from input dataframe
    :param x_train: features dataframe for model training
    :param y_train: target dataframe for model training
    :param x_test: features dataframe for model testing
    :param y_test: target dataframe for model testing
    :return: None
    """
    clf = BayesianRidge()
    clf.fit(x_train, y_train)
    y_pred = clf.predict(x_test)

    # Mean absolute error regression loss
    mean_abs = sklearn.metrics.mean_absolute_error(y_test, y_pred)
    # Mean squared error regression loss
    mean_sq = sklearn.metrics.mean_squared_error(y_test, y_pred)
    # Median absolute error regression loss
    median_abs = sklearn.metrics.median_absolute_error(y_test, y_pred)
    # R^2 (coefficient of determination) regression score function
    r2 = sklearn.metrics.r2_score(y_test, y_pred)
    # Explained variance regression score function
    exp_var_score = sklearn.metrics.explained_variance_score(y_test, y_pred)
    # Optimal ridge regression alpha value from CV
    ridge_alpha = clf.alpha_

    with open('../trained_networks/brr_%d_data.pkl' % n_features, 'wb') as results:
        pickle.dump(clf, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(mean_sq, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(median_abs, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(r2, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(exp_var_score, results, pickle.HIGHEST_PROTOCOL)
        pickle.dump(y_pred, results, pickle.HIGHEST_PROTOCOL)

    return
Beispiel #12
0
# Lasso
lassoCV_model = LassoCV(alphas = alphas)
lassoCV_model.fit(X_train, y_train);

We can then see which values of `alpha` performed best with the following.

print('Ridge alpha:', ridgeCV.alpha_)
print('Lasso alpha:', lassoCV.alpha_)

## Bayesian Regression

We can also fit Bayesian regression using `scikit-learn` (though another popular package is `pymc3`). A very straightforward implementation is provided below. 

from sklearn.linear_model import BayesianRidge
bayes_model = BayesianRidge()
bayes_model.fit(X_train, y_train);

This is not, however, identical to our construction in the previous section since it infers the $\sigma^2$ and $\tau$ parameters, rather than taking those as fixed inputs. More information can be found [here](https://scikit-learn.org/stable/modules/linear_model.html#bayesian-regression). The hidden chunk below demonstrates a hacky solution for running Bayesian regression in `scikit-learn` using known values for $\sigma^2$ and $\tau$, though it is hard to imagine a practical reason to do so

````{toggle}
By default, Bayesian regression in `scikit-learn` treats $\alpha = \frac{1}{\sigma^2}$ and $\lambda = \frac{1}{\tau}$ as random variables and assigns them the following prior distributions

$$
\begin{aligned}
\alpha &\sim \text{Gamma}(\alpha_1, \alpha_2) 
\\
\lambda &\sim \text{Gamma}(\lambda_1, \lambda_2).
\end{aligned}
$$
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score

N_SPLITS = 5

rng = np.random.RandomState(0)

X_full, y_full = fetch_california_housing(return_X_y=True)
# ~2k samples is enough for the purpose of the example.
# Remove the following two lines for a slower run with different error bars.
X_full = X_full[::10]
y_full = y_full[::10]
n_samples, n_features = X_full.shape

# Estimate the score on the entire dataset, with no missing values
br_estimator = BayesianRidge()
score_full_data = pd.DataFrame(cross_val_score(
    br_estimator,
    X_full,
    y_full,
    scoring='neg_mean_squared_error',
    cv=N_SPLITS),
                               columns=['Full Data'])

# Add a single missing value to each row
X_missing = X_full.copy()
y_missing = y_full
missing_samples = np.arange(n_samples)
missing_features = rng.choice(n_features, n_samples, replace=True)
X_missing[missing_samples, missing_features] = np.nan
Beispiel #14
0
# Create weigts with a precision lambda_ of 4.
lambda_ = 4.
w = np.zeros(n_features)
# Only keep 10 weights of interest
relevant_features = np.random.randint(0, n_features, 10)
for i in relevant_features:
    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
# Create noise with a precision alpha of 50.
alpha_ = 50.
noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
# Create the target
y = np.dot(X, w) + noise

###############################################################################
# Fit the Bayesian Ridge Regression and an OLS for comparison
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)

ols = LinearRegression()
ols.fit(X, y)

###############################################################################
# Plot true weights, estimated weights and histogram of the weights
plt.figure(figsize=(6, 5))
plt.title("Weights of the model")
plt.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate")
plt.plot(w, 'g-', label="Ground truth")
plt.plot(ols.coef_, 'r--', label="OLS estimate")
plt.xlabel("Features")
plt.ylabel("Values of the weights")
plt.legend(loc="best", prop=dict(size=12))
plt.title('Polynomial Predicted Coronavirus deaths Cases Over Time in India', size=30)
plt.xlabel('Days Since 1/22/2020', size=20)
plt.ylabel('No.of Cases(in Lakhs)', size=20)
plt.legend(['deaths Cases', 'Polynomial Regression Predictions'])
plt.xticks(size=15)
plt.show()

pol_ind_deaths_days = pol_ind_deaths_days.reshape(1,-1)[0]
df_ind_deaths_poly_predict = pd.DataFrame({'Date': prediction_dates[-(days_in_future):], 'Polynomial Regression Predicted # of deaths Cases India': np.round(pol_ind_deaths_days[-(days_in_future):])})
df_ind_deaths_poly_predict

#Bayesian Ridge

#To get Best Parameters

reg_world_deaths=BayesianRidge()
reg_world_deaths.fit(xtrain_world_deaths,ytrain_world_deaths)
print(reg_world_deaths.get_params)

#World

reg_world_deaths=BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, alpha_init=None,
              compute_score=False, copy_X=True, fit_intercept=True,
              lambda_1=1e-06, lambda_2=1e-06, lambda_init=None, n_iter=300,
              normalize=False, tol=0.001, verbose=False)
reg_world_deaths.fit(xtrain_world_deaths,ytrain_world_deaths)
reg_world_deaths_test = reg_world_deaths.predict(xtest_world_deaths)
reg_world_deaths_predict_days = reg_world_deaths.predict(prediction_days)
print('MAE:', metrics.mean_absolute_error(reg_world_deaths_test, ytest_world_deaths))
print('MSE:',metrics.mean_squared_error(reg_world_deaths_test, ytest_world_deaths))
print('R2 :',metrics.r2_score(reg_world_deaths_test, ytest_world_deaths))
Beispiel #16
0
def main():
    usage = 'usage: %prog [options] <model_file>'
    parser = OptionParser(usage)
    parser.add_option('-c', dest='center_dist', default=10, type='int', help='Distance between the motifs and sequence center [Default: %default]')
    parser.add_option('-d', dest='model_hdf5_file', default=None, help='Pre-computed model output as HDF5 [Default: %default]')
    parser.add_option('-g', dest='cuda', default=False, action='store_true', help='Run on the GPGPU [Default: %default]')
    parser.add_option('-l', dest='seq_length', default=600, type='int', help='Sequence length [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='heat', help='Output directory [Default: %default]')
    parser.add_option('-t', dest='targets', default='0', help='Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]')
    (options,args) = parser.parse_args()

    if len(args) != 1:
        parser.error('Must provide Basset model file')
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(',')]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ''
    if options.cuda:
        cuda_str = '-cuda'

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    num_filters = len(filter_consensus)
    # num_filters = 40
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length/2 - options.center_dist - filter_len
    right_i = options.seq_length/2 + options.center_dist

    ns_1hot = np.zeros((4,options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:,left_i:left_i+filter_len] = filter_consensus[i]
            motifs_seq[:,right_i:right_i+filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0],4,1,options.seq_length))


    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = '%s/motif_seqs.h5' % options.out_dir
    h5f = h5py.File(seqs_file, 'w')
    h5f.create_dataset('test_in', data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = '%s/motif_seqs_scores.h5' % options.out_dir
    torch_cmd = 'th basset_place2_predict.lua %s %s %s %s' % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, 'r')
    motif_seq_scores = np.array(hdf5_in['scores'])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0],2*num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi,i] += 1
                X[xi,num_filters+j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:,ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:,ti])

        # print filter coefficients
        coef_out = open('%s/coefs_t%d.txt' % (options.out_dir,ti), 'w')
        for i in range(num_filters):
            print >> coef_out, '%3d  %6.2f' % (i,model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters,num_filters))
        table_out = open('%s/table_t%d.txt' % (options.out_dir,ti), 'w')

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i,j] = motif_seq_scores[si,ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si,ti], motif_seq_preds[si], filter_interaction[i,j])
                print >> table_out, '%3d  %3d  %6.3f  %6.3f  %6.3f' % cols
                si += 1

        table_out.close()

        scores_abs = abs(filter_interaction.flatten())
        max_score = stats.quantile(scores_abs, .999)
        print 'Limiting scores to +-%f' % max_score
        filter_interaction_max = np.zeros((num_filters, num_filters))
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction_max[i,j] = np.min([filter_interaction[i,j], max_score])
                filter_interaction_max[i,j] = np.max([filter_interaction_max[i,j], -max_score])

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction_max, xticklabels=False, yticklabels=False)
        plt.savefig('%s/heat_t%d.pdf' % (options.out_dir,ti))
c = clean - np.min(clean)
c /= c.max()
c = c.astype(bool)
io.imsave("/Users/qcaudron/Desktop/charo/2_smoothed.jpg", ski.img_as_uint(surf))

# <codecell>

z1 = np.mean(surf, axis=0)
z2 = np.mean(surf, axis=1)

#for i in range(surf.shape[1]) : 
#    plt.plot(surf[:, i], "k")
#plt.plot(z2)
r = [BayesianRidge().fit(np.vander(np.arange(surf.shape[i]), 2), np.mean(surf, axis = 1-i)) for i in [0, 1]]
r1 = BayesianRidge().fit(np.arange(len(z1)).reshape(len(z1),1), z1)
r2 = BayesianRidge().fit(np.arange(len(z2[500:-500])).reshape(len(z2[500:-500]),1), z2[500:-500])

#plt.plot(r1.predict(np.arange(len(z1)).reshape(len(z1),1)), linewidth=5)
plt.plot(r2.predict(np.arange(len(z2)).reshape(len(z2),1)), linewidth=5)
plt.plot(z2, linewidth=5)
#plt.axhline(b[np.argmax(h)], c="r", linewidth=3)
#plt.plot(r[0].predict(np.vander(np.arange(surf.shape[0]), 2)), linewidth=3)

#plt.plot(r[0].predict(np.arange(len(z1)).reshape(len(z1),1)), linewidth=3)
#plt.plot(r[0].predict(np.expand_dims(np.arange(surf.shape[0]), axis=1)), linewidth=5)
#plt.axhline(np.mean(z1 / r1.predict(np.arange(len(z1)).reshape(len(z1),1))))

# <codecell>

lz = np.log(z2)
r3 = BayesianRidge().fit(np.arange(len(lz[500:-500])).reshape(len(lz[500:-500]),1), lz[500:-500])
xt = x

#print len(_x), len(x), len(y)

# Linear Regression
print 'linear'
lr = LinearRegression()
#lr.fit(x[:, np.newaxis], y)
#lr_sts_scores = lr.predict(xt[:, np.newaxis])
lr.fit(x, y)
lr_sts_scores = lr.predict(xt)


# Baysian Ridge Regression
print 'baysian ridge'
br = BayesianRidge(compute_score=True)
#br.fit(x[:, np.newaxis], y)
#br_sts_scores = br.predict(xt[:, np.newaxis])
br.fit(x, y)
br_sts_scores = br.predict(xt)


# Elastic Net
print 'elastic net'
enr = ElasticNet()
#enr.fit(x[:, np.newaxis], y)
#enr_sts_scores = enr.predict(xt[:, np.newaxis])
enr.fit(x, y)
enr_sts_scores = enr.predict(xt)

Beispiel #19
0
    print('\nmethod = ', method)

    if (method == 1):
        print('Multilayer perceptron (MLP) neural network 01')
        str_method = 'MLP model01'
        r = MLPRegressor(hidden_layer_sizes=(4, ), max_iter=40)

    if (method == 2):
        print('Multilayer perceptron (MLP) neural network 02')
        str_method = 'MLP model02'
        r = MLPRegressor(hidden_layer_sizes=(5, ), max_iter=30)

    if (method == 3):
        print('Bayesian Ridge')
        str_method = 'BayesianRidge'
        r = BayesianRidge(compute_score=True)

    # class sklearn.ensemble.BaggingRegressor(base_estimator=None, n_estimators=10,
    #    max_samples=1.0, max_features=1.0, bootstrap=True, bootstrap_features=False,
    # oob_score=False, warm_start=False, n_jobs=1, random_state=None, verbose=0)

    if (method == 4):
        print('Bagging Regressor 01')
        str_method = 'BaggingRegressor01'
        r = BaggingRegressor(
            DecisionTreeRegressor(max_depth=6, max_features=0.75))

    if (method == 5):
        print('GradientBoosting 01')
        str_method = 'GradientBoosting01'
        r = GradientBoostingRegressor(n_estimators=95,
Beispiel #20
0
                        zeroThreshold=1e-5)
    else:
        pipeline.verify(auto_X.sample(frac=0.05, random_state=13))
    customize(regressor, **kwargs)
    store_pkl(pipeline, name + ".pkl")
    mpg = DataFrame(pipeline.predict(auto_X), columns=["mpg"])
    store_csv(mpg, name + ".csv")


build_auto(
    AdaBoostRegressor(DecisionTreeRegressor(random_state=13,
                                            min_samples_leaf=5),
                      random_state=13,
                      n_estimators=17), "AdaBoostAuto")
build_auto(ARDRegression(normalize=True), "BayesianARDAuto")
build_auto(BayesianRidge(normalize=True), "BayesianRidgeAuto")
build_auto(DecisionTreeRegressor(random_state=13, min_samples_leaf=2),
           "DecisionTreeAuto",
           compact=False)
build_auto(
    BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                           min_samples_leaf=5),
                     random_state=13,
                     n_estimators=3,
                     max_features=0.5), "DecisionTreeEnsembleAuto")
build_auto(DummyRegressor(strategy="median"), "DummyAuto")
build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
           "ExtraTreesAuto")
build_auto(GradientBoostingRegressor(random_state=13, init=None),
           "GradientBoostingAuto")
Beispiel #21
0
def main(train, test):
    test_ori = test
    test.drop(['B3', 'B13', 'A13', 'A18'], axis=1, inplace=True)
    good_cols = list(train.columns)
    for col in train.columns:
        rate = train[col].value_counts(normalize=True, dropna=False).values[0]
        if col not in ['A23']:
            if rate > 0.9:
                good_cols.remove(col)
                print(col, rate)
    good_cols.append('A1')
    good_cols.append('A3')
    good_cols.append('A4')
    train = train[train['收率'] > 0.87]
    train = train[train['收率'] <= 1]
    train = train[train['B14'] >= 350]
    train = train[train['B14'] <= 460]
    train = train[good_cols]
    good_cols.remove('收率')
    test = test[good_cols]
    target = train['收率']
    del train['收率']
    data = pd.concat([train, test], axis=0, ignore_index=True)
    data.loc[data['A25'] == '1900/3/10 0:00', 'A25'] = 70
    for f in data.columns:
        if f != '样本id':
            if f in [
                    'A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5',
                    'B7', 'A20', 'A28', 'B4', 'B9', 'B10', 'B11'
            ]:
                data[f] = data[f].fillna(0)
            else:
                counts = stats.mode(data[f].astype(float))[0][0]
                data[f] = data[f].fillna(counts)

    for f in ['A5', 'A7', 'A9', 'A11', 'A14', 'A16', 'A24', 'A26', 'B5', 'B7']:
        try:
            data[f] = data[f].apply(timeTranSecond)
        except:
            print(f, '应该在前面被删除了!')

    for f in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']:
        data[f] = data.apply(lambda df: getDuration(df[f]), axis=1)
    for f in ['A20', 'A28', 'B4', 'B9', 'B10', 'B11']:
        data.loc[data[f] == 0, f] = stats.mode(data[f].astype(float))[0][0]
    # data['样本id'] = data['样本id'].apply(lambda x: int(x.split('_')[1]))
    data.drop(['样本id'], axis=1, inplace=True)
    categorical_columns = [f for f in data.columns if f not in ['样本id']]
    numerical_columns = [
        f for f in data.columns if f not in categorical_columns
    ]
    data['A25'] = pd.DataFrame(data['A25'], dtype=np.float)
    data['b14/a1_a3_a4_a19_b1_b12'] = data['B14'] / (data['A1'] + data['A3'] +
                                                     data['A4'] + data['A19'] +
                                                     data['B1'] + data['B12'])
    data['A1_A3_A4/a1_a3_a4_a19_b1_b12'] = (
        data['A1'] + data['A3'] + data['A4']) / (data['A1'] + data['A3'] +
                                                 data['A4'] + data['A19'] +
                                                 data['B1'] + data['B12'])
    data['B10_B11'] = (data['B12']) / (data['B10'] + data['B11'])
    data['A11_A5'] = data['A11'] - data['A5']
    for f in range(len(data['A11_A5'])):
        if data['A11_A5'][f] < 0:
            data['A11_A5'][f] = data['A11_A5'][f] + 24
    data['A16_A11'] = data['A16'] - data['A11']
    for f in range(len(data['A16_A11'])):
        if data['A16_A11'][f] < 0:
            data['A16_A11'][f] = data['A16_A11'][f] + 24
    data['A26_A24'] = (data['A26'] - data['A24'])
    for f in range(len(data['A26_A24'])):
        if data['A26_A24'][f] < 0:
            data['A26_A24'][f] = data['A26_A24'][f] + 24
    data['A26_A24_A28'] = data['A26_A24'] / data['A28']
    data['A21_A22_shijian'] = (data['A21'] + data['A22']) / data['A26_A24']
    data['B7_B5'] = (data['B7'] - data['B5'])
    for f in range(len(data['B7_B5'])):
        if data['B7_B5'][f] < 0:
            data['B7_B5'][f] = data['B7_B5'][f] + 24
    data['B14/B7_B5'] = data['B14'] / data['B7_B5']
    # data['B11*B14'] = data['B11'] * data['B14']
    # numerical_columns.append('B11*B14')
    numerical_columns.append('b14/a1_a3_a4_a19_b1_b12')
    numerical_columns.append('A21_A22_shijian')
    for l in [
            'A1', 'A3', 'A4', 'A7', 'A5', 'A11', 'A9', 'A14', 'A16', 'A21',
            'A22', 'A20', 'A26', 'A24', 'A28', 'A23', 'B8', 'B6', 'A17'
    ]:
        data.drop([l], axis=1, inplace=True)
    categorical_columns.append('B14/B7_B5')
    categorical_columns.append('A26_A24_A28')
    categorical_columns.append('A16_A11')
    categorical_columns.append('B10_B11')
    categorical_columns.append('A11_A5')
    for l in [
            'A1', 'A3', 'A4', 'A7', 'A5', 'A11', 'A9', 'A14', 'A16', 'A21',
            'A22', 'A20', 'A26', 'A24', 'A28', 'A23', 'B8', 'B6', 'A17'
    ]:
        categorical_columns.remove(l)
    for f in categorical_columns:
        data[f] = data[f].map(
            dict(zip(data[f].unique(), range(0, data[f].nunique()))))
    train = data[:train.shape[0]]
    test = data[train.shape[0]:]
    print(train.shape)
    print(test.shape)
    train['target'] = target
    train['intTarget'] = pd.cut(train['target'], 5, labels=False)
    train = pd.get_dummies(train, columns=['intTarget'])
    li = [
        'intTarget_0.0', 'intTarget_1.0', 'intTarget_2.0', 'intTarget_3.0',
        'intTarget_4.0'
    ]
    mean_columns = []
    for f1 in categorical_columns:
        cate_rate = train[f1].value_counts(normalize=True,
                                           dropna=False).values[0]
        if cate_rate < 0.90:
            for f2 in li:
                col_name = 'B14_to_' + f1 + "_" + f2 + '_mean'
                mean_columns.append(col_name)
                order_label = train.groupby([f1])[f2].mean()
                train[col_name] = train['B14'].map(order_label)
                miss_rate = train[col_name].isnull().sum(
                ) * 100 / train[col_name].shape[0]
                if miss_rate > 0:
                    train = train.drop([col_name], axis=1)
                    mean_columns.remove(col_name)
                else:
                    test[col_name] = test['B14'].map(order_label)
    train.drop(li + ['target'], axis=1, inplace=True)
    print(train.shape)
    print(test.shape)
    X_train = train[mean_columns + numerical_columns].values
    X_test = test[mean_columns + numerical_columns].values
    enc = OneHotEncoder()
    for f in categorical_columns:
        enc.fit(data[f].values.reshape(-1, 1))
        X_train = sparse.hstack(
            (X_train, enc.transform(train[f].values.reshape(-1, 1))), 'csr')
        X_test = sparse.hstack(
            (X_test, enc.transform(test[f].values.reshape(-1, 1))), 'csr')
    print(X_train.shape)
    print(X_test.shape)
    y_train = target.values
    param = {
        'num_leaves': 120,
        'min_data_in_leaf': 30,
        'objective': 'regression',
        'max_depth': -1,
        'learning_rate': 0.05,
        "min_child_samples": 30,
        "boosting": "gbdt",
        "feature_fraction": 0.9,
        "bagging_freq": 1,
        "bagging_fraction": 0.9,
        "bagging_seed": 11,
        "metric": 'mse',
        "lambda_l1": 0.1,
        "verbosity": -1
    }
    folds = KFold(n_splits=8, shuffle=True, random_state=2018)
    oof_lgb = np.zeros(len(train))
    predictions_lgb = np.zeros(len(test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
        val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])
        num_round = 10000
        clf = lgb.train(param,
                        trn_data,
                        num_round,
                        valid_sets=[trn_data, val_data],
                        verbose_eval=200,
                        early_stopping_rounds=100)
        oof_lgb[val_idx] = clf.predict(X_train[val_idx],
                                       num_iteration=clf.best_iteration)

        predictions_lgb += clf.predict(
            X_test, num_iteration=clf.best_iteration) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_lgb, target)))
    xgb_params = {
        'eta': 0.005,
        'max_depth': 10,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': True,
        'nthread': 4
    }

    folds = KFold(n_splits=5, shuffle=True, random_state=2018)
    oof_xgb = np.zeros(len(train))
    predictions_xgb = np.zeros(len(test))

    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
        print("fold n°{}".format(fold_ + 1))
        trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
        val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])

        watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
        clf = xgb.train(dtrain=trn_data,
                        num_boost_round=20000,
                        evals=watchlist,
                        early_stopping_rounds=200,
                        verbose_eval=100,
                        params=xgb_params)
        oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]),
                                       ntree_limit=clf.best_ntree_limit)
        predictions_xgb += clf.predict(
            xgb.DMatrix(X_test),
            ntree_limit=clf.best_ntree_limit) / folds.n_splits

    print("CV score: {:<8.8f}".format(mean_squared_error(oof_xgb, target)))
    # # stacking
    train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
    test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

    folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
    oof_stack = np.zeros(train_stack.shape[0])
    predictions = np.zeros(test_stack.shape[0])

    for fold_, (trn_idx,
                val_idx) in enumerate(folds_stack.split(train_stack, y_train)):
        print("fold {}".format(fold_))
        trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
        val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

        clf_3 = BayesianRidge()
        clf_3.fit(trn_data, trn_y)

        oof_stack[val_idx] = clf_3.predict(val_data)
        predictions += clf_3.predict(test_stack) / 10
    print("CV score: {:<8.8f}".format(
        mean_squared_error(target.values, oof_stack)))
    # print("CV score: {:<8.8f}".format(mean_squared_error(ansc['收率'], predictions_xgb)))
    # print(predictions_xgb)
    sub_df = pd.DataFrame({'a': test_ori['样本id'], 'b': predictions})
    # sub_df['b']=  sub_df['b'].apply(lambda x: round(x, 3))
    return sub_df
Beispiel #22
0
    def stack_model(self, prediction_list_name,
                    method = "BayesianRidge",
                    split_method = "kFold",
                    n_splits = 5,
                    random_state = 4520):
        target, test_df= self.get_train_target()

        if len(prediction_list_name) == 0:
            print("no prediction result ...")
            return
        else:
            oof_list = []
            prediction_list = []

            for name in prediction_list_name:

                pred_path = os.path.join(self.submission_dir, name)
                oof_path = os.path.join(self.submission_dir+'/oof', 'oof_'+name)
                if not os.path.isfile(pred_path):
                    print("{} is not a prediction result path".format(pred_path))
                elif not os.path.isfile(oof_path):
                    print("{} is not a oof result path".format(oof_path))
                else:
                    oof = pd.read_csv(oof_path)
                    prediction = pd.read_csv(pred_path)
                    prediction_list.append(prediction['target'].values)
                    oof_list.append(oof['target'].values)

            train_stack = np.vstack(oof_list).transpose()
            test_stack = np.vstack(prediction_list).transpose()

            if split_method == 'kFold':
                kfold = KFold(n_splits=n_splits, random_state=random_state)
                iterator = enumerate(kfold.split(train_stack))

            elif split_method == 'StratifiedKFold':
                kfold = StratifiedKFold(n_splits=n_splits, random_state=random_state)
                iterator = enumerate(kfold.split(train_stack,target.values))

            oof_stack = np.zeros(train_stack.shape[0])
            predictions_stack = np.zeros(test_stack.shape[0])

            for fold_, (trn_idx, val_idx) in enumerate(kfold.split(train_stack, target)):
                print("fold n°{}".format(fold_))
                trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
                val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

                print("-" * 10 + "Stacking " + str(fold_) + "-" * 10)
                #     cb_model = CatBoostRegressor(iterations=3000, learning_rate=0.1, depth=8, l2_leaf_reg=20, bootstrap_type='Bernoulli',  eval_metric='RMSE', metric_period=50, od_type='Iter', od_wait=45, random_seed=17, allow_writing_files=False)
                #     cb_model.fit(trn_data, trn_y, eval_set=(val_data, val_y), cat_features=[], use_best_model=True, verbose=True)
                if method == 'BayesianRidge':
                    clf = BayesianRidge()
                    clf.fit(trn_data, trn_y)

                oof_stack[val_idx] = clf.predict(val_data)
                predictions_stack += clf.predict(test_stack) / 5

            print("cv score : ",np.sqrt(mean_squared_error(target.values, oof_stack)))
            print('save stacked oof file and prediction file ...')
            oof_file_name = '_'.join(prediction_list_name).strip()
            oof_file_name = 'oof_merge_'+oof_file_name
            pred_file_name = '_'.join(prediction_list_name).strip()
            pred_file_name = 'merge_'+pred_file_name

            stack_result = pd.DataFrame({'card_id':test_df['card_id']})
            stack_result['target'] = predictions_stack
            stack_result.to_csv(os.path.join(self.submission_dir,pred_file_name), index=False)

            oof_stack = pd.DataFrame({'target': oof_stack})
            oof_stack.to_csv(os.path.join(self.submission_dir + '/oof', 'oof_' + oof_file_name), index=False)
            print('stacked oof and prediction file save successfully ...')
Beispiel #23
0
def test3():
    name = request.form["name"]
    target = request.form["target"]
    test_size = request.form["test_size"]
    dataset = request.files["dataset"]
    df = pd.read_csv(dataset)

    #directory making
    rootdirectory = name
    parent_dir = "/home/sanfer/Documents/ml-examples-vuejs-flask/web-app/src/assets/"
    path = os.path.join(parent_dir, rootdirectory)
    working = path  #working path
    os.mkdir(path)

    plotdirectory = "plots"
    plot_parent_dir = parent_dir + rootdirectory + '/'
    path = os.path.join(plot_parent_dir, plotdirectory)
    plots_dir = path  #plot path
    os.mkdir(path)

    modeldirectory = "models"
    model_parent_dir = parent_dir + rootdirectory + "/"
    path = os.path.join(model_parent_dir, modeldirectory)
    model_dir = path  #model path
    os.mkdir(path)

    #pre-processiong plots

    snsdist = sns.distplot(df[target])
    snsdist = snsdist.get_figure()
    snsdist.savefig(plots_dir + "/dist.png")
    snsdist.clf()

    features = {}

    dataTypes = df.dtypes
    for items in dataTypes.iteritems():
        # print(items)
        # print((items[1].name))
        if (items[1].name != 'float64' and items[1].name != 'int64'):
            df.drop(labels=items[0], axis=1, inplace=True)
        else:
            features.update({items[0]: items[1].name})

    del features[target]
    features = json.dumps(features)
    y = df[target]
    df.drop(labels=target, axis=1, inplace=True)
    df.replace(0, np.NaN).fillna(df.mean(), inplace=True)
    X = df[list(df.columns)]

    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=float(test_size), random_state=101)
    from sklearn.linear_model import BayesianRidge
    lm = BayesianRidge()
    lm.fit(X_train, y_train)

    print("Linear model intercept")
    print(lm.intercept_)
    coeff_df = pd.DataFrame(lm.coef_, X.columns, columns=['Coefficient'])
    print(coeff_df)

    predictions = lm.predict(X_test)
    # plt.figure()
    plt.scatter(y_test, predictions)
    plt.savefig(plots_dir + "/scatter.png")
    plt.clf()

    sn = sns.distplot((y_test - predictions), bins=50)
    sn = sn.get_figure()
    sn.savefig(plots_dir + "/residual.png")
    sn.clf()

    # plt.show()
    # cv2.waitKey(0)
    # sns.distplot((y_test-predictions),bins=50);
    from sklearn import metrics
    print('MAE:', metrics.mean_absolute_error(y_test, predictions))
    print('MSE:', metrics.mean_squared_error(y_test, predictions))
    print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

    print(test_size)
    print(name)
    print(features)
    pkl_filename = model_dir + "/" + name + ".pkl"
    with open(pkl_filename, 'wb') as file:
        pickle.dump(lm, file)

    #metrics to return
    r_square = lm.score(X, y)
    MAE = metrics.mean_absolute_error(y_test, predictions)
    MSE = metrics.mean_squared_error(y_test, predictions)
    RMSE = np.sqrt(MSE)

    #plot paths to return
    scatterplotpath = name + "/plots/scatter.png"
    distpath = name + "/plots/dist.png"
    residualpath = name + "/plots/residual.png"

    #path to model
    modelpath = name + "/models/" + name + ".pkl"

    return jsonify({
        "status": "success LinearReg",
        "metrics": {
            "mae": MAE,
            "mse": MSE,
            "rmse": RMSE,
            "r_square": r_square
        },
        "ploturl": {
            "scatterplotpath": scatterplotpath,
            "distpath": distpath,
            "residualpath": residualpath
        },
        "feature_names": features,
        "model_path": modelpath
    }), 201
Beispiel #24
0
y_train = y_train[['Target']]

print(y_train)
y_test = y_test[['Target']]

# corr = data.corr()

# param_grid = {'C': [4.7, 4.8, 4.9, 5.0], 'gamma': [ 0.000009, 0.000010, 0.000011, 0.000012]}

print(X_train)
print(y_train)

# regressor = LinearRegression()
# regressor = SVR(C=5, gamma=0.00001)
regressor = BayesianRidge(normalize=True,
                          n_iter=5,
                          tol=0.01,
                          fit_intercept=True)
# regressor = ARDRegression(normalize=True, n_iter=5, tol=0.01)
# regressor = SGDRegressor()
# regressor = MLPRegressor(hidden_layer_sizes=(200, 50, 10))
# regressor = RANSACRegressor(min_samples=80, max_trials=1000)
# regressor = Lasso()

regressor.fit(X_train, y_train.squeeze().tolist())

print(regressor.score(X_train, y_train.squeeze().tolist()))
print(regressor.score(X_test, y_test.squeeze().tolist()))

print(regressor.get_params())
y_predict = regressor.predict(X_test)
print(y_predict)
Beispiel #25
0
trainingcounts = counts[100:]
testcounts = counts[:100]

trainingrates = countrates[100:]
testrates = countrates[:100]

trainingtimes = times[100:]
testtimes = times[:100]

# using trainingcounts and training hists use log linear
#poisson_model = sm.GLM(trainingrates,
#						sm.tools.tools.add_constant(traininghists),
#						family =sm.families.Poisson(sm.genmod.families.links.log))
#results = poisson_model.fit()
#print(results.summary())

#x = results.predict(sm.tools.tools.add_constant(testhists))


clf = BayesianRidge(compute_score=True)
clf.fit(traininghists,trainingrates)
x = clf.predict(testhists)  

answer = testrates

plt.plot(bins,x)
plt.plot(bins,answer)
plt.show()


Beispiel #26
0
#
# LinearRegression
# Ridge
# Lasso
# Random Forrest
# Gradient Boosting Tree
# Support Vector Regression
# Linear Support Vector Regression
# ElasticNet
# Stochastic Gradient Descent
# BayesianRidge
# KernelRidge
# ExtraTreesRegressor
# XgBoost
models = [LinearRegression(),Ridge(),Lasso(alpha=0.01,max_iter=10000),RandomForestRegressor(),GradientBoostingRegressor(),SVR(),LinearSVR(),
          ElasticNet(alpha=0.001,max_iter=10000),SGDRegressor(max_iter=1000,tol=1e-3),BayesianRidge(),KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5),
          ExtraTreesRegressor(),XGBRegressor()]

names = ["LR", "Ridge", "Lasso", "RF", "GBR", "SVR", "LinSVR", "Ela","SGD","Bay","Ker","Extra","Xgb"]
# for name, model in zip(names, models):
#     score = rmse_cv(model, X_scaled, y_log)
#     print("{}: {:.6f}, {:.4f}".format(name,score.mean(),score.std()))

# grid函数寻找最优参数
class grid():
    def __init__(self, model):
        self.model = model

    def grid_get(self, X, y, param_grid):
        grid_search = GridSearchCV(self.model, param_grid, cv=5, scoring="neg_mean_squared_error")
        grid_search.fit(X, y)
    return X_train, X_test, y, y_test, snr, noise, w, size


###############################################################################
# Create data
size = 12
n_samples = 400
X_train, X_test, y_train, y_test, snr, noise, coefs, size =\
        create_simulation_data(snr=10, n_samples=n_samples, size=size)


###############################################################################
# Compute the results for supervised clustering
A = grid_to_graph(n_x=size, n_y=size, n_z=size)
clf = BayesianRidge(fit_intercept=True, normalize=True, tol=1.e-3)
sc = supervised_clustering.SupervisedClusteringRegressor(estimator=clf,
        connectivity=A, n_iterations=30, cv=25, verbose=1, n_jobs=8)
#sc = supervised_clustering.SupervisedClusteringRegressor(clf, connectivity=A,
#        n_iterations=30, verbose=1, n_jobs=8,
#        cv=ShuffleSplit(X_train.shape[0], n_splits=10, test_fraction=0.6,
#            random_state=0))
t1 = time()
sc.fit(X_train, y_train)
sc_time = time() -t1
computed_coefs = sc.inverse_transform()
computed_coefs = np.reshape(computed_coefs, [size, size, size])
score = sc.score(X_test, y_test)


###############################################################################
Beispiel #28
0
# total1.shape

# 最后,所有数据预处理完毕,后进行数据train/test进行分离。
train = total1[total1['source'] == 'train']
test = total1[total1['source'] == 'test']
train.drop(['source'], axis=1, inplace=True)
test.drop(['source'], axis=1, inplace=True)
# train.shape, test.shape

# 模型预测
# 首先选择一些基本模型进行单模型数据仿真。选择比较有代表性的线性模型、随机森林、GDBR以及最近比较流行的XGBoost。
# 模型评估函数选择Kaggle官网指定的评估指标,均方根误差。
lass = Lasso(alpha=0.1)
bayes = BayesianRidge(n_iter=300,
                      tol=0.001,
                      alpha_1=1e-06,
                      alpha_2=1e-06,
                      lambda_1=1e-06)
regr = RandomForestRegressor(max_depth=2)
gbr = GradientBoostingRegressor(
    loss='ls',
    learning_rate=0.1,
    n_estimators=100,
    subsample=1.0,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_depth=3,
    alpha=0.9,
)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603,
Beispiel #29
0
frames = [train, test]
df = pd.concat(frames, axis=0, ignore_index=True)

### Imputing DYAR
train = df[(df.DYAR.isnull() ==False) & (df.pct_team_tgts.isnull() == False)]
train.reset_index(inplace=True, drop=True)
test = df[(df.DYAR.isnull() == True) & (df.pct_team_tgts.isnull() == False)]
test.reset_index(inplace= True, drop=True)

features = ['targets', 'receptions', 'rec_tds', 'start_ratio', 'pct_team_tgts', 'pct_team_receptions', 'pct_team_touchdowns',
            'rec_yards', 'dpi_yards', 'fumbles', 'first_down_ctchs', 'pct_of_team_passyards']
X = scale(train[features])
y = train.DYAR

# Our best model for predicting DYAR was a Bayesian Ridge Regressor
br = BayesianRidge()
br.fit(X,y)
dyar_predictions = pd.DataFrame(br.predict(scale(test[features])), columns = ['DYAR_predicts'])

test = test.join(dyar_predictions)
test['DYAR'] = test['DYAR_predicts']
test.drop('DYAR_predicts', inplace=True, axis=1)

frames = [train,test]
df = pd.concat(frames, axis=0, ignore_index=True)

### Imputing EYds
train = df[(df.EYds.isnull() ==False) & (df.pct_team_tgts.isnull() == False)]
train.reset_index(inplace=True, drop=True)
test = df[(df.EYds.isnull() == True) & (df.pct_team_tgts.isnull() == False)]
test.reset_index(inplace= True, drop=True)
Beispiel #30
0
# Create weights with a precision lambda_ of 4.
lambda_ = 4.
w = np.zeros(n_features)
# Only keep 10 weights of interest
relevant_features = np.random.randint(0, n_features, 10)
for i in relevant_features:
    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
# Create noise with a precision alpha of 50.
alpha_ = 50.
noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
# Create the target
y = np.dot(X, w) + noise

# #############################################################################
# Fit the Bayesian Ridge Regression and an OLS for comparison
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)

ols = LinearRegression()
ols.fit(X, y)

# #############################################################################
# Plot true weights, estimated weights, histogram of the weights, and
# predictions with standard deviations
lw = 2
plt.figure(figsize=(6, 5))
plt.title("Weights of the model")
plt.plot(clf.coef_, color='lightgreen', linewidth=lw,
         label="Bayesian Ridge estimate")
plt.plot(w, color='gold', linewidth=lw, label="Ground truth")
plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate")
def do_validation(data_path, steps=10):
    allfiles = initialize(data_path)
    gbm = GradientBoostingRegressor(n_estimators=100, learning_rate=0.05, max_depth=6, min_samples_leaf=5, subsample=0.5)
    ada = AdaBoostRegressor(n_estimators=200, learning_rate=1)
    etree = ExtraTreesRegressor(n_estimators=200, n_jobs=-1, min_samples_leaf=5)
    rf = RandomForestRegressor(n_estimators=200, max_features=4, min_samples_leaf=5)
    kn = KNeighborsRegressor(n_neighbors=25)
    logit = LogisticRegression(tol=0.05)
    enet = ElasticNetCV(l1_ratio=0.75, max_iter=1000, tol=0.05)
    svr = SVR(kernel="linear", probability=True)
    ridge = Ridge(alpha=18)
    bridge = BayesianRidge(n_iter=500)

    gbm_metrics = 0.0
    ada_metrics = 0.0
    etree_metrics = 0.0
    rf_metrics = 0.0
    kn_metrics = 0.0
    logit_metrics = 0.0
    svr_metrics = 0.0
    ridge_metrics = 0.0
    bridge_metrics = 0.0
    enet_metrics = 0.0
    nnet_metrics = 0.0

    logistic = LogisticRegression()
    rbm = BernoulliRBM(random_state=0, verbose=True)
    classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

    for i in xrange(steps):
        driver = allfiles[i]
        df, Y = create_merged_dataset(driver)
        df['label'] = Y        
        # Shuffle DF.
        df = df.reindex(np.random.permutation(df.index))

        train = df[:100]
        label = train['label']
        del train['label']

        test = df[100:400]
        Y = test['label']
        del test['label']

        #to_drop = ['driver', 'trip', 'speed1', 'speed2', 'speed3', 'speed4', 'speed5', 'speed6', 'speed7', 'speed8', 'speed9', 
        #        'speed10', 'speed11', 'speed12', 'speed13', 'speed14', 'speed15', 'speed16', 'speed17', 'speed18', 'speed19', 
        #        'speed20', 'speed21', 'speed22', 'speed23', 'speed24', 'speed25', 'speed26', 'speed27', 'speed28', 'speed29', 
        #        'speed30', 'speed31', 'speed32', 'speed33', 'speed34', 'speed35', 'speed36', 'speed37', 'speed38', 'speed39', 
        #        'speed40', 'speed41', 'speed42', 'speed43', 'speed44', 'speed45', 'speed46', 'speed47', 'speed48', 'speed49', 
        #        'speed50', 'speed51', 'speed52', 'speed53', 'speed54', 'speed55', 'speed56', 'speed57', 'speed58', 'speed59', 
        #        'speed60', 'speed61', 'speed62', 'speed63', 'speed64', 'speed65', 'speed66', 'speed67', 'speed68', 'speed69', 
        #        'speed70', 'speed71', 'speed72', 'speed73', 'speed74', 'speed75', 'speed76', 'speed77', 'speed78', 'speed79', 'speed80']
        to_drop = ['driver', 'trip']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)
        
        gbm.fit(X_train, label)
        Y_hat = gbm.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        gbm_metrics += metrics.auc(fpr, tpr) 
        
        ada.fit(X_train, label)
        Y_hat = ada.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ada_metrics += metrics.auc(fpr, tpr)
    
        etree.fit(X_train, label)
        Y_hat = etree.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        etree_metrics += metrics.auc(fpr, tpr)
        
        rf.fit(X_train, label)
        Y_hat = rf.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        rf_metrics += metrics.auc(fpr, tpr)
        
        kn.fit(X_train, label)
        Y_hat = kn.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        kn_metrics += metrics.auc(fpr, tpr)

        # Linear models.
        to_drop = ['driver', 'trip', 'distance', 'sd_acceleration', 'final_angle', 'mean_acceleration', 'mean_avg_speed', 'sd_inst_speed',
                'sd_avg_speed', 'mean_inst_speed', 'points']

        X_train = train.drop(to_drop, 1)
        X_test = test.drop(to_drop, 1)
        
        logit.fit(X_train, label)
        Y_hat = [i[1] for i in logit.predict_proba(X_test)]
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        logit_metrics += metrics.auc(fpr, tpr)

        svr.fit(X_train, label)
        Y_hat = svr.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        svr_metrics += metrics.auc(fpr, tpr)
        
        ridge.fit(X_train, label)
        Y_hat = ridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        ridge_metrics += metrics.auc(fpr, tpr)

        bridge.fit(X_train, label)
        Y_hat = bridge.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        bridge_metrics += metrics.auc(fpr, tpr)

        enet.fit(X_train, label)
        Y_hat = enet.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        enet_metrics += metrics.auc(fpr, tpr)

        classifier.fit(X_train, label)
        Y_hat = classifier.predict(X_test)
        fpr, tpr, thresholds = metrics.roc_curve(Y, Y_hat)
        nnet_metrics += metrics.auc(fpr, tpr)

    print ""
    print "GBM:", gbm_metrics/steps
    print "AdaBoost:", ada_metrics/steps
    print "Extra Trees:", etree_metrics/steps
    print "RF:", rf_metrics/steps
    print "KN:", kn_metrics/steps
    print ""
    print "Logit:", logit_metrics/steps
    print "SVR:", svr_metrics/steps
    print "Ridge:", ridge_metrics/steps
    print "BayesianRidge:", bridge_metrics/steps
    print "Elastic Net:", enet_metrics/steps
    print "Neural Networks:", nnet_metrics/steps
    print ""
Beispiel #32
0
X = np.random.randn(n_samples, size**2)
for x in X:  # smooth data
    x[:] = ndimage.gaussian_filter(x.reshape(size, size), sigma=1.0).ravel()
X -= X.mean(axis=0)
X /= X.std(axis=0)

y = np.dot(X, coef.ravel())
noise = np.random.randn(y.shape[0])
noise_coef = (linalg.norm(y, 2) / np.exp(snr / 20.)) / linalg.norm(noise, 2)
y += noise_coef * noise  # add noise

###############################################################################
# Compute the coefs of a Bayesian Ridge with GridSearch
cv = KFold(len(y), 2)  # cross-validation generator for model selection
ridge = BayesianRidge()
mem = Memory(cachedir='.', verbose=1)

# Ward agglomeration followed by BayesianRidge
A = grid_to_graph(n_x=size, n_y=size)
ward = WardAgglomeration(n_clusters=10,
                         connectivity=A,
                         memory=mem,
                         n_components=1)
clf = Pipeline([('ward', ward), ('ridge', ridge)])
# Select the optimal number of parcels with grid search
clf = GridSearchCV(clf, {'ward__n_clusters': [10, 20, 30]}, n_jobs=1, cv=cv)
clf.fit(X, y)  # set the best parameters
coef_ = clf.best_estimator.steps[-1][1].coef_
coef_ = clf.best_estimator.steps[0][1].inverse_transform(coef_)
coef_agglomeration_ = coef_.reshape(size, size)

def sale(data):
	data = int(data) + 1
	return log(data)


dataset = pandas.read_csv("input/train2_.csv")
testset = pandas.read_csv("input/test2_.csv")

dataset['Sale'] = dataset['Sales'].apply(sale)

labelData = dataset['Sale'].values
myId = testset['Id'].values

testset.drop(['Id'], inplace=True, axis=1)
testData = testset.iloc[:, :].values
dataset.drop(['Sales', 'Sale'], inplace=True, axis=1)
dataData = dataset.iloc[:, :].values

BRModel = BayesianRidge(compute_score=True)
BRModel.fit(dataset.iloc[:, :].values, labelData)
preds = numpy.column_stack((myId, BRModel.predict(testData))).tolist()
preds = [[int(i[0])] + [exp(float(i[1])) - 1] for i in preds]

print BRModel.scores_
with open("result/sub_BayesRidge.csv", "w") as output:
	writer = csv.writer(output, lineterminator='\n')
	writer.writerow(["Id", "Sales"])
	writer.writerows(preds)
Beispiel #34
0
    def calculate_score(deg, X, y):
        pipe = make_pipeline(StandardScaler(), PolynomialFeatures(deg),
                             BayesianRidge(normalize=False))  # type: Pipeline

        pipe.fit(X, y)
        return pipe.score(X, y)
    ordered_idx = [i.feat_idx for i in imputer.imputation_sequence_]
    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == 2 * (d - 1)


@pytest.mark.parametrize(
    "predictor",
    [DummyRegressor(), BayesianRidge(),
     ARDRegression()])
def test_chained_imputer_predictors(predictor):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = ChainedImputer(missing_values=0,
                             n_imputations=1,
                             n_burn_in=1,
                             predictor=predictor,
                             random_state=rng)
    imputer.fit_transform(X)
Beispiel #36
0
from preprocess import preprocess
from kaggle_util import make_submission_with_model, get_kaggle_scores, submit_to_kaggle


def house_prices_preprocess(root):
    target_column = 'SalePrice'
    columns_to_drop = ['Id']
    forced_categorical = []
    forced_numeric = []
    columns_to_normalize = [target_column]
    use_labeler = []

    def manual_processing(features, complete_features):
        return features

    preprocess(root, target_column, columns_to_drop, forced_categorical,
               forced_numeric, columns_to_normalize, use_labeler,
               manual_processing)


if __name__ == '__main__':
    competition = 'house-prices-advanced-regression-techniques'
    root = f'C:/data/{competition}/'

    print(get_kaggle_scores(competition))

    from sklearn.linear_model import BayesianRidge
    model = BayesianRidge()
    path = make_submission_with_model(model, root)
    #submit_to_kaggle(path, competition)
def Ridge_Regression():
    model = BayesianRidge(compute_score=True)
    return model
Beispiel #38
0
# 将lgb和xgb的结果进行stacking
train_stack = np.vstack([oof_lgb, oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=4590)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx,
            val_idx) in enumerate(folds_stack.split(train_stack, target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values

    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)

    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10

mean_squared_error(target.values, oof_stack)

sub_df = pd.read_csv('datalab/7955/jinnan_round1_submit_20181227.csv',
                     header=None)
sub_df[1] = predictions
sub_df[1] = sub_df[1].apply(lambda x: round(x, 3))


def modeling_cross_validation(params, X, y, nr_folds=5):
    oof_preds = np.zeros(X.shape[0])
def model_BayesianRidge(train_X, test_X):
    "调用贝叶斯回归"
    #训练样本标签为act_class
    train_y = train_X['class'].values
    #训练样本,需要drop掉标签class,即类别标签不加入模型的训练
    train_x = train_X.drop(['class'], axis=1).values
    #测试样本,需要drop掉标签class
    test_x = test_X.drop(['class'], axis=1).values
    "模型1"
    #贝叶斯回归进行预测
    clf1 = BayesianRidge(alpha_1=1e-06,
                         alpha_2=1e-06,
                         compute_score=False,
                         copy_X=True,
                         fit_intercept=True,
                         lambda_1=1e-06,
                         lambda_2=1e-06,
                         n_iter=300,
                         normalize=False,
                         tol=0.001,
                         verbose=False)
    clf1 = clf1.fit(train_x, train_y)
    #获得预测结果
    test1 = clf1.predict(test_x)
    train1 = clf1.predict(train_x)
    #转为DataFrame类型
    test1 = pd.DataFrame(test1)
    train1 = pd.DataFrame(train1)
    "模型2"
    #贝叶斯回归进行预测
    clf2 = BayesianRidge(alpha_1=1e-05,
                         alpha_2=1e-05,
                         compute_score=False,
                         copy_X=True,
                         fit_intercept=True,
                         lambda_1=1e-05,
                         lambda_2=1e-05,
                         n_iter=400,
                         normalize=False,
                         tol=0.001,
                         verbose=False)
    clf2 = clf2.fit(train_x, train_y)
    #获得预测结果
    test2 = clf2.predict(test_x)
    train2 = clf2.predict(train_x)
    #转为DataFrame类型
    test2 = pd.DataFrame(test2)
    train2 = pd.DataFrame(train2)
    #合并两个模型
    test = pd.concat([test1, test2], axis=1)
    train = pd.concat([train1, train2], axis=1)

    print('------贝叶斯回归单模型结果-------')
    print("Mean squared error: %.2f" %
          mean_squared_error(test_X['class'], test1))
    print('Variance score: %.2f' % r2_score(test_X['class'], test1))
    print("Mean squared error: %.2f" %
          mean_squared_error(test_X['class'], test2))
    print('Variance score: %.2f' % r2_score(test_X['class'], test2))
    print('\n')
    return test, train
Beispiel #40
0
for index, row in tt.iterrows():
     if pd.isnull(row['Age']):
         for key in avg_age.keys():
             if key in row['Name']:
                 tt.loc[index,"Age"] = avg_age[key]
#--------------------------------------------------------------------------------
#

X =  td.loc[:,['Sex','Age', 'Fare','SibSp','Parch','Pclass']].values
X = np.where(np.isnan(X), -1, X)
X_ = tt.loc[:,['Sex','Age', 'Fare','SibSp','Parch', 'Pclass']].values
X_ = np.where(np.isnan(X_), -1, X_)

Y = td['Survived'].values

clf = BayesianRidge(lambda_1=10**-4, lambda_2=10**-4, alpha_1=10**2.75,alpha_2=10**3.3, compute_score=True) #0.78947
model = clf.fit(X, Y)

#Result
predict_result = model.predict(X_).round(0).astype(int)

result = pd.DataFrame.from_items([('PassengerId',tt['PassengerId']), ('Survived',predict_result)])
result.to_csv('result/bayes_result.csv', index=False)

t1 = pd.read_csv("result/elastic_result_077512.csv")
t2 = pd.read_csv("result/bayes_result.csv")
t3 = t1 == t2
i = 0
for index, row in t3.iterrows():
    if row['Survived'] == False:
        i += 1
plt.rcParams['font.sans-serif'] = ['SimHei']  # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus'] = False  # 用来正常显示负号

# 对稀疏数据做标准化,不能采用中心化的方式,否则会破坏稀疏数据的结构.比如此文件的boston数据集

size = 500  # train size
df = pd.DataFrame(pd.read_excel('boston_data.xlsx', header=0))
# np.random.shuffle()
training_data_input = df.values[:, 0:13][:size]  # 500*13
training_data_output = df.values[:, 13:14][:size].ravel()  # .dtype打印出数据类型,.ravel() 返回一维的数组
test_data_input = df.values[:, 0:13][size:]
test_data_output = df.values[:, 13:14][size:].ravel()  # float64

n_folds = 6  # 设置交叉检验的次数
model_br = BayesianRidge()  # 建立贝叶斯岭回归模型对象
model_lr = LinearRegression()  # 建立普通线性回归模型对象
model_etc = ElasticNet()  # 建立弹性网络回归模型对象
model_svr = SVR()  # 建立支持向量机回归模型对象
model_gbr = GradientBoostingRegressor()  # 建立梯度增强算法回归模型对象
model_mlp = MLPRegressor(solver='lbfgs', hidden_layer_sizes=(20, 20, 20), random_state=1)
model_names = ['BayesianRidge', 'LinearRegression', 'ElasticNet', 'SVR', 'GBR', 'MLP']  # 不同模型的名称列表
model_dic = [model_br, model_lr, model_etc, model_svr, model_gbr, model_mlp]  # 不同回归模型对象的集合

cv_score_list = []  # 交叉检验结果列表
pre_y_list = []  # 各个回归模型预测的y值列表
for model in model_dic:  # 读出每个回归模型对象
    # 将每个回归模型导入交叉检验模型中做训练检验
    scores = cross_val_score(model, training_data_input, training_data_output, cv=n_folds)   # 模型与数据间的距离,并非越大越好
    cv_score_list.append(scores)
    pre_y_list.append(
Beispiel #42
0
    def get_algorithm(self):
        '''
        Inputs:
            algorithm (string)  - Name of the regressor to run.  Follows Sklearn naming conventions.
                                    Available keys: ARDRegression | AdaBoostRegressor | BaggingRegressor | BayesianRidge | CCA
                                                    DecisionTreeRegressor | ElasticNet | ExtraTreeRegressor
                                                    ExtraTreesRegressor | GaussianProcessRegressor | GradientBoostingRegressor
                                                    HuberRegressor | KNeighborsRegressor | KernelRidge | Lars | Lasso
                                                    LassoLars | LinearRegression | LinearSVR | MLPRegressor | NuSVR | 
                                                    OrthogonalMatchingPursuit | PLSCanonical | PLSRegression | 
                                                    PassiveAggressiveRegressor | RANSACRegressor | RandomForestRegressor | 
                                                    Ridge | SGDRegressor | SVR | TheilSenRegressor | TransformedTargetRegressor

                                    Currently not supporting: ElasticNetCV | LarsCV | LassoCV | LassoLarsCV | LassoLarsIC | 
                                                    MultiTaskElasticNet | MultiTaskElasticNetCV | MultiTaskLasso | MultiTaskLassoCV |
                                                    OrthogonalMatchingPursuitCV | RidgeCV | RadiusNeighborsRegressor
        Outputs:

        Notes:
            Scoring Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
        '''
        if (self.algorithmName == "ARDRegression"): algorithm = ARDRegression()
        elif (self.algorithmName == "AdaBoostRegressor"):
            algorithm = AdaBoostRegressor()
        elif (self.algorithmName == "BaggingRegressor"):
            algorithm = BaggingRegressor()
        elif (self.algorithmName == "BayesianRidge"):
            algorithm = BayesianRidge()
        elif (self.algorithmName == "CCA"):
            algorithm = CCA()
        elif (self.algorithmName == "DecisionTreeRegressor"):
            algorithm = DecisionTreeRegressor()
        elif (self.algorithmName == "ElasticNet"):
            algorithm = ElasticNet()
        elif (self.algorithmName == "ExtraTreeRegressor"):
            algorithm = ExtraTreeRegressor()
        elif (self.algorithmName == "ExtraTreesRegressor"):
            algorithm = ExtraTreesRegressor()
        elif (self.algorithmName == "GaussianProcessRegressor"):
            algorithm = GaussianProcessRegressor()
        elif (self.algorithmName == "GradientBoostingRegressor"):
            algorithm = GradientBoostingRegressor()
        elif (self.algorithmName == "HuberRegressor"):
            algorithm = HuberRegressor()
        elif (self.algorithmName == "KNeighborsRegressor"):
            algorithm = KNeighborsRegressor()
        elif (self.algorithmName == "KernelRidge"):
            algorithm = KernelRidge()
        elif (self.algorithmName == "Lars"):
            algorithm = Lars()
        elif (self.algorithmName == "Lasso"):
            algorithm = Lasso()
        elif (self.algorithmName == "LassoLars"):
            algorithm = LassoLars()
        elif (self.algorithmName == "LinearRegression"):
            algorithm = LinearRegression()
        elif (self.algorithmName == "LinearSVR"):
            algorithm = LinearSVR()
        elif (self.algorithmName == "MLPRegressor"):
            algorithm = MLPRegressor()
        elif (self.algorithmName == "NuSVR"):
            algorithm = NuSVR()
        elif (self.algorithmName == "OrthogonalMatchingPursuit"):
            algorithm = OrthogonalMatchingPursuit()
        elif (self.algorithmName == "PLSCanonical"):
            algorithm = PLSCanonical()
        elif (self.algorithmName == "PLSRegression"):
            algorithm = PLSRegression()
        elif (self.algorithmName == "PassiveAggressiveRegressor"):
            algorithm = PassiveAggressiveRegressor()
        elif (self.algorithmName == "RANSACRegressor"):
            algorithm = RANSACRegressor()
        elif (self.algorithmName == "RandomForestRegressor"):
            algorithm = RandomForestRegressor()
        elif (self.algorithmName == "Ridge"):
            algorithm = Ridge()
        elif (self.algorithmName == "SGDRegressor"):
            algorithm = SGDRegressor()
        elif (self.algorithmName == "SVR"):
            algorithm = SVR()
        elif (self.algorithmName == "TheilSenRegressor"):
            algorithm = TheilSenRegressor()
        elif (self.algorithmName == "TransformedTargetRegressor"):
            algorithm = TransformedTargetRegressor()
        else:
            return None

        return algorithm
Beispiel #43
0
    if imputation_order == 'roman':
        assert np.all(ordered_idx[:d - 1] == np.arange(1, d))
    elif imputation_order == 'arabic':
        assert np.all(ordered_idx[:d - 1] == np.arange(d - 1, 0, -1))
    elif imputation_order == 'random':
        ordered_idx_round_1 = ordered_idx[:d - 1]
        ordered_idx_round_2 = ordered_idx[d - 1:]
        assert ordered_idx_round_1 != ordered_idx_round_2
    elif 'ending' in imputation_order:
        assert len(ordered_idx) == max_iter * (d - 1)


@pytest.mark.parametrize(
    "estimator",
    [None, DummyRegressor(),
     BayesianRidge(),
     ARDRegression(),
     RidgeCV()])
def test_iterative_imputer_estimators(estimator):
    rng = np.random.RandomState(0)

    n = 100
    d = 10
    X = sparse_random_matrix(n, d, density=0.10, random_state=rng).toarray()

    imputer = IterativeImputer(missing_values=0,
                               max_iter=1,
                               estimator=estimator,
                               random_state=rng)
    imputer.fit_transform(X)
Beispiel #44
0
# separate the data into training and testing
if TIME_SERIES:
    test_idx = X.index.values[-int(X.shape[0] / 5):]
else:
    np.random.seed(1)
    test_idx = np.random.choice(a=X.index.values,
                                size=int(X.shape[0] / 5),
                                replace=False)
train_idx = np.array(list(set(X.index.values) - set(test_idx)))

# set up the model
if classifier:
    model = MultiOutputClassifier(GaussianNB())
else:
    model = MultiOutputRegressor(BayesianRidge(n_iter=300))

# train the model
model.fit(X.iloc[train_idx, :], Y.iloc[train_idx, :])

# In[2]: Collect the predictions

# predict training and testing data
train_predict = pd.DataFrame(model.predict(X.iloc[train_idx, :]),
                             columns=Y.columns)
test_predict = pd.DataFrame(model.predict(X.iloc[test_idx, :]),
                            columns=Y.columns)

# reshape all of the predictions into a single table
predictions = pd.DataFrame()
for j in range(outputs):
Beispiel #45
0
def main():
    usage = "usage: %prog [options] <model_file>"
    parser = OptionParser(usage)
    parser.add_option(
        "-c",
        dest="center_dist",
        default=10,
        type="int",
        help="Distance between the motifs and sequence center [Default: %default]",
    )
    parser.add_option(
        "-d", dest="model_hdf5_file", default=None, help="Pre-computed model output as HDF5 [Default: %default]"
    )
    parser.add_option(
        "-g", dest="cuda", default=False, action="store_true", help="Run on the GPGPU [Default: %default]"
    )
    parser.add_option("-l", dest="seq_length", default=600, type="int", help="Sequence length [Default: %default]")
    parser.add_option("-o", dest="out_dir", default="heat", help="Output directory [Default: %default]")
    parser.add_option(
        "-t",
        dest="targets",
        default="0",
        help="Comma-separated list of target indexes to plot (or -1 for all) [Default: %default]",
    )
    (options, args) = parser.parse_args()

    if len(args) != 1:
        parser.error("Must provide Basset model file")
    else:
        model_file = args[0]

    out_targets = [int(ti) for ti in options.targets.split(",")]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    # torch options
    cuda_str = ""
    if options.cuda:
        cuda_str = "-cuda"

    #################################################################
    # place filter consensus motifs
    #################################################################
    # determine filter consensus motifs
    filter_consensus = get_filter_consensus(model_file, options.out_dir, cuda_str)

    seqs_1hot = []
    # num_filters = len(filter_consensus)
    num_filters = 20
    filter_len = filter_consensus[0].shape[1]

    # position the motifs
    left_i = options.seq_length / 2 - options.center_dist - filter_len
    right_i = options.seq_length / 2 + options.center_dist

    ns_1hot = np.zeros((4, options.seq_length)) + 0.25
    # ns_1hot = np.zeros((4,options.seq_length))
    # for i in range(options.seq_length):
    #     nt_i = random.randint(0,3)
    #     ns_1hot[nt_i,i] = 1

    for i in range(num_filters):
        for j in range(num_filters):
            # copy the sequence of N's
            motifs_seq = np.copy(ns_1hot)

            # write them into the one hot coding
            motifs_seq[:, left_i : left_i + filter_len] = filter_consensus[i]
            motifs_seq[:, right_i : right_i + filter_len] = filter_consensus[j]

            # save
            seqs_1hot.append(motifs_seq)

    # make a full array
    seqs_1hot = np.array(seqs_1hot)

    # reshape for spatial
    seqs_1hot = seqs_1hot.reshape((seqs_1hot.shape[0], 4, 1, options.seq_length))

    #################################################################
    # place filter consensus motifs
    #################################################################
    # save to HDF5
    seqs_file = "%s/motif_seqs.h5" % options.out_dir
    h5f = h5py.File(seqs_file, "w")
    h5f.create_dataset("test_in", data=seqs_1hot)
    h5f.close()

    # predict scores
    scores_file = "%s/motif_seqs_scores.h5" % options.out_dir
    torch_cmd = "th basset_place2_predict.lua %s %s %s %s" % (cuda_str, model_file, seqs_file, scores_file)
    subprocess.call(torch_cmd, shell=True)

    # load in scores
    hdf5_in = h5py.File(scores_file, "r")
    motif_seq_scores = np.array(hdf5_in["scores"])
    hdf5_in.close()

    #################################################################
    # analyze
    #################################################################
    for ti in out_targets:
        #################################################################
        # compute pairwise expectations
        #################################################################
        # X = np.zeros((motif_seq_scores.shape[0],num_filters))
        # xi = 0
        # for i in range(num_filters):
        #     for j in range(num_filters):
        #         X[xi,i] += 1
        #         X[xi,j] += 1
        #         xi += 1

        X = np.zeros((motif_seq_scores.shape[0], 2 * num_filters))
        xi = 0
        for i in range(num_filters):
            for j in range(num_filters):
                X[xi, i] += 1
                X[xi, num_filters + j] += 1
                xi += 1

        # fit model
        model = BayesianRidge()
        model.fit(X, motif_seq_scores[:, ti])

        # predict pairwise expectations
        motif_seq_preds = model.predict(X)
        print model.score(X, motif_seq_scores[:, ti])

        # print filter coefficients
        coef_out = open("%s/coefs_t%d.txt" % (options.out_dir, ti), "w")
        for i in range(num_filters):
            print >> coef_out, "%3d  %6.2f" % (i, model.coef_[i])
        coef_out.close()

        #################################################################
        # normalize pairwise predictions
        #################################################################
        filter_interaction = np.zeros((num_filters, num_filters))
        table_out = open("%s/table_t%d.txt" % (options.out_dir, ti), "w")

        si = 0
        for i in range(num_filters):
            for j in range(num_filters):
                filter_interaction[i, j] = motif_seq_scores[si, ti] - motif_seq_preds[si]
                cols = (i, j, motif_seq_scores[si, ti], motif_seq_preds[si], filter_interaction[i, j])
                print >> table_out, "%3d  %3d  %6.3f  %6.3f  %6.3f" % cols
                si += 1

        table_out.close()

        # plot heat map
        plt.figure()
        sns.heatmap(filter_interaction)
        plt.savefig("%s/heat_t%d.pdf" % (options.out_dir, ti))
Beispiel #46
0
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import numpy as np
from dataprepHousing import getData

import os, sys


full_path = os.path.realpath(__file__)
file = os.path.dirname(full_path) + "\\\data\\housingSample.csv"
(X,Y,records)=getData(file)
X_train, X_test, price_train, price_test = train_test_split(X, Y, test_size = 0.1, random_state = 42)
model=BayesianRidge()
model.fit(X_train, price_train.ravel())
predPrices=model.predict(X_train)
print(model)
# Summarize the fit of the model

#print(model.intercept_, model.coef_, mse)
print(model.score(X_train, price_train))

predPrices=model.predict(X_train)
mse=mean_squared_error(price_train, predPrices)
rs=r2_score(price_train, predPrices)

print("training mse:",mse)
print("training score:",rs)
Beispiel #47
0
def nickmain1():

	train_all = pd.read_csv(trainloc)
	target_all = pd.read_csv(trainloc)
	test_all = pd.read_csv(testloc)
	targets = ['Ca','P','pH','SOC','Sand']
	train_cols_to_remove = ['PIDN']+targets
	train_all["Depth"] = train_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10])
	test_all["Depth"] = test_all["Depth"].replace(["Topsoil", "Subsoil"],[10,-10])
	common_features = ['BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI']
	feats_list = {}
	colnames_nums = []
	colnames = train_all.ix[:,'m7497.96':'m599.76'].columns.values
	for x in colnames:
		match = re.search(r'(?<=m)[0-9]*',x)
		if match: 
			colnames_nums.append(int(match.group()))
	
	print len(colnames)
	print len(colnames_nums)
	print len(train_all.ix[0,'m7497.96':'m599.76'].values)


	

	for target in targets:
		selector = SelectKBest(f_regression, k=200)
		selector.fit_transform(train_all.ix[:,'m7497.96':'m599.76'], train_all[target])
		selected = selector.get_support()
		feats = [col for (col,sel) in zip(list(train_all.ix[:,'m7497.96':'m599.76'].columns.values), selected) if sel]
		feats_list[target] = feats+common_features

		


	#pickTest = ['PIDN', 'BSAN','BSAS','BSAV','CTI','ELEV','EVI','LSTD','LSTN','REF1','REF2','REF3','REF7','RELI','TMAP','TMFI','Depth']#ORIGINAL10
	ids = np.genfromtxt(testloc, dtype=str, skip_header=1, delimiter=',', usecols=0)
	df = pd.DataFrame({"PIDN": ids, "Ca": test_all['PIDN'], "P": test_all['PIDN'], "pH": test_all['PIDN'], "SOC": test_all['PIDN'], "Sand": test_all['PIDN']})
	
	cv = cross_validation.KFold(len(train_all), n_folds=10, indices=False)
	subresults = {}
	results = []

	if issub == False:
		for train_sub, test_sub in cv:
			for target in targets:
				#clf = ensemble.GradientBoostingRegressor(n_estimators=6)
				#clf = RandomForestRegressor(n_estimators = 40)
				#clf = linear_model.Lasso(alpha=0.08)
				#clf = svm.SVC()
				#clf = tree.DecisionTreeRegressor(min_samples_leaf=20)
				#clf = Ridge(alpha=1.0)
				#clf = ElasticNet(alpha=0.1, l1_ratio=0.7)
				clf = BayesianRidge(compute_score=True)
				clf.fit(np.array(train_all[feats_list[target]])[train_sub], np.array(train_all[target])[train_sub])
				pred = clf.predict(np.array(train_all[feats_list[target]])[test_sub])
				subresults[target] = ev.rmse(np.array(train_all[target])[test_sub],np.array(pred))
				#df[target] = pred
			subtotal = 0 
			for x in subresults:
				subtotal = subtotal + subresults[x]
			print ("average for the run is ", subtotal/len(targets))
			results.append(subtotal/len(targets))
		print "Results: " + str( np.array(results).mean() )

	else:
		for target in targets:
			#clf = ensemble.GradientBoostingRegressor(n_estimators=6)
			#clf = RandomForestRegressor(n_estimators = 20)
			#clf = linear_model.Lasso(alpha=0.08)
			#clf = svm.SVC()
			#clf = tree.DecisionTreeRegressor(min_samples_leaf=20)
			#clf = Ridge(alpha=1.0)
			#clf = ElasticNet(alpha=0.1, l1_ratio=0.7)
			clf = BayesianRidge(compute_score=True)
			clf.fit(np.array(train_all[feats_list[target]]), np.array(train_all[target]))
			pred = clf.predict(np.array(test_all[feats_list[target]]))
			df[target] = pred
			df.to_csv(predloc, index=False, cols=["PIDN","Ca","P","pH","SOC","Sand"])
Beispiel #48
0
        cross_validation.train_test_split(X_bns, Y,
                                          test_size=test_size, random_state=0)
    

#    k = int(0.5 * n_features)
#    print("-----------------------------------------------")
#    print("Perform chi2 feature selection k=", k)   
#    print("-----------------------------------------------")
#    X_train, X_test = selectFeatures(X_train, X_test, y_train, k)

    print("-----------------------------------------------")
    print("SVM Classification of training set")   
    print("-----------------------------------------------")
    class_weight = {0:5}
    print("Class weight=", class_weight)
    clf = BayesianRidge(compute_score=True).fit(X_train, y_train)
    print("Test svm.SVC score=", clf.score(X_test, y_test))
    print("Train svm.SVC score=", clf.score(X_train, y_train))
    
    print("-----------------------------------------------")
    print("Metrics on TEST SET")   
    print("-----------------------------------------------")    
    y_pred = clf.predict(X_test)
    
    print(metrics.classification_report(y_test, y_pred, target_names=label_names))
    print(metrics.confusion_matrix(y_test, y_pred))       
    
    print("-----------------------------------------------")
    print("Metrics on TRAIN SET")   
    print("-----------------------------------------------")    
    y_predTrain = clf.predict(X_train)
Beispiel #49
0
def bayes_ridge_reg(x_data,y_data):
    br = BayesianRidge()
    br.fit(x_data,y_data)
    print 'br params',br.coef_,br.intercept_
    adjusted_result = br.predict(x_data)
    return map(int,list(adjusted_result))
def prediction_BayesianRidge (X_train, Y_train, X_test, Y_test,normalize):

    # Print shapes of the training and testing data sets
    #print ("Shapes of the training and testing data sets")
    #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)
    #Create our regression object

    lreg = BayesianRidge(normalize=normalize)

    #do a linear regression, except only on the training
    lreg.fit(X_train,Y_train)

    #print("The estimated intercept coefficient is %.2f " %lreg.intercept_)
    #print("The number of coefficients used was %d " % len(lreg.coef_))



    # Set a DataFrame from the Facts
    coeff_df = DataFrame(X_train.columns)
    coeff_df.columns = ["Fact"]


    # Set a new column lining up the coefficients from the linear regression
    coeff_df["Coefficient"] = pd.Series(lreg.coef_)


    # Show
    #coeff_df

    #highest correlation between a fact and fraction votes
    #print ("Highest correlation fact: %s is %.9f" % (cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"], coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]) )

    #sns_plot = sns.jointplot(coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"Fraction Votes",pd.merge(X_test,pd.DataFrame(Y_test), right_index=True, left_index=True),kind="scatter")


    #Predictions on training and testing sets
    pred_train = lreg.predict(X_train)
    pred_test = lreg.predict(X_test)

    # The mean square error
    #print("MSE with X_train and Y_train: %.6f"  % np.mean((Y_train - pred_train) ** 2))
    #print("MSE with X_test and Y_test: %.6f"  %np.mean((Y_test - pred_test) ** 2))

    #Explained variance score: 1 is perfect prediction
    #print("Variance score: %.2f" % lreg.score(X_test, Y_test))

    result={}
    result["method"]="BayesianRidge"
    if normalize :
        result["normalize"]="Y"
    else:
        result["normalize"]="N"
    result["X_train_shape"]=X_train.shape
    result["Y_train_shape"]=Y_train.shape
    result["X_test_shape"]=X_test.shape
    result["Y_test_shape"]=Y_test.shape
    result["intercept"]=lreg.intercept_
    result["num_coef"]=len(lreg.coef_)
    result["max_fact"]=cf_dict.loc[coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Fact"],"description"]
    result["max_fact_value"]=coeff_df.iloc[coeff_df["Coefficient"].idxmax()]["Coefficient"]
    result["MSE_train"]=np.mean((Y_train - pred_train) ** 2)
    result["MSE_test"]=np.mean((Y_test - pred_test) ** 2)
    result["variance"]=lreg.score(X_test, Y_test)
    return pred_test,coeff_df,pred_train,result
# Create weights with a precision lambda_ of 4.
lambda_ = 4.
w = np.zeros(n_features)
# Only keep 10 weights of interest
relevant_features = np.random.randint(0, n_features, 10)
for i in relevant_features:
    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
# Create noise with a precision alpha of 50.
alpha_ = 50.
noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
# Create the target
y = np.dot(X, w) + noise

###############################################################################
# Fit the Bayesian Ridge Regression and an OLS for comparison
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)

ols = LinearRegression()
ols.fit(X, y)

###############################################################################
# Plot true weights, estimated weights, histogram of the weights, and
# predictions with standard deviations
lw = 2
plt.figure(figsize=(6, 5))
plt.title("Weights of the model")
plt.plot(clf.coef_, color='lightgreen', linewidth=lw,
         label="Bayesian Ridge estimate")
plt.plot(w, color='gold', linewidth=lw, label="Ground truth")
plt.plot(ols.coef_, color='navy', linestyle='--', label="OLS estimate")
Beispiel #52
0
from sklearn.linear_model import BayesianRidge, LinearRegression

from sklearn import svm
from sklearn import metrics

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from bokeh.plotting import figure
from bokeh.io import show as bokehShow

train, test = train_test_split(importation.t, test_size=0.2)
spamtrain, spamtest = train_test_split(importation.valspam, test_size=0.2)


clf = BayesianRidge(compute_score=True)
ols = LinearRegression()

clf.fit(train, spamtrain)
ols.fit(train, spamtrain)



expected = spamtest

predicted = clf.predict(test)
predicted1 = ols.predict(test)


#print(spamtrain)
Beispiel #53
0
def main():
    usage = 'usage: %prog [options] <repr_hdf5> <data_hdf5> <target_index>'
    parser = OptionParser(usage)
    parser.add_option('-a', dest='add_only', default=False, action='store_true', help='Use additional features only; no sequence features')
    parser.add_option('-b', dest='balance', default=False, action='store_true', help='Downsample the negative set to balance [Default: %default]')
    parser.add_option('-o', dest='out_dir', default='postmodel', help='Output directory [Default: %default]')
    parser.add_option('-r', dest='regression', default=False, action='store_true', help='Regression mode [Default: %default]')
    parser.add_option('-s', dest='seq_only', default=False, action='store_true', help='Use sequence features only; no additional features [Default: %default]')
    parser.add_option('--sample', dest='sample', default=None, type='int', help='Sample from the training set [Default: %default]')
    parser.add_option('-t', dest='target_hdf5', default=None, help='Extract targets from this HDF5 rather than data_hdf5 argument')
    parser.add_option('-x', dest='regex_add', default=None, help='Filter additional features using a comma-separated list of regular expressions')
    (options,args) = parser.parse_args()

    if len(args) != 3:
        parser.error('Must provide full data HDF5, representation HDF5, and target index or filename')
    else:
        repr_hdf5_file = args[0]
        data_hdf5_file = args[1]
        target_i = args[2]

    if not os.path.isdir(options.out_dir):
        os.mkdir(options.out_dir)

    random.seed(1)

    #######################################################
    # preprocessing
    #######################################################

    # load training targets
    data_hdf5_in = h5py.File(data_hdf5_file, 'r')
    if options.target_hdf5:
        target_hdf5_in = h5py.File(options.target_hdf5, 'r')
    else:
        target_hdf5_in = data_hdf5_in
    train_y = np.array(target_hdf5_in['train_out'])[:,target_i]
    test_y = np.array(target_hdf5_in['test_out'])[:,target_i]

    # load training representations
    if not options.add_only:
        repr_hdf5_in = h5py.File(repr_hdf5_file, 'r')
        train_x = np.array(repr_hdf5_in['train_repr'])
        test_x = np.array(repr_hdf5_in['test_repr'])
        repr_hdf5_in.close()

    if options.seq_only:
        add_labels = []

    else:
        # load additional features
        train_a = np.array(data_hdf5_in['train_add'])
        test_a = np.array(data_hdf5_in['test_add'])
        add_labels = np.array(data_hdf5_in['add_labels'])

        if options.regex_add:
            fi = filter_regex(options.regex_add, add_labels)
            train_a, test_a, add_labels = train_a[:,fi], test_a[:,fi], add_labels[fi]

        # append additional features
        if options.add_only:
            add_i = 0
            train_x, test_x = train_a, test_a
        else:
            add_i = train_x.shape[1]
            train_x = np.concatenate((train_x,train_a), axis=1)
            test_x = np.concatenate((test_x,test_a), axis=1)

    data_hdf5_in.close()
    if options.target_hdf5:
        target_hdf5_in.close()

    # balance
    if options.balance:
        train_x, train_y = balance(train_x, train_y)

    # sample
    if options.sample is not None and options.sample < train_x.shape[0]:
        sample_indexes = random.sample(range(train_x.shape[0]), options.sample)
        train_x = train_x[sample_indexes]
        train_y = train_y[sample_indexes]


    #######################################################
    # model
    #######################################################
    if options.regression:
        # fit
        model = BayesianRidge(fit_intercept=True)
        model.fit(train_x, train_y)

        # accuracy
        acc_out = open('%s/r2.txt' % options.out_dir, 'w')
        print >> acc_out, model.score(test_x, test_y)
        acc_out.close()

        test_preds = model.predict(test_x)

        # plot a sample of predictions versus actual
        plt.figure()
        sns.jointplot(test_preds[:5000], test_y[:5000], joint_kws={'alpha':0.3})
        plt.savefig('%s/scatter.pdf' % options.out_dir)
        plt.close()

        # plot the distribution of residuals
        plt.figure()
        sns.distplot(test_y-test_preds)
        plt.savefig('%s/residuals.pdf' % options.out_dir)
        plt.close()

    else:
        # fit
        model = LogisticRegression(penalty='l2', C=1000)
        model.fit(train_x, train_y)

        # accuracy
        test_preds = model.predict_proba(test_x)[:,1].flatten()
        acc_out = open('%s/auc.txt' % options.out_dir, 'w')
        print >> acc_out, roc_auc_score(test_y, test_preds)
        acc_out.close()

        # compute and print ROC curve
        fpr, tpr, thresholds = roc_curve(test_y, test_preds)

        roc_out = open('%s/roc.txt' % options.out_dir, 'w')
        for i in range(len(fpr)):
            print >> roc_out, '%f\t%f\t%f' % (fpr[i], tpr[i], thresholds[i])
        roc_out.close()

        # compute and print precision-recall curve
        precision, recall, thresholds = precision_recall_curve(test_y, test_preds)

        prc_out = open('%s/prc.txt' % options.out_dir, 'w')
        for i in range(len(precision)):
            print >> prc_out, '%f\t%f' % (precision[i], recall[i])
        prc_out.close()

    # save model
    joblib.dump(model, '%s/model.pkl' % options.out_dir)

    #######################################################
    # analyze
    #######################################################
    # print coefficients table
    coef_out = open('%s/add_coefs.txt' % options.out_dir, 'w')
    for ai in range(len(add_labels)):
        if options.regression:
            coefi = model.coef_[add_i+ai]
        else:
            coefi = model.coef_[0,add_i+ai]
        print >> coef_out, add_labels[ai], coefi
    coef_out.close()
Beispiel #54
0
def _fill_iterative(
        df: pd.DataFrame,
        seed: int = 1,
        max_iter: int = 10,
        estimator: Any = BayesianRidge(),
):
    """ Gets a single imputation using IterativeImputer from sklearn.

    Uses BayesianRidge() from sklearn.

    Changed default of sample_posterior to True as we're doing
    multiple imputation.

    Clips imputed values to min-max of observed values to avoid
    brokenly large values. When imputation model doesn't converge
    nicely we otherwise end up with extreme values that are out of
    range of the float32 type used by model training, causing crashes.
    Consider this clipping a workaround until a more robust imputation
    strategy is in place.

    """

    log.info("Started imputing "
             f"df with shape {df.shape} "
             f"missing share {df.isnull().mean().mean()}"
             f"with estimator {estimator.__class__.__name__}")
    # Only impute numberic cols
    cols_numeric = list(df.select_dtypes(include=[np.number]).columns.values)
    cols_not_numeric = [col for col in df.columns if col not in cols_numeric]

    log.info(f"imputing {len(cols_numeric)} numeric cols, "
             f"ignoring {len(cols_not_numeric)} non-numeric cols")

    for col in cols_numeric:
        log.debug(
            f"Missing share before impute {col} : {df[col].isnull().mean()}")

    # Get bounds so we can clip imputed values to not be outside
    # observed values
    observed_min = df[cols_numeric].min()
    observed_max = df[cols_numeric].max()

    df_imputed = df.loc[:, []].copy()
    for col in df:
        df_imputed[col] = np.nan

    df_imputed[cols_numeric] = IterativeImputer(
        random_state=seed, max_iter=max_iter,
        estimator=estimator).fit_transform(df[cols_numeric])
    df_imputed[cols_not_numeric] = df[cols_not_numeric]

    # Clip imputed values to observed min-max range
    df_imputed[cols_numeric] = df_imputed[cols_numeric].clip(observed_min,
                                                             observed_max,
                                                             axis=1)

    log.info("Finished _fill_iterative()"
             f"Imputed df mising share numeric cols "
             f"{df[cols_numeric].isnull().mean().mean()}")

    for col in cols_numeric:
        log.debug("Missing share after impute "
                  f"{col} : {df_imputed[col].isnull().mean()}")

    return df_imputed
Beispiel #55
0
from sklearn.linear_model import BayesianRidge
import matplotlib.pyplot as plt
import numpy as np

X = [[6], [8], [10], [14], [18]]
y = [[7], [9], [13], [17.5], [18]]

bayesModel = BayesianRidge(
    n_iter=300,  #最大迭代次数
    tol=1.e-3,  #停止训练的误差值大小
    alpha_1=0.5,  #分布的形状参数?
    alpha_2=0.5,  #比率参数
    lambda_1=0.6,  #Gamma分布的形状参数
    lambda_2=0.6,  #Gamma比例参数
    compute_score=False,  #如果为真,则计算模型每一步的目标函数。
    fit_intercept=True,  #是否计算结局
    normalize=False,  #是否正则化
    copy_X=True,  #X被复制?被覆盖
    verbose=False  #详情模式
)

bayesModel.fit(X, y)

xx = np.linspace(5, 20, 100)
xx = xx.reshape(xx.shape[0], 1)
yy = bayesModel.predict(xx)
plt.plot(X, y, 'k.')
plt.plot(xx, yy, 'r-')
plt.show()
Beispiel #56
0
y_test_predictions = lr.predict(X_test)
print  (y_test[y_test==1] == y_test_predictions[y_test==1]).sum().astype(float) / y_test[y_test==1].shape[0]
#0.875
#But, at what expense do we do this? To find out, use the following command:
print  (y_test_predictions == y_test).sum().astype(float) / y_test.shape[0]
#0.967999



# Directly applying Bayesian ridge regression 贝叶斯岭回归

from sklearn.datasets import make_regression
X, y = make_regression(1000, 10, n_informative=2, noise=20)
#We can just "throw" ridge regression at the problem with a few simple steps:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()
br.fit(X, y)
print  br.coef_
#array([0.3000136 , -0.33023408, 68.166673, -0.63228159, 0.07350987,
#-0.90736606, 0.38851709, -0.8085291 , 0.97259451, 68.73538646])

br_alphas = BayesianRidge(alpha_1=10, lambda_1=10)
br_alphas.fit(X, y)
print  br_alphas.coef_
#array([0.30054387, -0.33130025, 68.10432626, -0.63056712,
#0.07751436, -0.90919326, 0.39020878, -0.80822013,
#0.97497567, 68.67409658])


# Using boosting to learn from errors
Beispiel #57
0
def get_model_from_name(model_name, training_params=None, is_hp_search=False):
    global keras_imported

    # For Keras
    epochs = 1000
    # if os.environ.get('is_test_suite', 0) == 'True' and model_name[:12] == 'DeepLearning':
    #     print('Heard that this is the test suite. Limiting number of epochs, which will increase training speed dramatically at the expense of model accuracy')
    #     epochs = 100

    all_model_params = {
        'LogisticRegression': {},
        'RandomForestClassifier': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'ExtraTreesClassifier': {
            'n_jobs': -1
        },
        'AdaBoostClassifier': {},
        'SGDClassifier': {
            'n_jobs': -1
        },
        'Perceptron': {
            'n_jobs': -1
        },
        'LinearSVC': {
            'dual': False
        },
        'LinearRegression': {
            'n_jobs': -2
        },
        'RandomForestRegressor': {
            'n_jobs': -2,
            'n_estimators': 30
        },
        'LinearSVR': {
            'dual': False,
            'loss': 'squared_epsilon_insensitive'
        },
        'ExtraTreesRegressor': {
            'n_jobs': -1
        },
        'MiniBatchKMeans': {
            'n_clusters': 8
        },
        'GradientBoostingRegressor': {
            'learning_rate': 0.1,
            'warm_start': True
        },
        'GradientBoostingClassifier': {
            'learning_rate': 0.1,
            'warm_start': True
        },
        'SGDRegressor': {
            'shuffle': False
        },
        'PassiveAggressiveRegressor': {
            'shuffle': False
        },
        'AdaBoostRegressor': {},
        'LGBMRegressor': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'LGBMClassifier': {
            'n_estimators': 2000,
            'learning_rate': 0.15,
            'num_leaves': 8,
            'lambda_l2': 0.001,
            'histogram_pool_size': 16384
        },
        'DeepLearningRegressor': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'DeepLearningClassifier': {
            'epochs': epochs,
            'batch_size': 50,
            'verbose': 2
        },
        'CatBoostRegressor': {},
        'CatBoostClassifier': {}
    }

    # if os.environ.get('is_test_suite', 0) == 'True':
    #     all_model_params

    model_params = all_model_params.get(model_name, None)
    if model_params is None:
        model_params = {}

    if is_hp_search == True:
        if model_name[:12] == 'DeepLearning':
            model_params['epochs'] = 50
        if model_name[:4] == 'LGBM':
            model_params['n_estimators'] = 500

    if training_params is not None:
        print('Now using the model training_params that you passed in:')
        print(training_params)
        # Overwrite our stock params with what the user passes in (i.e., if the user wants 10,000 trees, we will let them do it)
        model_params.update(training_params)
        print(
            'After overwriting our defaults with your values, here are the final params that will be used to initialize the model:'
        )
        print(model_params)

    model_map = {
        # Classifiers
        'LogisticRegression': LogisticRegression(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RidgeClassifier': RidgeClassifier(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'LinearSVC': LinearSVC(),

        # Regressors
        'LinearRegression': LinearRegression(),
        'RandomForestRegressor': RandomForestRegressor(),
        'Ridge': Ridge(),
        'LinearSVR': LinearSVR(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'RANSACRegressor': RANSACRegressor(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'Lasso': Lasso(),
        'ElasticNet': ElasticNet(),
        'LassoLars': LassoLars(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'BayesianRidge': BayesianRidge(),
        'ARDRegression': ARDRegression(),

        # Clustering
        'MiniBatchKMeans': MiniBatchKMeans(),
    }

    try:
        model_map['SGDClassifier'] = SGDClassifier(max_iter=1000, tol=0.001)
        model_map['Perceptron'] = Perceptron(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
            max_iter=1000, tol=0.001)
        model_map['SGDRegressor'] = SGDRegressor(max_iter=1000, tol=0.001)
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor(
            max_iter=1000, tol=0.001)
    except TypeError:
        model_map['SGDClassifier'] = SGDClassifier()
        model_map['Perceptron'] = Perceptron()
        model_map['PassiveAggressiveClassifier'] = PassiveAggressiveClassifier(
        )
        model_map['SGDRegressor'] = SGDRegressor()
        model_map['PassiveAggressiveRegressor'] = PassiveAggressiveRegressor()

    if xgb_installed:
        model_map['XGBClassifier'] = XGBClassifier()
        model_map['XGBRegressor'] = XGBRegressor()

    if lgb_installed:
        model_map['LGBMRegressor'] = LGBMRegressor()
        model_map['LGBMClassifier'] = LGBMClassifier()

    if catboost_installed:
        model_map['CatBoostRegressor'] = CatBoostRegressor(
            calc_feature_importance=True)
        model_map['CatBoostClassifier'] = CatBoostClassifier(
            calc_feature_importance=True)

    if model_name[:12] == 'DeepLearning':
        if keras_imported == False:
            # Suppress some level of logs if TF is installed (but allow it to not be installed, and use Theano instead)
            try:
                os.environ['TF_CPP_MIN_VLOG_LEVEL'] = '3'
                os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
                from tensorflow import logging
                logging.set_verbosity(logging.INFO)
            except:
                pass

            global maxnorm
            global Dense, Dropout
            global LeakyReLU, PReLU, ThresholdedReLU, ELU
            global Sequential
            global keras_load_model
            global regularizers, optimizers
            global Activation
            global KerasRegressor, KerasClassifier

            from keras.constraints import maxnorm
            from keras.layers import Activation, Dense, Dropout
            from keras.layers.advanced_activations import LeakyReLU, PReLU, ThresholdedReLU, ELU
            from keras.models import Sequential
            from keras.models import load_model as keras_load_model
            from keras import regularizers, optimizers
            from keras.wrappers.scikit_learn import KerasRegressor, KerasClassifier
            keras_imported = True

        model_map['DeepLearningClassifier'] = KerasClassifier(
            build_fn=make_deep_learning_classifier)
        model_map['DeepLearningRegressor'] = KerasRegressor(
            build_fn=make_deep_learning_model)

    try:
        model_without_params = model_map[model_name]
    except KeyError as e:
        print(
            'It appears you are trying to use a library that is not available when we try to import it, or using a value for model_names that we do not recognize'
        )
        raise (e)

    if os.environ.get('is_test_suite', False) == 'True':
        if 'n_jobs' in model_params:
            model_params['n_jobs'] = 1
    model_with_params = model_without_params.set_params(**model_params)

    return model_with_params
def main():
    parser = argparse.ArgumentParser(description="""Creates embeddings predictions.""")
    parser.add_argument('--train')
    parser.add_argument('--test')
    parser.add_argument('--embeddings')
    parser.add_argument('--cv',default=False)


    args = parser.parse_args()

    stoplist = stopwords.words("english")
    stoplist.extend("it's 've 's i'm he's she's you're we're they're i'll you'll he'll ".split(" "))


    embeddings={}
    for line in codecs.open(args.embeddings,encoding="utf-8").readlines():
        line = line.strip()
        if line:
            a= line.split(" ")
            embeddings[a[0]] = np.array([float(v) for v in a[1:]]) #cast to float, otherwise we cannot operate

    train_indices = []
    test_indices = []
    train_scores = []
    train_features = []
    test_features = []


    # if args.learner == "logisticregression":
    #     learner= LogisticRegression()
    #     learner_type = "classification"
    # elif args.learner == "decisiontreeclassification":
    #     learner = tree.DecisionTreeClassifier()
    #     learner_type = "classification"
    # elif args.learner == "decisiontreeregression":
    #     learner = tree.DecisionTreeRegressor()
    #     learner_type = "regression"
    # elif args.learner == "bayesianridge":
    #     learner = BayesianRidge()
    #     learner_type = "regression"
    # else:
    learner = BayesianRidge()
    learner_type = "regression"

    le = preprocessing.LabelEncoder()


    for line in open(args.train).readlines():
        (index, score, tweet) = line.strip().split("\t")
        train_indices.append(index)
        train_scores.append(float(score))
        tweet = tweet.split(" ")
        train_features.append(embedfeats(tweet,embeddings,stoplist))


    train_indices = np.array(train_indices)
    train_scores = np.array(train_scores)
    train_features = np.array(train_features)

    train_scores_int = [roundup(v) for v in train_scores]
    le.fit(train_scores_int)

    train_scores_int_transformed = le.transform(train_scores_int)


    if args.cv:
        train_cv={}
        cross=cross_validation.KFold(len(train_scores),n_folds=10)
        acc=[]
        for train_index, test_index in cross:
            #if args.debug:
            #    print("TRAIN:", len(train_index), "TEST:", len(test_index))
            X=train_features
            y=train_scores
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]


            learner.fit(X_train,y_train)

            y_pred= learner.predict(X_test)
            assert(len(y_pred)==len(test_index))
            tids=train_indices[test_index]
            for twid,pred in zip(tids,y_pred):
                train_cv[twid] =  pred

            acc.append(cosine_similarity(y_test,y_pred)[0][0])

        print >>sys.stderr, "Cosine of 10-folds:", acc
        print >>sys.stderr, "Macro average:", np.mean(np.array(acc)), np.std(np.array(acc))

        for twid in train_indices:
            print "{}\t{}".format(twid,train_cv[twid])
    else:

        for line in open(args.test).readlines():
            (index, score, tweet) = line.strip().split("\t")
            test_indices.append(index)
            #scores.append(score)
            tweet = tweet.split(" ")
            test_features.append(embedfeats(tweet,embeddings,stoplist))


        #print  np.array(train_features).shape
        # when features are generated, train and test

        if learner_type == "regression":
            learner.fit(train_features,train_scores)
        else:
                learner.fit(train_features,train_scores_int_transformed)

        predicted_scores= learner.predict(test_features)
        if learner_type != "regression":
            predicted_scores = le.inverse_transform(predicted_scores)
        for index, score in zip(test_indices,predicted_scores):
            print index+"\t"+str(score)
# Create weigts with a precision lambda_ of 4.
lambda_ = 4.
w = np.zeros(n_features)
# Only keep 10 weights of interest
relevant_features = np.random.randint(0, n_features, 10)
for i in relevant_features:
    w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_))
# Create noise with a precision alpha of 50.
alpha_ = 50.
noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples)
# Create the target
y = np.dot(X, w) + noise

###############################################################################
# Fit the Bayesian Ridge Regression and an OLS for comparison
clf = BayesianRidge(compute_score=True)
clf.fit(X, y)

ols = LinearRegression()
ols.fit(X, y)

###############################################################################
# Plot true weights, estimated weights and histogram of the weights
pl.figure(figsize=(6, 5))
pl.title("Weights of the model")
pl.plot(clf.coef_, 'b-', label="Bayesian Ridge estimate")
pl.plot(w, 'g-', label="Ground truth")
pl.plot(ols.coef_, 'r--', label="OLS estimate")
pl.xlabel("Features")
pl.ylabel("Values of the weights")
pl.legend(loc="best", prop=dict(size=12))
Beispiel #60
0
def get_models():
    models = dict()

    # Neural Networks
    models['nnet'] = MLPRegressor(activation='relu',
                                  hidden_layer_sizes=(50, 50, 50),
                                  learning_rate='adaptive',
                                  learning_rate_init=0.1,
                                  max_iter=2000,
                                  solver='sgd',
                                  alpha=0.01,
                                  random_state=0,
                                  verbose=True)

    # Linear Regression
    tuned_parameters_lr = [{'normalize': ['True', 'False']}]
    clf_lr = GridSearchCV(LinearRegression(),
                          tuned_parameters_lr,
                          scoring='neg_mean_absolute_error')
    models['lr'] = clf_lr

    # Decision Tree
    tuned_parameters_dtr = [{
        'min_samples_leaf': [5, 10, 50, 100],
        'criterion': ['mse', 'friedman_mse', 'mae', 'poisson'],
        'splitter': ['best', 'random'],
        'random_state': [0]
    }]
    clf_dtr = GridSearchCV(DecisionTreeRegressor(),
                           tuned_parameters_dtr,
                           scoring='neg_mean_absolute_error')
    models['dtr'] = clf_dtr

    # Random Forest
    tuned_parameters_rf = [{
        'min_samples_leaf': [5, 10, 50, 100],
        'n_estimators': [5, 10, 50, 100],
        'criterion': ['mse', 'mae'],
        'random_state': [0]
    }]
    clf_rf = GridSearchCV(RandomForestRegressor(),
                          tuned_parameters_rf,
                          scoring='neg_mean_absolute_error')
    models['rf'] = clf_rf

    # SVR
    tuned_parameters_svm = [{
        'kernel': ['rbf'],
        'gamma': [1e-3, 1e-4],
        'C': [1, 10, 100, 1000]
    }]
    clf_svm = GridSearchCV(SVR(),
                           tuned_parameters_svm,
                           scoring='neg_mean_absolute_error')
    models['svm'] = clf_svm

    # Bayesian Ridge
    tuned_parameters_bayes = [{'n_iter': [100, 200, 300, 400, 500]}]
    clf_bayes = GridSearchCV(BayesianRidge(),
                             tuned_parameters_bayes,
                             scoring='neg_mean_absolute_error')
    models['bayes'] = clf_bayes

    # kNNeighbours
    tuned_parameters_knn = [{
        'n_neighbors': [1, 5, 10, 15, 20, 50],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }]
    clf_knn = GridSearchCV(KNeighborsRegressor(),
                           tuned_parameters_knn,
                           scoring='neg_mean_absolute_error')
    models['knn'] = clf_knn

    # Gaussian Process
    tuned_parameters_gp = [{
        'kernel': [WhiteKernel() + RBF() + DotProduct(),
                   RBF() + DotProduct()],
        'random_state': [0]
    }]
    clf_gp = GridSearchCV(GaussianProcessRegressor(),
                          tuned_parameters_gp,
                          scoring='neg_mean_absolute_error')
    models['gp'] = clf_gp

    return models