Beispiel #1
0
def elasticNet(X,y):

    print("\n### ~~~~~~~~~~~~~~~~~~~~ ###")
    print("Lasso Regression")

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    myDegree = 40
    polynomialFeatures = PolynomialFeatures(degree=myDegree, include_bias=False)
    Xp = polynomialFeatures.fit_transform(X)

    myScaler = StandardScaler()
    scaled_Xp = myScaler.fit_transform(Xp)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    elasticNet = ElasticNet(alpha=1e-7,l1_ratio=0.5)
    elasticNet.fit(scaled_Xp,y)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    dummyX = np.arange(0,2,0.01)
    dummyX = dummyX.reshape((dummyX.shape[0],1))
    dummyXp = polynomialFeatures.fit_transform(dummyX)
    scaled_dummyXp = myScaler.transform(dummyXp)
    dummyY = elasticNet.predict(scaled_dummyXp)

    outputFILE = 'plot-elasticNet.png'
    fig, ax = plt.subplots()
    fig.set_size_inches(h = 6.0, w = 10.0)
    ax.axis([0,2,0,15])
    ax.scatter(X,y,color="black",s=10.0)
    ax.plot(dummyX, dummyY, color='red', linewidth=1.5)
    plt.savefig(filename = outputFILE, bbox_inches='tight', pad_inches=0.2, dpi = 600)

    ### ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
    return( None )
Beispiel #2
0
def enet(a):
    print ("Doing elastic net")
    clf3 = ElasticNet(alpha=a)
    clf3.fit(base_X, base_Y)
    print ("Score = %f" % clf3.score(base_X, base_Y))
    clf3_pred = clf3.predict(X_test)
    write_to_file("elastic.csv", clf3_pred)
def check_ElasticNet(X, y, pred, tol, reg_alpha, reg_lambda, weights):
    enet = ElasticNet(alpha=reg_alpha + reg_lambda,
                      l1_ratio=reg_alpha / (reg_alpha + reg_lambda))
    enet.fit(X, y)
    enet_pred = enet.predict(X)
    assert np.isclose(weights, enet.coef_, rtol=tol, atol=tol).all()
    assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all()
Beispiel #4
0
def report_ff_en():
    # Fastfood approximation of Gaussian kernel
    para = FastfoodPara(n, d)
    st = time()
    PHI_train, _ = FastfoodForKernel(trainData, para, sgm)
    elapsed_ff_kern_train = time() - st
    st = time()
    PHI_valid, _ = FastfoodForKernel(validationData, para, sgm)
    elapsed_ff_kern_valid = time() - st

    # Train elastic net on projected training data
    en = ElasticNet()
    st = time()
    en.fit(PHI_train.T, trainLabels)
    elapsed_en_fit = time() - st

    # Predict labels for projected validation data
    st = time()
    y_pred = en.predict(PHI_valid.T)
    elapsed_en_pred = time() - st

    # Report performance
    mse_proj = metrics.mean_squared_error(validationLabels, y_pred)
    # print("For projected data, MSE = {:0.4g}.".format(mse_proj))

    return mse_proj, elapsed_en_fit, elapsed_ff_kern_train
def enet_granger_causality_test(X_t, y_t, top_df, max_iter=10000000):
    """
    Return the cv-parameters tested across the whole data
    :param X_t:
    :param y_t:
    :param top_df:
    :return: res_df, test_betas
    """

    test_errs = np.zeros(len(top_df))
    scores = np.zeros(len(top_df))
    dfs = np.zeros(len(top_df))

    test_coefs = np.zeros((len(top_df), X_t.shape[1]))
    for i in range(len(top_df)):
        alpha = top_df.iloc[i]["alpha"]
        lambda_min = top_df.iloc[i]["lambda.min"]
        enet = ElasticNet(l1_ratio=alpha, alpha=lambda_min, max_iter=max_iter)
        enet.fit(X_t, y_t)
        y_pred = enet.predict(X_t)
        test_errs[i] = np.average((y_t - y_pred)**2)
        scores[i] = enet.score(X_t, y_t)
        test_coefs[i] = enet.coef_

        dfs[i] = len(np.where(enet.coef_)[0])

    top_df["test_err"] = test_errs
    top_df["score"] = scores
    top_df["df"] = dfs


    return top_df, test_coefs
Beispiel #6
0
    def elastic_net(self):
        enet = ElasticNet()
        # features = ['season', 'holiday', 'workingday', 'weather', 'humidity', 'temp', 'windspeed', 'hour', 'month', 'year', 'day_of_week']
        features = ['season', 'workingday', 'weather', 'humidity', 'windspeed', 'hour', 'month', 'year', 'day_of_week']
        enet = ElasticNetCV()
        enet.fit(self.train[features], self.train['log-count'])

        return self.predict(enet, "Elastic Net", features)
def train_model(features_filename):
    training_data = np.loadtxt(features_filename, delimiter=",")

    X = training_data[:, :-1]
    y = training_data[:, -1]

    model = ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=True,
                       precompute='auto', rho=None)
    model.fit(X, y)

    return model
    def __init__(self, Dict_TrainingData, Flt_Lambda, Flt_L1):
        # Only for two class
        # Dict_Trainingdata
            # Key : 0,1
            # Row : data
        self.Data1 = Dict_TrainingData[0] # N by 256 matrix
        self.Data2 = Dict_TrainingData[1] # V by 256 matrix
        self.Dim = len(self.Data1[0]) # 256

        self.X = np.concatenate((self.Data1, self.Data2), axis=0) # N / V augmented matrix
        self.X = self.X - np.mean(self.X,axis=0)

        self.NumClass1 = len(self.Data1) # N
        self.NumClass2 = len(self.Data2) # V
        self.TotalNum = self.NumClass1 + self.NumClass2

        self.Y = self.Construct_Y()
        self.D = np.dot(np.transpose(self.Y), self.Y) / float(self.TotalNum) # P
        self.Q = np.ones((2,1))

        InitialTheta = np.array([2,5])
        I = np.eye(2)
        Theta = np.dot(I - np.dot(np.dot(self.Q, np.transpose(self.Q)), self.D ), InitialTheta)
        Theta /= np.sqrt(np.dot(np.dot(np.transpose(Theta), self.D), Theta))

        MaxIter = 10000
        PrevTheta = InitialTheta
        PrevB = np.ones(self.Dim)
        for idx in range(MaxIter):
            NewResp = np.dot(self.Y, Theta)
            elas = ElasticNet(alpha=Flt_Lambda, l1_ratio=Flt_L1)
            #
            # # Compute Coefficient
            # B = lasso.fit(X=self.X, y= NewResp).coef_
            B = elas.fit(X=self.X, y= NewResp).coef_
            # print B
            #
            # New OptScore
            Part1 = I - np.dot(np.dot(self.Q, np.transpose(self.Q)),self.D)
            Part2 = np.dot(Part1, np.linalg.inv(self.D))
            Part3 = np.dot(Part2, np.transpose(self.Y))
            WaveTheta = np.dot(np.dot(Part3, self.X), B)
            # print WaveTheta
            Theta = WaveTheta / np.sqrt(np.dot(np.dot(np.transpose(WaveTheta),self.D),WaveTheta))

            if np.sum(np.abs(B - PrevB)) < 1e-6:
                break
            else:
                PrevB = B

        # print B
        self.B = B 
Beispiel #9
0
    def fit_model_12(self,toWrite=False):
        model = ElasticNet(alpha=1.0)

        for data in self.cv_data:
            X_train, X_test, Y_train, Y_test = data
            model.fit(X_train,Y_train)
            pred = model.predict(X_test)
            print("Model 12 score %f" % (logloss(Y_test,pred),))

        if toWrite:
            f2 = open('model12/model.pkl','w')
            pickle.dump(model,f2)
            f2.close()
Beispiel #10
0
 def predict_linear(self, enet=True):
     """How well can we do on this SRFF with a linear regression
     (with optional elastic-net regularisation)?"""
     if enet:
         clf = ElasticNet()
     else:
         clf = LinearRegression()
     # we have to transpose X here because sklearn uses the
     # opposite order (rows v columns). maybe this is a sign that
     # I'm using the wrong order.
     clf.fit(self.train_X.T, self.train_y)
     yhat = clf.predict(self.test_X.T)
     err = self.defn(self.test_y, yhat)
     return clf.intercept_, clf.coef_, err
Beispiel #11
0
def sklean_linear_model_elastic_net():
    en = ElasticNet(fit_intercept=True, alpha=0.5)
    boston = load_boston()
    x = boston.data
    y = boston.target

    kf = KFold(len(x), n_folds=10)
    err = 0
    for train, test in kf:
        en.fit(x[train], y[train])
        p = map(en.predict, x[test])
        e = p - y[test]
        err += np.sum(e * e)
    rmse_10cv = np.sqrt(err / len(x))
    print "RMSE on 10-fold CV: {}".format(rmse_10cv)
Beispiel #12
0
def report_orig_en():
    # Train elastic net on original training data
    en = ElasticNet()
    st = time()
    en.fit(trainData.T, trainLabels)
    elapsed_en_fit = time() - st

    # Predict labels for original validation data
    st = time()
    y_pred = en.predict(validationData.T)
    elapsed_en_pred = time() - st

    # Report performance
    mse_orig = metrics.mean_squared_error(validationLabels, y_pred)
    return mse_orig, elapsed_en_fit, 0.
Beispiel #13
0
def fit_enet(train_X, train_y, test_X):
    """
    Use linear regression to predict. Elastic net is LR with L1 and L2
    regularisation.
    
    :param train_X:
    :param train_y:
    :param test_X:
    :return:
    """
    enet = ElasticNet()
    enet.fit(train_X, train_y)
    model = "ElasticNet int %.2f coefs %s" % (enet.intercept_, pprint(enet.coef_))
    yhat_train = enet.predict(train_X)
    yhat_test = enet.predict(test_X)
    
    return model, yhat_train, yhat_test
Beispiel #14
0
def create_ml_classifier(df):
    import operator
    X = np.array(df.drop('base_ip_release',1))
    y = np.array(df['base_ip_release'])
    #clf = LinearRegression()
    clf = ElasticNet(alpha=1,l1_ratio=0.5)
    #clf = Ridge(alpha=2)
    # train_X,test_X,train_y,test_y = cross_validation.train_test_split(X,y,train_size=0.9)
    #
    #
    # sc = StandardScaler()
    # sc.fit(train_X)
    # X_train_std = sc.transform(train_X)
    # X_test_std = sc.transform(test_X)
    #
    # clf.fit(X_train_std,train_y)
    # print clf.predict(X_test_std)
    # print accuracy_score(test_y,clf.predict(X_test_std))


    c = np.zeros(len(X)/10)
    kf = k(len(y),n_folds=10)
    c = 0
    min_dict = {}
    get_error = []
    for train,test in kf:
        get_clif = clf.fit(X[train],y[train])
        p = clf.predict(X[test])
        #print p
        e = (p - y[test])
        #print e, len(e)
        t =  np.dot(e,e)
        # print t
        c += t
        # print c
        #print p, y[test]
        min_dict[t] = get_clif
        get_error.append(t)
    #print min_dict
    min_error = min(get_error)
    print sorted(min_dict.items(),key=operator.itemgetter(0))
    print min_dict[min_error]
    print c
    print np.sqrt(c/len(X))
    return min_dict[min_error]
Beispiel #15
0
def ElasticNetRegression(input_dict):
	# 	from sklearn.datasets import load_iris
# 	from sklearn import tree
# 	iris = load_iris()
# 	clf = tree.DecisionTreeClassifier()
# 	clf = clf.fit(iris.data, iris.target)	
	from sklearn.datasets import  load_diabetes
	dta = load_diabetes()
	n_sample = dta.data
	n_feature = dta.target
	print "*******SAMPLES********"
	print n_sample
	print "******FEARTURES*******"
	print n_feature
	from sklearn.linear_model import ElasticNet
	rgs = ElasticNet().fit(n_sample, n_feature)
	print rgs
	print rgs.predict(n_sample)
Beispiel #16
0
def main():
    seq = [[(i * .1, k * .1) for i in range(1, 3)] for k in range(1, 3)]
    seq = list(itertools.chain.from_iterable(seq))

    counter = 1
    boston = datasets.load_boston()
    X = boston.data
    y = boston.target

    kfolds = KFold(X.shape[0], n_folds=4)
    for traini, testi in kfolds:
        alpha, l1 = seq[counter]
        print seq[counter]
        print alpha, l1
        enet = ElasticNet(alpha=alpha, l1_ratio=l1)
        y_pred = enet.fit(X[traini], y[traini]).predict(X[testi])
        score = r2_score(y[testi], y_pred)
        print score
def testLasso():
    # 目标函数加入了对w和样本个数的惩罚
    # 基于稀疏模型的情况,进行线性拟合,这时的效果较好
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.metrics import r2_score
    #我们先手动生成一些稀疏数据
    print np.random.seed(42)
    n_samples, n_features = 50, 200
    X = np.random.randn(n_samples, n_features)
    coef = 3 * np.random.randn(n_features) #这个就是实际的参数
    inds = np.arange(n_features)
    np.random.shuffle(inds) #打乱
    coef[inds[10:]] = 0 #生成稀疏数据
    y = np.dot(X, coef) #参数与本地点乘
    #来点噪音
    y += 0.01 * np.random.normal((n_samples,))

    X_train, y_train = X[:n_samples/2], y[:n_samples/2]
    X_test, y_test = X[n_samples/2:], y[n_samples/2:]

    from sklearn.linear_model import Lasso
    alpha = 0.1
    lasso = Lasso(alpha=alpha)

    y_pred_lasso = lasso.fit(X_train,y_train).predict(X_test)
    r2_score_lasso = r2_score(y_test, y_pred_lasso) #这里是0.38
    print lasso
    print "r2_score's result is %f" % r2_score_lasso

    from sklearn.linear_model import ElasticNet
    enet = ElasticNet(alpha=alpha, l1_ratio=0.7)
    y_pred_enet = enet.fit(X_train,y_train).predict(X_test)
    r2_score_enet = r2_score(y_test, y_pred_enet) #0.24 没有lasso好
    print enet
    print "nent's result is %f" % r2_score_enet

    plt.plot(enet.coef_, label='Elastic net coefficients')
    plt.plot(lasso.coef_, label='Lasso coefficients')
    plt.plot(coef, '--', label='original coefficients')
    plt.legend(loc="best")
    plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
              % (r2_score_lasso, r2_score_enet))
    plt.show()
Beispiel #18
0
def assert_regression_result(results, tol):
    regression_results = [r for r in results if
                          r["param"]["objective"] == "reg:linear"]
    for res in regression_results:
        X = scale(res["dataset"].X,
                  with_mean=isinstance(res["dataset"].X, np.ndarray))
        y = res["dataset"].y
        reg_alpha = res["param"]["alpha"]
        reg_lambda = res["param"]["lambda"]
        pred = res["bst"].predict(xgb.DMatrix(X))
        weights = xgb_get_weights(res["bst"])[1:]
        enet = ElasticNet(alpha=reg_alpha + reg_lambda,
                          l1_ratio=reg_alpha / (reg_alpha + reg_lambda))
        enet.fit(X, y)
        enet_pred = enet.predict(X)
        assert np.isclose(weights, enet.coef_, rtol=tol,
                          atol=tol).all(), (weights, enet.coef_)
        assert np.isclose(enet_pred, pred, rtol=tol, atol=tol).all(), (
            res["dataset"].name, enet_pred[:5], pred[:5])
def imputer_train(col_ind):
    import pandas as pd
    data = pd.DataFrame.from_csv("/Users/DboyLiao/Documents/kaggle/data/Display_Advertising_Challenge/complete_train.csv")
    print "[" + str(col_ind) + "th column] " + "Loading data."
    data = data.set_index("Id")
    data = data.drop("Label", 1)
    col_name = data.columns[col_ind]
    col_classes = ["numeric" if ind <= 12 else "categorical" for ind in range(39)]
    col_class = col_classes[col_ind]
    print "[" + str(col_ind) + "th column] " + "Processing."
    Y = data[col_name]
    X = data.drop(col_name, 1)
    if col_class == 'categorical':
        svc = SVC(C = 10)
        imputer = svc.fit(X, Y)
    elif col_class == 'numeric':
        EN = ElasticNet()
        imputer = EN.fit(X, Y)
    else:
        pass
    return imputer
 def __init__(self, model_name, model_type, n_clusters=None,
              n_components=None, n_lag=None, regularisation=None):
     self.n_lag = n_lag
     self.model_name = model_name
     self.model_type = model_type
     # self.n_clusters = n_clusters
     # self.clustering = NeuronClustering(self.n_clusters, signal_correlation)
     
     if model_name == 'cca':
         self.n_components = n_components
         self.model = CCA(n_components=self.n_components)
     elif model_name == 'linear-regression':
         if regularisation is None:
             self.model = LinearRegression()
         elif regularisation == 'l1':
             self.model = Lasso()
         elif regularisation == 'l2':
             self.model = Ridge()
         elif regularisation == 'l1l2':
             self.model = ElasticNet()
         else:
             raise NotImplementedError
Beispiel #21
0
def Lasso():
    from sklearn.linear_model import Lasso
    from sklearn.metrics import r2_score
    alpha = 0.1
    lasso = Lasso(alpha=alpha)
    trainDat = shortData
    trainLab = shortLabels
    
    
    lassoPred = lasso.fit(trainDat,trainLab)
    labPredict = lassoPred.predict(testDat)
    r2val = r2_score(testLab,labPredict)
    print(lasso)
    print "r^2 for lasso testing is: ", r2val
    
    from sklearn.linear_model import ElasticNet
    enet = ElasticNet(alpha=alpha, l1_ratio = 0.7)
    enetPred = enet.fit(trainDat, trainLab)
    labPredict_enet = enet.predict(testDat)
    r2val_enet = r2_score(testLab, labPredict_enet)
    print enet
    print "r^2 for enet testing is: ", r2val_enet
def lasso(filename, x_train_orig, x_devel_orig, x_test_orig, lab_train_orig, lab_devel_orig, lab_test_orig):

    # Normalize the data
    scaler_data = preprocessing.StandardScaler().fit(x_train_orig.toarray())
    x_train = scaler_data.transform(x_train_orig.toarray())
    x_devel = scaler_data.transform(x_devel_orig.toarray())
    x_test = scaler_data.transform(x_test_orig.toarray())

    scaler_lab = preprocessing.StandardScaler().fit(lab_train_orig)
    lab_train = scaler_lab.transform(lab_train_orig)
    lab_devel = scaler_lab.transform(lab_devel_orig)
    lab_test = scaler_lab.transform(lab_test_orig)

    # Elastic Net

    clf = ElasticNet(alpha = 0.025, l1_ratio = 0.7)
    clf.fit (x_train, lab_train)
    nz = (clf.coef_ != 0)

    # Se guardan los ficheros de parametros resultantes
    dump_svmlight_file(x_train_orig[:, nz], lab_train_orig, filename+"_elasso.train.libsvm", zero_based=False, comment=None, query_id=None)
    dump_svmlight_file(x_devel_orig[:, nz], lab_devel_orig, filename+"_elasso.devel.libsvm", zero_based=False, comment=None, query_id=None)
    dump_svmlight_file(x_test_orig[:, nz], lab_test_orig, filename+"_elasso.test.libsvm", zero_based=False, comment=None, query_id=None)
# different than Lasso, penalizes but still beta for most features will remain > 0
print "Ridge regression"

for alpha in range(1,5):
    ridge = Ridge(alpha)
    ridge_scores =cross_val_score(ridge, x, y, cv = 5)
    print "alpha={a}".format(a=alpha)
    print ridge_scores.mean()
    print ridge_scores

# combination of ridge and Lasso
print "Elastic net regularization"

for alpha in range(1,5):
    elastic_net = ElasticNet(alpha)
    elastic_net_scores =cross_val_score(elastic_net, x, y, cv = 5)
    print "alpha={a}".format(a=alpha)
    print elastic_net_scores.mean()
    print elastic_net_scores

# best performing regressor for this data set was Elastic net with alpha=1
# with score = 0.472705248975
# draw scatter plot for values predicted with this regressor

print "Showing scatter plot for elastic net with alpha = 1"

elastic_net = ElasticNet(1)
elastic_net.fit(x, y)
predicted_y = elastic_net.predict(x)
Beispiel #24
0
pc_file = "/home/pokoro/data/mesa_models/"+pop.lower()+"/"+pop.upper()+"_3_PCs.txt"
gene_annotation_file = "/home/pokoro/data/mesa_models/gencode.v18.annotation.parsed.txt"
snp_annotation_file = "/home/pokoro/data/mesa_models/"+pop.lower()+"/"+pop.upper()+"_"+chrom+"_annot.txt"


# parse the files

snpannot = get_filtered_snp_annot(snp_annotation_file)
geneannot = get_gene_annotation(gene_annotation_file, chrom)
cov = get_covariates(pc_file)
expr_df = get_gene_expression(gene_expression_file, geneannot)
genes = list(expr_df.columns)
gt_df = get_maf_filtered_genotype(snp_dosage_file)


en = ElasticNet(max_iter=10000, random_state=1234)


#where to write out result
open("/home/pokoro/data/mesa_models/en_R_Python_compare/"+pop+"_en_py_chr"+chrom+
     ".txt", "w").write("gene_id"+"\t"+"gene_name"+"\t"+"chr"+"\t"+"cvr2")

#Go through all protein coding genes

for gene in genes:
    coords = get_gene_coords(geneannot, gene)
    gene_name = get_gene_name(geneannot, gene)
    expr_vec = expr_df[gene]
    
    adj_exp = adjust_for_covariates(list(expr_vec), cov)
    cis_gt = get_cis_genotype(gt_df, snpannot, coords)
    reduce_cmd = "%s -v --reduce %s/config.json" % (exec_path, WD)
    os.system(map_cmd)
    os.system(reduce_cmd)

    ###########################################################################
    ## Do it without mapreduce
    res = list()
    for i, (tr, te) in enumerate(cv):
        # key = params[0]
        y_true = list()
        y_pred = list()
        for key in params:
            # tr, te = cv[0]
            Xtrain = X[tr, :]
            Xtest = X[te, :]
            ytrain = y[tr, :].ravel()
            ytest = y[te, :].ravel()
            mod = ElasticNet(alpha=key[0], l1_ratio=key[1])
            y_pred.append(mod.fit(Xtrain, ytrain).predict(Xtest))
            y_true.append(ytest)
        y_true = np.hstack(y_true)
        y_pred = np.hstack(y_pred)
        res.append([i, r2_score(y_true, y_pred)])
    true = pd.DataFrame(res, columns=["resample_key", "r2"])
    mr = pd.read_csv(os.path.join(WD, 'results.csv'))
    # Check same keys
    assert np.all(np.sort(true.resample_key) == np.sort(mr.resample_key))
    m = pd.merge(true, mr, on="resample_key", suffixes=["_true", "_mr"])
    # Check same scores
    assert np.allclose(m.r2_true, m.r2_mr)
Beispiel #26
0
    #     cv_mse = numpy.append(cv_mse, [mse_10cv])
    #     print('{:.3f}\t {:.4f}\t\t {:.4f}'.format(a,mse_train,mse_10cv))

    # pl.plot(alpha, t_mse, label='MSE_train')
    # pl.plot(alpha, cv_mse, label='MSE_CrossVal')
    # pl.legend( ('MSE_train', 'MSE_CrossVal') )
    # pl.ylabel('MSE')
    # pl.xlabel('alpha')
    # pl.show()

    a = 0.5
    for name, met in [
        ('linear_regression', LinearRegression()),
        ('lasso', Lasso(fit_intercept=True, alpha=a, normalize=True)),
        ('ridge', Ridge(fit_intercept=True, alpha=a, normalize=True)),
        ('elastic_net', ElasticNet(fit_intercept=True, alpha=a,
                                   normalize=True))
    ]:
        met.fit(x, y)

        with open('../data/' + name + str(i) + '.txt', 'wb') as model_file:
            pickle.dump(met, model_file)

        # p = np.array([met.predict(xi) for xi in x])
        p = met.predict(x)

        e = bound(p) - y
        total_error = numpy.dot(e, e)
        mse_train = total_error / len(p)

        # kf = KFold(10)
        # err = 0
Beispiel #27
0
def get_gridsearch(
    frequency,
    horizon=10,
    n_splits=5,
    between_split_lag=None,
    scoring="neg_mean_absolute_error",
    country_code_column=None,
    country_code=None,
    sklearn_models=False,
    sklearn_models_optimize_for_horizon=False,
    autosarimax_models=False,
    autoarima_dict=None,
    prophet_models=True,
    tbats_models=False,
    exp_smooth_models=False,
    average_ensembles=False,
    stacking_ensembles=False,
    stacking_ensembles_train_horizon=10,
    stacking_ensembles_train_n_splits=20,
    clip_predictions_lower=None,
    clip_predictions_upper=None,
    exog_cols=None,
):
    """Get grid search object based on selection criteria.

    Parameters
    ----------
    frequency : str
        Frequency of timeseries. Pandas compatible frequncies

    horizon : int
        How many units of frequency (e.g. 4 quarters), should be used to find the best models

    n_splits : int
        How many cross-validation folds should be used in model selection

    between_split_lag : int
        How big lag of observations should cv_splits have
        If kept as None, horizon is used resulting in non-overlaping cv_splits

    scoring : str, callable
        String of sklearn regression metric name, or hcrystalball compatible scorer. For creation
        of hcrystalball compatible scorer use `make_ts_scorer` function.

    country_code_column : str
        Column in data, that contains country code in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    country_code : str
        Country code in str (e.g. 'DE'). Used in holiday transformer.
        Only one of `country_code_column` or `country_code` can be set.

    sklearn_models : bool
        Whether to consider sklearn models

    sklearn_models_optimize_for_horizon: bool
        Whether to add to default sklearn behavior also models, that optimize predictions for each horizon

    autosarimax_models : bool
        Whether to consider auto sarimax models

    autoarima_dict : dict
        Specification of pmdautoarima search space

    prophet_models : bool
        Whether to consider FB prophet models

    exp_smooth_models : bool
        Whether to consider exponential smoothing models

    average_ensembles : bool
        Whether to consider average ensemble models

    stacking_ensembles : bool
        Whether to consider stacking ensemble models

    stacking_ensembles_train_horizon : int
        Which horizon should be used in meta model in stacking ensembles

    stacking_ensembles_train_n_splits : int
        Number of splits used in meta model in stacking ensembles

    clip_predictions_lower : float, int
        Minimal number allowed in the predictions

    clip_predictions_upper : float, int
        Maximal number allowed in the predictions

    exog_cols : list
        List of columns to be used as exogenous variables

    Returns
    -------
    sklearn.model_selection.GridSearchCV
        CV / Model selection configuration
    """
    exog_cols = exog_cols if exog_cols is not None else []
    # ensures only exogenous columns and country code column will be passed to model if provided
    # and columns names will be stored in TSColumnTransformer
    if exog_cols:
        cols = exog_cols + [country_code_column
                            ] if country_code_column else exog_cols
        exog_passthrough = TSColumnTransformer(transformers=[("raw_cols",
                                                              "passthrough",
                                                              cols)])
    else:
        exog_passthrough = "passthrough"
    # ensures holiday transformer is added to the pipeline if requested
    if country_code or country_code_column:
        from hcrystalball.feature_extraction import HolidayTransformer

        holiday = HolidayTransformer(country_code=country_code,
                                     country_code_column=country_code_column)
    else:
        holiday = "passthrough"

    estimator = Pipeline([("exog_passthrough", exog_passthrough),
                          ("holiday", holiday), ("model", "passthrough")])

    scoring = get_scorer(scoring)
    cv = FinerTimeSplit(n_splits=n_splits,
                        horizon=horizon,
                        between_split_lag=between_split_lag)

    grid_search = GridSearchCV(
        estimator=estimator,
        param_grid=[],
        scoring=scoring,
        cv=cv,
        refit=False,
        error_score=np.nan,
    )

    if autosarimax_models:
        # adding autosarimax to param_grid might cause differently found models
        # for different splits and raise inconsistency based errors.
        # sarimax pipeline is added to new grid_search's attribute (`grid_search.autosarimax`)
        # and handled in `hcrystalball.model_seleciton.select_model` function in following way
        # 1. get best model for the data part on last split
        # 2. append this best model to original `param_grid`
        # 3. run full grid search with `param_grid` containing
        #    sarimax model selected from autosarimax in point 1
        from hcrystalball.wrappers import SarimaxWrapper

        if autoarima_dict is None:
            autoarima_dict = {}
        if "error_action" not in autoarima_dict:
            autoarima_dict.update({"error_action": "raise"})

        grid_search.autosarimax = Pipeline(estimator.steps[:-1])
        grid_search.autosarimax.steps.append((
            "model",
            SarimaxWrapper(
                init_with_autoarima=True,
                autoarima_dict=autoarima_dict,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            ),
        ))

    if stacking_ensembles or average_ensembles or sklearn_models:
        from sklearn.linear_model import ElasticNet
        from sklearn.ensemble import RandomForestRegressor
        # TODO when scoring time is fixed, add HistGradientBoostingRegressor
        # from sklearn.experimental import enable_hist_gradient_boosting
        # from sklearn.ensemble import HistGradientBoostingRegressor
        from hcrystalball.wrappers import get_sklearn_wrapper
        from hcrystalball.feature_extraction import SeasonalityTransformer

        sklearn_model = get_sklearn_wrapper(
            RandomForestRegressor,
            clip_predictions_lower=clip_predictions_lower,
            clip_predictions_upper=clip_predictions_upper,
        )

        sklearn_model_pipeline = Pipeline([
            ("seasonality", SeasonalityTransformer(auto=True, freq=frequency)),
            ("model", sklearn_model)
        ])
        # TODO make sure naming here works as expected
        sklearn_model_pipeline.name = f"seasonality_{sklearn_model.name}"

    if sklearn_models:
        classes = [ElasticNet, RandomForestRegressor]
        models = {
            model_class.__name__: get_sklearn_wrapper(
                model_class,
                clip_predictions_lower=clip_predictions_lower,
                clip_predictions_upper=clip_predictions_upper,
            )
            for model_class in classes
        }

        optimize_for_horizon = [
            False, True
        ] if sklearn_models_optimize_for_horizon else [False]

        grid_search.param_grid.append({
            "model": [sklearn_model_pipeline],
            "model__seasonality__weekly": [True, False],
            "model__model":
            list(models.values()),
            # TODO change add once HistGradientBoostingRegressor is back
            # "model__model": list(models.values()) + [sklearn_model]
            "model__model__optimize_for_horizon":
            optimize_for_horizon,
            "model__model__lags": [3, 7, 10, 14],
        })

        grid_search.param_grid.append({
            "model": [sklearn_model_pipeline],
            "model__seasonality__weekly": [True, False],
            "model__model__optimize_for_horizon":
            optimize_for_horizon,
            "model__model": [sklearn_model],
            "model__model__max_depth": [6],
        })

    if prophet_models:
        from hcrystalball.wrappers import ProphetWrapper

        extra_regressors = [None] if exog_cols is None else [None, exog_cols]

        grid_search.param_grid.append({
            "model": [
                ProphetWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__seasonality_mode": ["multiplicative", "additive"],
            "model__extra_regressors":
            extra_regressors,
        })

        grid_search.param_grid.append({
            "model": [
                ProphetWrapper(
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__extra_seasonalities": [[{
                "name": "quarterly",
                "period": 90.0625,
                "fourier_order": 5,
                "prior_scale": 15.0,
                "mode": None,
            }]],
            "model__extra_regressors":
            extra_regressors,
        })

    if exp_smooth_models:
        from hcrystalball.wrappers import ExponentialSmoothingWrapper
        from hcrystalball.wrappers import HoltSmoothingWrapper
        from hcrystalball.wrappers import SimpleSmoothingWrapper

        # commented options show non deterministic behavior
        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": ["add"],
            "model__seasonal": [None, "add"],
            "model__damped": [True, False],
            "model__fit_params": [
                {
                    "use_boxcox": True,
                    "use_basinhopping": False
                },
                # {'use_boxcox':True, 'use_basinhopping':True},
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": ["add"],
            "model__seasonal": ["mul"],
            "model__damped": [True, False],
            "model__fit_params": [
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                ExponentialSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__trend": [None],
            "model__seasonal": [None, "add", "mul"],
            "model__damped": [False],
            "model__fit_params": [
                {
                    "use_boxcox": False,
                    "use_basinhopping": False
                },
                # {'use_boxcox':False, 'use_basinhopping':True}
            ],
        })

        grid_search.param_grid.append({
            "model": [
                SimpleSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
                HoltSmoothingWrapper(
                    freq=frequency,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                ),
            ]
        })

    if tbats_models:
        from hcrystalball.wrappers import TBATSWrapper

        grid_search.param_grid.append({
            "model": [
                TBATSWrapper(
                    use_arma_errors=False,
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ]
        })

    if stacking_ensembles:
        from hcrystalball.ensemble import StackingEnsemble
        from hcrystalball.wrappers import ProphetWrapper
        from sklearn.ensemble import RandomForestRegressor

        grid_search.param_grid.append({
            "model": [
                StackingEnsemble(
                    train_n_splits=stacking_ensembles_train_n_splits,
                    train_horizon=stacking_ensembles_train_horizon,
                    meta_model=ElasticNet(),
                    horizons_as_features=True,
                    weekdays_as_features=True,
                    base_learners=[],
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__meta_model": [ElasticNet(),
                                  RandomForestRegressor()],
            "model__base_learners": [
                [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    sklearn_model_pipeline,
                ],
            ],
        })
    if average_ensembles:
        from hcrystalball.ensemble import SimpleEnsemble
        from hcrystalball.wrappers import ProphetWrapper

        grid_search.param_grid.append({
            "model": [
                SimpleEnsemble(
                    base_learners=[],
                    clip_predictions_lower=clip_predictions_lower,
                    clip_predictions_upper=clip_predictions_upper,
                )
            ],
            "model__base_learners": [
                [
                    ProphetWrapper(
                        clip_predictions_lower=clip_predictions_lower,
                        clip_predictions_upper=clip_predictions_upper,
                    ),
                    sklearn_model_pipeline,
                ],
            ],
        })

    return grid_search
Beispiel #28
0
np.sqrt(np.mean((pred_train_ridge - y_train)**2))  # 7582
np.sqrt(np.mean((pred_test_ridge - y_test)**2))  # 13435

# Importanat Coefficient Plot
important_coff = pd.Series(RidgeM1.coef_, index=X.columns)
important_coff.plot(kind='barh', color='g')

##################################### - Elastic Net Regression - ##########################################

### Running a Elastic Net Regressor of set of alpha values and observing how the R-Squared, train_rmse and test_rmse are changing with change in alpha values
train_rmse = []
test_rmse = []
R_sqrd = []
alphas = np.arange(0, 1, 0.01)
for i in alphas:
    EN = ElasticNet(alpha=i, normalize=True, max_iter=500)
    EN.fit(X_train, y_train)
    R_sqrd.append(EN.score(X_train, y_train))
    train_rmse.append(np.sqrt(np.mean((EN.predict(X_train) - y_train)**2)))
    test_rmse.append(np.sqrt(np.mean((EN.predict(X_test) - y_test)**2)))

# Plotting Alpha vs Train and Test RMSE.
plt.scatter(x=alphas, y=R_sqrd)
plt.xlabel("alpha")
plt.ylabel("R_Squared")
plt.scatter(x=alphas, y=train_rmse)
plt.xlabel("alpha")
plt.ylabel("RMSE")
plt.scatter(x=alphas, y=test_rmse)
plt.xlabel("alpha")
plt.ylabel("RMSE")
Beispiel #29
0
def get_linear_model():
    elastic_net = ElasticNet()
    return [elastic_net], ['Elastic Net']
Beispiel #30
0
def run_stack(SEED, col):

    dset = "4"  

    trainBaseTarget = pd.read_csv('../preprocess/pre_shuffled_target_' + col + '.csv')
    trainBase = pd.read_csv('../models/Lasso' + dset + '_train_' + col + '.csv')
    #trainBase = pd.read_csv('../preprocess/pre_shuffled_train' + dset + '.csv')
    trainBase.drop(['PIDN'], axis=1, inplace=True)


    #test = pd.read_csv('../data/pre_shuffled_test.csv')

    columns = trainBase.columns    
    columnsHighScore = trainBase.columns 


    print(trainBase.columns)
    
    trainBase = np.nan_to_num(np.array(trainBase))
    targetBase = np.nan_to_num(np.array(trainBaseTarget))
    #test = np.nan_to_num(np.array(test))
    
    gc.collect()   
   
    
    avg = 1.0
    avgLast = avg
    bestAvg = avg
    bestAlpha = 0
    NumFolds = 5


   

    
    
    
    print ("Data size: " + str(len(trainBase)))
    print("Begin Training")
    
    lenTrainBase = len(trainBase)
   

    gc.collect()
    
    # best alpha is 0.00040
    for a in np.logspace(-8, -.5, 50): # best values seem to be slightly greater than 0.
        
        
        
        clf = ElasticNet(alpha=a)
        #print(clf)
        avg = 0
    
        coef_dataset = np.zeros((len(columns),NumFolds))
   
        foldCount = 0

        Folds = cross_validation.KFold(lenTrainBase, n_folds=NumFolds, indices=True)
            
        for train_index, test_index in Folds:
    
            #print()
            #print ("Iteration: " + str(foldCount))
            
            
            #now = datetime.datetime.now()
            #print(now.strftime("%Y/%m/%d %H:%M:%S"))    
    
    
            target = [targetBase[i] for i in train_index]
            train = [trainBase[i] for i in train_index]

            
            targetTest = [targetBase[i] for i in test_index]    
            trainTest = [trainBase[i] for i in test_index]    

            

            #print "LEN: ", len(train), len(target)
            
            
            target = np.array(np.reshape(target, (-1, 1)) )           
            #train = np.array(np.reshape(train, (-1, 1))  ) 
            
    
            targetTest = np.array(np.reshape(targetTest, (-1, 1)) )  
            #trainTest = np.array(np.reshape(trainTest, (-1, 1)) )  
             
            

            #clf.fit(train, target, sample_weight = weight
            clf.fit(train, target)
            predicted = clf.predict(trainTest) 
 
 
            #print(target.shape) 
            #print(predicted.shape)
  
            #print(str(math.sqrt(mean_squared_error(targetTest, predicted))))
            avg += math.sqrt(mean_squared_error(targetTest, predicted))/NumFolds

                 
            coef_dataset[:, foldCount] = clf.coef_                 

            foldCount = foldCount + 1
        
            #break
     
        
        coefs = coef_dataset.mean(1)
        #print(coefs)        
        sorted_coefs = sorted(coefs)
        #print("len coefs: " + str(len(sorted_coefs)))
   
        coefsAboveZero = [i for i in coefs if i > 0.0]   
        #print(str(len(coefsAboveZero)))
   
        print ("------------------------Average: " + str(avg))               
  
        if avg < bestAvg:
            bestAvg = avg
            bestAlpha = a
  
  
    print("bestAvg: " + str(bestAvg))
    print("bestAlpha: " + str(bestAlpha))
Beispiel #31
0
grid_search = GridSearchCV(estimator=lasso,
                           param_grid=parameters,
                           cv=5,
                           scoring='neg_mean_squared_error',
                           n_jobs=-1)

grid_search = grid_search.fit(X_poly[:, 1:], y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

#Using elastic net regression in combination with polynomial regression att degree 2
from sklearn.linear_model import ElasticNet
elastReg = ElasticNet(normalize=True,
                      warm_start=True,
                      random_state=True,
                      precompute=False,
                      selection='cyclic')

parameters = [{
    'alpha': [1, 0.99, 0.98],
    'tol': [1e+2, 1e-6, 1e-7, 1e-4, 1e-3, 1e-2, 1e-1, 1e-0],
    'max_iter': [3000, 6000, 10000, 15000],
    'l1_ratio': [0.99, 0.98, 0.95, 1]
}]
grid_search = GridSearchCV(estimator=elastReg,
                           param_grid=parameters,
                           cv=5,
                           n_jobs=-1,
                           scoring='neg_mean_squared_error')
grid_search = grid_search.fit(X_poly[:, 1:], y_train)
Beispiel #32
0
def rmsle_cv(model):
    kf = KFold(
        n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse = np.sqrt(-cross_val_score(
        model, train.values, y_train, scoring="neg_mean_squared_error", cv=kf))
    print("rmse", rmse)
    return (rmse)


# 模型
# LASSO Regression :
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
# Elastic Net Regression
ENet = make_pipeline(
    RobustScaler(), ElasticNet(
        alpha=0.0005, l1_ratio=.9, random_state=3))
# Kernel Ridge Regression
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
# Gradient Boosting Regression
GBoost = GradientBoostingRegressor(
    n_estimators=3000,
    learning_rate=0.05,
    max_depth=4,
    max_features='sqrt',
    min_samples_leaf=15,
    min_samples_split=10,
    loss='huber',
    random_state=5)
#  XGboost
model_xgb = xgb.XGBRegressor(
    colsample_bytree=0.4603,
Beispiel #33
0
                          cv=nr_cv,
                          verbose=1,
                          scoring=score_calc)
grid_lasso.fit(X, y)

sc_lasso = get_best_score(grid_lasso)

pred_lasso = grid_lasso.predict(X_test)

# ### Elastic Net

# In[ ]:

from sklearn.linear_model import ElasticNet

enet = ElasticNet()
parameters = {
    'alpha': [0.1, 1.0, 10],
    'max_iter': [1000000],
    'l1_ratio': [0.04, 0.05],
    'fit_intercept': [False, True],
    'normalize': [True, False],
    'tol': [1e-02, 1e-03, 1e-04]
}
grid_enet = GridSearchCV(enet,
                         parameters,
                         cv=nr_cv,
                         verbose=1,
                         scoring=score_calc)
grid_enet.fit(X_sc, y_sc)
def _select_estimator(estimator, n_jobs, n_estimators, random_state=None):
    '''Select estimator and parameters from argument name.'''
    # Regressors
    if estimator == 'RandomForestRegressor':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap']}
        estimator = RandomForestRegressor(
            n_jobs=n_jobs, n_estimators=n_estimators,
            random_state=random_state)
    elif estimator == 'ExtraTreesRegressor':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap']}
        estimator = ExtraTreesRegressor(
            n_jobs=n_jobs, n_estimators=n_estimators,
            random_state=random_state)
    elif estimator == 'GradientBoostingRegressor':
        param_dist = parameters['ensemble']
        estimator = GradientBoostingRegressor(
            n_estimators=n_estimators, random_state=random_state)
    elif estimator == 'SVR':
        param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]}
        estimator = SVR(kernel='rbf', gamma='scale')
    elif estimator == 'LinearSVR':
        param_dist = {**parameters['svm'], 'epsilon': [0.0, 0.1]}
        estimator = SVR(kernel='linear')
    elif estimator == 'Ridge':
        param_dist = parameters['linear']
        estimator = Ridge(solver='auto', random_state=random_state)
    elif estimator == 'Lasso':
        param_dist = parameters['linear']
        estimator = Lasso(random_state=random_state)
    elif estimator == 'ElasticNet':
        param_dist = parameters['linear']
        estimator = ElasticNet(random_state=random_state)
    elif estimator == 'KNeighborsRegressor':
        param_dist = parameters['kneighbors']
        estimator = KNeighborsRegressor(algorithm='auto')

    # Classifiers
    elif estimator == 'RandomForestClassifier':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap'],
                      **parameters['criterion']}
        estimator = RandomForestClassifier(
            n_jobs=n_jobs, n_estimators=n_estimators,
            random_state=random_state)
    elif estimator == 'ExtraTreesClassifier':
        param_dist = {**parameters['ensemble'], **parameters['bootstrap'],
                      **parameters['criterion']}
        estimator = ExtraTreesClassifier(
            n_jobs=n_jobs, n_estimators=n_estimators,
            random_state=random_state)
    elif estimator == 'GradientBoostingClassifier':
        param_dist = parameters['ensemble']
        estimator = GradientBoostingClassifier(
            n_estimators=n_estimators, random_state=random_state)
    elif estimator == 'LinearSVC':
        param_dist = parameters['linear_svm']
        estimator = LinearSVC(random_state=random_state)
    elif estimator == 'SVC':
        param_dist = parameters['svm']
        estimator = SVC(kernel='rbf', random_state=random_state, gamma='scale')
    elif estimator == 'KNeighborsClassifier':
        param_dist = parameters['kneighbors']
        estimator = KNeighborsClassifier(algorithm='auto')

    return param_dist, estimator
Beispiel #35
0
LSTM_params = {'learning_rate':[1e-4, 1e-5, 1e-4, 1e-6], 'depth': [2, 2, 1, 2], 'hidden_number': [256]*4}
RNN_params = {'learning_rate':[0.1, 0.1, 0.1, 0.001], 'depth': [1, 1, 2, 1], 'hidden_number': [256]*4}


#**********************2.全样本3/12/24/36个月滑动窗口函数运行**********************************#
path = r'..\DataBase\factor'#96项因子所在路径
factorname = [x[1:-4] for x in os.listdir(path)]
riskfree, timeseries, factor, timeseries2, index = datatransfrom(path)[0], datatransfrom(path)[1], datatransfrom(path)[2], datatransfrom2(path)[0], datatransfrom2(path)[1]
for i in range(4):
    i= 0
    output(window[i],LinearRegression(),'OLS'+str(window[i]),riskfree[i], timeseries)
    FC(window[i], riskfree[i], timeseries, 96,'FC')
    output(window[i], PLSRegression(PLS_params[i]), 'PLS' + str(window[i]), riskfree[i], timeseries)
    output(window[i],Lasso(alpha=lasso_params[i]),'Lasso'+ str(window[i]), riskfree[i], timeseries)
    output(window[i],Ridge(alpha=ridge_params[i]),'Ridge'+str(window[i]),riskfree[i], timeseries)
    output(window[i],ElasticNet(alpha= elasticnet_params['alpha'] [i],l1_ratio= elasticnet_params['l1_ratio'][i]),'ElasticNet'+str(window[i]),riskfree[i], timeseries)
    output(window[i],SVR(kernel=SVR_params['kernel'][i],gamma= SVR_params ['gamma'][i],C= SVR_params ['C'][i] ),'SVR'+str(window[i]),riskfree[i], timeseries)
    output(window[i], GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]), 'GBDT' + str(window[i]),riskfree[i], timeseries)
    output(window[i], XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]), 'XGBOOST' + str(window[i]), riskfree[i], timeseries)
    output(window[i], ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]), 'ENANN' + str(window[i]), riskfree[i], timeseries)
    output(window[i], DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]), 'DFN' + str(window[i]), riskfree[i], timeseries)
    output2(window[i], rm.lstmmodule(96, LSTM_params['hidden_number'][i], LSTM_params['depth'][i], 100, 3571, lr=LSTM_params['learning_rate'][i]), 'LSTM'+ str(window[i]) ,riskfree[i], timeseries2)
    output2(window[i], rm.lstmmodule(96,  RNN_params['hidden_number'][i], RNN_params['depth'][i], 100, 3571, lr=RNN_params['learning_rate'][i], ntype='RNN'), 'RNN'+ str(window[i]), riskfree[i], timeseries2)
    modellist = [DFN.DFN(outputdim=1, neuralset=[96, 50, 25, 10, 5, 2], ctx=gpu(0), epoch=10, batch_size=DFN_params['batch'][i], lr=DFN_params['learning_rate'][i]),
                 ensemblenn(5,modeluse = MLPRegressor(solver = 'lbfgs', max_iter=ENANN_params['max_iter'][i]), pickpercent=ENANN_params['p'][i]),
                 XGBRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i], learning_rate=GBDT_params['learning_rate'][i]),
                 GradientBoostingRegressor(n_estimators=GBDT_params['n_estimators'][i],max_depth=GBDT_params['maxdepth'][i],learning_rate=GBDT_params['learning_rate'][i]),
                 PLSRegression(PLS_params[i]),
                 Ridge(alpha=ridge_params[i]),
                 SVR(kernel=SVR_params['kernel'][i],gamma= SVR_params ['gamma'][i],C= SVR_params ['C'][i])]# PLS一定要放在倒数第三个
    nmolist = [rm.lstmmodule(96, LSTM_params['hidden_number'][i], LSTM_params['depth'][i], 100, 3571, lr=LSTM_params['learning_rate'][i]),
Beispiel #36
0
X_net_holdout[:,n_pow]=temp[1]
X_net_test[:,n_pow]=temp[2]
####################################################################################
####################################################################################
####################################################################################
#Elastic net blender
####################################################################################
####################################################################################
####################################################################################
from sklearn.linear_model import ElasticNet
# objective function: 1 / (2 * n_samples) * ||y - Xw||^2_2 +
# + alpha * l1_ratio * ||w||_1
# + 0.5 * alpha * (1 - l1_ratio) * ||w||^2_2

enet=ElasticNet(alpha=1.0, l1_ratio=0.5, fit_intercept=False, normalize=False, 
    precompute=False, max_iter=1000, copy_X=True, tol=0.0001, warm_start=False, 
    positive=False)

enet_mod=enet.fit(X_net_valid,valid_Y)
pred_holdout=enet_mod.predict(X_net_holdout)

holdout_gini=Gini(holdout_Y,pred_holdout)

valid_rmse=np.sqrt(sum( (pred_holdout[m] - holdout_Y[m])**2 for m in range(len(holdout_Y))) / float(len(holdout_Y)))
print valid_rmse, holdout_gini                     

pred_test=enet_mod.predict(X_net_test)

df=pd.DataFrame(pred_test)
df.columns=['Hazard']
indices=np.loadtxt("X_test_indices.gz",delimiter=",").astype('int32')
Beispiel #37
0
    pred_train_l1 = l1Regr.predict(X_train)
    pred_test_l1 = l1Regr.predict(X_test)

    # GBR
    myGBR = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.02,
                                      max_depth=4, max_features='sqrt',
                                      min_samples_leaf=15, min_samples_split=50,
                                      loss='huber', random_state=5)

    myGBR.fit(X_train, y_train)
    pred_train_GBR = myGBR.predict(X_train)

    pred_test_GBR = myGBR.predict(X_test)

    # ENet
    ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=4.0, l1_ratio=0.005, random_state=3))
    ENet.fit(X_train, y_train)
    pred_train_ENet = ENet.predict(X_train)
    pred_test_ENet = ENet.predict(X_test)

    # LGB
    myLGB = lgb.LGBMRegressor(objective='regression', num_leaves=5,
                              learning_rate=0.05, n_estimators=600,
                              max_bin=50, bagging_fraction=0.6,
                              bagging_freq=5, feature_fraction=0.25,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf=6, min_sum_hessian_in_leaf=11)
    myLGB.fit(X_train, y_train)
    pred_train_LGB = myLGB.predict(X_train)
    pred_test_LGB = myLGB.predict(X_test)
Beispiel #38
0
 def BuildModel(self, data, labels):
   # Create and train the classifier.
   elasticNet = SElasticNet(alpha=self.rho,
                            l1_ratio=self.alpha)
   elasticNet.fit(data, labels)
   return elasticNet
import pandas
from sklearn import cross_validation
from sklearn.linear_model import ElasticNet
url = "https://goo.gl/sXleFv"
names = [
    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
    'PTRATIO', 'B', 'LSTAT', 'MEDV'
]
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:, 0:13]
Y = array[:, 13]
num_folds = 10
num_instances = len(X)
seed = 7
kfold = cross_validation.KFold(n=num_instances,
                               n_folds=num_folds,
                               random_state=seed)
model = ElasticNet()
scoring = 'mean_squared_error'
results = cross_validation.cross_val_score(model,
                                           X,
                                           Y,
                                           cv=kfold,
                                           scoring=scoring)
print(results.mean())  # -*- coding: utf-8 -*-
Beispiel #40
0
from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as mse

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import Lasso

from sklearn.linear_model import ElasticNet

pipe = Pipeline([('sc', StandardScaler()),
                 ('poly', PolynomialFeatures(degree=2, include_bias=True)),
                 ('en', ElasticNet())])
model = GridSearchCV(pipe,
                     param_grid={
                         'en__alpha': [0.005, 0.01, 0.05, 0.1],
                         'en__l1_ratio': [0.1, 0.4, 0.8]
                     })
model.fit(train_[columns], train_['log_price'])
degree = model.best_params_
print(degree)
pred = np.exp(model.predict(test_))
Accuracy = sqrt(mse(pred, test['price']))
print('\nRMSE for elastic net regression : ', Accuracy)

RMSE.append(Accuracy)
Models.append('ElasticNet Regression')
options = ["0.1", "0.2", "0.3", "0.4", "0.5", "5.0", "10.0", "15.0"]
signals = [options[0], options[1], options[3], options[5], options[7]]  #5 MAX
x_vals = []
x_real_vals = []
""" ELASTIC NET LEARNED PARAMETERS
    Learned from dl=10.0nm """
amax = 0.0464158883361
l1max = 2.53536449397

for i in xrange(len(signals)):
    yfile = path + '/Narrowband_2laser_data/2laser_dlambda=' + signals[
        i] + 'nm_v1.txt'
    yf = pd.read_csv(yfile, sep='\t', usecols=[0, 1])
    yval, OPL = yf.values[:, 1], yf.values[:, 0]

    enet = ElasticNet(alpha=amax, l1_ratio=l1max, positive=True)
    y_pred_enet = enet.fit(A1, yval).predict(A1)
    x_vals.append(enet.coef_)

    x_real_validate = np.zeros(len(A1[0]))
    x_real_validate[get_index(1560 + float(signals[i]) / 2.0,
                              wavelengths)] = 0.8
    x_real_validate[get_index(1560 - float(signals[i]) / 2.0,
                              wavelengths)] = 1.0
    x_real_vals.append(x_real_validate)
""" ---------------------------------------------------------------------------
PLOT THE RESULTS FROM x_vals & x_real_vals BELOW
--------------------------------------------------------------------------- """
font = {'size': 16}
matplotlib.rc('font', **font)
Beispiel #42
0
def Regression(train_data, train_solution, test_data, test_solution, method):
    ## Fix Data Structure ##
    train_data = train_data.values
    train_solution = train_solution.values
    test_data = test_data.values
    test_solution = test_solution.values

    ## List of Method Options with Initialization ##
    if method == 'lin_reg':  # linear regression
        from sklearn.linear_model import LinearRegression
        reg = LinearRegression()
    elif method == 'ply_reg':  # polynomial regression
        from sklearn.linear_model import LinearRegression
        reg = LinearRegression()
        poly_features = PolynomialFeatures(degree=2)
    elif method == 'rdg_reg':  # ridge regression
        from sklearn.linear_model import Ridge
        reg = Ridge()
    elif method == 'lso_reg':  # lasso regression
        from sklearn.linear_model import Lasso
        reg = Lasso(alpha=0.00001)
    elif method == 'ela_net':  # elastic net regression
        from sklearn.linear_model import ElasticNet
        reg = ElasticNet()
    elif method == 'svr_lin':  # SVM regression
        from sklearn.svm import LinearSVR
        reg = LinearSVR(epsilon=0.01, max_iter=10000)
    elif method == 'svr_2nd':  # SVR regression
        from sklearn.svm import SVR
        reg = SVR(kernel='poly', degree=2, epsilon=0.01)  #C=100
    elif method == 'svr_3rd':  # SVR regression
        from sklearn.svm import SVR
        reg = SVR(kernel='poly', degree=3, epsilon=0.01)  #C=100
    elif method == 'dcn_tre':  # decision tree
        from sklearn.tree import DecisionTreeRegressor
        reg = DecisionTreeRegressor()
    elif method == 'rdm_for':  # random forests
        from sklearn.ensemble import RandomForestRegressor
        reg = RandomForestRegressor(n_estimators=100, random_state=3)
    elif method == 'ada_bst':  # AdaBoost Regressor
        from sklearn.ensemble import AdaBoostRegressor
        reg = AdaBoostRegressor(n_estimators=100, random_state=3)
    elif method == 'grd_bst':  # Gradient Boosting Regressor
        from sklearn.ensemble import GradientBoostingRegressor
        reg = GradientBoostingRegressor(random_state=3)
    elif method == 'gss_prc':  # Gaussian Process Regressor
        from sklearn.gaussian_process import GaussianProcessRegressor
        reg = GaussianProcessRegressor(random_state=3)
    elif method == 'knl_rdg':  # Kernel Ridge Regression
        from sklearn.kernel_ridge import KernelRidge
        reg = KernelRidge()
    elif method == 'nst_nbr_uni':  # K Nearest Neighbors Regressor
        from sklearn.neighbors import KNeighborsRegressor
        reg = KNeighborsRegressor(weights='uniform')
    elif method == 'nst_nbr_dst':  # K Nearest Neighbors Regressor
        from sklearn.neighbors import KNeighborsRegressor
        reg = KNeighborsRegressor(weights='distance')
    elif method == 'rad_nbr_uni':  # Radius Neighbor Regressor
        from sklearn.neighbors import RadiusNeighborsRegressor
        reg = RadiusNeighborsRegressor(weights='uniform')
    elif method == 'rad_nbr_dst':  # Radius Neighbor Regressor
        from sklearn.neighbors import RadiusNeighborsRegressor
        reg = RadiusNeighborsRegressor(weights='distance')
    elif method == 'mlp_reg':
        from sklearn.neural_network import MLPRegressor
        reg = MLPRegressor(random_state=3)
    else:
        print(
            'Error: Regression method not recognized.\nPlease pick a valid method key (example: xxx_xxx).'
        )

    ## Preprocessing and Setup ##
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    data = scaler.fit_transform(train_data)
    scaler = StandardScaler()
    test_data = scaler.fit_transform(test_data)
    solution = train_solution.reshape(-1, )
    if method == 'ply_reg':
        data = poly_features.fit_transform(data)
    reg.fit(data, solution)

    if len(test_data) < 5:
        predictions = reg.predict(data)

    elif len(test_data) > 5:
        if method == 'ply_reg':
            test_data = poly_features.transform(test_data)
        test_solution = test_solution.reshape(-1, )
        predictions_test = reg.predict(test_data)
        solution = test_solution
        predictions = predictions_test

    else:
        print('Error: test_set undetermined.')

    Matrix_to_save = pd.DataFrame()
    Matrix_to_save['Solution'] = solution
    Matrix_to_save['Predictions'] = predictions

    return Matrix_to_save
importances = pd.DataFrame()
oof_reg_preds = np.zeros(df_train.shape[0])
sub_reg_preds = np.zeros(df_test.shape[0])

df_test_fullvisitorid_str = df_test["fullVisitorId"].copy()
df_test["fullVisitorId"] = df_test["fullVisitorId"].astype(float)

for fold_, (trn_, val_) in enumerate(folds):
    trn_x, trn_y = df_train.iloc[trn_], y_reg.iloc[trn_]
    val_x, val_y = df_train.iloc[val_], y_reg.iloc[val_]

    trn_x["fullVisitorId"] = trn_x["fullVisitorId"].astype(float)
    val_x["fullVisitorId"] = val_x["fullVisitorId"].astype(float)

    reg = ElasticNet(random_state=0)

    reg.fit(
        trn_x,
        np.log1p(trn_y),
    )

    oof_reg_preds[val_] = reg.predict(val_x)
    oof_reg_preds[oof_reg_preds < 0] = 0

    _preds = reg.predict(df_test)
    _preds[_preds < 0] = 0
    sub_reg_preds += np.expm1(_preds) / len(folds)

mean_squared_error(np.log1p(y_reg), oof_reg_preds)**.5
Beispiel #44
0
    plt.show()

pic = []

test = [0.1, 0.3, 0.5, 0.7, 0.9]

fig, axarr = plt.subplots(5, 5, figsize=(15, 15), sharex=True, sharey=True)

a = int(0)

b = int(0)

for i in test_array:
    for j in test:
        ridge = ElasticNet(alpha=i, l1_ratio=j)

        # Fit the regressor to the training data
        ridge.fit(X_train, y_train)

        # Predict on the test data: y_pred
        y_pred = ridge.predict(X_test)

        print("Here is the ElasticNet regression with alpha = ", i,
              " and L1 ratio = ", j, " stat data:")

        print("coef: ", ridge.coef_)
        print("intercept: ", ridge.intercept_)

        # Compute and print R^2 and RMSE
        print("R^2: {}".format(round(ridge.score(X_test, y_test), 4)))
Beispiel #45
0
    data = pd.read_csv(wine_path)

    # Split the data into training and test sets. (0.75, 0.25) split.
    train, test = train_test_split(data)

    # The predicted column is "quality" which is a scalar from [3, 9]
    train_x = train.drop(["quality"], axis=1)
    test_x = test.drop(["quality"], axis=1)
    train_y = train[["quality"]]
    test_y = test[["quality"]]

    alpha = float(sys.argv[1]) if len(sys.argv) > 1 else 0.5
    l1_ratio = float(sys.argv[2]) if len(sys.argv) > 2 else 0.5

    with mlflow.start_run():
        lr = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42)
        lr.fit(train_x, train_y)

        predicted_qualities = lr.predict(test_x)

        (rmse, mae, r2) = eval_metrics(test_y, predicted_qualities)

        print("Elasticnet model (alpha=%f, l1_ratio=%f):" % (alpha, l1_ratio))
        print("  RMSE: %s" % rmse)
        print("  MAE: %s" % mae)
        print("  R2: %s" % r2)

        mlflow.log_param("alpha", alpha)
        mlflow.log_param("l1_ratio", l1_ratio)
        mlflow.log_metric("rmse", rmse)
        mlflow.log_metric("r2", r2)
Beispiel #46
0
def mult_reg(p_x, p_y):
    """
    Funcion para ajustar varios modelos lineales

    Parameters
    ----------

    p_x: pd.DataFrame with regressors or predictor variables

    p_y: pd.DataFrame with variable to predict

    Returns
    -------
    r_models: dict Diccionario con modelos ajustados

    """
    xtrain, xtest, ytrain, ytest = train_test_split(p_x,
                                                    p_y,
                                                    test_size=.8,
                                                    random_state=455)

    # fit linear regression
    linreg = LinearRegression(normalize=False, fit_intercept=False)
    linreg.fit(xtrain, ytrain)
    y_p_linear = linreg.predict(xtest)

    # Fit RIDGE regression
    ridgereg = Ridge(normalize=True)
    model = ridgereg.fit(xtrain, ytrain)
    y_p_ridge = model.predict(xtest)

    # Fit LASSO regression
    lassoreg = Lasso(normalize=True)
    lassoreg.fit(xtrain, ytrain)
    y_p_lasso = lassoreg.predict(xtest)

    # Fit ElasticNet regression
    enetreg = ElasticNet(normalize=True)
    enetreg.fit(xtrain, ytrain)
    y_p_enet = enetreg.predict(xtest)

    # RSS = residual sum of squares

    # Return the result of the model
    r_models = {
        "summary": {
            "linear rss": sum((y_p_linear - ytest)**2),
            "Ridge rss": sum((y_p_ridge - ytest)**2),
            "lasso rss": sum((y_p_lasso - ytest)**2),
            "elasticnet rss": sum((y_p_enet - ytest)**2)
        },
        "test": ytest,
        'linear': {
            'rss': sum((y_p_linear - ytest)**2),
            'predict': y_p_linear,
            'model': linreg,
            'intercept': linreg.intercept_,
            'coef': linreg.coef_
        },
        'ridge': {
            'rss': sum((y_p_ridge - ytest)**2),
            'predict': y_p_ridge,
            'model': ridgereg,
            'intercept': ridgereg.intercept_,
            'coef': ridgereg.coef_
        },
        'lasso': {
            'rss': sum((y_p_lasso - ytest)**2),
            'predict': y_p_lasso,
            'model': lassoreg,
            'intercept': lassoreg.intercept_,
            'coef': lassoreg.coef_
        },
        'elasticnet': {
            'rss': sum((y_p_enet - ytest)**2),
            'predict': y_p_enet,
            'model': enetreg,
            'intercept': enetreg.intercept_,
            'coef': enetreg.coef_
        }
    }

    return r_models
Beispiel #47
0
def quantify_isoforms(genes, genome, reads):
    """
    :param genes: the list of gene tuples generated by the parser
    :param genome_fn: the full genome file
    :param reads_fn: the file of shuffled reads
    :return: a list of tuples, where the first element of the tuple is the transcript sequence (the isoform in terms of
            the exon sequences that form it in the genome), and the second element of the tuple is the abundance of that
            specific isoform
            NOTE: this skeleton is built assuming the return value exists like this, but as long as you change the way
            the output file is generated, this can be in whatever form you like.
    """
    """
        Within this function, you should go through most of the process of quantifying isoforms given the data.
        This can be broken down into the following few steps:

            1. Align reads to the genome, exome, or isoforms
                    your choice of method, but note the length of the genome

            2. Use the generated alignment to get exon counts

            3. Formulate your RNA seq problem using the isoforms and exon counts (linear algebra)

            4. Compute the isoform abundances based on your above formulation
    """

    # Create phonebook
    pattern_length = 10
    num_genes = len(genes)
    phonebook = {}
    exon_counts = [None] * num_genes

    for i in range(num_genes):
        num_exons = len(genes[i][0])
        exons = [0] * num_exons
        exon_counts[i] = exons
        for j in range(num_exons):
            start = genes[i][0][j][0]
            end = genes[i][0][j][1]
            #print(start, end)
            for k in range(start, end - pattern_length + 2):
                pattern = genome[k:k + pattern_length]

                # if k == start:
                #     print(len(pattern))
                #     print(pattern)

                if pattern in phonebook:
                    phonebook[pattern].append((i, j))
                    #print('re')
                else:
                    pair = (i, j)
                    phonebook[pattern] = [pair]

    for read in reads:
        ll = range(len(read) - pattern_length + 1)
        read_portions = [
            read[i:i + pattern_length] for i in ll[::pattern_length]
        ]
        for read_portion in read_portions:
            if read_portion in phonebook:
                #print(phonebook[read])
                for gene_exon in phonebook[read_portion]:
                    gene_id = gene_exon[0]
                    exon_id = gene_exon[1]
                    #print(gene_id, exon_id)
                    exon_counts[gene_id][exon_id] += 1 / len(
                        phonebook[read_portion]) / (50 / pattern_length)
    #print(exon_counts)

    isoforms = []
    abundances = []
    for i in range(num_genes):
        M = np.zeros((len(genes[i][0]), len(genes[i][1])))
        for j in range(len(genes[i][1])):
            isoforms.append('')
            #print(genes[i][1][j])
            for k in genes[i][1][j]:
                start = genes[i][0][k][0]
                end = genes[i][0][k][1]
                isoforms[len(isoforms) - 1] += genome[start:end + 1]
                M[k][j] = (end - start + 1) / 50
            print(len(isoforms[len(isoforms) - 1]))

        #print(M)

        b = np.array(exon_counts[i])
        #print(b)

        regr = ElasticNet(alpha=1.5, positive=True)
        #regr = linear_model.LassoLars(alpha=0.01, positive=True)
        regr.fit(M, b)
        x = regr.coef_

        #x = np.linalg.lstsq(M, b)[0]
        x = x / sum(x)
        print(x)
        abundances.extend(x)

    iso_abund = [(isoforms[i], abundances[i]) for i in range(len(isoforms))]

    return iso_abund
Beispiel #48
0
 def fit(self, *args, **kwargs):
     return ElasticNet.fit(self, *args, **kwargs)
Beispiel #49
0
reg_1 = Lasso()
reg_1.fit(X_train, y_train)
print("Lasso Score:", reg_1.score(X_test, y_test))

# Ridge Regressor
reg_2 = Ridge()
reg_2.fit(X_train, y_train)
print("Ridge Score:", reg_2.score(X_test, y_test))

# Bayesian Ridge Regressor
reg_3 = BayesianRidge()
reg_3.fit(X_train, y_train)
print("BayesianRidge Score:", reg_3.score(X_test, y_test))

# ElasticNet Regresor
reg_4 = ElasticNet()
reg_4.fit(X_train, y_train)
print("ElasticNet Score:", reg_4.score(X_test, y_test))

#Let us predict the stock market for the Future 30 days
days = 20

data_seed = df['Adj Close'].values[-window_size:][None]

input_values = {
    'Lasso': data_seed,
    'Ridge': data_seed,
    'BayesianRidge': data_seed,
    'ElasticNet': data_seed
}
values = {'Lasso': [], 'Ridge': [], 'BayesianRidge': [], 'ElasticNet': []}
print X_test.shape


y_train=df_train["Purchase"]
df_train=df_train.drop("Purchase", axis=1)

#from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import f_regression
#sel = SelectKBest(f_regression, k=10)
#X_tr=pd.DataFrame(sel.fit_transform(X_train,y_train))
#X_tst=pd.DataFrame(sel.transform(X_test))

#print X_tr.shape
#print X_tst.shape

from sklearn.linear_model import ElasticNet
model=ElasticNet(alpha=0.001)

model.fit(X_train,y_train)
y_pred=model.predict(X_test)
#print y_pred.shape
#print key1.shape
#print key2.shape


out=pd.DataFrame()
out["User_ID"]=key1
out["Product_ID"]=key2
out["Purchase"]=y_pred
out.to_csv('outavb.csv', index=False)
# by Willi Richert and Luis Pedro Coelho
# published by PACKT Publishing
#
# It is made available under the MIT License

import numpy as np
from sklearn.datasets import load_svmlight_file
from sklearn.cross_validation import KFold
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, r2_score

data, target = load_svmlight_file('data/E2006.train')

# Edit the lines below if you want to switch method:
# met = LinearRegression(fit_intercept=True)
met = ElasticNet(fit_intercept=True, alpha=.1)

kf = KFold(len(target), n_folds=5)
pred = np.zeros_like(target)
for train, test in kf:
    met.fit(data[train], target[train])
    pred[test] = met.predict(data[test])

print('[EN 0.1] RMSE on testing (5 fold), {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
print('[EN 0.1] R2 on testing (5 fold), {:.2}'.format(r2_score(target, pred)))
print('')

met.fit(data, target)
pred = met.predict(data)
print('[EN 0.1] RMSE on training, {:.2}'.format(np.sqrt(mean_squared_error(target, pred))))
print('[EN 0.1] R2 on training, {:.2}'.format(r2_score(target, pred)))
reg = linear_model.LinearRegression()
#岭回归
from sklearn import linear_model
reg = linear_model.Ridge(alpha=.5)
#逻辑回归算法
from sklearn.linear_model import LogisticRegression
clf_l1_LR = LogisticRegression(C=C, penalty='l1', tol=0.01)
#核岭回归(Kernel ridge regression)
from sklearn.kernel_ridge import KernelRidge
KernelRidge(kernel='rbf', alpha=0.1, gamma=10)
#套索回归(Lasso)
from sklearn import linear_model
reg = linear_model.Lasso(alpha=0.1)
#弹性网络回归(Elastic Net)
from sklearn.linear_model import ElasticNet
regr = ElasticNet(random_state=0)
#贝叶斯回归(Bayesian Regression)
from sklearn import linear_model
reg = linear_model.BayesianRidge()
#多项式回归(Polynomial regression——多项式基函数回归)
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
poly.fit_transform(X)
#偏最小二乘回归(PLS)
from sklearn.cross_decomposition import PLSCanonical
PLSCanonical(algorithm='nipals',
             copy=True,
             max_iter=500,
             n_components=2,
             scale=True,
             tol=1e-06)
# Lasso
from sklearn.linear_model import Lasso

alpha = 0.1
lasso = Lasso(alpha=alpha)

y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)

# #############################################################################
# ElasticNet
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

plt.plot(enet.coef_, color='lightgreen', linewidth=2,
         label='Elastic net coefficients')
plt.plot(lasso.coef_, color='gold', linewidth=2,
         label='Lasso coefficients')
plt.plot(coef, '--', color='navy', label='original coefficients')
plt.legend(loc='best')
plt.title("Lasso R^2: %f, Elastic Net R^2: %f"
          % (r2_score_lasso, r2_score_enet))
plt.show()
    'C': np.arange(0.25, 2, 0.25)
}
knn_params = {
    'n_neighbors': np.arange(3, 10, 2),
    'weights': ['uniform', 'distance'],
    'p': np.arange(1, 2, 0.25)
}
dt_params = {
    'criterion': ['mse', 'friedman_mse', 'mae'],
    'max_depth': np.arange(1, 50, 5)
}

models_list = [('LR', LinearRegression(), {}),
               ('Ridge', Ridge(), ridge_params),
               ('Lasso', Lasso(), lasso_params),
               ('ElasticNet', ElasticNet(), elasticnet_params),
               ('SGDRegressor', SGDRegressor(), sgdregressor_params),
               ('SVR', SVR(), svr_params),
               ('KNN', KNeighborsRegressor(), knn_params),
               ('GaussianProcess', GaussianProcessRegressor(), {}),
               ('DTree', DecisionTreeRegressor(), dt_params)]

rmsle_scores = []
r2_scores = []
model_names = []
best_estimators = []

for name, model, model_params in list(models_list):
    print('-' * 100)
    print('Fitting ', name)
    model_names.append(name)
Beispiel #55
0
# Lasso
from sklearn.linear_model import Lasso

alpha = 0.1
lasso = Lasso(alpha=alpha)

y_pred_lasso = lasso.fit(X_train, y_train).predict(X_test)
r2_score_lasso = r2_score(y_test, y_pred_lasso)
print(lasso)
print("r^2 on test data : %f" % r2_score_lasso)

# #############################################################################
# ElasticNet
from sklearn.linear_model import ElasticNet

enet = ElasticNet(alpha=alpha, l1_ratio=0.7)

y_pred_enet = enet.fit(X_train, y_train).predict(X_test)
r2_score_enet = r2_score(y_test, y_pred_enet)
print(enet)
print("r^2 on test data : %f" % r2_score_enet)

m, s, _ = plt.stem(
    np.where(enet.coef_)[0],
    enet.coef_[enet.coef_ != 0],
    markerfmt="x",
    label="Elastic net coefficients",
    use_line_collection=True,
)
plt.setp([m, s], color="#2ca02c")
m, s, _ = plt.stem(
Beispiel #56
0
preds = np.zeros(XALL.shape[0])
feature_importance = []
test_preds = np.zeros((test.shape[0], 5))
for cv_idx, (train_idx, valid_idx) in enumerate(kf.split(XALL)):
    print('CV epoch[{0:2d}]:'.format(cv_idx))
    train_dat = lgb.Dataset(XALL.iloc[train_idx], yALL.iloc[train_idx])
    valid_dat = lgb.Dataset(XALL.iloc[valid_idx], yALL.iloc[valid_idx])

    gbm = lgb.train(variables.lgb_params,
                    train_dat,
                    num_boost_round=variables.num_boost_round,
                    valid_sets=valid_dat,
                    verbose_eval=100,
                    early_stopping_rounds=variables.early_stopping_rounds,
                    feval=mse)

    tree_feature_train = gbm.predict(XALL.iloc[train_idx],
                                     num_iteration=gbm.best_iteration,
                                     pred_leaf=True)
    regr = ElasticNet(**variables.ElasticNetParams)
    regr.fit(tree_feature_train, yALL.iloc[train_idx])

    test_feature = gbm.predict(test[predictor],
                               pred_leaf=True,
                               num_iteration=gbm.best_iteration)
    test_preds[:, cv_idx] = regr.predict(test_feature)

preds = test_preds.mean(axis=1)
submission = pd.DataFrame({'preds': preds})
submission.to_csv('../submission/result_lgb_en.csv', index=False, header=False)
Beispiel #57
0
"""

# Import necessary modules
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Create the hyperparameter grid
l1_space = np.linspace(0, 1, 30)
param_grid = {'l1_ratio': l1_space}

# Instantiate the ElasticNet regressor: elastic_net
elastic_net = ElasticNet()

# Setup the GridSearchCV object: gm_cv
gm_cv = GridSearchCV(elastic_net, param_grid, cv=5)

# Fit it to the training data
gm_cv.fit(X_train, y_train)

# Predict on the test set and compute metrics
y_pred = gm_cv.predict(X_test)
r2 = gm_cv.score(X_test, y_test)
mse = mean_squared_error(y_test, y_pred)
print("Tuned ElasticNet l1 ratio: {}".format(gm_cv.best_params_))
print("Tuned ElasticNet R squared: {}".format(r2))
print("Tuned ElasticNet MSE: {}".format(mse))
Beispiel #58
0
plt.xlim([-10, 50])
plt.show()
#Mean Squared Error, R^2
print('MSE train: %.3f, test: %.3f' % (mean_squared_error(
    y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
print('R^2 train: %.3f, test: %.3f' %
      (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))

#4. Elanet Regression model
X = df.iloc[:, :-1].values
y = df[df.columns].values
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.8,
                                                    random_state=42)
elanet = ElasticNet(alpha=1.0, l1_ratio=0.5)
elanet.fit(X_train, y_train)
y_train_pred = elanet.predict(X_train)
y_test_pred = elanet.predict(X_test)
#residual plot
plt.scatter(y_train_pred,
            y_train_pred - y_train,
            c='blue',
            marker='o',
            edgecolor='white',
            label='Training data')
plt.scatter(y_test_pred,
            y_test_pred - y_test,
            c='green',
            marker='s',
            edgecolor='white',
            X_train = X_train[idx]
            y_train = y_train[idx]

            std = X_train.std(axis=0)
            mean = X_train.mean(axis=0)
            X_train = (X_train - mean) / std
            X_test = (X_test - mean) / std

            std = y_train.std(axis=0)
            mean = y_train.mean(axis=0)
            y_train = (y_train - mean) / std
            y_test = (y_test - mean) / std

            gc.collect()
            print("- benching ElasticNet")
            clf = ElasticNet(alpha=alpha, rho=0.5, fit_intercept=False)
            tstart = time()
            clf.fit(X_train, y_train)
            elnet_results[i, j, 0] = mean_squared_error(clf.predict(X_test),
                                                       y_test)
            elnet_results[i, j, 1] = time() - tstart

            gc.collect()
            print("- benching SGD")
            n_iter = np.ceil(10 ** 4.0 / n_train)
            clf = SGDRegressor(alpha=alpha, fit_intercept=False,
                               n_iter=n_iter, learning_rate="invscaling",
                               eta0=.01, power_t=0.25)

            tstart = time()
            clf.fit(X_train, y_train)
        sum1 = err[i] + sum1
    mse1 = sum1 / len(err)
    print('mse=' + str(mse1))


def plot(y_pred, y_test):
    plt.plot(y_test, y_pred, '.')  #actual values of x and y
    plt.title('Elastic Net Regression')
    plt.xlabel('Y_test')
    plt.ylabel('Y_pred')
    plt.show()


aa = np.loadtxt('database.dat', unpack=True)
data = aa.T
data2 = data[~np.isnan(data).any(axis=1)]
y = np.squeeze(data2[:, 3])
x = np.squeeze(data2[:, 4:])
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.1,
                                                    random_state=0)

model = ElasticNet()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

accuracy(y_pred, y_test)
mse(y_pred, y_test)
plot(y_pred, y_test)