Ejemplo n.º 1
0
def lasso_cv(x, y, x_pred=None, max_deg=3, cv=10, max_iter=1e3, return_model=False):
    """LASSO polynomial fit with cross-validation.
    
    Regularized polynomial regression (by penalized least-squares) from a
    range of degrees up to n = max_deg. The LASSO regression minimises MSE and
    penalizes the size of the parameter vector using L1-norm, which leads to
    fewer coefficients in the fitted model.

    - The 'alpha' parameter (amount of penalization) is selected by k-fold CV.
    - Predicts fitted model on given values 'x_pred' (default use 'x').
    - Supports NaNs.

    """
    ind, = np.where((~np.isnan(x)) & (~np.isnan(y)))
    x_, y_ = x[ind], y[ind]
    X_ = dmatrix('C(x_, Poly)')
    if x_pred is None:
        X = dmatrix('C(x, Poly)')      # predict on original values
    else:
        X = dmatrix('C(x_pred, Poly)') # predict on given values
    lasso = LassoCV(cv=cv, copy_X=True, normalize=True, max_iter=max_iter)
    lasso = lasso.fit(X_[:,1:max_deg+1], y_)
    y_pred = lasso.predict(X[:,1:max_deg+1])
    if return_model:
        y_pred = [y_pred, lasso]
    return y_pred
Ejemplo n.º 2
0
def lassoCV_regression(data,target,alphas):
    clf=LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(data, target)
    n_features = sfm.transform(data).shape[1]
    
    while n_features > 2:
        sfm.threshold += 0.1
        data_transform = sfm.transform(data)
        n_features = data_transform.shape[1]
     
    rmses=[]
    kf=KFold(len(target),10,True,None)
    for train_index, test_index in kf:
        data_train,data_test=data_transform[train_index],data_transform[test_index]
        target_train,target_test=target[train_index],target[test_index]
        clf.fit(data_train,target_train)
        rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
        rmses.append(rmse)
        
    x0=np.arange(1,11)
    
    plt.figure()
    plt.plot(x0,rmses,label='LassoCV')
    plt.legend()
    plt.show()
    
    return rmses
	def predict(self,trains_x,train_y,tests_x,parameters,times=10,isFile=True,foldername="blend-dir"):
		"""
		Ensamble many features and regression

		:params train_X: dictionary for training
		:params train_y: testing vector
		"""
		#parameter_get
		test_data_sample = tests_x.values()[0]

		if not os.path.exists(foldername):
			os.makedirs(foldername)

		skf = None
		kfold_file = foldername + "/kfold_index.pkl"
		if os.path.exists(kfold_file):
			skf = pickle.load(open(kfold_file,"r"))
		else:
			skf = KFold(n=len(train_y),n_folds=times,shuffle=True)
			pickle.dump(skf,open(kfold_file,"w"))

		blend_train = np.zeros((len(train_y),len(parameters)))
		blend_test = np.zeros((len(test_data_sample),len(parameters)))

		for j,parameter in enumerate(parameters):
			train_x = trains_x[parameter['data']]
			test_x = tests_x[parameter['data']]

			blend_test_tmp = np.zeros((len(test_data_sample),len(parameters)))

			#file path check
			for i, (train_index,valid_index) in enumerate(skf):
				clf = model_select(parameter['parameter'])

				train = train_x[train_index]
				train_valid_y = train_y[train_index]

				kfold_filepath = "./" + foldername + "/parameter_{}_kfold_{}.pkl".format(j,i)

				if os.path.exists(kfold_filepath):
					blend_train_prediction,blend_test_prediction = pickle.load(open(kfold_filepath,"r"))
					blend_train[train_index,j] = np.expm1(clf.predict(train))
					blend_test_tmp[:,i] = np.expm1(clf.predict(test_x))
				else:
					clf.fit(train,np.log1p(train_valid_y))
					blend_train_prediction = np.expm1(clf.predict(train))
					blend_test_prediction = np.expm1(clf.predict(test_x))
					pickle.dump((blend_train_prediction,blend_test_prediction),open(kfold_filepath,"w"))

				blend_train[train_index,j] = blend_train_prediction
				blend_test_tmp[:,i] = blend_test_prediction
			blend_test[:,j] = blend_test_tmp.mean(1)

		#Blending Model
		bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
		bclf.fit(blend_train, train_y)
		y_test_predict = bclf.predict(blend_test)

		return y_test_predict
Ejemplo n.º 4
0
def fit_Lasso(features_train, labels_train, features_pred):
	model = LassoCV()
	model.fit(features_train, labels_train)
	mse = model.mse_path_
	print "LASSO - Mean square error: ", mse.shape
	# Test the model
	labels_pred = model.predict(features_pred)
	return labels_pred
	def bagging(self,trains,tests,train_y,model_name=None):
		blend_train = trains.T
		bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
		bclf.fit(blend_train, train_y)
		y_test_predict = bclf.predict(tests.T)
		train_predict = bclf.predict(trains.T)

		return train_predict,y_test_predict
Ejemplo n.º 6
0
def lassocv_feature_select(df):
    """
    通过LassoCV 进行特征选择
    """    
    X = df.drop(['status'],axis=1)
    y = df['status']
    model_lasso = LassoCV(alphas = [0.1,1,0.001, 0.0005])
    model_lasso.fit(X,y)
    coef = pd.Series(model_lasso.coef_,index=X.columns)
    print(coef.sort_values(ascending=False))
Ejemplo n.º 7
0
def make_model_and_predict(train_file, test_file):
    """Given name of training csv file, name of test csv file, constructs
    a random forest model and outputs predictions to a time-stampled csv file.
    If the test_file has SalaryNormalized as an attribute, it will score the
    model and write the result in the file "score<datetime>"
    """

    train = pd.read_csv(train_file)
    valid = pd.read_csv(test_file)
    number_of_word_features = 200
    title_words = count_words_in_column(train, "Title")
    key_count_pairs = [(k,v) for (k,v) in title_words.items() if k not in
                                                stopwords.words('english')]

    key_count_pairs.sort(key=lambda (k,v): -v)

    for word, count in key_count_pairs[:number_of_word_features]:
        add_appearance_count_feature(train, word, "Title")
        add_appearance_count_feature(valid, word, "Title")


    group_features = ["LocationNormalized", "Category", "Company", "SourceName"]

    for f in group_features:
        continuize_feature(train, valid, f, "SalaryNormalized")

    feature_columns = train.columns[12:]

    feature=train[feature_columns]
    label=train.SalaryNormalized
    clf = LassoCV()
    clf.fit(feature, label)

    valid_salary_predict = clf.predict(valid[feature_columns])
    valid["SalaryNormalized_Predict"] = valid_salary_predict

    date_string = re.sub("[ :.]", "", str(datetime.datetime.now()))
    predict_filename = 'predict' + date_string + '.csv'
    score_filename = 'score' + date_string + '.txt'
    with open(predict_filename,'wb') as f:
        valid[["Id","SalaryNormalized_Predict"]].to_csv(f, index=False,
                                                    header=False)

    ##Computes average RMS error and writes score to file
    if hasattr(valid, 'SalaryNormalized'):
        score = 0
        for i,_ in enumerate(valid["SalaryNormalized_Predict"]):
            score += (valid.SalaryNormalized[i] -
                                valid.SalaryNormalized_Predict[i]) **2
        score = math.sqrt(score/len(valid["SalaryNormalized_Predict"]))
        with open (score_filename, 'wb') as f:
            f.write("Train: " + train_file + "\n")
            f.write("Test: " + test_file + "\n")
            f.write("Score: " + str(score) + "\n")
def lassoRegularization(X,Y):
    """
    :param X: data consisting of features (excluding class variable)
    :param Y: column vector consisting of class variable
    :return: report best RMSE value for lasso regularization
    """
    tuningAlpha = [0.1,0.01,0.001]
    lasso = LassoCV(normalize=True, alphas=tuningAlpha, cv=10)
    lasso.fit(X,Y)
    prediction = lasso.predict(X)

    print
    print "LASSO REGULARIZATION"
    print "Best Alpha value for Lasso Regularization : " + str(lasso.alpha_)
    print 'Best RMSE for corresponding Alpha =', np.sqrt(mean_squared_error(Y, prediction))
Ejemplo n.º 9
0
class LocalRegression:
    """This class implements "local" regression. Given a set of training data and a set of unknown data,
           iterate through each unknown spectrum, find the nearest training spectra, and generate a model.
           Each of these local models is optimized using built-in cross validation methods from scikit."""
    def __init__(self, params, n_neighbors = 250):
        """Initialize LocalRegression

        Arguments:
        params = Dict containing the keywords and parameters for the regression method to be used.

        Keyword arguments:
        n_neighbors = User-specified number of training spectra to use to generate the local regression model for each
                      unknown spectrum.

        """
        self.model = LassoCV(**params) # For now, the only option is LASSO. Other methods to be added in the future
                                       # params is a dict containing the keywords and parameters for LassoCV

        self.neighbors = NearestNeighbors(n_neighbors=n_neighbors)

    def fit_predict(self,x_train,y_train, x_predict):
        """Use local regression to predict values for unknown data.

        Arguments:
            x_train = The training data spectra.
            y_train = The values of the quantity being predicted for the training data
            x_predict = The unknown spectra for which y needs to be predicted.
        """
        self.neighbors.fit(x_train)
        predictions = []
        coeffs = []
        intercepts = []
        for i in range(x_predict.shape[0]):
            print('Predicting spectrum ' + str(i + 1))
            x_temp = np.array(x_predict[i])
            foo, ind = self.neighbors.kneighbors([x_temp])
            x_train_local = np.squeeze(x_train[ind])
            y_train_local = np.squeeze(y_train[ind])

            cv = GroupKFold(n_splits=3)
            cv = cv.split(x_train_local, y_train_local,
                          groups=y_train_local)
            self.model.fit(x_train_local, y_train_local)
            predictions.append(self.model.predict([x_temp])[0])
            coeffs.append(self.model.coef_)
            intercepts.append(self.model.intercept_)
        return predictions, coeffs, intercepts
Ejemplo n.º 10
0
class Trainer:
    clf = None
    svm = None

    def __init__(self):
        if config.model is 'SVM':
            self.svm = svm.SVC(kernel='linear', shrinking=True, verbose=False)
            params = {
                'C': np.logspace(-5, -1, num=20), # Range of C values
            }
            self.clf = GridSearchCV(self.svm, params,
                cv      = 5,           # k-fold CV
                n_jobs  = cpu_count(), # Parallelize over CPUs
                verbose = 1,
            )

        elif config.model is 'Regression':
            self.clf = LassoCV(
                cv         = 3,
                max_iter   = 2000,
                n_jobs     = cpu_count(),
                verbose    = True,
            )

    def train(self, featMat, persist=True):
        # Preprocess
        scaler = StandardScaler()
        featMat.X = scaler.fit_transform(featMat.X, featMat.y)

        # Save preprocess output
        self.scaler = scaler
        if persist:
            joblib.dump(scaler, 'preprocess.out')

        # Perform CV
        print('Running trainer on %d rows of data with %d features.' % featMat.X.shape)
        self.clf.fit(featMat.X, featMat.y)

        # Save CV output
        if config.model is 'SVM':
            self.estimator = self.clf.best_estimator_
        elif config.model is 'Regression':
            self.estimator = self.clf
        print(self.estimator)

        if persist:
            joblib.dump(self.clf, 'cv.out')
Ejemplo n.º 11
0
    def __init__(self):
        if config.model is 'SVM':
            self.svm = svm.SVC(kernel='linear', shrinking=True, verbose=False)
            params = {
                'C': np.logspace(-5, -1, num=20), # Range of C values
            }
            self.clf = GridSearchCV(self.svm, params,
                cv      = 5,           # k-fold CV
                n_jobs  = cpu_count(), # Parallelize over CPUs
                verbose = 1,
            )

        elif config.model is 'Regression':
            self.clf = LassoCV(
                cv         = 3,
                max_iter   = 2000,
                n_jobs     = cpu_count(),
                verbose    = True,
            )
Ejemplo n.º 12
0
def lassocvclassifier(training_samples, eval_samples, vectorizer, do_grid_search=False):
    X_train, Y_train = training_samples
    X_eval, Y_eval = eval_samples
    #clf = SGDClassifier(loss='log', penalty= 'l2',l1_ratio=0.0, n_iter=30, shuffle=True, verbose=False, 
    #                    n_jobs=4, alpha=1e-4, average=True, class_weight=None)
    clf = LassoCV()
   
    clf.fit(X_train, Y_train)
    #y_train_true, y_train_pred = Y_train, clf.predict(X_train)
    print_top_10_words = True
    
    
    scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss')
    print scores, np.mean(scores), np.median(scores)

    print(clf)
    #scores = cross_validation.cross_val_score(clf.best_estimator_, X_train, Y_train, cv=10, scoring='log_loss')
    #print scores, np.mean(scores), np.median(scores)
    y_true, y_pred = Y_eval, clf.predict(X_eval)
    y_prob = clf.predict_proba(X_eval)
Ejemplo n.º 13
0
	def _regression( self, i_start, i_end ):
		"""
		Model of Lasso
		"""
		X, y = self._AssembleRegressionData_i( i_start, i_end );

		lasso = LassoCV( cv = 10 );
		lasso.fit_intercept = True;
		lasso.fit( X, y );


		res = { "reg_result" : lasso,\
			# Add reg_coefficients in the future!
# Extract Coefficients from LassoCV doesn't quite work. Need to continue
# Note: this needs to be updated to show coefficients for predict!!!!!!!!
#		reg_coefficients = list( lasso.coef_ );		
#		print reg_coefficients

		};

		return res;
Ejemplo n.º 14
0
def remove_foreground_glm(
        x, y,
        spatial_mask=None, spectral_mask=None,
        alphas=None, l1_ratio=1.):
    """Summary

    Args:
        x (TYPE): Description
        y (TYPE): Description
        spatial_mask (TYPE, optional): Description
        spectral_mask (TYPE, optional): Description
        alphas (TYPE, optional): Description

    Returns:
        TYPE: Description
    """

    # cast to double and reshape
    x_rs = np.float64(x.reshape((x.shape[0], -1))).T
    y_rs = np.float64(y.flatten())

    if spatial_mask is None:
        spatial_mask_rs = np.ones_like(y_rs, dtype=bool)
    else:
        spatial_mask_rs = spatial_mask.flatten()

    if spectral_mask is None:
        spectral_mask = np.ones(x_rs.shape[1], dtype=bool)

    if alphas is not None:
        alphas = np.atleast_1d(alphas)

    # fit GLM
    if l1_ratio == 1.:
        reg = LassoCV(
            positive=True,
            alphas=alphas,
            n_jobs=-1,
            max_iter=5000
        )
    elif l1_ratio == 0.:
        reg = RidgeCV(
            alphas=alphas,
        )
    else:
        reg = ElasticNetCV(
            positive=True,
            alphas=alphas,
            n_jobs=-1,
            l1_ratio=l1_ratio
        )

    reg.fit(x_rs[spatial_mask_rs][:, spectral_mask], y_rs[spatial_mask_rs])

    y_model = reg.predict(x_rs[:, spectral_mask]).reshape(y.shape)

    glm_coeffs = np.zeros(x_rs.shape[1], dtype=np.float32)
    glm_coeffs[spectral_mask] += reg.coef_

    return y_model, reg, glm_coeffs
Ejemplo n.º 15
0
def get_model_per_cluster(X, Y):
    model_per_cluster = {}
    for c in X.cluster.unique():    
        X_cluster = X[X.cluster==c]
        Y_true = Y[Y.cluster == c].ALSFRS_slope
        
        regr = LassoCV(cv=5)
        regr.fit(X_cluster, Y_true)

        print 'cluster: %d size: %s' % (c, Y_true.shape)
        Y_predict = regr.predict(X_cluster)
        print "\t RMS error (0 is perfect): %.2f" % np.sqrt(np.mean(
            (Y_predict - Y_true) ** 2))
        regression_SS = ((Y_predict - Y_true) ** 2).sum()
        residual_SS =((Y_true - Y_true.mean()) ** 2).sum()
        print '\t coefficient of determination R^2 = %.2f ' % (1.0 - regression_SS/residual_SS) # regr.score(X_cluster, Y_true)
        cov = sum((Y_predict - Y_predict.mean())*(Y_true - Y_true.mean()))
        Y_predict_std = np.sqrt(sum((Y_predict - Y_predict.mean())**2))
        Y_true_std = np.sqrt(sum((Y_true - Y_true.mean())**2))
        print '\t pearson correlation r = %.2f ' % (cov/(Y_predict_std*Y_true_std)) # scipy.stats.pearsonr(Y_predict, Y_true)[0]
        print "3 sample predictions: ", regr.predict(X_cluster)[:3]
        model_per_cluster[c] = {"cluster_train_data_means": X_cluster.mean(), "model" : regr}
    return model_per_cluster
Ejemplo n.º 16
0
def Lasso_model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    real_train_tar=np.expm1(train_linear_tar)
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    write_pkl(lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl')
    return test_prediction_lasso
Ejemplo n.º 17
0
    def __init__(self, params, n_neighbors = 250):
        """Initialize LocalRegression

        Arguments:
        params = Dict containing the keywords and parameters for the regression method to be used.

        Keyword arguments:
        n_neighbors = User-specified number of training spectra to use to generate the local regression model for each
                      unknown spectrum.

        """
        self.model = LassoCV(**params) # For now, the only option is LASSO. Other methods to be added in the future
                                       # params is a dict containing the keywords and parameters for LassoCV

        self.neighbors = NearestNeighbors(n_neighbors=n_neighbors)
Ejemplo n.º 18
0
def lassocv_n_random_lasso(X, y, n_iter = 30, test_size = 0.2,
                           max_iter = 50000, n_resampling = 2000):
    # find a good alpha using cv
    ss = ShuffleSplit(X.shape[0], n_iter, test_size)
    reg = LassoCV(normalize = True, cv = ss, max_iter = max_iter)
    reg.fit(X, y)
    reg = RandomizedLasso(alpha = reg.alpha_,
                          n_resampling = n_resampling,
                          max_iter = max_iter, normalize = True)
    reg.fit(X, y)
    rank = reg.scores_.argsort()[::-1]
    return (rank, reg.scores_[rank])
Ejemplo n.º 19
0
    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        ## Enumerate features if feature names not provided
        if feature_names is None:
            self.feature_names = ['feature_' + str(x) for x in range(0, X.shape[1])]
        else:
            self.feature_names=feature_names

        ## initialise tree generator
        if self.tree_generator is None:
            self.tree_generator = GradientBoostingRegressor()

        if type(self.tree_generator) not in [GradientBoostingRegressor,
                                             GradientBoostingClassifier,
                                             RandomForestRegressor,
                                             RandomForestClassifier]:
            raise ValueError("RuleFit only works with RandomForest and BoostingRegressor")
        ## TODO: Error if tree generator not GB nor RF

        ## fit tree generator
        self.tree_generator.fit(X, y)

        tree_list = self.tree_generator.estimators_
        if isinstance(self.tree_generator, RandomForestRegressor) or isinstance(self.tree_generator, RandomForestClassifier):
             tree_list = [[x] for x in self.tree_generator.estimators_]
        ## extract rules
        self.rule_ensemble = RuleEnsemble(tree_list = tree_list,
                                          feature_names=self.feature_names)

        ## concatenate original features and rules
        X_rules = self.rule_ensemble.transform(X)
        ## No rules found
        if X_rules.shape[0] == 0:
            X_concat = X
        else:
            X_concat = np.concatenate((X, X_rules), axis=1)

        ## initialise Lasso
        self.lscv = LassoCV()

        ## fit Lasso
        self.lscv.fit(X_concat, y)
        return self
Ejemplo n.º 20
0
    def train_2013(self):
        #pass
        self._load_dataframe()
        self._training_frame.to_csv("Training/raw_frame.csv")
        total_features,total_labels=self._extract_features(self._training_frame,isTraining=True)
        total_labels=np.ravel(total_labels)
        
        print type(total_features)
        print type(total_labels)

                                       
        
        
        #create train and test split
        self._features, test_features, self._labels, test_labels =\
            train_test_split(total_features, total_labels, test_size = 0.33)
        
        
        
        print self._features.shape
        print self._labels.shape
        print test_features.shape
        print test_labels.shape
        
        cv_outer = KFold(self._labels.shape[0],n_folds=5)
        self._clf = LassoCV(eps=0.01, n_alphas=10,cv =5)
        cross_val_arr=cross_val_score(self._clf,self._features,self._labels,cv=cv_outer)
        print "Finished Training....."
        
        r_sq=np.mean(cross_val_arr)
        print "R Square for training set: ",r_sq
        
        self._clf.fit(self._features,self._labels)
        plt.plot(test_labels, self._clf.predict(test_features),'ro',linewidth=2)
        plt.plot(np.arange(0,1.,.1),np.arange(0,1.,.1),'b-',linewidth=2)
        plt.xlabel("Actual Gross")
        plt.ylabel("Predicted Gross")
        plt.show()
Ejemplo n.º 21
0
    def pred_twist(self):

        #Database Access - the following code enables access to data stored on a database
        #username = "******"
        #password = "******"

        #sql = ("SELECT * FROM Table")
        #cnxn = pyodbc.connect('Driver={database};SERVER=server; Uid=username;Pwd=password')
        #df = pd.read_sql(sql, cnxn)

        #Historical data located in excel spreadsheet
        file_path = r'C:\Users\mattl\Documents\projects\historical_data.csv'
        df = pd.read_csv(file_path)

        #Input parameters

        #assign initial twist values from UI inputs
        passQty_txt = self.passQtyEdit.text()
        seg1_txt = self.seg1Edit.text()
        seg2_txt = self.seg2Edit.text()
        seg3_txt = self.seg3Edit.text()
        seg4_txt = self.seg4Edit.text()
        seg5_txt = self.seg5Edit.text()
        seg6_txt = self.seg6Edit.text()
        seg7_txt = self.seg7Edit.text()

        #convert UI text inputs into floats
        passQty = int(passQty_txt)
        seg1 = float(seg1_txt)
        seg2 = float(seg2_txt)
        seg3 = float(seg3_txt)
        seg4 = float(seg4_txt)
        seg5 = float(seg5_txt)
        seg6 = float(seg6_txt)
        seg7 = float(seg7_txt)

        #build input list and transform into dataframe
        twist = [seg1, seg2, seg3, seg4, seg5, seg6, seg7, passQty]
        df_twist = pd.DataFrame(twist).T

        #filters historical data by alloy selected
        alloy_filter = df[(df['ALLOY'] == '' +
                           str(self.alloy_comboBox.currentText()) + '')]

        #creates "X" dataframe for future regression
        X_df = alloy_filter[[
            'SEG1', 'SEG2', 'SEG3', 'SEG4', 'SEG5', 'SEG6', 'SEG7', 'PASS_QTY'
        ]].copy()

        #number of segments with measurements
        n_seg = 7

        #initialize predicted twist array
        a_twist = np.empty(n_seg)

        i = 0

        #iterates over each station to predict twist using historic data
        for segment in range(1, n_seg + 1, 1):

            y_df = alloy_filter[['A_SEG' + str(segment) + '']].copy()

            #number of polynomial degrees
            degree = 2

            #Model Pipeline
            steps = [('scaler', StandardScaler()),
                     ('poly', PolynomialFeatures(degree)),
                     ('model', LassoCV(n_jobs=-1, cv=4, max_iter=10000))]

            pipeline = Pipeline(steps)
            pipeline.fit(X_df, y_df)

            pred_A = pipeline.predict(df_twist)
            print("Segment " + str(segment) + " Prediction Score: " +
                  str(np.round(pipeline.score(X_df, y_df) * 100, decimals=1)) +
                  "%")

            a_twist[(segment - 1)] = np.round(pred_A, decimals=2)

            i += 1

            self.completed += (100 / (n_seg))
            self.progressBar.setValue(self.completed)

        a_twist = a_twist.T
        x = list(range(1, n_seg + 1))

        #fig, ax = plt.subplots(figsize=(8,4))
        self.MplWeldWidget.canvas.axes.cla()
        self.MplWeldWidget.canvas.axes.plot(x,
                                            twist[0:(n_seg)],
                                            marker='.',
                                            label="Initial Twist")
        self.MplWeldWidget.canvas.axes.plot(x,
                                            a_twist,
                                            marker='v',
                                            label="Predicted Twist")
        self.MplWeldWidget.canvas.axes.set_title("Predicted Twist per Segment",
                                                 fontsize=10)
        self.MplWeldWidget.canvas.axes.set_ylabel('Twist, mm', fontsize=8)
        self.MplWeldWidget.canvas.axes.set_xlabel('Segment', fontsize=8)
        self.MplWeldWidget.canvas.axes.set_xticks(x)
        self.MplWeldWidget.canvas.axes.axhline(0.75, ls='--', color='red')
        self.MplWeldWidget.canvas.axes.axhline(-0.75, ls='--', color='red')
        self.MplWeldWidget.canvas.axes.legend(loc='lower left')

        for u, v in zip(x, a_twist):
            label = "{:.2f}".format(v)
            self.MplWeldWidget.canvas.axes.annotate(label, (u, v),
                                                    textcoords="offset points",
                                                    xytext=(0, 10),
                                                    ha='center')

        self.MplWeldWidget.canvas.draw()

        #progress bar completion update
        self.completed = 0
        self.progressBar.setValue(self.completed)
Ejemplo n.º 22
0
X_train = X[:train.shape[0]]
X_test = X[train.shape[0]:]
y = train.SalePrice
outliers_id = np.array([523, 1298])
X_train = X_train.drop(outliers_id)
y = y.drop(outliers_id)


def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(
        model, X_train, y, scoring="neg_mean_squared_error", cv=5))
    return (rmse)


#LASSO MODEL
clf1 = LassoCV(alphas=[1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))
#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))
#XGBOOST
clf3 = xgb.XGBRegressor(colsample_bytree=0.4,
                        gamma=0.045,
                        learning_rate=0.07,
                        max_depth=20,
                        min_child_weight=1.5,
                        n_estimators=300,
                        reg_alpha=0.65,
                        reg_lambda=0.45,
Ejemplo n.º 23
0
def cross_validate(X, y, alphas):
    lasso_cv = LassoCV(alphas=alphas)
    lasso_cv.fit(X, y)
    return lasso_cv.score(X, y), lasso_cv.alpha_
Ejemplo n.º 24
0
                               param_grid=cv_grid,
                               cv=StratifiedKFold(y[0],
                                                  n_folds=5,
                                                  shuffle=True),
                               n_jobs=-1,
                               scoring="accuracy")
            clf.fit(x, y[0])
            prob = [row[1] for row in clf.predict_proba(tx)]
            pred = [row for row in clf.predict(tx)]

        if classifier == "LASSO":
            x = np.loadtxt(dir + '/benchmark_train_data.csv', delimiter=',')
            y = np.loadtxt(dir + '/benchmark_train_labels.csv', delimiter=',')
            tx = np.loadtxt(dir + '/benchmark_test_data.csv', delimiter=',')
            ty = np.loadtxt(dir + '/benchmark_test_labels.csv', delimiter=',')
            clf = LassoCV(alphas=np.logspace(-4, -0.5, 50), cv=5, n_jobs=-1)
            clf.fit(x, y)
            prob = [row for row in clf.predict(tx)]
            pred = [int(i > 0.5) for i in prob]

        accuracy.append(clf.score(tx, ty))
        roc_auc.append(roc_auc_score(ty, prob))
        precision.append(precision_score(ty, pred, average='weighted'))
        recall.append(recall_score(ty, pred, average='weighted'))
        f_score.append(f1_score(ty, pred, average='weighted'))
        pred_list.append(pred)
        prob_list.append(prob)

        if classifier == "RF":
            i = 0
            for f in features:
Ejemplo n.º 25
0
print("PCA completed successfully ...")

# save data
# np.save('X_train_pca',X_train_pca)
# X_train_pca = np.load('X_train_pca.npy')

# Load csv file into numpy array
age_train = np.genfromtxt(os.path.join(curr_path, 'targets.csv'),
                          delimiter=',')

# Regression model
# Ridge regression with lambda optimized using generalized cross validation
# reg = RidgeCV()
#degree = 5
#reg = make_pipeline(PolynomialFeatures(degree), RidgeCV())
reg = LassoCV()
reg.fit(X_train_pca, age_train)
print("Data fitted with CV Ridge Regression")

# Prediction Error
age_train_predict = reg.predict(X_train_pca)
training_error = (
    (age_train_predict - age_train).dot(age_train_predict - age_train)) / n
print('Training Error: %f' % (training_error))

# for j in range(0,int(n_features/3)):
#    save_name = "{0}_{1}-{2}.npy".format('eigenimages',3*j+1,3*(j+1))
#    np.save(save_name,eigenimages[3*j:3*(j+1),:])

#n_test=48
n_test = 138
Ejemplo n.º 26
0
#selecting based on best performance
#    predictors = np.column_stack((NBO[:,0],sa[:,0],sa[:,1],bv[:,0],CoHOMO[:,0],CoHOMO[:,1],CoHOMO[:,2],CoLUMO[:,0],CoLUMO[:,1],ba[:,0],ba[:,1],ba[:,2],ba[:,3],ba[:,4],ba[:,5],lt[:,0],lt[:,1],lt[:,2],lt[:,4],lt[:,5]))



#######Training targets  ###
#    hydricities = CoHOMO[:,1]
#    hyduns = np.column_stack((therm[:,1])).reshape((-1,1))
#    scaler = StandardScaler()
#    hydricities2 = hydricities1.reshape((-1,1))
#    hydricities=scale(hydricities2)
#    print(hyd1)

    # compound features
    polyFeatures = PolynomialFeatures(degree=1,interaction_only=True)
    regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000, cv=KFold(n_splits=5, shuffle=True)))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), LassoCV(max_iter=60000))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Ridge())
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Lasso(alpha=0.7, max_iter=70000))#, fit_intercept=True))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), Lasso(alpha=0, max_iter=70000))#, fit_intercept=True))
#    regressor = make_pipeline(polyFeatures, StandardScaler(), LinearRegression())
#    regressor = RandomForestRegressor(oob_score=True,n_estimators=2000)
    





####Make the output, print statements with detailed analysis of each step ##### 
    regressor.fit(predictors, hydricities)
    predictions = regressor.predict(predictors)
Ejemplo n.º 27
0
def regression(seed, start, end, step, cv=3, comment=''):

    logger = logging.getLogger(__name__)

    ##
    logger.info('read provided data')
    X_test, X_train, y_train = read_data()
    std_train, std_test, = transform_data(X_train=X_train, X_test=X_test)

    ##
    removed = 0
    for col in std_train.columns:
        data = std_train[col].copy()
        mask = numpy.abs(data) > data.mean() + 3.5 * data.std()
        std_train.loc[mask, col] = numpy.NaN
        removed += sum(mask)
        del data, mask
    logger.info('removed a total of [{}] elements'.format(removed))

    ##
    if True:
        logger.info(
            'fill NaN with 0 i.e. the mean of the standardized random variables'
        )
        std_train.fillna(1e-3, inplace=True)
        std_test.fillna(1e-3, inplace=True)

    elif False:
        logger.info('fill NaN with linear regression model of X_i = f(y)')

        std_train = clean_data(predictors=std_train_temp,
                               response=y_train,
                               clean_mode=CLEAN_MODE.RESPONSE)

        std_test.fillna(0.0, inplace=True)
        std_test = std_test.reindex(columns=choose)
        del choose

    ##
    logger.info('feature engineering')
    base_columns = std_train.copy().columns
    base_train = std_train.copy()
    base_test = std_test.copy()

    names = base_columns + '_sq'
    train_sq = base_train.pow(2)
    train_sq.columns = names
    std_train = pandas.concat([std_train, train_sq], axis=1)

    test_sq = base_test.pow(2)
    test_sq.columns = names
    std_test = pandas.concat([std_test, test_sq], axis=1)

    names = base_columns + '_sin'
    train_sq = numpy.sin(base_train)
    train_sq.columns = names
    std_train = pandas.concat([std_train, train_sq], axis=1)

    test_sq = numpy.sin(base_test)
    test_sq.columns = names
    std_test = pandas.concat([std_test, test_sq], axis=1)

    ##
    logger.info('use lasso regression with custom set of lambda parameters')
    alphas = seed**numpy.arange(start, end, step)
    logger.info('alpha parameters := {}'.format(
        str(["{0:0.2f}".format(i) for i in alphas]).replace("'", "")))
    reg = LassoCV(alphas=alphas, cv=cv, n_jobs=2, random_state=12357)
    model_cv = reg.fit(std_train.values, y_train.values.flatten())
    logger.info('alpha := {:f}'.format(float(model_cv.alpha_)))
    pred = model_cv.predict(std_test)
    resid = y_train.values.flatten() - model_cv.predict(std_train)

    ##
    logger.info('plotting of first stage results')
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(17, 10))
    f.suptitle('first stage')
    ax1.plot(resid, 'bo')
    tau = numpy.mean(resid) + 1.64 * numpy.std(resid)
    mask = numpy.abs(resid) > tau
    ax1.plot([i if numpy.abs(i) > tau else None for i in resid], 'ro')
    ax1.set_title('Residuals')
    ax2.scatter(model_cv.predict(std_train), y_train)
    x0, x1 = ax2.get_xlim()
    y0, y1 = ax2.get_ylim()
    ax2.set_aspect((x1 - x0) / (y1 - y0))
    ax2.set_title('Fitted vs. Actual')

    ##
    logger.info(
        'use second lasso regression, removing large error inducing observations'
    )
    std_train_ = std_train[~mask]
    y_train_ = y_train[~mask]
    reg = LassoCV(alphas=alphas, cv=cv, n_jobs=2, random_state=12357)
    model_cv = reg.fit(std_train_.values, y_train_.values.flatten())
    logger.info('alpha := {:f}'.format(float(model_cv.alpha_)))
    pred = model_cv.predict(std_test)
    resid = y_train_.values.flatten() - model_cv.predict(std_train_)

    ##
    logger.info('plotting of second stage results')
    f, (ax1, ax2) = plt.subplots(1, 2, sharey=False, figsize=(17, 10))
    f.suptitle('second stage')
    ax1.plot(resid, 'bo')
    tau = numpy.mean(resid) + 1.6 * numpy.std(resid)
    mask = numpy.abs(resid) > tau
    ax1.plot([i if numpy.abs(i) > tau else None for i in resid], 'ro')
    ax1.set_title('Residuals')
    ax2.scatter(model_cv.predict(std_train), y_train)
    x0, x1 = ax2.get_xlim()
    y0, y1 = ax2.get_ylim()
    ax2.set_aspect((x1 - x0) / (y1 - y0))
    ax2.set_title('Fitted vs. Actual, RMSE := {:.6f}'.format(
        mean_squared_error(y_train, model_cv.predict(std_train))))

    ##
    logger.info('write to pandas Series object')
    write_to_file = pandas.Series(pred,
                                  index=X_test.index.astype(int),
                                  name='y')
    write_to_file.to_csv(os.path.join(dt.output_dir(),
                                      'task1_solution_{}.csv'.format(comment)),
                         index=True,
                         header=['y'],
                         index_label=['id'])
Ejemplo n.º 28
0
#Create your training and testing data
test_x = all_data[train_x.shape[0]:]

train_x = all_data[:train_x.shape[0]]

####################################################################### modeling #######################################################################

#Import Linear Modeling Modules to run models
import time
from sklearn import svm, datasets, linear_model, preprocessing, tree
from sklearn.linear_model import LassoCV, ElasticNetCV, RidgeCV, Lasso
from sklearn.model_selection import *

# #Use Simple Lasso to Cross Validate
t1 = time.time()
model_lasso = LassoCV(alphas=alphas, random_state=0).fit(train_x, y)
t_lasso = time.time() - t1
print("\nLasso Score with CV: " + str(-1 * cross_val_score(
    model_lasso, train_x, y, scoring='neg_mean_squared_error',
    cv=kfold).mean()) + "\nTime of " + str(t_lasso))
pred_lasso = pd.DataFrame(
    data=np.expm1(model_lasso.predict(test_x)),  # values
    index=range(TRAIN_ROWS, TRAIN_COLS),  #Set Index
    columns=['SalePrice'])  # 1st column as index

#Use Elastic Net to Cross Validate
t2 = time.time()
model_elastic = ElasticNetCV(alphas=alphas).fit(train_x, y)
t_elastic = time.time() - t1
print("\nElastic Net Score with CV: " + str(-1 * cross_val_score(
    model_elastic, train_x, y, scoring='neg_mean_squared_error',
features = features.dropna(axis=1)

alpha_values = []
for a in range(1, 10001):
    alpha_values.append(a / 100)

print "Started at " + str(datetime.now())

estimator_ridge = RidgeCV(alphas=alpha_values, cv=3)
estimator_ridge.fit(features, goal)
scores = cross_val_score(Ridge(alpha=estimator_ridge.alpha_), features, goal, cv=5)
print "Ridge alpha " + str(estimator_ridge.alpha_)
print str(np.mean(scores))
print scores

estimator_lasso = LassoCV(alphas=alpha_values, cv=3)
estimator_lasso.fit(features, goal)
scores = cross_val_score(Lasso(alpha=estimator_lasso.alpha_), features, goal, cv=5)
print "Lasso alpha " + str(estimator_lasso.alpha_)
print str(np.mean(scores))
print scores


estimator_elastic_net = ElasticNetCV(alphas=alpha_values, cv=3, n_jobs=-1)
estimator_elastic_net.fit(features, goal)
scores = cross_val_score(ElasticNet(alpha=estimator_elastic_net.alpha_), features, goal, cv=5)
print "ElasticNet alpha " + str(estimator_elastic_net.alpha_)
print str(np.mean(scores))
print scores

print "Finished at " + str(datetime.now())
Ejemplo n.º 30
0
def train_and_analyse(_X, _y, features):
	X = _X
	Y = _y
	cv_l = cross_validation.KFold(X.shape[0], n_folds=10,
								shuffle=True, random_state=1)
	ranks = {}

	lr = LinearRegression(normalize=True)
	lr.fit(X, Y)
	ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	

	ridge = RidgeCV(cv=cv_l)
	ridge.fit(X, Y)
	ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
	lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000)
	lasso.fit(X, Y)
	ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features)
	
	rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42)
	rlasso.fit(X, Y)
	ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features)
	
	rfe = RFE(lr, n_features_to_select=1)
	rfe.fit(X,Y)
	ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1)

	rf = RandomForestRegressor(n_estimators=500)
	rf.fit(X,Y)
	ranks["RF"] = rank_to_dict(rf.feature_importances_, features)

	f, pval  = f_regression(X, Y, center=True)
	ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features)

	mine = MINE()
	mic_scores = []
	for i in range(X.shape[1]):
	   mine.compute_score(X[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = rank_to_dict(mic_scores, features) 

	r = {}
	for name in features:
	    r[name] = round(np.mean([ranks[method][name] 
	                             for method in ranks.keys()]), 2)
	 
	methods = sorted(ranks.keys())
	ranks["Mean"] = r
	methods.append("Mean")
	
	ranks = pd.DataFrame(ranks)

	selection_feature = ranks[ranks.Mean > 0.12].index.values

	return ranks, selection_feature
#
# Another possibility to take into account correlated variables in the dataset,
# is to estimate sparse coefficients. In some way we already did it manually
# when we dropped the AGE column in a previous Ridge estimation.
#
# Lasso models (see the :ref:`lasso` User Guide section) estimates sparse
# coefficients. LassoCV applies cross validation in order to
# determine which value of the regularization parameter (`alpha`) is best
# suited for the model estimation.

from sklearn.linear_model import LassoCV

model = make_pipeline(
    preprocessor,
    TransformedTargetRegressor(regressor=LassoCV(alphas=np.logspace(
        -10, 10, 21),
                                                 max_iter=100000),
                               func=np.log10,
                               inverse_func=sp.special.exp10))

_ = model.fit(X_train, y_train)

# %%
# First we verify which value of :math:`\alpha` has been selected.

model[-1].regressor_.alpha_

# %%
# Then we check the quality of the predictions.

y_pred = model.predict(X_train)
Ejemplo n.º 32
0
lassoreg = Lasso(alpha=0.01, normalize=True)
lassoreg.fit(X_train, y_train)
print lassoreg.coef_


# calculate RMSE (for alpha=0.01)
y_pred = lassoreg.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# - [LassoCV](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html): lasso regression with built-in cross-validation of the alpha parameter
# - **n_alphas:** number of alpha values (automatically chosen) to try

# select the best alpha with LassoCV
from sklearn.linear_model import LassoCV
lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1)
lassoregcv.fit(X_train, y_train)
lassoregcv.alpha_


# examine the coefficients
print lassoregcv.coef_


# predict method uses the best alpha value
y_pred = lassoregcv.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# ## Part 5: Regularized classification in scikit-learn
# 
Ejemplo n.º 33
0

# FILLING NULL VALUES AND GETTING DUMMIES

combined = pd.get_dummies(combined)
combined = combined.fillna(combined.mean())
#print(combined.isnull().values.sum())

# DIVIDING TRAIN AND TEST DATA
train = combined[0:train_set.shape[0]]
test = combined[train_set.shape[0]:]
#print(test.shape)
#print(test_set.shape)


# MODELLING

from sklearn.linear_model import LassoCV,Lasso
model_lasso = LassoCV(alphas=[10,1,0.1,0.001,0.0005]).fit(train,target)
lasso_preds = model_lasso.predict(test)

# CREATING CSV FILE
df_op = pd.DataFrame()
df_op["Id"] = test["Id"]
df_op["SalePrice"] = lasso_preds
df_op[["Id","SalePrice"]].to_csv("/home/vic/Desktop/Kaggle/House Prices/output.csv",index=False)




Ejemplo n.º 34
0
Archivo: eg4.py Proyecto: insomnia250/K
# The main tuning parameter for the Ridge model is alpha - a regularization parameter that measures how flexible our model is.
# The higher the regularization the less prone our model will be to overfit.
# However it will also lose flexibility and might not capture all of the signal in the data.
plt.figure(2)
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha=alpha)).mean() for alpha in alphas]
cv_ridge = pd.Series(cv_ridge, index=alphas)
cv_ridge.plot(title="Validation - Just Do It")
plt.xlabel("alpha")
plt.ylabel("rmse")

print cv_ridge.min()
# So for the Ridge regression we get a rmsle of about 0.127
# Let' try out the Lasso model. We will do a slightly different approach here and use the built in Lasso CV to figure out the best alpha for us. For some reason the alphas in Lasso CV are really the inverse or the alphas in Ridge.
model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(X_train, y)
print rmse_cv(model_lasso).mean()
'''
Nice! The lasso performs even better so we'll just use this one to predict on the test set. 
Another neat thing about the Lasso is that it does feature selection for you - setting coefficients of features it deems unimportant to zero. 
Let's take a look at the coefficients:
'''
coef = pd.Series(model_lasso.coef_, index=X_train.columns)
print("Lasso picked " + str(sum(coef != 0)) +
      " variables and eliminated the other " + str(sum(coef == 0)) +
      " variables")

plt.figure(3)
imp_coef = pd.concat(
    [coef.sort_values().head(10),
     coef.sort_values().tail(10)])
Ejemplo n.º 35
0
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from mlxtend.regressor import StackingRegressor
import matplotlib.pyplot as pl
beijing_dataset_2 = pd.read_csv(r'../resource/beijing_dataset2.csv')
print(beijing_dataset_2)
########################################################O3########################################################
# 避免过拟合,采用交叉验证,验证集占训练集20%,固定随机种子(random_state)
target = beijing_dataset_2['O3']
train_X, test_X, train_y, test_y = train_test_split(
    beijing_dataset_2.drop(columns=['pressure', 'O3']),
    target,
    test_size=0.2,
    random_state=0)

lassocv = LassoCV(alphas=[0.01, 0.1, 0.5, 1, 3, 5, 7, 10, 20, 100],
                  cv=5)  #0.329379497536
# 拟合训练集
#lassocv.fit(train_X, train_y)
# 打印最优的α值
#print ("最优的alpha值: "+str(lassocv.alpha_.astype('float')))
# 打印模型的系数
#print (lassocv.intercept_)
#print (lassocv.coef_)
rfg = RandomForestRegressor(
    bootstrap=True,
    max_features=0.005,
    min_samples_leaf=11,
    min_samples_split=10,  #0.459946225678
    n_estimators=100)
svr_rbf = SVR(kernel='rbf')
lr = LinearRegression()  #0.70
Ejemplo n.º 36
0
def best_lasso(df, resp_var, exp_vars, kcv=3, cv_path=False, 
               hists=False):
    """ Find the best lasso model through cross-validation.
    
    Args:
        df:       Dataframe
        resp_var: String. Response variable
        exp_vars: List of strings. Explanatory variables
        kcv:      Number of cross-validation folds
        cv_path:  Whether to plot the path of cross-validation
                  scores
        hists:    Whether to plot histograms of coefficient
                  estimates based on bootstrapping
    
    Returns:
        Dataframe of coefficients for best model and histograms
        of coefficient variability based on bootstrap resampling.
    """
    import seaborn as sn
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LassoCV
    from sklearn.utils import resample

    # Standardise the feature data and response
    feat_std = StandardScaler().fit_transform(df[[resp_var,] + exp_vars])

    model = LassoCV(fit_intercept=False, 
                    normalize=False, 
                    max_iter=10000,
                    cv=kcv,
                    eps=1e-3)

    # Train model on full dataset
    model.fit(feat_std[:, 1:], feat_std[:, 0])

    print model

    # Get param estimates
    params = pd.DataFrame(pd.Series(model.coef_, index=exp_vars))
    
    if cv_path:
        # Display results
        m_log_alphas = -np.log10(model.alphas_)

        plt.figure()
        plt.plot(m_log_alphas, model.mse_path_, ':')
        plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
                 label='Average across the folds', linewidth=2)
        plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                    label='alpha: CV estimate')

        plt.legend()

        plt.xlabel('-log(alpha)')
        plt.ylabel('Mean square error')
        plt.axis('tight')

        plt.show()

    if hists:
        # Estimate confidence using bootstrap
        # i.e. what is the std. dev. of the estimates for each parameter
        # based on 1000 resamplings
        err = np.array([model.fit(*resample(feat_std[:, 1:], 
                                            feat_std[:, 0])).coef_ for i in range(1000)])
        err_df = pd.DataFrame(data=err, columns=exp_vars)

        # Melt for plotting with seaborn
        err_df = pd.melt(err_df)
        g = sn.FacetGrid(err_df, col="variable", col_wrap=4)
        g = g.map(plt.hist, "value", bins=20)

        # Vertical line at 0
        g.map(sn.plt.axvline, x=0, c='k', lw=2)
    
    return params
def test_explain_linear_unsupported_multiclass(clf, newsgroups_train):
    docs, y, target_names = newsgroups_train
    vec = TfidfVectorizer()
    clf.fit(vec.fit_transform(docs), y)
    expl = explain_prediction(clf, docs[0], vec=vec)
    assert 'supported' in expl.error


@pytest.mark.parametrize(['reg'], [
    [ElasticNet(random_state=42)],
    [ElasticNetCV(random_state=42)],
    [HuberRegressor()],
    [Lars()],
    [LarsCV(max_n_alphas=10)],
    [Lasso(random_state=42)],
    [LassoCV(n_alphas=10)],
    [LassoLars(alpha=0.1)],
    [LassoLarsCV(max_n_alphas=10)],
    [LassoLarsIC()],
    [LinearRegression()],
    [LinearRegression(fit_intercept=False)],
    [LinearSVR(random_state=42)],
    [OrthogonalMatchingPursuit(n_nonzero_coefs=10)],
    [OrthogonalMatchingPursuitCV()],
    [PassiveAggressiveRegressor(C=0.1)],
    [Ridge(random_state=42)],
    [RidgeCV()],
    [SGDRegressor(**SGD_KWARGS)],
    [TheilSenRegressor()],
    [SVR(kernel='linear')],
    [NuSVR(kernel='linear')],
Ejemplo n.º 38
0
def crossval(x,y,z,k,o,Type, RealDat=False):
    kfold = KFold(n_splits=k)

    poly = PolynomialFeatures(degree=o)
    X = X_Mat(x,y,o)
    Type = Type.lower()
    if Type not in ['ols', 'ridge', 'lasso']:
        raise ValueError('Not accepted method. Try OLS, Ridge or Lasso')
        
    if Type == 'ols':
        model = LinearRegression()
        estimated_mse_sklearn = np.zeros(o)
        
    elif Type == 'ridge':
        nlambdas = 500
        scoresKfold = np.zeros((nlambdas, k))
        lambdas = np.logspace(-5, 3, nlambdas)
        estimated_mse_sklearn = np.zeros(nlambdas)
        #scoresSK = np.zeros(nlambdas)
        i = 0
        sneed = []
        for lmb in lambdas:
            j = 0
            model = Ridge(alpha=lmb)
            for train_inds, test_inds in kfold.split(X):
                x_train = x[train_inds]
                y_train = y[train_inds]
                x_test = x[test_inds]
                y_test = y[test_inds]
                X_train =  poly.fit_transform(x_train[:,np.newaxis])
                model.fit(X_train, y_train[:,np.newaxis])
                
                X_test = poly.fit_transform(x_test[:,np.newaxis])
                ypred = model.predict(X_test)
                scoresKfold[i,j] = np.sum((ypred-y_test[:,np.newaxis])**2)/np.size(ypred)
                j += 1
                sneed.append(ypred)
            i += 1
        """
        i = 0
        #scikit solution, return scoresSK, although it is (pretty much) identical to the manual solution 
        for lmb in lambdas:
            ridge = Ridge(alpha = lmb)
        
            X = poly.fit_transform(x[:, np.newaxis])
            estimated_mse_folds = cross_val_score(ridge, X, y[:, np.newaxis], scoring='neg_mean_squared_error', cv=kfold)
            scoresSK[i] = np.mean(-estimated_mse_folds)

            i += 1
        """
        if RealDat == True:
            return scoresKfold, lambdas, sneed
        else:
            return scoresKfold, lambdas
    elif Type == 'lasso':
        model = LassoCV(cv=k)
        estimated_mse_sklearn = np.zeros(o)
        
    if Type == 'ols' or 'lasso':
        for polydegree in range(1, o):
            for degree in range(polydegree):
                X = X_Mat(x,y,degree)
            estimated_mse_folds = cross_val_score(model, X, z, scoring='neg_mean_squared_error', cv=kfold)
            estimated_mse_sklearn[polydegree] = np.mean(-estimated_mse_folds)  
        
        return estimated_mse_sklearn
Ejemplo n.º 39
0
            compact=False)
 build_auto(
     BaggingRegressor(DecisionTreeRegressor(random_state=13,
                                            min_samples_leaf=5),
                      random_state=13,
                      n_estimators=3,
                      max_features=0.5), "DecisionTreeEnsembleAuto")
 build_auto(DummyRegressor(strategy="median"), "DummyAuto")
 build_auto(ElasticNetCV(random_state=13), "ElasticNetAuto")
 build_auto(ExtraTreesRegressor(random_state=13, min_samples_leaf=5),
            "ExtraTreesAuto")
 build_auto(GradientBoostingRegressor(random_state=13, init=None),
            "GradientBoostingAuto")
 build_auto(HuberRegressor(), "HuberAuto")
 build_auto(LarsCV(), "LarsAuto")
 build_auto(LassoCV(random_state=13), "LassoAuto")
 build_auto(LassoLarsCV(), "LassoLarsAuto")
 build_auto(OptimalLGBMRegressor(objective="regression",
                                 n_estimators=17,
                                 num_iteration=11),
            "LGBMAuto",
            num_iteration=11)
 build_auto(LinearRegression(), "LinearRegressionAuto")
 build_auto(
     BaggingRegressor(LinearRegression(),
                      random_state=13,
                      max_features=0.75), "LinearRegressionEnsembleAuto")
 build_auto(OrthogonalMatchingPursuitCV(), "OMPAuto")
 build_auto(RandomForestRegressor(random_state=13, min_samples_leaf=3),
            "RandomForestAuto",
            flat=True)
Ejemplo n.º 40
0
d = 200
beta_star = np.zeros(d)
beta_star[:5] = [5, -10, 0, 0, 3]

adaptiveCV = AdaptiveHuberCV({
    'c_tau':
    np.arange(start=0.5, stop=1.5, step=.5),
    'c_lamb':
    np.arange(start=0.005, stop=0.03, step=.005)
})
adaptive = AdaptiveHuber(c_lamb=0.005, c_tau=0.5)

N = 100
algos = {
    "OLS": linear_model.LinearRegression(fit_intercept=False),
    "LassoCV": LassoCV(cv=3),
    "Huber": linear_model.HuberRegressor(),
    #"MedianReg" : QuantRegScikit(q = 0.5),
    "AdaptiveCV": adaptiveCV
}

tails = {
    "normal": stats.norm(loc=0, scale=4),
    "student": stats.t(df=1.5),
    "lognormal": stats.lognorm(1, loc=0, scale=4)
}

errors = get_errors_for(algos, tails, N, d, n, beta_star)
#errors.to_pickle("{}_{}_errors.pickle".format(n, d))

# the table of the paper
X = pd.DataFrame(housevalue.data)
y = housevalue.target
X.columns = [
    "住户收入中位数", "房屋使用年代中位数", "平均房间数目", "平均卧室数目", "街区人口", "平均入住率", "街区的纬度",
    "街区的经度"
]
X.head()
Xtrain, Xtest, Ytrain, Ytest = TTS(X, y, test_size=0.3, random_state=420)

for i in [Xtrain, Xtest]:
    i.index = range(i.shape[0])

alpharange = np.logspace(-10, -2, 200, base=10)

lasso_ = LassoCV(
    alphas=alpharange  # 自行输入的alpha的取值范围
    ,
    cv=5  # 交叉验证的折数
).fit(Xtrain, Ytrain)
alpha = lasso_.alpha_
score = lasso_.score(Xtest, Ytest)

reg = LinearRegression().fit(Xtrain, Ytrain)
reg_score = reg.score(Xtest, Ytest)

ls_ = LassoCV(eps=0.00001, n_alphas=300, cv=5).fit(Xtrain, Ytrain)
ls_alpha = ls_.alpha_
ls_shape = ls_.alphas_.shape
ls_.score(Xtest, Ytest)
ls_coef = ls_.coef_
modelList = [
    [KNeighborsRegressor(), 'K近邻'],
    [
        Pipeline([('poly', PolynomialFeatures(degree=7)),
                  ('linear', LinearRegression(fit_intercept=False))]), '线性回归'
    ],
    [
        Pipeline([('poly', PolynomialFeatures(degree=5)),
                  ('linear',
                   RidgeCV(alphas=np.logspace(-3, 2, 50),
                           fit_intercept=False))]), '岭回归'
    ],
    [
        Pipeline([('poly', PolynomialFeatures(degree=6)),
                  ('linear',
                   LassoCV(alphas=np.logspace(-3, 2, 50),
                           fit_intercept=False))]), 'LASSO回归'
    ],
    [
        Pipeline([('poly', PolynomialFeatures(degree=6)),
                  ('linear',
                   ElasticNetCV(alphas=np.logspace(-3, 2, 50),
                                l1_ratio=[.1, .5, .7, .9, .95, .99, 1],
                                fit_intercept=False))]), '弹性网'
    ], [DecisionTreeRegressor(criterion='mse', max_depth=4), '决策树'],
    [RandomForestRegressor(n_estimators=10, max_depth=3), '随机森林'],
    [GradientBoostingRegressor(n_estimators=100, max_depth=1), 'GBRT'],
    [xgb.XGBRegressor(n_estimators=200, max_depth=4), 'XGBoosting'],
    [AdaBoostRegressor(n_estimators=100), 'AdaBoost'],
    [svm.SVR(kernel='rbf', gamma=0.2, C=100), 'SVR']
]
train = pd.read_csv("train_afterchange.csv")
test = pd.read_csv("test_afterchange.csv")
alldata = pd.concat((train.iloc[:, 1:-1], test.iloc[:, 1:]), ignore_index=True)
alldata.shape

X_train = train.iloc[:, 1:-1]
y = train.iloc[:, -1]
X_test = test.iloc[:, 1:]

# 定义验证函数
def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv=5))
    return (rmse)

#Lasso
clf1 = LassoCV(alphas=[1, 0.1, 0.001, 0.0005, 0.0003, 0.0002, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))  # exp(x) - 1  <---->log1p(x)==log(1+x)
score1 = rmse_cv(clf1)
print("\nLasso score: {:.4f} ({:.4f})\n".format(score1.mean(), score1.std()))

#ElasticNet
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))

score2 = rmse_cv(clf2)
print("\nElasticNet score: {:.4f} ({:.4f})\n".format(score2.mean(), score2.std()))

# print(lasso_preds)
# print(elas_preds)
def Lasso_Mode(X_train,y_train,X_test,y_test,num_class):
    algo_name = 'Lasso Regression'
    lasso_model = LassoCV(alphas=[0.01,0.05,0.10,0.20,0.50,1])
    lasso_model.fit(X_train,y_train)
    y_pred_lm = lasso_model.predict(X_test)
    PRAF(y_test, y_pred_lm,num_class,algo_name)
Ejemplo n.º 45
0

plt_sth()
"""
# #############################################################################
Bonus: how much can you trust the selection of alpha?

To answer this question we use the LassoCV object that sets its alpha
parameter automatically from the data by internal cross-validation 
(i.e. it performs cross-validation on the training data it receives).

We use external cross-validation to see how much the automatically obtained
alphas differ across different cross-validation folds.

"""
lasso_cv = LassoCV(alphas=alphas, random_state=0)
k_fold = KFold(3)

print("Answer to the bonus question:",
      "how much can you trust the selection of alpha?")
print()
print("Alpha parameters maximising the generalization score on different")
print("subsets of the data:")
for k, (train, test) in enumerate(k_fold.split(X, y)):
    lasso_cv.fit(X[train], y[train])
    print("[fold {0}] alpha: {1:.5f}, score: {2:.5f}".format(
        k, lasso_cv.alpha_, lasso_cv.score(X[test], y[test])))
print()
print("Answer: Not very much since we obtained different alphas for different")
print("subsets of the data and moreover, the scores for these alphas differ")
print("quite substantially.")
Ejemplo n.º 46
0
from sklearn.linear_model import LassoCV, Lasso

# データの読み込み
df = pd.read_csv(
    "/Users/kidashuhei/deleted_NaN_risk_factors_cervical_cancer.csv",
    index_col=0)
# データのAgeからSchillerまで範囲指定
# 目的変数をCitologyに設定
X = df.loc[:, 'Age':'Schiller']
y = df.loc[:, "Citology"]
# 配列に格納
data = list(df.columns)

# Lasso回帰で交差検証を実行
# alphasで制約を調整
lasso = LassoCV(cv=5, alphas=10**np.arange(-6, 1, 0.1))
lasso.fit(X, y)

# 平均二乗誤差(MSE)の値が最小となる正則項ラムダ
minlam = np.argmin(lasso.mse_path_.mean(axis=-1))
minmse = np.amin(lasso.mse_path_.mean(axis=-1))
print(lasso.alphas_.min())
print(minmse)

# プロット
plt.figure()
plt.xlim(plt.xlim()[::-1])
plt.semilogx(lasso.alphas_, lasso.mse_path_.mean(axis=-1), "r.")
plt.semilogx(lasso.alphas_[minlam], minmse, "g o")
plt.xlabel('Lambda')
plt.ylabel('MSE')
                    cross_val = _validation.cross_val_score(current_estimator, x, y, cv = RepeatedStratifiedKFold(n_splits = cv_testfolds, n_repeats = n_iter_test))
                    print(str(np.mean(cross_val)) + "\tAggregated cross validation accuracy for healthy samples from " + str(data_sets_healthy) +
                          " and diseased samples from " + str(data_sets_diseased) + 
                          " with model " + learn_type + " and kmer size " + str(kmer_size) + " with params " + str(all_params[i]))
                    '''

            elif learn_type == "enet" or learn_type == "lasso":
                accuracies = []
                if (learn_type == "enet"):
                    estimator = ElasticNetCV(alphas=param_grid["alpha"][0],
                                             l1_ratio=param_grid["l1"],
                                             cv=cv_gridsearch,
                                             n_jobs=-1)
                else:
                    estimator = LassoCV(alphas=param_grid["alpha"][0],
                                        cv=cv_gridsearch,
                                        n_jobs=-1)
                skf = RepeatedStratifiedKFold(n_splits=cv_testfolds,
                                              n_repeats=n_iter_test)
                for train_i, test_i in skf.split(x, y):
                    x_train, x_test = x[train_i], x[test_i]
                    y_train, y_test = y[train_i], y[test_i]
                    y_train = list(map(int, y_train))
                    y_test = list(map(int, y_test))

                    estimator.fit(x_train, y_train)

                    accuracy = evaluate(estimator, x_test, y_test)
                    accuracies.append(accuracy)

                    print("Best params for healthy samples from " +
Ejemplo n.º 48
0
ridge.fit(x_train, y_train)

y_ridge_pred = ridge.predict(x_test)

ridge_error = mean_squared_error(y_pred=ss_y.inverse_transform(y_ridge_pred),
                                 y_true=y_test)

# Lasso回归
lasso = Lasso(alpha=0.01)
lasso.fit(x_train, y_train)

y_lasso_pred = lasso.predict(x_test)

lasso_error = mean_squared_error(y_pred=y_lasso_pred, y_true=y_test)

alphas = [0.01, 0.1, 1, 5, 10, 20, 50, 100]
# 使用交叉岭回归预测模型
rig_cv = RidgeCV(alphas=alphas)
rig_cv.fit(x_train, y_train)

# 使用交叉验证lasso回归进行模型预测
lasso_cv = LassoCV(alphas=alphas)
lasso_cv.fit(x_train, y_train)

print("feature_data:\n", feature_data[:10], "\ntarget:\n", target_data[:10],
      "\n正规方程的均方误差为:\n", lr_error, "\n正规方程的回归系数:\n", lr.coef_,
      "\n梯度下降的均方误差:\n", sgd_error, "\n梯度下降的回归系数:\n", sgd.coef_, "\n岭回归方程误差:\n",
      ridge_error, "\n岭回归均系数:\n", ridge.coef_, "\nlasso回归方程误差:\n", lasso_error,
      "\nlasso回归均方系数:\n", lasso.coef_, "\n岭回归最优的正则化力度:\n", rig_cv.alpha_,
      "\nlasso回归最优的正则化力度:\n", lasso_cv.alpha_)
Ejemplo n.º 49
0
                clf.fit_cv(X_train, Y_train, [(X_cv, Y_cv)])
            else:
                clf.fit(X_train, Y_train)

            one_result = clf.predict(X_cv)
            blend_train[cv_index, j] = one_result
            cv_score = gini_normalized(Y_cv, blend_train[cv_index, j])
            cv_results[j, i] = cv_score
            score_mse = metrics.mean_absolute_error(Y_cv, one_result)
            print ('Fold [%s] norm. Gini = %0.5f, MSE = %0.5f' % (i, cv_score, score_mse))
            blend_test_j[:, i] = clf.predict(X_test)
        blend_test[:, j] = blend_test_j.mean(1)
        print ('Clf_%d Mean norm. Gini = %0.5f (%0.5f)' % (j, cv_results[j,].mean(), cv_results[j,].std()))

    end_time = datetime.now()
    time_taken = (end_time - start_time)
    print ("Time taken for pre-blending calculations: {0}".format(time_taken))
    print ("CV-Results", cv_results)
    print ("Blending models.")

    bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
    bclf.fit(blend_train, Y_dev)

    Y_test_predict = bclf.predict(blend_test)

    cv_score = cv_results.mean()
    print ('Avg. CV-Score = %s' % (cv_score))
    submission = pd.DataFrame({"Id": test_ids, "Hazard": Y_test_predict})
    submission = submission.set_index('Id')
    submission.to_csv("farons_solution.csv")
Ejemplo n.º 50
0
 def __init__(self, eps=1e-4, n_alphas=200, cv=5):
     self.model = LassoCV(eps=eps, n_alphas=n_alphas, cv=cv)
from sklearn.linear_model import LassoCV
import pandas as pd

train_data=pd.read_csv('D:\sufe\A\data_train_changed.csv')
train_data=train_data.ix[0:,1:].drop(['REPORT_ID',"ID_CARD",'LOAN_DATE'],1)
train_data=train_data.dropna()
# print(train_data.info())
X=train_data.drop(['Y'],1).as_matrix()#7
y=train_data['Y'].as_matrix()#1
lassocv = LassoCV()
lassocv.fit(X,y)
print(train_data.columns.drop('Y'),lassocv.coef_)
Ejemplo n.º 52
0
print(__doc__)

#import matplotlib.pyplot as plt
import numpy as np

from sklearn.datasets import load_boston
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectKBest, f_classif

# Load the boston dataset.
boston = load_boston()
X, y = boston['data'], boston['target']

# We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV()

# Set a minimum threshold of 0.25
sfm = SelectFromModel(clf, threshold=0.25)
sfm.fit(X, y)
n_features = sfm.transform(X).shape[1]

# Reset the threshold till the number of features equals two.
# Note that the attribute can be set directly instead of repeatedly
# fitting the metatransformer.
while n_features > 2:
    sfm.threshold += 0.1
    X_transform = sfm.transform(X)
    n_features = X_transform.shape[1]
print(X)
# Plot the selected two features from X.
Ejemplo n.º 53
0
print X_uni_bi_gram

#X = np.array(tfidf_array)
X = X_uni_bi_gram
y = np.array(engagement_rate)
print X

binary_y_pre = []

for i in range(len(y)):
	if y[i]>0: binary_y_pre.append(1)
	else: binary_y_pre.append(0)
binary_y = np.array(binary_y_pre)

coef_path_linear_cv = LinearRegression(normalize=Normalize,fit_intercept=Fit_Intercept) 
coef_path_lasso_cv = LassoCV(normalize=Normalize, max_iter=Max_Iter, copy_X=True, cv=CV, verbose=Verbose, fit_intercept=Fit_Intercept, tol=Tol)#, alphas=Alphas) 
coef_path_elastic_cv = ElasticNetCV(normalize=Normalize,max_iter=Max_Iter, tol=Tol)#,alphas=Alphas)
coef_path_logistic_cv = LogisticRegression( tol=Tol)
coef_path_binary_x_logistic_cv = LogisticRegression( tol=Tol)
coef_path_forest_cv = RandomForestClassifier(n_estimators = N_Estimators, max_features=number_of_features)

binary_X = vectorizer_binary.fit_transform(corpus)
coef_path_forest_cv.fit(X,binary_y)
coef_path_lasso_cv.fit(X,y)
coef_path_binary_x_logistic_cv.fit(binary_X,binary_y)
coef_path_logistic_cv.fit(X,binary_y)
coef_path_elastic_cv.fit(X,y)

forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')
lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring)
elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring)
Ejemplo n.º 54
0
# make a bar up and down
baseline = 1
plt.bar(range(len(both['coef_value'])),[x-baseline for x in both['coef_value']])
plt.xticks(np.arange(10), (both.index.values))
plt.xticks(rotation=90)
plt.title("Top 10 Most Important Predictors")
plt.show()

################################### LASSO ####################################

####### a) looking for best parameters
# reference: https://www.scikit-yb.org/en/latest/api/regressor/alphas.html
# Create a list of alphas to cross-validate against
alphas2=np.arange(1, 200, 5) #range for alpha
# Instantiate the linear model and visualizer
model2 = LassoCV(alphas=alphas2)
visualizer2 = AlphaSelection(model2)
visualizer2.fit(X, y)
g = visualizer2.poof()
visualizer2.alpha_

####### b) Implement Lasso Regression
las = linear_model.Lasso(alpha=visualizer2.alpha_)
las.fit(X_train, y_train)  
coefs_lasso = pd.DataFrame(las.coef_.T, index =[X.columns])
coefs_lasso = coefs_lasso.rename(columns={0:'coef_value'})   

# TRAIN SET
pred_train_las = las.predict(X_train) # Use this model to predict the train data
# Calculate RMSE train
print("RMSE for Train:",sqrt(mean_squared_error(y_train, pred_train_las))) #RMSE
Ejemplo n.º 55
0
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir):
    """
    Train one of the built-in linear regression models.

    Parameters
    ----------
    model_name : str
        Name of the built-in model to train.
    df_train : pandas DataFrame
        Data frame containing the features on which
        to train the model.
    experiment_id : str
        The experiment ID.
    csvdir : str
        Path to the `output` experiment output directory.
    figdir : str
        Path to the `figure` experiment output directory.

    Returns
    -------
    learner : skll Learner object
        SKLL LinearRegression Learner object containing
        the coefficients learned by training the built-in
        model.
    """
    # get the columns that actually contain the feature values
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]

    # LinearRegression (formerly empWt) : simple linear regression
    if model_name == 'LinearRegression':

        # get the feature columns
        X = df_train[feature_columns]

        # add the intercept
        X = sm.add_constant(X)

        # fit the model
        fit = sm.OLS(df_train['sc1'], X).fit()
        df_coef = ols_coefficients_to_dataframe(fit.params)
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # EqualWeightsLR (formerly eqWt) : all features get equal weight
    elif model_name == 'EqualWeightsLR':
        # we first compute a single feature that is simply the sum of all features
        df_train_eqwt = df_train.copy()
        df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1)

        # train a plain Linear Regression model
        X = df_train_eqwt['sumfeature']
        X = sm.add_constant(X)
        fit = sm.OLS(df_train_eqwt['sc1'], X).fit()

        # get the coefficient for the summed feature and the intercept
        coef = fit.params['sumfeature']
        const = fit.params['const']

        # now we need to assign this coefficient to all of the original
        # features and create a fake SKLL learner with these weights
        original_features = [c for c in df_train_eqwt.columns if c not in ['sc1',
                                                                           'sumfeature',
                                                                           'spkitemid']]
        coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)]))
        df_coef = ols_coefficients_to_dataframe(coefs)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # RebalancedLR (formerly empWtBalanced) : balanced empirical weights
    # by changing betas [adapted from http://bit.ly/UTP7gS]
    elif model_name == 'RebalancedLR':

        # train a plain Linear Regression model
        X = df_train[feature_columns]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_params = ols_coefficients_to_dataframe(fit.params)
        df_params = df_params.set_index('feature')

        # compute the betas for the non-intercept coefficients
        df_weights = df_params.loc[feature_columns]
        df_betas = df_weights.copy()
        df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std()

        # replace each negative beta with delta and adjust
        # all the positive betas to account for this
        RT = 0.05
        df_positive_betas = df_betas[df_betas['coefficient'] > 0]
        df_negative_betas = df_betas[df_betas['coefficient'] < 0]
        delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas)
        df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1)

        # rescale the adjusted betas to get the new coefficients
        df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index')

        # add the intercept back to the new coefficients
        df_coef['Intercept'] = df_params.loc['Intercept'].coefficient
        df_coef = df_coef.sort_index().reset_index()
        df_coef.columns = ['feature', 'coefficient']

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature
    # selection using lasso regression with a fixed lambda and then
    # use only those features to train a second linear regression
    elif model_name == 'LassoFixedLambdaThenLR':

        # train a Lasso Regression model with this featureset with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train a new vanilla linear regression with just the non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # get the coefficients data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature
    # selection using lasso regression optimized for log likelihood using
    # cross validation and then use only those features to train a
    # second linear regression
    elif model_name == 'PositiveLassoCVThenLR':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # NNLR (formerly empWtNNLS) : First do feature selection using
    # non-negative least squares (NNLS) and then use only its non-zero
    # features to train a regular linear regression. We do the regular
    # LR at the end since we want an LR object so that we have access
    # to R^2 and other useful statistics. There should be no difference
    # between the non-zero coefficients from NNLS and the coefficients
    # that end up coming out of the subsequent LR.
    elif model_name == 'NNLR':

        # add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters to a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # LassoFixedLambdaThenNNLR (formerly empWtDropNegLasso): First do
    # feature selection using lasso regression and positive only weights.
    # Then fit an NNLR (see above) on those features.
    elif model_name == 'LassoFixedLambdaThenNNLR':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train an NNLS regression using these non-zero features
        # first add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the positive features
        used_features = non_zero_features

    # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with
    # a fixed lambda
    elif model_name == 'LassoFixedLambda':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        alpha = p_lambda / len(df_train)
        learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
        learner.train(fs_train, grid_search=False)

        # convert this model's parameters to a data frame
        df_coef = skll_learner_params_to_dataframe(learner)

        # there's no OLS fit object in this case
        fit = None

        # we used all the features
        used_features = feature_columns

    # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection
    # using lasso regression optimized for log likelihood using cross
    # validation.
    elif model_name == 'PositiveLassoCV':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # save the non-zero model coefficients and intercept to a data frame
        non_zero_features, non_zero_feature_values = [], []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)
                non_zero_feature_values.append(coefficient)

        # initialize the coefficient data frame with just the intercept
        df_coef = pd.DataFrame([('Intercept', model.intercept_)])
        df_coef = df_coef.append(list(zip(non_zero_features,
                                          non_zero_feature_values)), ignore_index=True)
        df_coef.columns = ['feature', 'coefficient']

        # create a fake SKLL learner with these non-zero weights
        learner = create_fake_skll_learner(df_coef)

        # there's no OLS fit object in this case
        fit = None

        # we used only the non-zero features
        used_features = non_zero_features

    # save the raw coefficients to a file
    df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False)

    # compute the standardized and relative coefficients (betas) for the
    # non-intercept features and save to a file
    df_betas = df_coef.set_index('feature').loc[used_features]
    df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False)

    # save the OLS fit object and its summary to files
    if fit:
        ols_file = join(csvdir, '{}.ols'.format(experiment_id))
        summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

        # create a data frame with main model fit metrics and save to the file
        df_model_fit = model_fit_to_dataframe(fit)
        model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id))
        df_model_fit.to_csv(model_fit_file, index=False)

    # save the SKLL model to a file
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    return learner
X = X[~np.isnan(X).any(axis=1)]

y = np.zeros(Y.shape)
class_names = list(np.unique(Y))
class_num = 0
number_of_classes = np.unique(Y).shape[0]
for classes in np.unique(Y):
    y[Y == classes] = int(class_num)
    print('Class ' + classes + ': ' + str(class_num))
    class_num = class_num + 1

X = StandardScaler().fit_transform(X)
#X = MinMaxScaler().fit_transform(X)

## We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
clf = LassoCV(max_iter=10000)

# Set a minimum threshold of 0.25
#sfm = SelectFromModel(clf, threshold=0.25)
sfm = SelectFromModel(clf, max_features=100)

#

X_select = sfm.fit_transform(X, y)
idx_sorted = sfm.get_support(indices=True)

select_features = list(feature_set[i] for i in idx_sorted)
print('Selected features: ')
print(select_features)
print('\n')
    print("R2:",r2_score(y_test,y_p))
    print("EVS:",explained_variance_score(y_test,y_p))

md=dnn_reg(X_train,y_train,X_test,y_test)
reg_eval(X_test,y_test,md)

###Lasso CV regression

def reg_eval2(y_test,model):
    y_pred=model.predict(X_test)
    print("evaluation the results for model:",model)
    print("MSE:",mean_squared_error(y_test,y_pred))
    print("R2:",r2_score(y_test,y_pred))
    print("EVS:",explained_variance_score(y_test,y_pred))

lasso = LassoCV(cv=5, random_state=0,max_iter=10000)
lasso.fit(X_train,y_train)
reg_eval2(y_test,lasso)

#ElasticNet Regressionb
ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77)
ela.fit(X_train,y_train)
print("R square:",ela.score(X_test,y_test))
reg_eval2(y_test,ela)


#SVR Regression
from sklearn.svm import LinearSVR
LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000)
# scaler=RobustScaler()
# pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)])
model.fit(x_train, y_train)
# 模型预测
y_test_hat = model.predict(x_test)

# 评估模型
score = model.score(x_test, y_test)
print("Score:", score)

# 构建线性回归
lr = LinearRegression()
lr.fit(x_train, y_train)
lr_y_test_hat = lr.predict(x_test)
lr_score = lr.score(x_test, y_test)
print("lr:", lr_score)
# 构建lasso
lasso = LassoCV(alphas=np.logspace(-3, 1, 20))
lasso.fit(x_train, y_train)
lasso_y_test_hat = lasso.predict(x_test)
lasso_score = lasso.score(x_test, y_test)
print("lasso:", lasso_score)
# 构建岭回归
ridge = RidgeCV(alphas=np.logspace(-3, 1, 20))
ridge.fit(x_train, y_train)
ridge_y_test_hat = ridge.predict(x_test)
ridge_score = ridge.score(x_test, y_test)
print("ridge:", ridge_score)

## 7. 画图
plt.figure(figsize=(12, 6), facecolor='w')
ln_x_test = range(len(x_test))
Ejemplo n.º 59
0
#formula_rhs = " + ".join(elorange_cols)

formula = "elo ~ " + " + ".join(rhs_cols)

msg("Fitting!")

weights = np.ones(train.shape[0])

do_statsmodels=True
if do_statsmodels:
    ols = sm.wls(formula=formula, data=train, weights=weights).fit()
    print ols.summary()
    msg("Making predictions for all playergames")
    yy_df['ols_prediction'] = ols.predict(yy_df)
else:
    ols_lr = LassoCV(n_jobs=-1, verbose=True)
    X = train[rhs_cols]
    y = train['elo']
    ols_lr.fit(X,y)
    yy_df['ols_prediction'] = ols_lr.predict(X)

yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs()
yy_df['training'] = (yy_df['gamenum'] % 3)
insample_scores = yy_df.groupby('training')['ols_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std})
print insample_scores

msg("Error summary by ELO:")
elo_centuries = cut(yy_df['elo'], 20)
print yy_df.groupby(elo_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean})

msg("Error summary by gamenum:")
Ejemplo n.º 60
0
	# 将x,y的值转化为列向量
	x.shape = -1, 1
	y.shape = -1, 1
	# 构造四个模型---Pipeline
	models = [
		Pipeline([
			('poly', PolynomialFeatures()),
			('linear', LinearRegression(fit_intercept=False))
		]),
		Pipeline([
			('poly', PolynomialFeatures()),
			('linear', RidgeCV(alphas=np.logspace(-3, 2, 10), fit_intercept=False))
		]),
		Pipeline([
			('poly', PolynomialFeatures()),
			('linear', LassoCV(alphas=np.logspace(-3, 2, 10), fit_intercept=False))
		]),
		Pipeline([
			('poly', PolynomialFeatures()),
			('linear', ElasticNetCV(alphas=np.logspace(-3, 2, 10), l1_ratio=[.1, .5, .7, .9, .95, .99, 1], fit_intercept=False))
		])
	]

	mpl.rcParams['font.sans-serif'] = [u'simHei']
	mpl.rcParams['axes.unicode_minus'] = False
	np.set_printoptions(suppress=True)

	plt.figure(figsize=(18, 12), facecolor='w')
	d_pool = np.arange(1, N, 1)    # 阶
	m = d_pool.size
	clrs = []    # 颜色