def remove_foreground_glm(
        x, y,
        spatial_mask=None, spectral_mask=None,
        alphas=None, l1_ratio=1.):
    """Summary

    Args:
        x (TYPE): Description
        y (TYPE): Description
        spatial_mask (TYPE, optional): Description
        spectral_mask (TYPE, optional): Description
        alphas (TYPE, optional): Description

    Returns:
        TYPE: Description
    """

    # cast to double and reshape
    x_rs = np.float64(x.reshape((x.shape[0], -1))).T
    y_rs = np.float64(y.flatten())

    if spatial_mask is None:
        spatial_mask_rs = np.ones_like(y_rs, dtype=bool)
    else:
        spatial_mask_rs = spatial_mask.flatten()

    if spectral_mask is None:
        spectral_mask = np.ones(x_rs.shape[1], dtype=bool)

    if alphas is not None:
        alphas = np.atleast_1d(alphas)

    # fit GLM
    if l1_ratio == 1.:
        reg = LassoCV(
            positive=True,
            alphas=alphas,
            n_jobs=-1,
            max_iter=5000
        )
    elif l1_ratio == 0.:
        reg = RidgeCV(
            alphas=alphas,
        )
    else:
        reg = ElasticNetCV(
            positive=True,
            alphas=alphas,
            n_jobs=-1,
            l1_ratio=l1_ratio
        )

    reg.fit(x_rs[spatial_mask_rs][:, spectral_mask], y_rs[spatial_mask_rs])

    y_model = reg.predict(x_rs[:, spectral_mask]).reshape(y.shape)

    glm_coeffs = np.zeros(x_rs.shape[1], dtype=np.float32)
    glm_coeffs[spectral_mask] += reg.coef_

    return y_model, reg, glm_coeffs
Example #2
0
def lassoCV_regression(data,target,alphas):
    clf=LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(data, target)
    n_features = sfm.transform(data).shape[1]
    
    while n_features > 2:
        sfm.threshold += 0.1
        data_transform = sfm.transform(data)
        n_features = data_transform.shape[1]
     
    rmses=[]
    kf=KFold(len(target),10,True,None)
    for train_index, test_index in kf:
        data_train,data_test=data_transform[train_index],data_transform[test_index]
        target_train,target_test=target[train_index],target[test_index]
        clf.fit(data_train,target_train)
        rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
        rmses.append(rmse)
        
    x0=np.arange(1,11)
    
    plt.figure()
    plt.plot(x0,rmses,label='LassoCV')
    plt.legend()
    plt.show()
    
    return rmses
	def predict(self,trains_x,train_y,tests_x,parameters,times=10,isFile=True,foldername="blend-dir"):
		"""
		Ensamble many features and regression

		:params train_X: dictionary for training
		:params train_y: testing vector
		"""
		#parameter_get
		test_data_sample = tests_x.values()[0]

		if not os.path.exists(foldername):
			os.makedirs(foldername)

		skf = None
		kfold_file = foldername + "/kfold_index.pkl"
		if os.path.exists(kfold_file):
			skf = pickle.load(open(kfold_file,"r"))
		else:
			skf = KFold(n=len(train_y),n_folds=times,shuffle=True)
			pickle.dump(skf,open(kfold_file,"w"))

		blend_train = np.zeros((len(train_y),len(parameters)))
		blend_test = np.zeros((len(test_data_sample),len(parameters)))

		for j,parameter in enumerate(parameters):
			train_x = trains_x[parameter['data']]
			test_x = tests_x[parameter['data']]

			blend_test_tmp = np.zeros((len(test_data_sample),len(parameters)))

			#file path check
			for i, (train_index,valid_index) in enumerate(skf):
				clf = model_select(parameter['parameter'])

				train = train_x[train_index]
				train_valid_y = train_y[train_index]

				kfold_filepath = "./" + foldername + "/parameter_{}_kfold_{}.pkl".format(j,i)

				if os.path.exists(kfold_filepath):
					blend_train_prediction,blend_test_prediction = pickle.load(open(kfold_filepath,"r"))
					blend_train[train_index,j] = np.expm1(clf.predict(train))
					blend_test_tmp[:,i] = np.expm1(clf.predict(test_x))
				else:
					clf.fit(train,np.log1p(train_valid_y))
					blend_train_prediction = np.expm1(clf.predict(train))
					blend_test_prediction = np.expm1(clf.predict(test_x))
					pickle.dump((blend_train_prediction,blend_test_prediction),open(kfold_filepath,"w"))

				blend_train[train_index,j] = blend_train_prediction
				blend_test_tmp[:,i] = blend_test_prediction
			blend_test[:,j] = blend_test_tmp.mean(1)

		#Blending Model
		bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
		bclf.fit(blend_train, train_y)
		y_test_predict = bclf.predict(blend_test)

		return y_test_predict
	def bagging(self,trains,tests,train_y,model_name=None):
		blend_train = trains.T
		bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
		bclf.fit(blend_train, train_y)
		y_test_predict = bclf.predict(tests.T)
		train_predict = bclf.predict(trains.T)

		return train_predict,y_test_predict
Example #5
0
def fit_Lasso(features_train, labels_train, features_pred):
	model = LassoCV()
	model.fit(features_train, labels_train)
	mse = model.mse_path_
	print "LASSO - Mean square error: ", mse.shape
	# Test the model
	labels_pred = model.predict(features_pred)
	return labels_pred
Example #6
0
def lassocv_feature_select(df):
    """
    通过LassoCV 进行特征选择
    """    
    X = df.drop(['status'],axis=1)
    y = df['status']
    model_lasso = LassoCV(alphas = [0.1,1,0.001, 0.0005])
    model_lasso.fit(X,y)
    coef = pd.Series(model_lasso.coef_,index=X.columns)
    print(coef.sort_values(ascending=False))
def make_model_and_predict(train_file, test_file):
    """Given name of training csv file, name of test csv file, constructs
    a random forest model and outputs predictions to a time-stampled csv file.
    If the test_file has SalaryNormalized as an attribute, it will score the
    model and write the result in the file "score<datetime>"
    """

    train = pd.read_csv(train_file)
    valid = pd.read_csv(test_file)
    number_of_word_features = 200
    title_words = count_words_in_column(train, "Title")
    key_count_pairs = [(k,v) for (k,v) in title_words.items() if k not in
                                                stopwords.words('english')]

    key_count_pairs.sort(key=lambda (k,v): -v)

    for word, count in key_count_pairs[:number_of_word_features]:
        add_appearance_count_feature(train, word, "Title")
        add_appearance_count_feature(valid, word, "Title")


    group_features = ["LocationNormalized", "Category", "Company", "SourceName"]

    for f in group_features:
        continuize_feature(train, valid, f, "SalaryNormalized")

    feature_columns = train.columns[12:]

    feature=train[feature_columns]
    label=train.SalaryNormalized
    clf = LassoCV()
    clf.fit(feature, label)

    valid_salary_predict = clf.predict(valid[feature_columns])
    valid["SalaryNormalized_Predict"] = valid_salary_predict

    date_string = re.sub("[ :.]", "", str(datetime.datetime.now()))
    predict_filename = 'predict' + date_string + '.csv'
    score_filename = 'score' + date_string + '.txt'
    with open(predict_filename,'wb') as f:
        valid[["Id","SalaryNormalized_Predict"]].to_csv(f, index=False,
                                                    header=False)

    ##Computes average RMS error and writes score to file
    if hasattr(valid, 'SalaryNormalized'):
        score = 0
        for i,_ in enumerate(valid["SalaryNormalized_Predict"]):
            score += (valid.SalaryNormalized[i] -
                                valid.SalaryNormalized_Predict[i]) **2
        score = math.sqrt(score/len(valid["SalaryNormalized_Predict"]))
        with open (score_filename, 'wb') as f:
            f.write("Train: " + train_file + "\n")
            f.write("Test: " + test_file + "\n")
            f.write("Score: " + str(score) + "\n")
def lassocv_n_random_lasso(X, y, n_iter = 30, test_size = 0.2,
                           max_iter = 50000, n_resampling = 2000):
    # find a good alpha using cv
    ss = ShuffleSplit(X.shape[0], n_iter, test_size)
    reg = LassoCV(normalize = True, cv = ss, max_iter = max_iter)
    reg.fit(X, y)
    reg = RandomizedLasso(alpha = reg.alpha_,
                          n_resampling = n_resampling,
                          max_iter = max_iter, normalize = True)
    reg.fit(X, y)
    rank = reg.scores_.argsort()[::-1]
    return (rank, reg.scores_[rank])
def lassoRegularization(X,Y):
    """
    :param X: data consisting of features (excluding class variable)
    :param Y: column vector consisting of class variable
    :return: report best RMSE value for lasso regularization
    """
    tuningAlpha = [0.1,0.01,0.001]
    lasso = LassoCV(normalize=True, alphas=tuningAlpha, cv=10)
    lasso.fit(X,Y)
    prediction = lasso.predict(X)

    print
    print "LASSO REGULARIZATION"
    print "Best Alpha value for Lasso Regularization : " + str(lasso.alpha_)
    print 'Best RMSE for corresponding Alpha =', np.sqrt(mean_squared_error(Y, prediction))
Example #10
0
def lasso_cv(x, y, x_pred=None, max_deg=3, cv=10, max_iter=1e3, return_model=False):
    """LASSO polynomial fit with cross-validation.
    
    Regularized polynomial regression (by penalized least-squares) from a
    range of degrees up to n = max_deg. The LASSO regression minimises MSE and
    penalizes the size of the parameter vector using L1-norm, which leads to
    fewer coefficients in the fitted model.

    - The 'alpha' parameter (amount of penalization) is selected by k-fold CV.
    - Predicts fitted model on given values 'x_pred' (default use 'x').
    - Supports NaNs.

    """
    ind, = np.where((~np.isnan(x)) & (~np.isnan(y)))
    x_, y_ = x[ind], y[ind]
    X_ = dmatrix('C(x_, Poly)')
    if x_pred is None:
        X = dmatrix('C(x, Poly)')      # predict on original values
    else:
        X = dmatrix('C(x_pred, Poly)') # predict on given values
    lasso = LassoCV(cv=cv, copy_X=True, normalize=True, max_iter=max_iter)
    lasso = lasso.fit(X_[:,1:max_deg+1], y_)
    y_pred = lasso.predict(X[:,1:max_deg+1])
    if return_model:
        y_pred = [y_pred, lasso]
    return y_pred
Example #11
0
class Trainer:
    clf = None
    svm = None

    def __init__(self):
        if config.model is 'SVM':
            self.svm = svm.SVC(kernel='linear', shrinking=True, verbose=False)
            params = {
                'C': np.logspace(-5, -1, num=20), # Range of C values
            }
            self.clf = GridSearchCV(self.svm, params,
                cv      = 5,           # k-fold CV
                n_jobs  = cpu_count(), # Parallelize over CPUs
                verbose = 1,
            )

        elif config.model is 'Regression':
            self.clf = LassoCV(
                cv         = 3,
                max_iter   = 2000,
                n_jobs     = cpu_count(),
                verbose    = True,
            )

    def train(self, featMat, persist=True):
        # Preprocess
        scaler = StandardScaler()
        featMat.X = scaler.fit_transform(featMat.X, featMat.y)

        # Save preprocess output
        self.scaler = scaler
        if persist:
            joblib.dump(scaler, 'preprocess.out')

        # Perform CV
        print('Running trainer on %d rows of data with %d features.' % featMat.X.shape)
        self.clf.fit(featMat.X, featMat.y)

        # Save CV output
        if config.model is 'SVM':
            self.estimator = self.clf.best_estimator_
        elif config.model is 'Regression':
            self.estimator = self.clf
        print(self.estimator)

        if persist:
            joblib.dump(self.clf, 'cv.out')
class LocalRegression:
    """This class implements "local" regression. Given a set of training data and a set of unknown data,
           iterate through each unknown spectrum, find the nearest training spectra, and generate a model.
           Each of these local models is optimized using built-in cross validation methods from scikit."""
    def __init__(self, params, n_neighbors = 250):
        """Initialize LocalRegression

        Arguments:
        params = Dict containing the keywords and parameters for the regression method to be used.

        Keyword arguments:
        n_neighbors = User-specified number of training spectra to use to generate the local regression model for each
                      unknown spectrum.

        """
        self.model = LassoCV(**params) # For now, the only option is LASSO. Other methods to be added in the future
                                       # params is a dict containing the keywords and parameters for LassoCV

        self.neighbors = NearestNeighbors(n_neighbors=n_neighbors)

    def fit_predict(self,x_train,y_train, x_predict):
        """Use local regression to predict values for unknown data.

        Arguments:
            x_train = The training data spectra.
            y_train = The values of the quantity being predicted for the training data
            x_predict = The unknown spectra for which y needs to be predicted.
        """
        self.neighbors.fit(x_train)
        predictions = []
        coeffs = []
        intercepts = []
        for i in range(x_predict.shape[0]):
            print('Predicting spectrum ' + str(i + 1))
            x_temp = np.array(x_predict[i])
            foo, ind = self.neighbors.kneighbors([x_temp])
            x_train_local = np.squeeze(x_train[ind])
            y_train_local = np.squeeze(y_train[ind])

            cv = GroupKFold(n_splits=3)
            cv = cv.split(x_train_local, y_train_local,
                          groups=y_train_local)
            self.model.fit(x_train_local, y_train_local)
            predictions.append(self.model.predict([x_temp])[0])
            coeffs.append(self.model.coef_)
            intercepts.append(self.model.intercept_)
        return predictions, coeffs, intercepts
Example #13
0
def lassocvclassifier(training_samples, eval_samples, vectorizer, do_grid_search=False):
    X_train, Y_train = training_samples
    X_eval, Y_eval = eval_samples
    #clf = SGDClassifier(loss='log', penalty= 'l2',l1_ratio=0.0, n_iter=30, shuffle=True, verbose=False, 
    #                    n_jobs=4, alpha=1e-4, average=True, class_weight=None)
    clf = LassoCV()
   
    clf.fit(X_train, Y_train)
    #y_train_true, y_train_pred = Y_train, clf.predict(X_train)
    print_top_10_words = True
    
    
    scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss')
    print scores, np.mean(scores), np.median(scores)

    print(clf)
    #scores = cross_validation.cross_val_score(clf.best_estimator_, X_train, Y_train, cv=10, scoring='log_loss')
    #print scores, np.mean(scores), np.median(scores)
    y_true, y_pred = Y_eval, clf.predict(X_eval)
    y_prob = clf.predict_proba(X_eval)
Example #14
0
	def _regression( self, i_start, i_end ):
		"""
		Model of Lasso
		"""
		X, y = self._AssembleRegressionData_i( i_start, i_end );

		lasso = LassoCV( cv = 10 );
		lasso.fit_intercept = True;
		lasso.fit( X, y );


		res = { "reg_result" : lasso,\
			# Add reg_coefficients in the future!
# Extract Coefficients from LassoCV doesn't quite work. Need to continue
# Note: this needs to be updated to show coefficients for predict!!!!!!!!
#		reg_coefficients = list( lasso.coef_ );		
#		print reg_coefficients

		};

		return res;
def get_model_per_cluster(X, Y):
    model_per_cluster = {}
    for c in X.cluster.unique():    
        X_cluster = X[X.cluster==c]
        Y_true = Y[Y.cluster == c].ALSFRS_slope
        
        regr = LassoCV(cv=5)
        regr.fit(X_cluster, Y_true)

        print 'cluster: %d size: %s' % (c, Y_true.shape)
        Y_predict = regr.predict(X_cluster)
        print "\t RMS error (0 is perfect): %.2f" % np.sqrt(np.mean(
            (Y_predict - Y_true) ** 2))
        regression_SS = ((Y_predict - Y_true) ** 2).sum()
        residual_SS =((Y_true - Y_true.mean()) ** 2).sum()
        print '\t coefficient of determination R^2 = %.2f ' % (1.0 - regression_SS/residual_SS) # regr.score(X_cluster, Y_true)
        cov = sum((Y_predict - Y_predict.mean())*(Y_true - Y_true.mean()))
        Y_predict_std = np.sqrt(sum((Y_predict - Y_predict.mean())**2))
        Y_true_std = np.sqrt(sum((Y_true - Y_true.mean())**2))
        print '\t pearson correlation r = %.2f ' % (cov/(Y_predict_std*Y_true_std)) # scipy.stats.pearsonr(Y_predict, Y_true)[0]
        print "3 sample predictions: ", regr.predict(X_cluster)[:3]
        model_per_cluster[c] = {"cluster_train_data_means": X_cluster.mean(), "model" : regr}
    return model_per_cluster
Example #16
0
def Lasso_model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    real_train_tar=np.expm1(train_linear_tar)
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    write_pkl(lassocv_alpha, '/Users/vickywinter/Documents/NYC/Machine Learning Proj/Pickle/lasso_params.pkl')
    return test_prediction_lasso
)

feature_engg_linreg_mse_test = metrics.mean_squared_error(
    test_data["casual_log"], feature_engg_linreg_model.predict(test_data.drop(target, axis=1))
)


# Not much difference? > Doesn't look like we are overfitting!

# But how to perform shrinkage/penalized regression in general?

from sklearn.linear_model import LassoCV

feature_engg_lassocv_model = LassoCV(max_iter=50, cv=3, n_jobs=-1, random_state=42)

feature_engg_lassocv_model.fit(train_data.drop(target, axis=1), train_data["casual_log"])

feature_engg_lassocv_mse_train = metrics.mean_squared_error(
    train_data["casual_log"], feature_engg_lassocv_model.predict(train_data.drop(target, axis=1))
)

feature_engg_lassocv_mse_test = metrics.mean_squared_error(
    test_data["casual_log"], feature_engg_lassocv_model.predict(test_data.drop(target, axis=1))
)


# Check the performance on test set
print feature_engg_linreg_mse_test
print feature_engg_lassocv_mse_test
# Penalization decreases performance?
Example #18
0

"""Вопрос 2. Какой признак линейная регрессия считает наиболее сильно влияющим на качество вина?"""
linreg_coef = pd.DataFrame({'coef': linreg.coef_}, index=X_train.columns)
linreg_coef.sort_values(by='coef', inplace=True)


lasso1 = Lasso(alpha=0.01, random_state=17)
lasso1.fit(X_train_scaled, y_train)

lasso1_coef = pd.DataFrame({'coef': lasso1.coef_}, index=X_train.columns)
lasso1_coef.sort_values(by='coef', inplace=True)

alphas = np.logspace(-6, 2, 200)
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=17)
lasso_cv.fit(X_train_scaled, y_train)


"""Вопрос 3. Какой признак "обнулился первым" в настроенной модели LASSO?"""
lasso_cv_coef = pd.DataFrame({'coef': lasso_cv.coef_}, index=X_train.columns)
lasso_cv_coef.sort_values(by='coef', inplace=True)

print("Mean squared error (train): %.3f" % mean_squared_error(y_train, lasso_cv.predict(X_train_scaled)))
print("Mean squared error (test): %.3f" % mean_squared_error(y_holdout, lasso_cv.predict(X_holdout_scaled)))


forest = RandomForestRegressor(random_state=17)
forest.fit(X_train, y_train)

print("Mean squared error (train): %.3f" % mean_squared_error(y_train, forest.predict(X_train)))
print("Forest Mean squared error (cv): %.3f" % cross_val_score(forest, X_train, y_train, scoring='neg_mean_squared_error').mean())
rf_mask = rf.feature_importances_ > 0.10
#reduced_X = X.loc[:, rf_mask]

#3.2 RF + RFE

rfe = RFE(estimator=rf, n_features_to_select=5, step=2, verbose=0)
rfe.fit(X_train, y_train)
rfe_mask = rfe.support_ 
print("{0:.1} RF RFE R^2 on test set.".format(rfe.score(X_test, y_test)))
mse = MSE(y_test, rfe.predict(X_test))
print("{} RF RFE RMSE on test set.".format(mse**0.5))


#3.3 LassoCV
lasso_CV = LassoCV(n_alphas=250, cv=4, n_jobs=2)
lasso_CV.fit(X_train, y_train)
lcv_mask = lasso_CV.coef_ != 0
print('{} features out of {} selected'.format(sum(lcv_mask), len(lcv_mask)))
lasso_CV_coefs =  dict(zip(lasso_CV.coef_.round(4), X_sc.columns))
print("{0:.1} LassoCV R^2 on test set.".format(lasso_CV.score(X_test, y_test)))
mse = MSE(y_test, lasso_CV.predict(X_test))
print("{} LassoCV RFE RMSE on test set.".format(mse**0.5))

#3.4 Gradient Boosting
gbr = GBR(n_estimators=250)
rfe = RFE(estimator=gbr, n_features_to_select=5, step=2, verbose=0)
rfe.fit(X_train, y_train)
gbr_mask = rfe.support_ 
print("{0:.1} GBR RFE R^2 on test set.".format(rfe.score(X_test, y_test)))
mse = MSE(y_test, rfe.predict(X_test))
print("{} GBR RFE RMSE on test set.".format(mse**0.5))
Example #20
0
y_val = map(lambda x: int(x), y_test)
test_pred = map(lambda x: int(x), test_pred)

print("="*60)
print("Logistic Regression with L1-based feature selection(0.001)")
print("="*60)

print(classification_report(y_test, test_pred))
print('Accuracy: %s\n' % accuracy_score(test_pred, y_test))

######################## Logistic Regression with L1-based feature selection (CV) ################################

features = tfidf.fit_transform(X_train).toarray()

lassocv = LassoCV()
lassocv.fit(features, y_train)
alpha = lassocv.alpha_

lor_l1 = Pipeline([
	('vect', tfidf),
	('tfidf', TfidfTransformer()),
	('predict', Lasso(alpha=alpha, max_iter=15)),
	])

lor_l1.fit(X_train, y_train)
test_pred = lor_l1.predict(X_test)

y_val = map(lambda x: int(x), y_test)
test_pred = map(lambda x: int(x), test_pred)

print("="*60)
Example #21
0
# for alpha in alphas:
#     clf = Lasso(alpha)
#     test_score = np.sqrt(-cross_val_score(clf, X_train, sale_price, cv=10, scoring='neg_mean_squared_error'))
#     test_scores.append(np.mean(test_score))
# plt.plot(alphas, test_scores)
# plt.title("Alpha vs CV Error")
# plt.show()

# Parameter tuning (alpha) for Lasso regression - method 2
# Re-write this part (inspired from kernel)
lscv = LassoCV(alphas=None, copy_X=True, cv=10, eps=0.001, fit_intercept=True,
               max_iter=5000, n_alphas=1000, n_jobs=1, normalize=False, positive=False,
               precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
               verbose=False)

lscv.fit(X_train, sale_price)
# the best alpha is lscv.alpha_
# print (lscv.alpha_)
# gives 0.000482366054317

lasso = Lasso(alpha=lscv.alpha_, max_iter=5000)
lasso.fit(X_train, sale_price)
# Since the model is trained on log of sale price, back transforming it for output
y_lasso = np.exp(lasso.predict(X_test))
print y_lasso

y_final = (0.7 * y_ridge) + (0.3 * y_lasso)

final_output = pd.DataFrame(data= {'Id' : test.index, 'SalePrice': y_final})
print final_output.head(10)
final_output.to_csv('output_ridge.csv', index=False)
Example #22
0
training_data.loc[training_data['smoker'] == 'yes', 'smoker'] = 1
training_data.loc[training_data['smoker'] == 'no', 'smoker'] = 0

training_data.loc[:, 'northeast'] = training_data['region'] == 'northeast'
training_data.loc[:, 'northwest'] = training_data['region'] == 'northwest'
training_data.loc[:, 'southeast'] = training_data['region'] == 'southeast'
training_data.loc[:, 'southwest'] = training_data['region'] == 'southwest'
del training_data['region']

x_train = training_data[['age', 'is_male', 'bmi', 'children', 'smoker', 'northeast', 'northwest', 'southeast', 'southwest']]\
    .to_numpy(dtype=np.float64)
y_train = training_data['charges'].to_numpy(dtype=np.float64)

model = LassoCV(normalize=True)
model.fit(x_train, y_train)

raw_test_data = pd.read_csv('public_dataset/test_sample.csv')
test_data = raw_test_data.copy()

test_data.loc[test_data['sex'] == 'female', 'sex'] = 0
test_data.loc[test_data['sex'] == 'male', 'sex'] = 1
test_data.rename(columns={'sex': 'is_male'}, inplace=True)

test_data.loc[test_data['smoker'] == 'yes', 'smoker'] = 1
test_data.loc[test_data['smoker'] == 'no', 'smoker'] = 0

test_data.loc[:, 'northeast'] = test_data['region'] == 'northeast'
test_data.loc[:, 'northwest'] = test_data['region'] == 'northwest'
test_data.loc[:, 'southeast'] = test_data['region'] == 'southeast'
test_data.loc[:, 'southwest'] = test_data['region'] == 'southwest'
    def get_keywords_with_regression(self,
                                     random_seed=923,
                                     ngram_range=(1, 1),
                                     min_df=0.01,
                                     max_df=0.85,
                                     apply_smote=True):
        """
        Get the keywords using LASSO's feature selection technique
        """
        # a dataframe of all the reviews
        df = pd.DataFrame({"review_text": self.corpus, "labels": self.labels})
        # Get positive Examples
        pos_df = df.loc[df['labels'] == 1, :].reset_index(drop=True)

        # Get negative examples
        neg_df = df.loc[df['labels'] == 0, :].reset_index(drop=True)

        # get the shape
        pos_count = pos_df.shape[0]
        neg_count = neg_df.shape[0]

        if min([pos_count, neg_count]) < 100:
            print('Warning: Number of minority class less than 100')

        if min([pos_count, neg_count]) / max([pos_count, neg_count]) < 0.333:
            print("Class imbalance detected")
        # initialize the variables
        #salient_terms = dict()

        # removed the iteration
        df_combine = pos_df.append(neg_df)
        df_combine.reset_index(drop=True, inplace=True)
        target = df_combine['labels']

        # Fit the TFidf vectorizer
        vec = TfidfVectorizer(ngram_range=ngram_range,
                              tokenizer=self.tokenizer,
                              min_df=min_df,
                              max_df=max_df)
        vec_f = vec.fit(df_combine['review_text'])

        # Create the traiining document-term matrix
        train_dtm = vec_f.transform(df_combine['review_text'])

        if apply_smote:
            # Apply SMOTE to fix the class imbalance problem
            sm = SMOTE(random_state=923, ratio=1)

            X_input, y_input = sm.fit_resample(train_dtm, target)
        else:
            X_input, y_input = train_dtm, target
        """
        Modeling
        """
        lasso = LassoCV(cv=5)
        lasso_f = lasso.fit(X_input, y_input)
        """
        Construct the coeficient table
        """
        coef_table = pd.DataFrame({
            "feature": vec_f.get_feature_names(),
            "coef": lasso_f.coef_
        })

        # Select only the positive coefficients
        key_terms_df = coef_table[['feature', 'coef']][coef_table['coef'] > 0]

        key_terms_df.reset_index(drop=True, inplace=True)
        """
        Next find the DF for the terms 
        """
        # Get the index for each of the terms with positive label
        train_feature_tb = pd.DataFrame(train_dtm.toarray())
        train_feature_tb.columns = vec_f.get_feature_names()

        dtm_key_terms = train_feature_tb[list(key_terms_df['feature'])]

        for term in dtm_key_terms:
            dtm_key_terms[term] = [x != 0 for x in dtm_key_terms[term]]
            # replace the vectors in the tf-idf table with a boolean vector

        frequency_count = pd.DataFrame(dtm_key_terms.sum(axis=0)).reset_index()
        frequency_count.columns = ['feature', 'count']
        frequency_count[
            'prop'] = frequency_count['count'] / dtm_key_terms.shape[0]

        # Join the two tables on features to get all the info needed
        significant_terms = pd.merge(key_terms_df,
                                     frequency_count,
                                     how="left",
                                     on="feature")
        """
        As a last step, let's tag each term and separate into Nouns, Verbs and Adjs
        """
        class output:
            significant_terms_tb = significant_terms
            dtm_boolean = dtm_key_terms
            #salient_terms_dict = salient_terms

        return output
Example #24
0
#miss_cal = test_data[quality].isnull().sum().sort_values(ascending=False)
#miss = miss_cal[miss_cal>0].index
#test_data[miss] = test_data[miss].fillna(test_data[miss].mean())
#print test_data[miss].isnull().sum()
#print train_data_x.shape
#print train_data_y
#print test_data[np.isnan(test_data).any]

#line_reg = LinearRegression()#0.508
#line_reg.fit(new_tr_x,train_data_y)
re1 = list(test_data['Id'])
ind = test_data['Id']
train_data_x = train_data_x.drop('Id', axis=1)
test_data = test_data.drop('Id', axis=1)
las = LassoCV()
las.fit(new_tr_x, train_data_y)
para = {
    'n_alphas': [i for i in range(50, 200)],
    'max_iter': [j for j in range(500, 2000)]
}
clt = GridSearchCV(las, param_grid=para)
clt.fit(train_data_x, train_data_y)
test_data_y3 = clt.predict(test_data)
#train_data_x = preprocessing.normalize(train_data_x)
#test_data = preprocessing.normalize(test_data)
boostreg = GradientBoostingRegressor(
)  #learning_rate=0.016,n_estimators=1000)#0.161
#boostreg.fit(new_tr_x, train_data_y)
#train_data_y = np.log1p(train_data_y)
boostreg.fit(train_data_x, train_data_y)
#my_pip = RandomForestRegressor()#0.172
def yuchuli(file_path):
    global special
    alphaslist = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100, 1000,
                  10000]  # alphas列表
    df = pd.read_excel(file_path)
    head_list = df.columns.values
    special_head = head_list[special]
    drop_index.remove(special)
    dataset = df.drop(df.columns[drop_index], axis=1)
    new_head_list = dataset.columns.values
    new_special = list(new_head_list).index(special_head)
    special = new_special  # 获取筛选列过后的数据集中特征所在列的下标

    # h = 13  # 第几个城市
    # dataset = dataset[20 * h - 20:20 * h]
    dataset = dataset.fillna(0.1)
    y = dataset.iloc[:, special]
    x = dataset.drop(dataset.columns[[special]], axis=1)
    po = PolynomialFeatures(degree=2,
                            interaction_only=False,
                            include_bias=False)
    x_poly = po.fit_transform(x)
    x_change = pd.DataFrame(x_poly, columns=po.get_feature_names())
    labels = x_change.columns
    X_train, X_test, Y_train, Y_test = train_test_split(x_change,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)
    model2 = LassoCV(alphas=alphaslist)  # 导入模型传入参数alpha=0.1
    model2.fit(X_train, Y_train)  # 训练数据
    # model2 = Lasso(max_iter = iterations, alphas=1).fit(X_train,Y_train)
    index = model2.coef_
    newindex = []
    if len(labels) == len(index):
        for i in range(len(index)):
            if index[i] >= 0.00001 and i != 0:  # 选择系数值绝对值大于等于0.00001的系数
                pass
            else:
                pass
            if abs(index[i]) >= 0.00001:
                newindex.append(i)
    ypre = model2.predict(X_test)
    x_changenew = x_change.iloc[:, newindex]  # 形成新的dataset

    # 选择新的特征系数重新进行线性回归————————————————————————————————————————————————————————————————————————————————————-———

    X_train, X_test, Y_train, Y_test = train_test_split(x_changenew,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=1)
    model2 = RidgeCV(alphas=alphaslist)  # 导入模型传入参数alpha=0.1
    model2.fit(X_train, Y_train)  # 训练数据
    # model2 = Lasso(max_iter=iterations).fit(X_train, Y_train)
    index = model2.coef_
    gongshi = []
    for i in range(len(index)):
        if index[i] >= 0 and i != 0:
            gongshi.append('+')
        gongshi.append('%.5f' % index[i] + labels[i])

    return ypre, Y_test, gongshi
Example #26
0
print('Linear Regression RMSE:%0.5f'%np.sqrt(metrics.mean_squared_error(y_train,y_pred_lr)))
print('Linear Regression R^2: %0.5f' %metrics.explained_variance_score(y_train,y_pred_lr))
print ("predict time:", round(time.time()-t1, 3), "s")

# Store Standard Error
se_lr = stats.sem(y_pred_lr)

"""<h2 id="4.-Lasso-Regression-model">4. Lasso Regression model<a class="anchor-link" href="#4.-Lasso-Regression-model">¶</a></h2><hr/>

#### For Train mse
"""

# Initialise Lasso Regression model
lcv = LassoCV()
t0=time.time()
lcv.fit(X_train, y_train)

# Make Prediction
t1=time.time()
y_pred_lcv = lcv.predict(X_test)

# Return Results
print('Lasso Regression MAE: %0.5f'%metrics.mean_absolute_error(y_test,y_pred_lcv))
print('Lasso Regression MSE:%0.5f'%metrics.mean_squared_error(y_test,y_pred_lcv))
print('Lasso Regression RMSE:%0.5f'%np.sqrt(metrics.mean_squared_error(y_test,y_pred_lcv)))
print('Lasso Regression R^2: %0.5f' %metrics.explained_variance_score(y_test,y_pred_lcv))
print ("fitting time:", round(time.time()-t0, 3), "s")
print ("predict time:", round(time.time()-t1, 3), "s")

se_lcv = stats.sem(y_pred_lcv)
Example #27
0
y_pred1=LR.predict(scaler.transform([[300,110,5,5,5,10,1]]))


# In[31]:


y_pred1


# In[32]:


# Lasso Regularization
# LassoCV will return best alpha and coefficients after performing 10 cross validations
lasscv = LassoCV(alphas = None,cv =10, max_iter = 100000, normalize = True)
lasscv.fit(x_train, y_train)


# In[33]:


# best alpha parameter
alpha = lasscv.alpha_
alpha


# In[34]:


#now that we have best parameter, let's use Lasso regression and see how well our data has fitted before
Example #28
0
#%% LASSO
"""
Lasso Regression : The cost function for Lasso (least absolute shrinkage and selection operator) 
regression can be written as
Just like Ridge regression cost function, for lambda =0, the equation above reduces the equation.
The only difference is instead of taking the square of the coefficients, magnitudes are 
taken into account. This type of regularization (L1) can lead to zero coefficients i.e. some 
of the features are completely neglected for the evaluation of output. So Lasso regression not 
only helps in reducing over-fitting but it can help us in feature selection. 
Just like Ridge regression the regularization parameter (lambda) can be controlled.
"""
#%% LASSO CV test different alphas
lass_cv = LassoCV(alphas=[1, 0.7, 0.5, 0.3, 0.1, 0.01, 0.001, .0005, .0001, .00001], cv=3, tol=.001, random_state=67, normalize=True)

# fit to data
lass_cv.fit(X_sample, y_sample)

# score based off of test data
lass_cv.score(X_sample, y_sample)

# best alpha
lass_cv.alpha_

#%% Test more alphas based off of last cell
lass_cv = LassoCV(alphas=[.008, .009, .0001, .0002, .0003, .0004], cv=3, random_state=67, normalize=True)

# fit to data
lass_cv.fit(X_sample, y_sample)

# score based off of test data
lass_cv.score(X_sample, y_sample)
Example #29
0
def reg_pro_aprobacion():
    datos = pd.read_csv('Datos/Datos_MEN.csv', header=0)
    # obtenemos la columna año de los datos
    datos['AÑO'] = datos['AÑO'].astype('int64')
    # totamos solo los valos de la columna
    X = datos['AÑO'].values
    X = X[:, np.newaxis, ]
    # obtenemos las columna de Aprobacion de los datos
    y = datos['APROBACIÓN_MEDIA'].values

    # Aprobacion media
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5)

    # Se crean los modelos lineales
    lr = LinearRegression()
    rgl = LassoCV(cv=4)
    rgr = RidgeCV(alphas=[0.1, 0.2, 0.5, 1.0, 3.0, 5.0, 10.0])

    # #entreno el modelo

    # Aprobacion media
    lr.fit(X_train, y_train)
    rgl.fit(X_train, y_train)
    rgr.fit(X_train, y_train)

    # realizamos la prediccion
    # Aprobacion media
    y_pred = lr.predict(X_test)
    y_predrgl = rgl.predict(X_test)
    y_predrgr = rgr.predict(X_test)

    # periodos a proyectar -----------------------------------------------------------------------------
    año_actual = datetime.datetime.now().year
    periodos = [
        año_actual, año_actual + 1, año_actual + 2, año_actual + 3,
        año_actual + 4
    ]
    periodos = np.reshape(periodos, (-1, 1))

    # Aprobacion media
    y_pro2 = lr.predict(periodos)
    y_prorgl2 = rgl.predict(periodos)
    y_prorgr2 = rgr.predict(periodos)

    print('DATOS DEL MODELO DE REPROBACION')
    print('Regresión Mínimos Cuadrados Ordinarios')
    # Coeficiente
    # m_coe =lr.coef_
    print('Coeficientes:', lr.coef_)
    # MSE
    # m_mse = "{0:.4f}".format(np.mean((lr.predict(X_test) - y_test) ** 2))
    print("Residual sum of squares: %.2f" % np.mean(
        (lr.predict(X_test) - y_test)**2))
    # Varianza explicada
    # m_ve = "{0:.4f}".format(abs(lr.score(X_test,y_test)))

    print('Varianza explicada: %.2f\n' % lr.score(X_test, y_test))

    print('Regresión Lasso')
    # Coeficiente
    # l_coe =  rgl.coef_
    print('Coeficientes:', rgl.coef_)
    # MSE
    # l_mse = "{0:.4f}".format(np.mean((rgl.predict(X_test) - y_test) ** 2))
    print("Residual sum of squares: %.2f" % np.mean(
        (rgl.predict(X_test) - y_test)**2))
    # Varianza explicada
    # l_ve = "{0:.4f}".format(abs(rgl.score(X_test, y_test)))
    print('Varianza explicada: %.2f\n' % rgl.score(X_test, y_test))

    print('Regresión Ridge')
    #Coeficiente
    # r_coe = rgr.coef_
    print('Coeficientes:', rgr.coef_)
    # MSE
    # r_mse = "{0:.4f}".format((np.mean(rgr.predict(X_test) - y_test) ** 2))
    print("Residual sum of squares: %.2f" % np.mean(
        (rgr.predict(X_test) - y_test)**2))
    # Varianza explicada
    # r_ve = "{0:.4f}".format(abs(rgr.score(X_test,y_test)))
    print('Varianza explicada: %.2f\n' % rgr.score(X_test, y_test))

    fig1 = plt.figure(figsize=(12, 8), dpi=120)

    fig1.subplots_adjust(hspace=0.5, wspace=0.5)
    ax = fig1.add_subplot(2, 1, 1)
    ax.scatter(X, y)
    ax.set_xlabel('Periodos')
    ax.set_ylabel('Tasa de Aprobación')
    ax.set_title('Grafica de Aprobación')

    ax2 = fig1.add_subplot(2, 1, 2)
    ax2.scatter(X_test, y_test, color='black')
    ax2.plot(X_test, y_pred, color='blue', linewidth=3, label=u'Regresión MCO')
    ax2.plot(X_test,
             y_predrgl,
             color='yellow',
             linewidth=3,
             label=u'Regresión Lasso')
    ax2.plot(X_test,
             y_predrgr,
             color='green',
             linewidth=3,
             label=u'Regresión Ridge')
    ax2.set_title(u'Regresión de Aprobación Media por 3 metodos diferentes')
    ax2.set_xlabel('Periodos')
    ax2.set_ylabel('Regresión de Aprobación Media')
    # plt.legend()
    # ax2.set_xticks(())
    # ax2.set_yticks(())
    ruta_reg = "static/file/regresion_aprobacion.png"
    fig1.savefig(ruta_reg)
    # plt.show()

    fig01 = plt.figure(figsize=(12, 8), dpi=120)

    fig01.subplots_adjust(hspace=0.5, wspace=0.5)
    ax = fig01.add_subplot(1, 1, 1)
    ax.scatter(periodos, y_pro2, color='blue')
    ax.scatter(periodos, y_prorgl2, color='yellow')
    ax.scatter(periodos, y_prorgr2, color='green')
    # ax2.plot(periodos, y_pred, color='blue',linewidth=3, label=u'Regresión MCO')
    # ax2.plot(periodos, y_predrgl, color='yellow',linewidth=3, label=u'Regresión Lasso')
    # ax2.plot(periodos, y_predrgr, color='green',linewidth=3, label=u'Regresión Ridge')
    ax.set_title(u'Proyeccion de los siguentes 5 Periodos Escolares')
    ax.set_xlabel('Periodos')
    ax.set_ylabel('Proyeccion Aprobación Media')
    ruta_pro = "static/file/pro_aprobacion.png"
    fig01.savefig(ruta_pro)

    return ruta_reg, ruta_pro
Example #30
0
def pert_test(clusters, energies, counts, comps, noise=0.1, Normalize=True, Intercept=True, Energy_above_hull = True, name=''):
    fold_pick = 10
    lasso_coefs = []
    ridge_coefs = []
    linreg_coefs = []
    counts = np.array(counts)
    energies = np.array(energies)
    ###- scale to energy above hull -###
    if Energy_above_hull == True:
        y1 = min(energies)
        y2 = max(energies)
        x2 = min(comps)
        x1 = max(comps)
        scale = lambda x0, y0, x1, y1, x2, y2: abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1) / np.sqrt(
            (y2 - y1) ** 2 + (x2 - x1) ** 2)
        energies = [scale(comps[i], energies[i], x1, y1, x2, y2) for i in range(len(energies))]

    ###- Set up output file -###
    file_out = 'pert_summery.txt'
    file = open(file_out, 'w')

    ###- Set up alphas for CV -###
    alpha_range = [-10, 10]
    alpha_lasso = np.logspace(alpha_range[0], alpha_range[1], num=1000)
    n_alphas = 1010
    alpha_ridge = np.logspace(-15, 10, n_alphas)

    # LASSO and RIDGE, Cross-Validation, Lin Reg without CV
    lassocv = LassoCV(alphas=alpha_lasso, normalize=Normalize, fit_intercept=Intercept, cv=fold_pick, max_iter=1e5)
    ridgecv = RidgeCV(alphas=alpha_ridge, normalize=Normalize, fit_intercept=Intercept, cv=None, store_cv_values=True)
    linreg = LinearRegression(fit_intercept=Intercept, normalize=Normalize)
    # Fit to data for each method
    noise = np.linspace(0.001,1,25)
    lasso_vars = [[] for _ in range(len(clusters))]
    for n in noise:
        lasso_coefs = []
        ridge_coefs = []
        linreg_coefs = []
        lassocv.fit(counts, energies)
        lasso_coefs.append(lassocv.coef_)
        ridgecv.fit(counts, energies)
        ridge_coefs.append(ridgecv.coef_)
        linreg.fit(counts, energies)
        linreg_coefs.append(linreg.coef_)
        for i in range(100):
            data_noise = np.random.normal(0, n, counts.shape)
            counts_new = counts + data_noise
            data_noise = np.random.normal(0, n, energies.shape)
            energies_new = energies + data_noise
            lassocv.fit(counts_new, energies_new)
            lasso_coefs.append(lassocv.coef_)
            ridgecv.fit(counts_new, energies_new)
            ridge_coefs.append(ridgecv.coef_)
            linreg.fit(counts_new, energies_new)
            linreg_coefs.append(linreg.coef_)
        lasso_coefs = np.array(lasso_coefs)
        ridge_coefs = np.array(ridge_coefs)
        linreg_coefs = np.array(linreg_coefs)
        for i in range(len(clusters)):
            data = np.transpose(lasso_coefs[:, i])
            var = data.var()
            lasso_vars[i].append(var)
    for i in range(len(lasso_vars)):
        plt.plot(noise,lasso_vars[i])
    file.close()
    return
predictors = df_train.drop([target], axis=1)
X_train, X_test, y_train, y_test = train_test_split(predictors,
                                                    train_target,
                                                    train_size=1 - tam_test,
                                                    test_size=tam_test,
                                                    random_state=0)
X_pred = df_pred

# In[]: Lasso Model
lasso = LassoCV(alphas=[
    0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3,
    0.6, 1, 3, 6, 10, 30, 60, 100
],
                max_iter=50000,
                cv=10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print("Best alpha :", alpha)

print("Try again for more precision with alphas centered around " + str(alpha))
lasso = LassoCV(alphas=[
    alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
    alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
    alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4
],
                max_iter=50000,
                cv=10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print("Best alpha :", alpha)
Example #32
0
class MovieTrainer(object):
    
    def __init__(self,training_file,test_file):
        self._training_pickle=training_file
        self._test_pickle=test_file
        
        #to be defined later
        self._list_of_dicts=None
        self._dataframe=None
        self._features=None
        self._test_features=None
        self._labels=None
        self._clf=None
        
        self._training_frame=None
        self._test_frame=None
        self._prediction_frame=None
        
        #dicts
        self._actor_dict=None
        self._director_dict=None
        self._genre_dict=None
        self._production_house=None
        
    
    def _load_dataframe(self):
        if os.path.isfile(self._training_pickle) ==True:
            self._training_dict=pickle.load(file(self._training_pickle))
        else:
            raise AttributeError("Cannot find pickle file:%s"%self._training_pickle)
        
        if os.path.isfile(self._test_pickle) ==True:
            self._test_dict=pickle.load(file(self._test_pickle))
        else:
           raise AttributeError("Cannot find pickle file:%s"%self._test_pickle)
          
         #load pandas frame
         
        self._training_frame=pd.DataFrame(self._training_dict)
        self._test_frame=pd.DataFrame(self._test_dict)
        
        #drop movies with no names 
        self._training_frame.dropna(subset=["moviename"])
        self._test_frame.dropna(subset=["moviename"])
        return
            #raise error?
    
    def _addtodict(self,name,this_dict):
        if this_dict.has_key(name):
            this_dict[name]+=1
        else:
            this_dict[name]=1
        return
        
    def _modify_string(self,playername):
        playername = re.sub('^\s+|\s+$','', playername)
        playername=re.sub('\s+','_',playername)
        playername=re.sub('\*','',playername)
        return playername
    
    #this function creates a list of features
    #corresponding to the most frequent actors 
    #in a movie
    def _create_playerdict(self,frame,colname,num_features):
        
        playerdict={}
        
        
        for index in frame.index:
            #for each row, we have list of actors
            #like ['Sandra Bullock', 'Melissa McCarthy']
            playerlist=frame.ix[index,colname] 
            
            
            if type(playerlist)!=float:
                #only actors have multiple list members, other players
                #like director don't
                if colname=="actors":
                    for playername in playerlist:
                        #remove spaces, *, leading trailing spaces
                        playername=self._modify_string(playername)
                        self._addtodict(playername,playerdict)

                else:
                    playerlist=self._modify_string(playerlist)
                    self._addtodict(playerlist,playerdict)   
        
        
        
        counter=0
        feature_list=[]
        #sort the dict to get players with highest number of movies
        for key,value in sorted(playerdict.items(),key=lambda x:x[1],reverse=True):
            #print key,value
            feature_list.append(key)
            counter+=1
            if counter>num_features:
                break
        return feature_list
    
    #this function returns a value of the player features for 
    #each movie
    
    def _create_player_features(self,frame,colname,num_features):
        #feature_list is all names of players with most movies
        feature_list=self._create_playerdict(frame,colname,num_features)
        actor_frame = pd.DataFrame()
        
        for player in feature_list:
            feature_name=colname+":"+player
            actor_frame[feature_name]=pd.Series(0,index=frame.index)
            bigplayer_name="feature:big_"+colname #TODO: take out of loop
            actor_frame[bigplayer_name]=pd.Series(0,index=frame.index)#big actors directors present or not?
            
        for index in frame.index:
            playerval=frame.ix[index,colname]
            if type(playerval)!=float: #playerval is not None
                if colname=="actors":
                    for actor in playerval:
                        actor=self._modify_string(actor)
                        if actor in feature_list:
                            thisfeature=colname+":"+actor
                            actor_frame.loc[index,thisfeature]=1
                else:
                    playerval=self._modify_string(playerval)
                    if playerval in feature_list:
                        thisfeature=colname+":"+playerval
                        actor_frame.loc[index,thisfeature]=1
                actor_frame.loc[index,bigplayer_name]=1
            else:
                actor_frame.loc[index,bigplayer_name]=0
                        
        return actor_frame
                        
                
        
    def _create_theater_features(self,frame):
        
        #add feature column
        theater_frame=pd.DataFrame()
        theater_frame["feature:num_theaters"]=pd.Series(0,index=frame.index)
        
        
        for index in frame.index:
            theater_list=frame.ix[index,"theater_list"]
            if type(theater_list)==list and len(theater_list)>0:
                theater=theater_list[0]
                theater=re.sub(',','',theater)
                if re.search('\d+',theater) is not None:
                    theater_frame.loc[index,"feature:num_theaters"]=int(theater)
                else:
                    theater_frame.loc[index,"feature:num_theaters"]=0
            else:
                theater_frame.loc[index,"feature:num_theaters"]=0
        
        return theater_frame
        
    def _first_weekend_rank(self,frame):
        #todo: try to merge with create theater features
        weekend_frame = pd.DataFrame()
        weekend_frame["feature:rank"]=pd.Series(0,index=frame.index)
        for index in frame.index:
            rank_list=frame.ix[index,"rank_list"]
            if type(rank_list)==list and len(rank_list)>0:
                rank=rank_list[0]
                rank=re.sub(',','',rank)
                if re.search('\d+',rank) is not None:
                    weekend_frame.loc[index,"feature:rank"]=int(rank)
                else:
                    weekend_frame.loc[index,"feature:rank"]=1000#some large number? or zero?
            else:
                weekend_frame.loc[index,"feature:rank"]=1000
        
        return weekend_frame
    
    def _create_running_time_feature(self,frame):
        runtime_frame = pd.DataFrame()
        runtime_frame["feature:runtime"]=pd.Series(0,index=frame.index)
        
        for index in frame.index:
            running_time=frame.ix[index,"runtime"]
            if type(running_time)!= float: #not NaN
                pattern='(\d+).+\s(\d+)'
                hrmin=re.match(pattern,running_time)
                if hrmin is not None:
                    hrs=hrmin.group(1)
                    mins=hrmin.group(2)
                    tot_time=int(hrs)*60+int(mins)
                    runtime_frame.loc[index,"feature:runtime"]=tot_time
                    
                else:
                    runtime_frame.loc[index,"feature:runtime"]=0
                    
            else:
                runtime_frame.loc[index,"feature:runtime"]=0
            
        return runtime_frame
                
    
    def _create_release_date_feature(self,frame):
        monthlist=["January","February","March","April","May","June"\
                "July","August","September","October","November","December"]
        month_frame = pd.DataFrame()
        for month in monthlist:
            feature_name="feature:release_"+month
            month_frame[feature_name]=pd.Series(0,index=frame.index) 
        
        for index in frame.index:
            release_date=frame.ix[index,"release_date"]
            if type(release_date)!=float:
                pattern='(\S+)\s(\d+)'
                monthday=re.match(pattern,release_date)
                if monthday is not None:
                    month=monthday.group(1)
                    day=monthday.group(2)
                    
                    if month in monthlist:
                        thisfeature="feature:release_"+month
                        month_frame.loc[index,thisfeature]=1
        return month_frame
                        
                       
    
    def _extract_features(self,frame,isTraining=True):
        """
        extracts features from training and test frame
        all major data munging, cleaning takes place here
        
        """ 
        #pass
        #we will make clean_frame as the data frame, 
        #then we will define the training/test frame
        #and add each feature as a dataframe
        #and finally concatenate the features
    
                
        #check if labels exist for these movies 
        clean_data=frame[pd.notnull(frame["domestic_gross"])]
        
        list_of_frames=[]
        
        #no of theaters it opened at in the first week
        #keep this as first feature so that you can plot using this
        list_of_frames.append( self._create_theater_features(clean_data) )
        print "Created Theater Feature..."
        list_of_frames.append( self._first_weekend_rank(clean_data) )
        print "Created Rank Feature..."
        list_of_frames.append( self._create_running_time_feature(clean_data) )
        print "Created running time Feature..."
        list_of_frames.append( self._create_release_date_feature(clean_data) )
        print "Created release date Feature..."
        #create player features
        list_of_frames.append( \
                    self._create_player_features(clean_data,"actors",5) )
        list_of_frames.append( \
                    self._create_player_features(clean_data,"director",5) )
        list_of_frames.append( 
                    self._create_player_features(clean_data,"distributor",5) )
        list_of_frames.append( 
                    self._create_player_features(clean_data,"genre_toplist",5) )
        list_of_frames.append( 
                    self._create_player_features(clean_data,"mpaa_rating",5) )
        print "Created player Features..."
        
        #check dataframe shapes
        for frames in list_of_frames:
            assert frames.shape[0] == clean_data.shape[0]
            
        
        #concatenate the dataframes
        final_frame = pd.concat(list_of_frames,axis = 1)
        
        final_frame.to_csv("Training/training_frame.csv")
        
        #get training labels
        if isTraining == True:
            labels_arr=self._extract_labels(clean_data)
        else:
            prediction_frame=clean_data[["moviename","genre_toplist","actors"]]
       

        n_samples=len(final_frame.index)
        n_features=len(final_frame.columns)
        
        #from Dataframe to numpy array
        feature_arr=final_frame.values.reshape(n_samples,n_features)
        
        print "Created All Features....."
        
        if isTraining is True:
            return feature_arr,labels_arr
        else:
            return feature_arr,prediction_frame

        
        #plt.plot(theater_arr,self._clf.predict(theater_arr),'r-',linewidth=2)
        
        #plt.show()
        return
    
    def _extract_labels(self,frame):
        
        df_Y=frame["domestic_gross"].values
        gross_list=df_Y.tolist()
        for i in range(len(gross_list)):
            gross_list[i]=int(gross_list[i])
        
        max_gross=np.max(gross_list)
        #print max_gross
        gross_list=[x/max_gross for x in gross_list]
        n_samples=len(gross_list)
        gross_arr=np.array(gross_list).reshape(n_samples,1)
        
        return gross_arr
        
    def _get_top_actors(self,actorlist):
        top_actors=[None,None,None]
        if type(actorlist) ==float:
            return top_actors;
        
        counter=0
        for actor in actorlist:
            top_actors[counter]=self._modify_string(actor)
            counter+=1
            if counter==2:
                break
        return top_actors
                
                
             
        
    def explore_data(self):
        """
        plots and prints various kinds of stuff to test out the data
        change, comment and uncomment here directly
        """
        if self._training_frame is None:
            self._load_dataframe()
        
        
        #col_list.remove('actors')
        #print col_list
        #self._training_frame.drop(col_list,axis=1,inplace=True)
        
        #print self._training_frame.ix[500:510]
        #print len(self._training_frame.index)
        #only_budget=self._training_frame[pd.isnull(self._training_frame["domestic_gross"])]
        #print len(only_budget.index)
        
        #actors_there=self._training_frame[pd.notnull(self._training_frame["actors"])]
        #print len(actors_there.index)
        #print actors_there.head()
        
        #director_there=self._training_frame[pd.notnull(self._training_frame["director"])]
        #print len(director_there.index)
        #print director_there.head()
        pass
    
    def top_5_genres(self):
        if self._training_frame is None:
            self._load_dataframe()
        genre_list=self._create_playerdict(self._training_frame,"genre_toplist",5)
        print genre_list
        
    
    
    def train_2013(self):
        #pass
        self._load_dataframe()
        self._training_frame.to_csv("Training/raw_frame.csv")
        total_features,total_labels=self._extract_features(self._training_frame,isTraining=True)
        total_labels=np.ravel(total_labels)
        
        print type(total_features)
        print type(total_labels)

                                       
        
        
        #create train and test split
        self._features, test_features, self._labels, test_labels =\
            train_test_split(total_features, total_labels, test_size = 0.33)
        
        
        
        print self._features.shape
        print self._labels.shape
        print test_features.shape
        print test_labels.shape
        
        cv_outer = KFold(self._labels.shape[0],n_folds=5)
        self._clf = LassoCV(eps=0.01, n_alphas=10,cv =5)
        cross_val_arr=cross_val_score(self._clf,self._features,self._labels,cv=cv_outer)
        print "Finished Training....."
        
        r_sq=np.mean(cross_val_arr)
        print "R Square for training set: ",r_sq
        
        self._clf.fit(self._features,self._labels)
        plt.plot(test_labels, self._clf.predict(test_features),'ro',linewidth=2)
        plt.plot(np.arange(0,1.,.1),np.arange(0,1.,.1),'b-',linewidth=2)
        plt.xlabel("Actual Gross")
        plt.ylabel("Predicted Gross")
        plt.show()
        
    
    def test_2014(self):
        #check if already trained
        if self._clf is None:
            self.train_2013()
        
        print "Generating Test Features..."
        self._test_features,self._prediction_frame=self._extract_features(\
                                           self._test_frame,isTraining=False)
        
        self._prediction_frame["prediction"]=self._clf.predict(self._test_features)
        print "Finished Testing..."
                
        #sanity check and normalize
        self._prediction_frame["prediction"]=self._prediction_frame["prediction"].apply(\
                                             lambda x: 0 if x<0 else x)
        maxpred=self._prediction_frame["prediction"].max()
        if maxpred>1:
            self._prediction_frame["prediction"]=self._prediction_frame["prediction"].apply(\
                                                 lambda x: x/maxpred)
            
        
        
        print self._prediction_frame.head()
        
        
        
        
    
    def save_db(self,filename):
        con=sqlite3.connect(filename)
        cursor=con.cursor()
        cursor.execute('DROP TABLE IF EXISTS currentmovies')
        cursor.execute('CREATE TABLE currentmovies(\
                                     moviename VARCHAR(255) ,\
                                     genre VARCHAR(255),\
                                     prediction INT,\
                                     actor1 VARCHAR(255),\
                                     actor2 VARCHAR(255),\
                                     actor3 VARCHAR(255))')
        
        for index in self._prediction_frame.index:
            movname=self._prediction_frame.ix[index]["moviename"].encode('utf-8')
            pred=self._prediction_frame.ix[index]["prediction"]
            genre=self._prediction_frame.ix[index]["genre_toplist"].encode('utf-8')
            (actor1,actor2,actor3)=self._get_top_actors(self._prediction_frame.ix[index]["actors"])
            if type(movname)==float and math.isnan(movname)==True:
                continue
            print movname,genre,pred
            
            cursor.execute('INSERT INTO currentmovies\
                             VALUES(?,?,?,?,?,?)',(movname,genre,pred,actor1,actor2,actor3))
        
        con.commit()
        con.close()
Example #33
0
 def select(self,X,y,weight):
     lm = LassoCV(cv=self.cv,normalize=False,max_iter=2000)
     lm.fit(X,y)
     f_indices = np.argwhere(lm.coef_ != 0).T[0]
     return f_indices
Example #34
0
lassoreg.fit(X_train, y_train)
print lassoreg.coef_


# calculate RMSE (for alpha=0.01)
y_pred = lassoreg.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# - [LassoCV](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html): lasso regression with built-in cross-validation of the alpha parameter
# - **n_alphas:** number of alpha values (automatically chosen) to try

# select the best alpha with LassoCV
from sklearn.linear_model import LassoCV
lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1)
lassoregcv.fit(X_train, y_train)
lassoregcv.alpha_


# examine the coefficients
print lassoregcv.coef_


# predict method uses the best alpha value
y_pred = lassoregcv.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# ## Part 5: Regularized classification in scikit-learn
# 
# - Wine dataset from the UCI Machine Learning Repository: [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data), [data dictionary](http://archive.ics.uci.edu/ml/datasets/Wine)

# In[11]:


X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size = 0.15)


# In[12]:


cv = RepeatedKFold(n_splits = 5, n_repeats=3)
lasso = LassoCV(alphas=None, cv = cv, max_iter = 100000)
lasso.fit(xtrain, ytrain)


# In[13]:


ypred = lasso.predict(xtest)
r2 = r2_score(ytest, ypred)
print('R2 Score: ', r2)
score = lasso.score(xtrain, ytrain)
print("R-squared:", score)


# In[14]:

Example #36
0
def train_and_analyse(_X, _y, features):
	X = _X
	Y = _y
	cv_l = cross_validation.KFold(X.shape[0], n_folds=10,
								shuffle=True, random_state=1)
	ranks = {}

	lr = LinearRegression(normalize=True)
	lr.fit(X, Y)
	ranks["Linear reg"] = rank_to_dict(np.abs(lr.coef_), features)
	

	ridge = RidgeCV(cv=cv_l)
	ridge.fit(X, Y)
	ranks["Ridge"] = rank_to_dict(np.abs(ridge.coef_), features)
	
	# Run the RandomizedLasso: we use a paths going down to .1*alpha_max
    # to avoid exploring the regime in which very noisy variables enter
    # the model
	lasso = LassoCV(cv=cv_l, n_jobs=2, normalize=True, tol=0.0001, max_iter=170000)
	lasso.fit(X, Y)
	ranks["Lasso"] = rank_to_dict(np.abs(lasso.coef_), features)
	
	rlasso = RandomizedLasso(alpha=lasso.alpha_, random_state=42)
	rlasso.fit(X, Y)
	ranks["Stability"] = rank_to_dict(np.abs(rlasso.scores_), features)
	
	rfe = RFE(lr, n_features_to_select=1)
	rfe.fit(X,Y)
	ranks["RFE"] = rank_to_dict(np.array(rfe.ranking_).astype(float), features, order=-1)

	rf = RandomForestRegressor(n_estimators=500)
	rf.fit(X,Y)
	ranks["RF"] = rank_to_dict(rf.feature_importances_, features)

	f, pval  = f_regression(X, Y, center=True)
	ranks["Corr."] = rank_to_dict(np.nan_to_num(f), features)

	mine = MINE()
	mic_scores = []
	for i in range(X.shape[1]):
	   mine.compute_score(X[:,i], Y)
	   m = mine.mic()
	   mic_scores.append(m)
	
	ranks["MIC"] = rank_to_dict(mic_scores, features) 

	r = {}
	for name in features:
	    r[name] = round(np.mean([ranks[method][name] 
	                             for method in ranks.keys()]), 2)
	 
	methods = sorted(ranks.keys())
	ranks["Mean"] = r
	methods.append("Mean")
	
	ranks = pd.DataFrame(ranks)

	selection_feature = ranks[ranks.Mean > 0.12].index.values

	return ranks, selection_feature
 def fit(self, X, y):
     LassoCV.fit(self, X, y)
     pred = self.predict(X)
     self.sigma2_ = np.mean((y - pred)**2)
def SolveJointRegression(X1, X2, Y1, Y2, fused=True, s1=15, s=2):
    mean_Y1_s = Y1.mean()
    mean_Y2_s = Y2.mean()
    sigma1 = (Y1 - mean_Y1_s).dot(
        Y1 - mean_Y1_s
    ) / n1  #unbiased estimate of variance of the noise variance of feature being regressed on TODO: make  sigma a subfunction with options
    Sigma1 = np.dot(
        X1[:, regressor_ind].T,
        X1[:, regressor_ind]) / n1  #empirical covariance matrix for the X_s^c
    sigma2 = (Y2 - mean_Y2_s).dot(Y2 - mean_Y2_s) / n2
    Sigma2 = np.dot(X2[:, regressor_ind].T, X2[:, regressor_ind]) / n2

    Ys = [Y1, Y2]
    Xs = [X1[:, regressor_ind], X2[:, regressor_ind]]

    lam_L1 = np.sqrt(sigma1) * np.sqrt(
        2. * (np.log(n_vars - 1)) / n1) * 2  #order sigma*sqrt(log(p)/n)
    alphas = np.array([0.1, 0.5, 1., 5., 10., 50., 100.]) * lam_L1

    if fused:
        lam_L2 = np.sqrt(sigma2) * np.sqrt(2. *
                                           (0.01 + np.log(n_vars - 1)) / n2)
        lam_F = np.sqrt(sigma2) * np.sqrt((np.log((n_vars - 1)) / (n2)))

        fm_cv = FusedMultiTaskLassoCV(cv=5,
                                      gammas=np.array([0.1, 1., 10., 100.]) *
                                      lam_F,
                                      minLam=0.1 * lam_F,
                                      n_jobs=n_jobs)
        fm_cv.fit(Xs, Ys)
        coefs = np.reshape(fm_cv.coefs, (2, n_vars - 1)).T

        mu1 = 4. * (fm_cv.alpha_ * s1 + fm_cv.gamma * s)
        mu1 = (1. / mu1) / (n2**(0.01))

        mu = 2. * (fm_cv.alpha_) * s
        mu = (1. / mu) / (n2**(0.01))

        Theta1, Theta2 = DecorrInverseCovDiff(X1[:, regressor_ind],
                                              X2[:, regressor_ind],
                                              mu=mu,
                                              mu1=mu1)

    else:
        #Two debiased lasso
        coefs = np.zeros((n_vars - 1, 2))
        ls_cv = LassoCV(cv=5, fit_intercept=False, alphas=alphas)
        ls_cv.fit(X1[:, regressor_ind], Y1)
        coefs[:, 0] = ls_cv.coef_

        lam_L2 = np.sqrt(sigma2) * np.sqrt(
            2. * (0.01 + np.log(n_vars - 1)) / n2) * 2
        alphas = np.array([0.1, 0.5, 1., 5., 10., 50., 100.]) * lam_L2
        ls_cv2 = LassoCV(cv=5, fit_intercept=False, alphas=alphas)
        ls_cv2.fit(X2[:, regressor_ind], Y2)
        coefs[:, 1] = ls_cv2.coef_

        mu_L = (1. / np.sqrt(n1)) * scipy.stats.norm.ppf(1 -
                                                         (0.1 /
                                                          ((n_features**2))))
        mu_L2 = (1. / np.sqrt(n2)) * scipy.stats.norm.ppf(1 -
                                                          (0.1 /
                                                           ((n_features**2))))
        Theta1 = DecorrInverseCovQP(X1[:, regressor_ind], mu=mu_L)
        Theta2 = DecorrInverseCovQP(X2[:, regressor_ind], mu=mu_L2)

    coefs_debiased1 = coefs[:, 0] + Theta1.dot(X1[:, regressor_ind].T.dot(
        Y1 - X1[:, regressor_ind].dot(coefs[:, 0]))) / n1
    coefs_debiased2 = coefs[:, 1] + Theta2.dot(X2[:, regressor_ind].T.dot(
        Y2 - X2[:, regressor_ind].dot(coefs[:, 1]))) / n2
    coefs_debiased = coefs_debiased1 - coefs_debiased2
    var_components1 = sigma1 * np.diag(Theta1.dot(Sigma1).dot(Theta1.T) / n1)
    var_components2 = sigma2 * np.diag(Theta2.dot(Sigma2).dot(Theta2.T) / n2)

    std_components = np.sqrt(var_components1 + var_components2)
    interval = scipy.stats.norm.ppf(1. - alp / 2.) * std_components

    LowerConfInterval = coefs_debiased - interval  # confidence interval
    UpperConfInterval = coefs_debiased + interval
    Pvals = 2 * (1. -
                 scipy.stats.norm.cdf(np.abs(coefs_debiased /
                                             (std_components))))

    return coefs_debiased, Pvals, LowerConfInterval, UpperConfInterval
def alasso(X_Train, X_Test, Y_Train, Y_Test, target):
    X_Tr = copy.deepcopy(X_Train)
    X_Te = copy.deepcopy(X_Test)
    Y_Tr = copy.deepcopy(Y_Train)
    Y_Te = copy.deepcopy(Y_Test)
    [Tr_sample, num_feature] = X_Tr.shape
    dict = {
        'Brazil': np.array([-10.0, 310.0]),
        'Peru': np.array([-5.75, 283.0]),
        'Asia': np.array([-10.0, 137.0])
    }
    target_location = dict[target]
    resolution = {
        332: [10, 'lat_lon_10x10.mat'],
        1257: [5, 'lat_lon_5x5.mat'],
        5881: [2.5, 'lat_lon.mat']
    }
    step = resolution[num_feature][0]
    data = loadmat(resolution[num_feature][1])
    position = data['lat_lon_data']
    [row, column] = position.shape
    lat_block = np.zeros(row)
    lon_block = np.zeros(column)
    for i in range(row):
        lat_block[i] = -90 + step * i
    for i in range(column):
        lon_block[i] = 0 + i * step

    distance = np.zeros((row, column))
    for i in range(row):
        for j in range(column):
            distance[i][j] = cal_dis(lat_block[i], lon_block[j],
                                     target_location[0], target_location[1])
    max_value = np.amax(distance)

    for i in range(row):
        for j in range(column):
            distance[i][j] = distance[i][j] / max_value

    weight = np.zeros(num_feature)
    count = -1
    for i in range(row):
        for j in range(column):
            if position[i][j] == 0:
                count = count + 1
                weight[count] = distance[i][j]

    X_w = np.zeros((Tr_sample, num_feature))
    for i in range(num_feature):
        for j in range(Tr_sample):
            X_w[j][i] = X_Tr[j][i] / weight[i]

    lasso_cv = LassoCV(alphas=np.linspace(0.01, 0.5, 25),
                       fit_intercept=True,
                       normalize=False,
                       precompute='auto',
                       max_iter=10000,
                       tol=0.00001,
                       copy_X=True,
                       cv=10,
                       verbose=False,
                       n_jobs=1,
                       positive=False,
                       random_state=None,
                       selection='cyclic')

    lasso_cv.fit(X_w, Y_Tr)
    l = lasso_cv.alpha_
    beta = lasso_cv.coef_
    beta_update = np.zeros(beta.shape)
    for i in range(num_feature):
        beta_update[i] = beta[i] / weight[i]

    Tr_pred = X_Tr.dot(beta_update)
    Te_pred = X_Te.dot(beta_update)

    beta = lasso_cv.coef_
    r2_train = r2_score(Y_Tr, Tr_pred)
    r2_test = r2_score(Y_Te, Te_pred)
    rmse_train = sqrt(mean_squared_error(Y_Tr, Tr_pred))
    rmse_test = sqrt(mean_squared_error(Y_Te, Te_pred))
    return (Tr_pred, Te_pred)
features = features.dropna(axis=1)

alpha_values = []
for a in range(1, 10001):
    alpha_values.append(a / 100)

print "Started at " + str(datetime.now())

estimator_ridge = RidgeCV(alphas=alpha_values, cv=3)
estimator_ridge.fit(features, goal)
scores = cross_val_score(Ridge(alpha=estimator_ridge.alpha_), features, goal, cv=5)
print "Ridge alpha " + str(estimator_ridge.alpha_)
print str(np.mean(scores))
print scores

estimator_lasso = LassoCV(alphas=alpha_values, cv=3)
estimator_lasso.fit(features, goal)
scores = cross_val_score(Lasso(alpha=estimator_lasso.alpha_), features, goal, cv=5)
print "Lasso alpha " + str(estimator_lasso.alpha_)
print str(np.mean(scores))
print scores


estimator_elastic_net = ElasticNetCV(alphas=alpha_values, cv=3, n_jobs=-1)
estimator_elastic_net.fit(features, goal)
scores = cross_val_score(ElasticNet(alpha=estimator_elastic_net.alpha_), features, goal, cv=5)
print "ElasticNet alpha " + str(estimator_elastic_net.alpha_)
print str(np.mean(scores))
print scores

print "Finished at " + str(datetime.now())
Example #41
0
def train_and_score(X, y, occ_codes, lookup):
    lasso = LassoCV(max_iter=10000, cv = LeaveOneOut(), n_jobs = -1)
    lasso.fit(X, y)
    return score_model(lasso.alpha_, X, y, occ_codes, lookup)
Example #42
0
def best_lasso(df, resp_var, exp_vars, kcv=3, cv_path=False, 
               hists=False):
    """ Find the best lasso model through cross-validation.
    
    Args:
        df:       Dataframe
        resp_var: String. Response variable
        exp_vars: List of strings. Explanatory variables
        kcv:      Number of cross-validation folds
        cv_path:  Whether to plot the path of cross-validation
                  scores
        hists:    Whether to plot histograms of coefficient
                  estimates based on bootstrapping
    
    Returns:
        Dataframe of coefficients for best model and histograms
        of coefficient variability based on bootstrap resampling.
    """
    import seaborn as sn
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LassoCV
    from sklearn.utils import resample

    # Standardise the feature data and response
    feat_std = StandardScaler().fit_transform(df[[resp_var,] + exp_vars])

    model = LassoCV(fit_intercept=False, 
                    normalize=False, 
                    max_iter=10000,
                    cv=kcv,
                    eps=1e-3)

    # Train model on full dataset
    model.fit(feat_std[:, 1:], feat_std[:, 0])

    print model

    # Get param estimates
    params = pd.DataFrame(pd.Series(model.coef_, index=exp_vars))
    
    if cv_path:
        # Display results
        m_log_alphas = -np.log10(model.alphas_)

        plt.figure()
        plt.plot(m_log_alphas, model.mse_path_, ':')
        plt.plot(m_log_alphas, model.mse_path_.mean(axis=-1), 'k',
                 label='Average across the folds', linewidth=2)
        plt.axvline(-np.log10(model.alpha_), linestyle='--', color='k',
                    label='alpha: CV estimate')

        plt.legend()

        plt.xlabel('-log(alpha)')
        plt.ylabel('Mean square error')
        plt.axis('tight')

        plt.show()

    if hists:
        # Estimate confidence using bootstrap
        # i.e. what is the std. dev. of the estimates for each parameter
        # based on 1000 resamplings
        err = np.array([model.fit(*resample(feat_std[:, 1:], 
                                            feat_std[:, 0])).coef_ for i in range(1000)])
        err_df = pd.DataFrame(data=err, columns=exp_vars)

        # Melt for plotting with seaborn
        err_df = pd.melt(err_df)
        g = sn.FacetGrid(err_df, col="variable", col_wrap=4)
        g = g.map(plt.hist, "value", bins=20)

        # Vertical line at 0
        g.map(sn.plt.axvline, x=0, c='k', lw=2)
    
    return params
Example #43
0
mpl = MLPRegressor()
poly = make_pipeline(PolynomialFeatures(2), Ridge())
knn = neighbors.KNeighborsRegressor(5, weights='distance')
lasso = linear_model.Lasso(alpha=0.1, max_iter=100000)
methods = [mpl, lassoCV, linear, poly, knn]
m_names = [u'Neuralt Nätverk', 'Lasso', u'Linjär', 'Polynom', 'KNN', 'Q-Lasso']
type = 'ny'
X = joblib.load(model_place + type + 'X.pkl')
y = joblib.load(model_place + type + 'y.pkl')
y_mean = np.mean(y)

type = 'big'
Xbig = joblib.load(model_place + type + 'X.pkl')
ybig = joblib.load(model_place + type + 'y.pkl')

print lassoCV.fit(X, y).coef_
print Xbig.shape
print y_mean
f, ax = plt.subplots(3, 2)
for k in range(0, len(methods), 1):
    pred = cross_val_predict(methods[k], X, y, cv=10, verbose=True)
    temp_ax = ax[k / 2, k % 2]
    temp_ax.scatter(y, pred, marker='x', label=m_names[k], color=colors[1], alpha=0.15, s=1)
    temp_ax.plot([y.min(), y.max()], [y.min(), y.max()], 'k--', lw=1)
    scores(y, pred)
    temp_ax.legend(loc='upper left', fontsize=10)
    temp_ax.tick_params(labelsize=8)

pred = cross_val_predict(lasso, Xbig, ybig, cv=10, verbose=True)
k = 5
temp_ax = ax[2, 1]
#now, let's fit it to the total model, and compute predictions.
model2_real = Lasso(alpha=0.01, max_iter=50000)
model2_real.fit(scaled_X, y_log)
reg = LassoCV(cv=5, random_state=0).fit(scaled_X, y_log)
reg.score(scaled_X, y_log)
Lasso_predictions = reg.predict(scaled_X_test)

#let's try a new model: Random Forests
from sklearn.ensemble import RandomForestRegressor
reg_RF = RandomForestRegressor()
reg_RF.fit(X_train, y_train)
pred_cv = reg_RF.predict(X_cv)
error_RF = np.sqrt(mean_squared_log_error(y_cv, pred_cv))
error_RF
#that looks about the same
reg.fit(scaled_X, y_log)
RF_predictions = reg.predict(scaled_X_test)
#let's compare Random Forests with Gradient Boosting
from sklearn.ensemble import GradientBoostingRegressor
reg_GB = GradientBoostingRegressor()
reg_GB.fit(X_train, y_train)
pred_cv = reg_GB.predict(X_cv)
error_GB = np.sqrt(mean_squared_log_error(y_cv, pred_cv))
error_GB
#Let's use GridSearchCV to find the best hyperparameters of our gradient boosting algo. Note: I've already completed this step, the results are below.
#from sklearn.model_selection import GridSearchCV
#param_grid = {
#        'alpha' : [0.1,0.5,0.9],
#        'n_estimators': [50,100,800,1500],
#        'max_features': [20,15],
#	    'max_depth': [3,5,8,10],
Example #45
0
    data = data.values
    x = data[:, 2:].astype(np.float)
    y = data[:, 1].astype(np.int)
    columns = columns[2:]

    ss = StandardScaler()
    x = ss.fit_transform(x)

    # 增加一列全1
    t = np.ones(x.shape[0]).reshape((-1, 1))
    print t.shape, x.shape
    x = np.hstack((t, x))

    # model = ElasticNetCV(alphas=np.logspace(-3, 2, 50), l1_ratio=[.1, .5, .7, .9, .95, .99, 1], fit_intercept=False)
    model = LassoCV(alphas=np.logspace(-3, 2, 50), fit_intercept=False)
    model.fit(x, y)
    y_hat = model.predict(x)
    y_hat[y_hat < 0] = 0
    print 'model.alpha = \t', model.alpha_
    # print 'model.l1_ratio = \t', model.l1_ratio_
    print 'model.coef_ = \n', model.coef_
    print 'model.predict(x) = \n', y_hat
    print 'Acture = \n', y
    print 'RMSE:\t', np.sqrt(np.mean((y_hat-y)**2))
    print 'R2:\t', model.score(x, y)
    for theta, col in zip(model.coef_[1:], columns):
        if theta > 0.01:
            print col, theta

    plt.figure(facecolor='w')
    t = np.arange(len(y))
Example #46
0
def three_way_fit(clusters, energies, counts, comps, fold_pick=10, Normalize=True, Intercept=True, Energy_above_hull = True, name=''):
    ###- Lambda expression for scaling to energy above hull -###
    scale = lambda x0, y0, x1, y1, x2, y2: abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1) / np.sqrt(
        (y2 - y1) ** 2 + (x2 - x1) ** 2)
    ###- scale to energy above hull -###
    if Energy_above_hull == True: # If you want to FIT to energy above hull (which you don't)
        y1 = min(energies)
        y2 = max(energies)
        x2 = min(comps)
        x1 = max(comps)
        if x2==x1:
            energies = [energies[i]-y1 for i in range(len(energies))]
        else:
             energies = [scale(comps[i], energies[i], x1, y1, x2, y2) for i in range(len(energies))]

    ###- Set up text for output file -###
    file_out = 'Fit_summery.txt'
    file = open(file_out, 'w')
    file.write('Data set: ' + name + '\n\n' + 'Clusters:  [[species],[distance],[chem (0) / spin (1)]]' + '\n')
    [file.write(str(clusters[i]) + '\n') for i in range(len(clusters))]
    file.write('\n\nEnergy per Atom (eV) : Cluster Count per Atom\n')
    for i in range(len(energies)):
        file.write(str(energies[i]) + ' : ' + str(counts[i]) + '\n')

    ###- Set up alphas for CV -###
    alpha_range = [-10, 10]
    alpha_lasso = np.logspace(alpha_range[0], alpha_range[1], num=1000) # for lasso cv
    n_alphas = 1010
    alpha_ridge = np.logspace(-15, 10, n_alphas) # for ridge cv

    ###- Set range for plot -###
    axis_range = [min(energies) * 1.0001, max(energies) * .9999]

    # LASSO and RIDGE, Cross-Validation, Lin Reg without CV
    lassocv = LassoCV(alphas=alpha_lasso, normalize=Normalize, fit_intercept=Intercept, cv=fold_pick, max_iter=1e5)
    ridgecv = RidgeCV(alphas=alpha_ridge, normalize=Normalize, fit_intercept=Intercept, cv=None, store_cv_values=True)
    linreg = LinearRegression(fit_intercept=Intercept, normalize=Normalize)
    # Fit to data for each method
    lassocv.fit(counts, energies) # do fit for lasso
    ridgecv.fit(counts, energies) # do fit for ridge
    linreg.fit(counts, energies) # do fit for linreg
    lassocv_rmse = np.sqrt(lassocv.mse_path_)
    ridgecv_rmse = np.sqrt(ridgecv.cv_values_)
    # Set up Random Forrest regression, max depth is hard coded to 5 but this can be played with
    RandF_reg = RandomForestRegressor(max_depth=5, random_state=0)
    RandF_reg.fit(counts, energies) # for random forrest fit

    ### Get results ready for energy above hull plots ###
    y1 = min(energies)
    y2 = max(energies)
    x2 = min(comps)
    x1 = max(comps)
    scale = lambda x0, y0, x1, y1, x2, y2: abs((y2 - y1) * x0 - (x2 - x1) * y0 + x2 * y1 - y2 * x1) / np.sqrt(
        (y2 - y1) ** 2 + (x2 - x1) ** 2)
    eahDFT = [scale(comps[i], energies[i], x1, y1, x2, y2) for i in range(len(energies))]
    axis_rangeEAH = [-0.002, max(eahDFT) * 1.1]


    ########################################################################################################################
    ################ RANDOM FOREST FIT #####################################################################################
    ########################################################################################################################
    file.write("\n\n#### Random Forest #### ")
    # Plot Fit vs DFT

    cluster_energy_RF = RandF_reg.predict(counts)
    #print(RandF_reg.estimators_) # Comment in if you want to see the estimator generated by random forrest (its a bit messy)
    #print(RandF_reg.get_params()) # ''
    plt.figure()
    plt.scatter(energies, cluster_energy_RF, color='b', alpha=0.5)
    plt.plot(axis_range, axis_range, 'k', alpha=0.5)
    plt.xlim(axis_range)
    plt.ylim(axis_range)
    plt.gca().set_aspect('equal')
    plt.xlabel('Energy, DFT')
    plt.ylabel('Energy, CE')
    plt.title('Random Forest Fit Comparison: ' + name)
    plt.tight_layout()
    plt.show()
    eahCE = [scale(comps[i], cluster_energy_RF[i], x1, y1, x2, y2) for i in range(len(cluster_energy_RF))]
    plt.scatter(eahDFT, eahCE, color='b', alpha=0.5)
    plt.plot(axis_rangeEAH, axis_rangeEAH, 'k', alpha=0.5)
    plt.xlim(axis_rangeEAH)
    plt.ylim(axis_rangeEAH)
    plt.gca().set_aspect('equal')
    plt.xlabel('EAH, DFT')
    plt.ylabel('EAH, CE')
    plt.title('Random Forest Fit Comparison: ' + name)
    plt.tight_layout()
    plt.show()


    ########################################################################################################################
    ################ LASSO FIT #############################################################################################
    ########################################################################################################################

    file.write("\n\n#### LASSO #### \nk-folds cross validation\n")
    file.write("alpha: %7.6f\n" % lassocv.alpha_)
    file.write("avg rmse: %7.4f\n" % min(lassocv_rmse.mean(axis=-1)))
    file.write("score: %7.4f\n" % lassocv.score(counts, energies))
    file.write("non-zero coefficient: %7.4f\n" % np.count_nonzero(lassocv.coef_))
    file.write('Intercept: ')
    file.write(str(lassocv.intercept_))
    file.write('\n')
    # Show data from cross validation / cluster picking
    plt.figure()
    m_log_alphas = -np.log10(lassocv.alphas_)
    plt.plot(m_log_alphas, lassocv_rmse, ':')
    plt.plot(m_log_alphas, lassocv_rmse.mean(axis=-1), 'k', label='Average across the folds', linewidth=2)
    plt.axvline(-np.log10(lassocv.alpha_), linestyle='--', color='k', label='alpha: CV estimate')
    plt.xlabel('-log(alpha)')
    plt.ylabel('Root-mean-square error')
    plt.title('Root-mean-square error on each fold: ' + name)
    plt.legend()
    plt.tight_layout()
    plt.show()
    # Plot Fit vs DFT
    cluster_energy_ce = lassocv.predict(counts)
    plt.figure()
    plt.scatter(energies, cluster_energy_ce, color='b', alpha=0.5)
    plt.plot(axis_range, axis_range, 'k', alpha=0.5)
    plt.xlim(axis_range)
    plt.ylim(axis_range)
    plt.gca().set_aspect('equal')
    plt.xlabel('Energy, DFT')
    plt.ylabel('Energy, CE')
    plt.title('LASSO Fit Comparison: ' + name)
    plt.tight_layout()
    plt.show()
    eahCE = [scale(comps[i], cluster_energy_ce[i], x1, y1, x2, y2) for i in range(len(cluster_energy_ce))]
    plt.scatter(eahDFT, eahCE, color='b', alpha=0.5)
    plt.plot(axis_rangeEAH, axis_rangeEAH, 'k', alpha=0.5)
    plt.xlim(axis_rangeEAH)
    plt.ylim(axis_rangeEAH)
    plt.gca().set_aspect('equal')
    plt.xlabel('EAH, DFT')
    plt.ylabel('EAH, CE')
    plt.title('LASSO Fit Comparison: ' + name)
    plt.tight_layout()
    plt.show()

    # Show Non-zero clusters
    cluster_energy_new = []
    for i in range(len(energies)):
        cluster_energy_new.append(energies[i] - cluster_energy_ce[i])
    cluster_coef = []
    cluster_pick = []
    cluster_coef.append(lassocv.intercept_)
    cluster_coef_all = lassocv.coef_
    cluster_nonzero = [c for c, v in enumerate(cluster_coef_all) if abs(v) >= 0.00000000001 ]
    for i in cluster_nonzero:
        cluster_coef.append(cluster_coef_all[i])
        cluster_pick.append(clusters[i])
    file.write("\n Clusters \n")
    for i in range(len(cluster_pick)):
        if len(cluster_pick[i]) == 2:
            file.write(str(cluster_pick[i][0]) + ':' + '[0]' + ':' + str(cluster_pick[i][1][0]) + ':' + str(
                cluster_coef[i + 1]) + '\n')
        else:
            file.write(
                str(cluster_pick[i][0]) + ':' + str(cluster_pick[i][1]) + ':' + str(cluster_pick[i][2][0]) + ':' + str(
                    cluster_coef[i + 1]) + '\n')
    file.write("\n")
    file.write("\n")

    ########################################################################################################################
    ############# RIDGE FIT ################################################################################################
    ########################################################################################################################

    file.write("### RIDGE ### \nk-folds cross validation\n")
    file.write("alpha: %7.6f\n" % ridgecv.alpha_)
    file.write("avg rmse: %7.4f\n" % min(ridgecv_rmse.mean(axis=-1)))
    file.write("score: %7.4f\n" % ridgecv.score(counts, energies))
    file.write("non-zero coefficient: %7.4f\n" % np.count_nonzero(ridgecv.coef_))
    # Plot Fit vs DFT
    cluster_energy_ce = ridgecv.predict(counts)
    plt.figure()
    plt.scatter(energies, cluster_energy_ce, color="r", alpha=0.5)
    plt.plot(axis_range, axis_range, 'k', alpha=0.5)
    plt.xlim(axis_range)
    plt.ylim(axis_range)
    plt.gca().set_aspect('equal')
    plt.xlabel('Energy, DFT')
    plt.ylabel('Energy, CE')
    plt.title('RIDGE Fit Comparison: ' + name)
    plt.tight_layout()
    plt.show()
    eahCE = [scale(comps[i], cluster_energy_ce[i], x1, y1, x2, y2) for i in range(len(cluster_energy_ce))]
    plt.scatter(eahDFT, eahCE, color='r', alpha=0.5)
    plt.plot(axis_rangeEAH, axis_rangeEAH, 'k', alpha=0.5)
    plt.xlim(axis_rangeEAH)
    plt.ylim(axis_rangeEAH)
    plt.gca().set_aspect('equal')
    plt.xlabel('EAH, DFT')
    plt.ylabel('EAH, CE')
    plt.title('RIDGE Fit Comparison: ' + name)
    plt.tight_layout()
    plt.show()
    # Show Non-zero clusters
    cluster_coef = []
    cluster_pick = []
    cluster_coef.append(ridgecv.intercept_)
    cluster_coef_all = ridgecv.coef_
    cluster_nonzero = [c for c, v in enumerate(cluster_coef_all) if abs(v) >= 0.00000000001]
    for i in cluster_nonzero:
        cluster_coef.append(cluster_coef_all[i])
        cluster_pick.append(clusters[i])
    file.write("\n Clusters\n")
    for i in range(len(cluster_pick)):
        if len(cluster_pick[i]) == 2:
            file.write(str(cluster_pick[i][0]) + ':' + '[0]' + ':' + str(cluster_pick[i][1][0]) + ':' + str(
                cluster_coef[i + 1]) + '\n')
        else:
            file.write(
                str(cluster_pick[i][0]) + ':' + str(cluster_pick[i][1]) + ':' + str(cluster_pick[i][2][0]) + ':' + str(
                    cluster_coef[i + 1]) + '\n')

    ########################################################################################################################
    ############# LIN REG FIT ##############################################################################################
    ########################################################################################################################

    file.write("\n #### Lin Reg #### \nNo cross validation\n")
    file.write("score: %7.4f\n" % ridgecv.score(counts, energies))
    file.write("non-zero coefficient: %7.4f\n" % np.count_nonzero(ridgecv.coef_))
    file.write('Intercept: %7.4f\n' % linreg.intercept_)
    # Plot Fit vs DFT
    cluster_energy_ce = linreg.predict(counts)
    plt.figure()
    plt.scatter(energies, cluster_energy_ce, color="g", alpha=0.5)
    plt.plot(axis_range, axis_range, 'k', alpha=0.5)
    plt.xlim(axis_range)
    plt.ylim(axis_range)
    plt.gca().set_aspect('equal')
    plt.xlabel('Energy, DFT')
    plt.ylabel('Energy, CE')
    plt.title('LinReg Fit Comparison: ' + name)
    plt.tight_layout()
    plt.show()
    eahCE = [scale(comps[i], cluster_energy_ce[i], x1, y1, x2, y2) for i in range(len(cluster_energy_ce))]
    plt.scatter(eahDFT, eahCE, color='g', alpha=0.5)
    plt.plot(axis_rangeEAH, axis_rangeEAH, 'k', alpha=0.5)
    plt.xlim(axis_rangeEAH)
    plt.ylim(axis_rangeEAH)
    plt.gca().set_aspect('equal')
    plt.xlabel('EAH, DFT')
    plt.ylabel('EAH, CE')
    plt.title('LinReg Fit Comparison: ' + name)
    plt.tight_layout()
    plt.show()
    # Show Non-zero clusters
    cluster_coef = []
    cluster_pick = []
    cluster_coef.append(linreg.intercept_)
    cluster_coef_all = linreg.coef_
    cluster_nonzero = [c for c, v in enumerate(cluster_coef_all) if abs(v) >= 0.00000000001]
    for i in cluster_nonzero:
        cluster_coef.append(cluster_coef_all[i])
        cluster_pick.append(clusters[i])
    file.write('\nClusters\n')
    for i in range(len(cluster_pick)):
        if len(cluster_pick[i]) == 2:
            file.write(str(cluster_pick[i][0]) + ':' + '[0]' + ':' + str(cluster_pick[i][1][0]) + ':' + str(
                cluster_coef[i + 1]) + '\n')
        else:
            file.write(
                str(cluster_pick[i][0]) + ':' + str(cluster_pick[i][1]) + ':' + str(cluster_pick[i][2][0]) + ':' + str(
                    cluster_coef[i + 1]) + '\n')
    file.write('\n')
    file.close()
    return
Example #47
0
def Model(train_linear, test_linear):
    train_linear_fea=train_linear.drop(columns=['SalePrice'])
    train_linear_tar=train_linear.SalePrice
    x_train, x_test, y_train, y_test = train_test_split(train_linear_fea, train_linear_tar,test_size=0.2, random_state=0)
    def evaluate(model, test_features, test_labels,train_features, train_labels):
        predictions = model.predict(test_features)
        errors = abs(predictions - test_labels)
        mape = 100 * np.mean(errors / test_labels)
        accuracy = 100 - mape
        print('Model Performance')
        print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
        print('Accuracy = {:0.2f}%.'.format(accuracy))    
        print("MSE for train data is: %f" % mean_squared_error(y_train, model.predict(x_train)))
        print("MSE for validation data is: %f" % mean_squared_error(y_test, model.predict(x_test)))
        return accuracy
    real_train_tar=np.expm1(train_linear_tar)
    """
        . Lasso model
    """
    
    lassocv = LassoCV(alphas = np.logspace(-5, 4, 400), )
    lassocv.fit(train_linear_fea, train_linear_tar)
    lassocv_score = lassocv.score(train_linear_fea, train_linear_tar)
    lassocv_alpha = lassocv.alpha_
    print("Best alpha : ", lassocv_alpha, "Score: ",lassocv_score)
    
    start=time.time()
    lasso =Lasso(normalize = True)
    lasso.set_params(alpha=lassocv_alpha,max_iter = 10000)
    lasso.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, lasso.predict(x_test))
    coef_lasso=pd.Series(lassocv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(lasso,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_lasso_predict=lasso.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_lasso_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_lasso=np.expm1(lasso.predict(test_linear))
    
    
    """
        . Ridge model
    """
    
    ridgecv = RidgeCV(alphas = np.logspace(-5, 4, 400))
    ridgecv.fit(x_train, y_train)
    ridgecv_score = ridgecv.score(x_train, y_train)
    ridgecv_alpha = ridgecv.alpha_
    print("Best alpha : ", ridgecv_alpha, "Score: ",ridgecv_score)
    coef=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    
    start=time.time()
    ridge =Ridge(normalize = True)
    ridge.set_params(alpha=ridgecv_alpha,max_iter = 10000)
    ridge.fit(x_train, y_train)
    end=time.time()
    mean_squared_error(y_test, ridge.predict(x_test))
    coef_ridge=pd.Series(ridgecv.coef_, index=x_train.columns).sort_values(ascending =False)
    evaluate(ridge,x_test,y_test,x_train,y_train)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_ridge_predict=ridge.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_ridge_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    
    test_prediction_ridge=np.expm1(ridge.predict(test_linear))
    
    
    """
        . Random Forest
    """
    #train=train.drop(columns=['DateSold'])
    #test=test.drop(columns=['DateSold'])
    #X_train=train.drop(columns=['SalePrice'])
    #Y_train=train['SalePrice']
    X_train=train_linear_fea
    Y_train=train_linear_tar
    x_train_rf, x_test_rf, y_train_rf, y_test_rf = train_test_split(X_train, Y_train,test_size=0.2, random_state=0)
    
    
    n_estimators = [int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    max_features = ['auto', 'sqrt']
    max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    bootstrap = [True, False]
    random_grid = {'n_estimators': n_estimators,
                   'max_features': max_features,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf,
                   'bootstrap': bootstrap}
    
    rf = RandomForestRegressor()
    # Random search of parameters, using 3 fold cross validation, 
    # search across 100 different combinations, and use all available cores
    #
    rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
    rf_random.fit(X_train, Y_train)
    #rf_random.fit(x_train_rf, y_train_rf)
    rf_random.best_params_
    
    #Random search allowed us to narrow down the range for each hyperparameter. Now that we know where to concentrate our search,
    # we can explicitly specify every combination of settings to try. 
    param_grid = {
        'bootstrap': [False],
        'max_depth': [80, 90, 100, 110,120,130],
        'max_features': [2, 3],
        'min_samples_leaf': [1,2,3, 4],
        'min_samples_split': [2,4,6,8, 10, 12],
        'n_estimators': [600,700, 800, 900, 1000]
    }
    # Create a based model
    rf = RandomForestRegressor()
    # Instantiate the grid search model
    grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv = 3, n_jobs = -1, verbose = 2)
    #grid_search.fit(x_train, y_train)
    grid_search.fit(X_train, Y_train)
    grid_search.best_params_
    
    best_random = grid_search.best_estimator_
    start=time.time()
    best_random.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(best_random, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    y_rf_predict=best_random.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_rf_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_rf = pd.DataFrame({'features':train_linear_fea.columns, 'imp':best_random.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_rf = importance_rf.iloc[:20,]
    
    plt.barh(importance_top20_rf.features, importance_top20_rf.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_rf=np.expm1(best_random.predict(test_linear))
    
    """
        . Xgboost
    """
    
    learning_rate = [round(float(x), 2) for x in np.linspace(start = .1, stop = .2, num = 11)]
        # Minimum for sum of weights for observations in a node
    min_child_weight = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
        # Maximum nodes in each tree
    max_depth = [int(x) for x in np.linspace(1, 10, num = 10)]
    n_estimators=[int(x) for x in np.linspace(start = 100, stop = 2000, num = 20)]
    subsample=[0.3, 0.4,0.5,0.6, 0.7]
    model = xgb.XGBRegressor()
    random_grid = {'learning_rate': learning_rate,
                    'max_depth': max_depth,
                    'min_child_weight': min_child_weight,
                    'subsample': subsample,
                    'n_estimators':n_estimators
                    }
    
        # Make a RandomizedSearchCV object with correct model and specified hyperparams
    xgb_random = RandomizedSearchCV(estimator=model, param_distributions=random_grid, n_iter=1000, cv=5, verbose=2, random_state=42, n_jobs=-1)
    start = time.time()
        # Fit models
    xgb_random.fit(X_train, Y_train)
    xgb_random.best_params_
    
    
    """
    best_params_={'learning_rate': 0.1,
     'max_depth': 2,
     'min_child_weight': 4,
     'n_estimators': 900,
     'subsample': 0.5}
    """
    model_xgb = XGBRegressor(**xgb_random.best_params_)
    #model_xgb = XGBRegressor(**best_params_)
    start=time.time()
    model_xgb.fit(x_train_rf,y_train_rf)
    end=time.time()
    evaluate(model_xgb, x_test_rf, y_test_rf,x_train_rf,y_train_rf)
    print('Time elapsed: %.4f seconds' % (end-start))
    
    
    
    y_xgb_predict=model_xgb.predict(train_linear_fea)
    x_line = np.arange(700000)
    y_line=x_line
    plt.scatter(real_train_tar,np.expm1(y_xgb_predict))
    plt.plot(x_line, y_line, color='r')
    plt.xlabel('Actual Sale Price')
    plt.ylabel('Predict Sle Price')
    importance_xgb = pd.DataFrame({'features':train_linear_fea.columns, 'imp':model_xgb.feature_importances_}).\
                            sort_values('imp',ascending=False)
    
    importance_top20_xgb = importance_xgb.iloc[:20,]
    
    plt.barh(importance_top20_xgb.features, importance_top20_xgb.imp)
    plt.xlabel('Feature Importance')
    
    test_prediction_xgb=np.expm1(model_xgb.predict(test_linear))
    
    return(test_prediction_lasso, test_prediction_ridge, test_prediction_rf, test_prediction_xgb,y_lasso_predict, y_ridge_predict, y_rf_predict, y_xgb_predict)
Example #48
0
    lasso_cofficients.append(lasso.coef_)

# 绘制Lambda与回归系数的关系
plt.plot(Lambdas, lasso_cofficients)
# 对x轴作对数变换
plt.xscale('log')
# 设置折线图x轴和y轴标签
plt.xlabel('Lambda')
plt.ylabel('Cofficients')
# 显示图形
plt.show()


# LASSO回归模型的交叉验证
lasso_cv = LassoCV(alphas = Lambdas, normalize=True, cv = 10, max_iter=10000)
lasso_cv.fit(X_train, y_train)
# 输出最佳的lambda值
lasso_best_alpha = lasso_cv.alpha_
lasso_best_alpha


# 基于最佳的lambda值建模
lasso = Lasso(alpha = lasso_best_alpha, normalize=True, max_iter=10000)
lasso.fit(X_train, y_train)
# 返回LASSO回归的系数
pd.Series(index = ['Intercept'] + X_train.columns.tolist(),data = [lasso.intercept_] + lasso.coef_.tolist())

# 预测
lasso_predict = lasso.predict(X_test)
# 预测效果验证
RMSE = np.sqrt(mean_squared_error(y_test,lasso_predict))
Example #49
0
                clf.fit_cv(X_train, Y_train, [(X_cv, Y_cv)])
            else:
                clf.fit(X_train, Y_train)

            one_result = clf.predict(X_cv)
            blend_train[cv_index, j] = one_result
            cv_score = gini_normalized(Y_cv, blend_train[cv_index, j])
            cv_results[j, i] = cv_score
            score_mse = metrics.mean_absolute_error(Y_cv, one_result)
            print ('Fold [%s] norm. Gini = %0.5f, MSE = %0.5f' % (i, cv_score, score_mse))
            blend_test_j[:, i] = clf.predict(X_test)
        blend_test[:, j] = blend_test_j.mean(1)
        print ('Clf_%d Mean norm. Gini = %0.5f (%0.5f)' % (j, cv_results[j,].mean(), cv_results[j,].std()))

    end_time = datetime.now()
    time_taken = (end_time - start_time)
    print ("Time taken for pre-blending calculations: {0}".format(time_taken))
    print ("CV-Results", cv_results)
    print ("Blending models.")

    bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
    bclf.fit(blend_train, Y_dev)

    Y_test_predict = bclf.predict(blend_test)

    cv_score = cv_results.mean()
    print ('Avg. CV-Score = %s' % (cv_score))
    submission = pd.DataFrame({"Id": test_ids, "Hazard": Y_test_predict})
    submission = submission.set_index('Id')
    submission.to_csv("farons_solution.csv")
Example #50
0
lassocv = LassoCV(cv=10, fit_intercept=False)

l = 1
seed(l)

for i in np.arange(len(ds)):
    d = ds[i]
    X = np.random.multivariate_normal(np.zeros(d), cov_mat[i], N)
    inv_cov = ndws(X)
    for ii in np.arange(len(ss)):
        s = ss[ii]
        beta_s = np.concatenate((np.ones(s), np.zeros(d - s)))
        mu = X.dot(beta_s)
        y = np.random.normal(mu, 1)
        try:
            lassocv.fit(X, y)
            beta_h = lassocv.coef_
            g = -X.T.dot(y - X.dot(beta_h)) / N
            beta_db = beta_h - inv_cov.dot(g)
            beta_d = beta_db - beta_s
            ts[i, ii, :, 0] = np.array([
                norm(beta_d, np.inf),
                norm(beta_d, 2),
                norm(beta_d, 1),
                np.abs(beta_d[1])
            ])
            ts[i, ii, :, 1] = np.array([
                norm(beta_d[:s], np.inf),
                norm(beta_d[:s], 2),
                norm(beta_d[:s], 1),
                np.abs(beta_d[:s][1])
from sklearn.linear_model import LassoCV
import pandas as pd

train_data=pd.read_csv('D:\sufe\A\data_train_changed.csv')
train_data=train_data.ix[0:,1:].drop(['REPORT_ID',"ID_CARD",'LOAN_DATE'],1)
train_data=train_data.dropna()
# print(train_data.info())
X=train_data.drop(['Y'],1).as_matrix()#7
y=train_data['Y'].as_matrix()#1
lassocv = LassoCV()
lassocv.fit(X,y)
print(train_data.columns.drop('Y'),lassocv.coef_)
Example #52
0
    for feat in feats:
        print(gene_cols[feat])

    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    feat1 = X_transform[:, 0]
    feat2 = X_transform[:, 1]
    ax.scatter(feat1, feat2, tol)
    plt.xlabel("feat1")
    plt.ylabel("feat2")
    plt.show()

use_svr = input("Use svr? ")
if "yes" in use_svr:
    clf = SVR(kernel='linear', C=1.0, epsilon=0.2)
    clf.fit(X, y)
    selector = RFE(clf, 5, step=5)
    selector = selector.fit(X, y)
    gene_pos = selector.get_support(indices=True)
    for pos in gene_pos:
        print(gene_cols[pos] + '\t' + str(pos))

if True == False:
    pca = PCA()
    pca.fit(exp_matrix)

    selector = VarianceThreshold()
    selector.fit(exp_matrix)
    exp_vars = selector.variances_
    av_var = sum(exp_vars) / len(exp_vars)
    std_var = np.std(exp_vars)
for i in range(len(y)):
	if y[i]>0: binary_y_pre.append(1)
	else: binary_y_pre.append(0)
binary_y = np.array(binary_y_pre)

coef_path_linear_cv = LinearRegression(normalize=Normalize,fit_intercept=Fit_Intercept) 
coef_path_lasso_cv = LassoCV(normalize=Normalize, max_iter=Max_Iter, copy_X=True, cv=CV, verbose=Verbose, fit_intercept=Fit_Intercept, tol=Tol)#, alphas=Alphas) 
coef_path_elastic_cv = ElasticNetCV(normalize=Normalize,max_iter=Max_Iter, tol=Tol)#,alphas=Alphas)
coef_path_logistic_cv = LogisticRegression( tol=Tol)
coef_path_binary_x_logistic_cv = LogisticRegression( tol=Tol)
coef_path_forest_cv = RandomForestClassifier(n_estimators = N_Estimators, max_features=number_of_features)

binary_X = vectorizer_binary.fit_transform(corpus)
coef_path_forest_cv.fit(X,binary_y)
coef_path_lasso_cv.fit(X,y)
coef_path_binary_x_logistic_cv.fit(binary_X,binary_y)
coef_path_logistic_cv.fit(X,binary_y)
coef_path_elastic_cv.fit(X,y)

forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')
lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring)
elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring)
logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')
binary_x_logistic_cv_score = cross_validation.cross_val_score(coef_path_binary_x_logistic_cv, binary_X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')

forest_results_parameters = [ coef_path_forest_cv.predict(X), coef_path_forest_cv.get_params, coef_path_forest_cv.feature_importances_, 
				coef_path_forest_cv.classes_, coef_path_forest_cv.n_classes_]
forest_scores = [forest_cv_score, classification_report(binary_y, forest_results_parameters[0]), 'forest']

lasso_results_parameters = [coef_path_lasso_cv.predict(X), coef_path_lasso_cv.get_params, coef_path_lasso_cv.alphas_, coef_path_lasso_cv.coef_]  
Example #54
0
def feature_selection(method, instance_db, target, percentage):
    instance_db_values = instance_db.values
    if method == "lasso":
        lassocv = LassoCV(max_iter=5000, normalize=True, alphas=[0.0001])
        lassocv.fit(instance_db_values, target)

        # sort features according to coef_
        coef = abs(lassocv.coef_)
        feature = zip(instance_db.columns, coef)
        feature = sorted(feature, key=lambda tup: tup[1], reverse=True)
        features = pd.DataFrame([tup[0] for tup in feature])

        # the features been chosen
        mask = pd.DataFrame([
            item[1] for item in feature
        ]) >= feature[int(len(feature) * percentage) - 1][1]
        feature_selected = pd.DataFrame(features)[:][np.asarray(mask).reshape(
            -1)]
        feature_selected = [
            item for items in feature_selected.values for item in items
        ]

    if method == "pearson":
        scores, _ = pearson(instance_db_values, target)
        scores = np.nan_to_num(scores)
        scores = [abs(term) for term in scores]

        # sort features according to cores_
        feature = zip(instance_db.columns, scores)
        feature = sorted(feature, key=lambda tup: tup[1], reverse=True)
        features = pd.DataFrame([tup[0] for tup in feature])

        # the features been chosen
        mask = pd.DataFrame([
            item[1] for item in feature
        ]) >= feature[int(len(feature) * percentage - 1)][1]
        feature_selected = pd.DataFrame(features)[:][np.asarray(mask).reshape(
            -1)]
        feature_selected = [
            item for items in feature_selected.values for item in items
        ]

    if method == "reliefF":
        select = ReliefF(n_neighbors=5)
        instance_db_values = np.array(instance_db_values, np.float64)
        max_batch_instance = 5000
        if instance_db_values.shape[0] > max_batch_instance:
            for i in range(
                    int(instance_db_values.shape[0] / max_batch_instance)):
                instance = instance_db_values[i * max_batch_instance:(i + 1) *
                                              max_batch_instance]
                tar = target[i * max_batch_instance:(i + 1) *
                             max_batch_instance]
                select.fit_transform(instance, tar)
                if i == 0:
                    feature_impotances = select.feature_importances_
                else:
                    feature_impotances += np.asarray(
                        select.feature_importances_)
            feature_impotances = list(feature_impotances)
        else:
            select.fit_transform(instance_db_values, target)
            feature_impotances = list(select.feature_importances_)
        # sort features according to importance_
        feature = zip(instance_db.columns, feature_impotances)
        feature = sorted(feature, key=lambda tup: tup[1], reverse=True)
        features = pd.DataFrame([tup[0] for tup in feature])

        # the features been chosen
        mask = pd.DataFrame([
            item[1] for item in feature
        ]) >= feature[int(len(feature) * percentage - 1)][1]
        feature_selected = pd.DataFrame(features)[:][np.asarray(mask).reshape(
            -1)]
        feature_selected = [
            item for items in feature_selected.values for item in items
        ]

    return feature_selected
Example #55
0
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir):
    """
    Train one of the built-in linear regression models.

    Parameters
    ----------
    model_name : str
        Name of the built-in model to train.
    df_train : pandas DataFrame
        Data frame containing the features on which
        to train the model.
    experiment_id : str
        The experiment ID.
    csvdir : str
        Path to the `output` experiment output directory.
    figdir : str
        Path to the `figure` experiment output directory.

    Returns
    -------
    learner : skll Learner object
        SKLL LinearRegression Learner object containing
        the coefficients learned by training the built-in
        model.
    """
    # get the columns that actually contain the feature values
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]

    # LinearRegression (formerly empWt) : simple linear regression
    if model_name == 'LinearRegression':

        # get the feature columns
        X = df_train[feature_columns]

        # add the intercept
        X = sm.add_constant(X)

        # fit the model
        fit = sm.OLS(df_train['sc1'], X).fit()
        df_coef = ols_coefficients_to_dataframe(fit.params)
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # EqualWeightsLR (formerly eqWt) : all features get equal weight
    elif model_name == 'EqualWeightsLR':
        # we first compute a single feature that is simply the sum of all features
        df_train_eqwt = df_train.copy()
        df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1)

        # train a plain Linear Regression model
        X = df_train_eqwt['sumfeature']
        X = sm.add_constant(X)
        fit = sm.OLS(df_train_eqwt['sc1'], X).fit()

        # get the coefficient for the summed feature and the intercept
        coef = fit.params['sumfeature']
        const = fit.params['const']

        # now we need to assign this coefficient to all of the original
        # features and create a fake SKLL learner with these weights
        original_features = [c for c in df_train_eqwt.columns if c not in ['sc1',
                                                                           'sumfeature',
                                                                           'spkitemid']]
        coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)]))
        df_coef = ols_coefficients_to_dataframe(coefs)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # RebalancedLR (formerly empWtBalanced) : balanced empirical weights
    # by changing betas [adapted from http://bit.ly/UTP7gS]
    elif model_name == 'RebalancedLR':

        # train a plain Linear Regression model
        X = df_train[feature_columns]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_params = ols_coefficients_to_dataframe(fit.params)
        df_params = df_params.set_index('feature')

        # compute the betas for the non-intercept coefficients
        df_weights = df_params.loc[feature_columns]
        df_betas = df_weights.copy()
        df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std()

        # replace each negative beta with delta and adjust
        # all the positive betas to account for this
        RT = 0.05
        df_positive_betas = df_betas[df_betas['coefficient'] > 0]
        df_negative_betas = df_betas[df_betas['coefficient'] < 0]
        delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas)
        df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1)

        # rescale the adjusted betas to get the new coefficients
        df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index')

        # add the intercept back to the new coefficients
        df_coef['Intercept'] = df_params.loc['Intercept'].coefficient
        df_coef = df_coef.sort_index().reset_index()
        df_coef.columns = ['feature', 'coefficient']

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature
    # selection using lasso regression with a fixed lambda and then
    # use only those features to train a second linear regression
    elif model_name == 'LassoFixedLambdaThenLR':

        # train a Lasso Regression model with this featureset with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train a new vanilla linear regression with just the non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # get the coefficients data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature
    # selection using lasso regression optimized for log likelihood using
    # cross validation and then use only those features to train a
    # second linear regression
    elif model_name == 'PositiveLassoCVThenLR':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # NNLR (formerly empWtNNLS) : First do feature selection using
    # non-negative least squares (NNLS) and then use only its non-zero
    # features to train a regular linear regression. We do the regular
    # LR at the end since we want an LR object so that we have access
    # to R^2 and other useful statistics. There should be no difference
    # between the non-zero coefficients from NNLS and the coefficients
    # that end up coming out of the subsequent LR.
    elif model_name == 'NNLR':

        # add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters to a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # LassoFixedLambdaThenNNLR (formerly empWtDropNegLasso): First do
    # feature selection using lasso regression and positive only weights.
    # Then fit an NNLR (see above) on those features.
    elif model_name == 'LassoFixedLambdaThenNNLR':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train an NNLS regression using these non-zero features
        # first add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the positive features
        used_features = non_zero_features

    # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with
    # a fixed lambda
    elif model_name == 'LassoFixedLambda':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        alpha = p_lambda / len(df_train)
        learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
        learner.train(fs_train, grid_search=False)

        # convert this model's parameters to a data frame
        df_coef = skll_learner_params_to_dataframe(learner)

        # there's no OLS fit object in this case
        fit = None

        # we used all the features
        used_features = feature_columns

    # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection
    # using lasso regression optimized for log likelihood using cross
    # validation.
    elif model_name == 'PositiveLassoCV':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # save the non-zero model coefficients and intercept to a data frame
        non_zero_features, non_zero_feature_values = [], []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)
                non_zero_feature_values.append(coefficient)

        # initialize the coefficient data frame with just the intercept
        df_coef = pd.DataFrame([('Intercept', model.intercept_)])
        df_coef = df_coef.append(list(zip(non_zero_features,
                                          non_zero_feature_values)), ignore_index=True)
        df_coef.columns = ['feature', 'coefficient']

        # create a fake SKLL learner with these non-zero weights
        learner = create_fake_skll_learner(df_coef)

        # there's no OLS fit object in this case
        fit = None

        # we used only the non-zero features
        used_features = non_zero_features

    # save the raw coefficients to a file
    df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False)

    # compute the standardized and relative coefficients (betas) for the
    # non-intercept features and save to a file
    df_betas = df_coef.set_index('feature').loc[used_features]
    df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False)

    # save the OLS fit object and its summary to files
    if fit:
        ols_file = join(csvdir, '{}.ols'.format(experiment_id))
        summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

        # create a data frame with main model fit metrics and save to the file
        df_model_fit = model_fit_to_dataframe(fit)
        model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id))
        df_model_fit.to_csv(model_fit_file, index=False)

    # save the SKLL model to a file
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    return learner
print n
p = 101
K = 10  # K-fold CV
y = y.reshape(n)

alphas = np.exp(np.linspace(np.log(0.01), np.log(1), 100))  # Using log-scale
N = len(alphas)  # Number of lasso parameters

scores = np.zeros(N)
alpha = np.zeros(N)
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
for i in range(N):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
    clf = LassoCV(n_alphas=100, cv=K)
    clf = clf.fit(X_train, y_train)
    scores[i] = clf.score(X_test, y_test)
    alpha[i] = clf.alpha_

scores = np.asarray(scores)
max_score_index = np.argmax(scores)
best_alpha = alpha[max_score_index]

print(best_alpha)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
clf = Lasso(alpha=best_alpha)
#clf = LassoCV(n_alphas = 100, cv = K, precompute='auto', n_jobs=2, normalize='True')
clf = clf.fit(X_train, y_train)
scores = clf.score(X_test, y_test)
print(predictor_var[0])
    print("EVS:",explained_variance_score(y_test,y_p))

md=dnn_reg(X_train,y_train,X_test,y_test)
reg_eval(X_test,y_test,md)

###Lasso CV regression

def reg_eval2(y_test,model):
    y_pred=model.predict(X_test)
    print("evaluation the results for model:",model)
    print("MSE:",mean_squared_error(y_test,y_pred))
    print("R2:",r2_score(y_test,y_pred))
    print("EVS:",explained_variance_score(y_test,y_pred))

lasso = LassoCV(cv=5, random_state=0,max_iter=10000)
lasso.fit(X_train,y_train)
reg_eval2(y_test,lasso)

#ElasticNet Regressionb
ela = ElasticNetCV(l1_ratio=0.8,normalize=True,max_iter=5000,random_state=77)
ela.fit(X_train,y_train)
print("R square:",ela.score(X_test,y_test))
reg_eval2(y_test,ela)


#SVR Regression
from sklearn.svm import LinearSVR
LSVR=LinearSVR(epsilon=0.1,random_state=0, tol=1e-5,max_iter=10000)
# scaler=RobustScaler()
# pipe=Pipeline(steps=[("scaling",scaler),("rg",LSVR)])
LSVR.fit(X_train,y_train)
#affichage des coefficients  pour alpha = 0.25
print(pandas.DataFrame({'Variables': nom_var, 'Coefficients': coefs25}))

#validation croisée pour Lasso
from sklearn.linear_model import LassoCV

#outil pour la détection de la solution la plus performante en validation croisée
lcv = LassoCV(alphas=my_alphas,
              normalize=False,
              fit_intercept=False,
              random_state=0,
              cv=5)

#lancement sur l'échantillon d'apprentissage
lcv.fit(ZTrain[:, :16], ZTrain[:, 16])

#valeurs des alphas qui ont été testés
print(lcv.alphas_)

#valeurs des MSE en validation croisée
print(lcv.mse_path_)

#moyenne mse en validation croisée pour chaque alpha
avg_mse = numpy.mean(lcv.mse_path_, axis=1)

#alphas vs. MSE en cross-validation
print(pandas.DataFrame({'alpha': lcv.alphas_, 'MSE': avg_mse}))

#sous-forme graphique
plt.plot(lcv.alphas_, avg_mse)
msg("Fitting!")

weights = np.ones(train.shape[0])

do_statsmodels=True
if do_statsmodels:
    ols = sm.wls(formula=formula, data=train, weights=weights).fit()
    print ols.summary()
    msg("Making predictions for all playergames")
    yy_df['ols_prediction'] = ols.predict(yy_df)
else:
    ols_lr = LassoCV(n_jobs=-1, verbose=True)
    X = train[rhs_cols]
    y = train['elo']
    ols_lr.fit(X,y)
    yy_df['ols_prediction'] = ols_lr.predict(X)

yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs()
yy_df['training'] = (yy_df['gamenum'] % 3)
insample_scores = yy_df.groupby('training')['ols_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std})
print insample_scores

msg("Error summary by ELO:")
elo_centuries = cut(yy_df['elo'], 20)
print yy_df.groupby(elo_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean})

msg("Error summary by gamenum:")
gamenum_centuries = cut(yy_df['gamenum'], 20)
print yy_df.groupby(gamenum_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean})
    print(train_data.shape)
    print(test_data.shape)

    params = { 'loss': 'ls',
          'learning_rate': 0.022,
          'n_estimators': 2825,
          'max_depth':4,
          'subsample':0.9,
          'min_samples_split':2,
          'min_samples_leaf':1,
          'random_state':1,
          'max_features':'log2',
          'alpha':0.9}
    model_gbr = GradientBoostingRegressor(**params)
    y_predict_gbr=model_gbr.fit(train_data, train_data_y).predict(test_data)
    model_Lasso = LassoCV(normalize=False, alphas=np.arange(0.0001,0.01,0.0001),cv=ShuffleSplit(n_splits=5,test_size=0.2),n_jobs=-1)
    y_predict_lasso=model_Lasso.fit(train_data, train_data_y).predict(test_data)
    model_bridge = BayesianRidge()
    y_predict_bridge=model_bridge.fit(train_data, train_data_y).predict(test_data)
    answer_true=pd.read_csv('D:\desktop\天池\AI\season two\[new] fusai_answer_a_20180127.csv',header=None).iloc[:,-1]

    p=np.arange(0.1,1.0,0.001)
    for i in list(p):
        y_predict=i*y_predict_gbr+(1-i)*y_predict_lasso
        mse1 = metrics.mean_squared_error(y_predict, answer_true)
        if mse1<0.026:
            print(mse1)