def bagging(self,trains,tests,train_y,model_name=None):
		blend_train = trains.T
		bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
		bclf.fit(blend_train, train_y)
		y_test_predict = bclf.predict(tests.T)
		train_predict = bclf.predict(trains.T)

		return train_predict,y_test_predict
Beispiel #2
0
def lassoCV_regression(data,target,alphas):
    clf=LassoCV()
    sfm = SelectFromModel(clf, threshold=0.25)
    sfm.fit(data, target)
    n_features = sfm.transform(data).shape[1]
    
    while n_features > 2:
        sfm.threshold += 0.1
        data_transform = sfm.transform(data)
        n_features = data_transform.shape[1]
     
    rmses=[]
    kf=KFold(len(target),10,True,None)
    for train_index, test_index in kf:
        data_train,data_test=data_transform[train_index],data_transform[test_index]
        target_train,target_test=target[train_index],target[test_index]
        clf.fit(data_train,target_train)
        rmse=sqrt(np.mean((clf.predict(data_test)-target_test)**2))
        rmses.append(rmse)
        
    x0=np.arange(1,11)
    
    plt.figure()
    plt.plot(x0,rmses,label='LassoCV')
    plt.legend()
    plt.show()
    
    return rmses
Beispiel #3
0
def lasso_cv(x, y, x_pred=None, max_deg=3, cv=10, max_iter=1e3, return_model=False):
    """LASSO polynomial fit with cross-validation.
    
    Regularized polynomial regression (by penalized least-squares) from a
    range of degrees up to n = max_deg. The LASSO regression minimises MSE and
    penalizes the size of the parameter vector using L1-norm, which leads to
    fewer coefficients in the fitted model.

    - The 'alpha' parameter (amount of penalization) is selected by k-fold CV.
    - Predicts fitted model on given values 'x_pred' (default use 'x').
    - Supports NaNs.

    """
    ind, = np.where((~np.isnan(x)) & (~np.isnan(y)))
    x_, y_ = x[ind], y[ind]
    X_ = dmatrix('C(x_, Poly)')
    if x_pred is None:
        X = dmatrix('C(x, Poly)')      # predict on original values
    else:
        X = dmatrix('C(x_pred, Poly)') # predict on given values
    lasso = LassoCV(cv=cv, copy_X=True, normalize=True, max_iter=max_iter)
    lasso = lasso.fit(X_[:,1:max_deg+1], y_)
    y_pred = lasso.predict(X[:,1:max_deg+1])
    if return_model:
        y_pred = [y_pred, lasso]
    return y_pred
def remove_foreground_glm(
        x, y,
        spatial_mask=None, spectral_mask=None,
        alphas=None, l1_ratio=1.):
    """Summary

    Args:
        x (TYPE): Description
        y (TYPE): Description
        spatial_mask (TYPE, optional): Description
        spectral_mask (TYPE, optional): Description
        alphas (TYPE, optional): Description

    Returns:
        TYPE: Description
    """

    # cast to double and reshape
    x_rs = np.float64(x.reshape((x.shape[0], -1))).T
    y_rs = np.float64(y.flatten())

    if spatial_mask is None:
        spatial_mask_rs = np.ones_like(y_rs, dtype=bool)
    else:
        spatial_mask_rs = spatial_mask.flatten()

    if spectral_mask is None:
        spectral_mask = np.ones(x_rs.shape[1], dtype=bool)

    if alphas is not None:
        alphas = np.atleast_1d(alphas)

    # fit GLM
    if l1_ratio == 1.:
        reg = LassoCV(
            positive=True,
            alphas=alphas,
            n_jobs=-1,
            max_iter=5000
        )
    elif l1_ratio == 0.:
        reg = RidgeCV(
            alphas=alphas,
        )
    else:
        reg = ElasticNetCV(
            positive=True,
            alphas=alphas,
            n_jobs=-1,
            l1_ratio=l1_ratio
        )

    reg.fit(x_rs[spatial_mask_rs][:, spectral_mask], y_rs[spatial_mask_rs])

    y_model = reg.predict(x_rs[:, spectral_mask]).reshape(y.shape)

    glm_coeffs = np.zeros(x_rs.shape[1], dtype=np.float32)
    glm_coeffs[spectral_mask] += reg.coef_

    return y_model, reg, glm_coeffs
	def predict(self,trains_x,train_y,tests_x,parameters,times=10,isFile=True,foldername="blend-dir"):
		"""
		Ensamble many features and regression

		:params train_X: dictionary for training
		:params train_y: testing vector
		"""
		#parameter_get
		test_data_sample = tests_x.values()[0]

		if not os.path.exists(foldername):
			os.makedirs(foldername)

		skf = None
		kfold_file = foldername + "/kfold_index.pkl"
		if os.path.exists(kfold_file):
			skf = pickle.load(open(kfold_file,"r"))
		else:
			skf = KFold(n=len(train_y),n_folds=times,shuffle=True)
			pickle.dump(skf,open(kfold_file,"w"))

		blend_train = np.zeros((len(train_y),len(parameters)))
		blend_test = np.zeros((len(test_data_sample),len(parameters)))

		for j,parameter in enumerate(parameters):
			train_x = trains_x[parameter['data']]
			test_x = tests_x[parameter['data']]

			blend_test_tmp = np.zeros((len(test_data_sample),len(parameters)))

			#file path check
			for i, (train_index,valid_index) in enumerate(skf):
				clf = model_select(parameter['parameter'])

				train = train_x[train_index]
				train_valid_y = train_y[train_index]

				kfold_filepath = "./" + foldername + "/parameter_{}_kfold_{}.pkl".format(j,i)

				if os.path.exists(kfold_filepath):
					blend_train_prediction,blend_test_prediction = pickle.load(open(kfold_filepath,"r"))
					blend_train[train_index,j] = np.expm1(clf.predict(train))
					blend_test_tmp[:,i] = np.expm1(clf.predict(test_x))
				else:
					clf.fit(train,np.log1p(train_valid_y))
					blend_train_prediction = np.expm1(clf.predict(train))
					blend_test_prediction = np.expm1(clf.predict(test_x))
					pickle.dump((blend_train_prediction,blend_test_prediction),open(kfold_filepath,"w"))

				blend_train[train_index,j] = blend_train_prediction
				blend_test_tmp[:,i] = blend_test_prediction
			blend_test[:,j] = blend_test_tmp.mean(1)

		#Blending Model
		bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
		bclf.fit(blend_train, train_y)
		y_test_predict = bclf.predict(blend_test)

		return y_test_predict
def fit_Lasso(features_train, labels_train, features_pred):
	model = LassoCV()
	model.fit(features_train, labels_train)
	mse = model.mse_path_
	print "LASSO - Mean square error: ", mse.shape
	# Test the model
	labels_pred = model.predict(features_pred)
	return labels_pred
def make_model_and_predict(train_file, test_file):
    """Given name of training csv file, name of test csv file, constructs
    a random forest model and outputs predictions to a time-stampled csv file.
    If the test_file has SalaryNormalized as an attribute, it will score the
    model and write the result in the file "score<datetime>"
    """

    train = pd.read_csv(train_file)
    valid = pd.read_csv(test_file)
    number_of_word_features = 200
    title_words = count_words_in_column(train, "Title")
    key_count_pairs = [(k,v) for (k,v) in title_words.items() if k not in
                                                stopwords.words('english')]

    key_count_pairs.sort(key=lambda (k,v): -v)

    for word, count in key_count_pairs[:number_of_word_features]:
        add_appearance_count_feature(train, word, "Title")
        add_appearance_count_feature(valid, word, "Title")


    group_features = ["LocationNormalized", "Category", "Company", "SourceName"]

    for f in group_features:
        continuize_feature(train, valid, f, "SalaryNormalized")

    feature_columns = train.columns[12:]

    feature=train[feature_columns]
    label=train.SalaryNormalized
    clf = LassoCV()
    clf.fit(feature, label)

    valid_salary_predict = clf.predict(valid[feature_columns])
    valid["SalaryNormalized_Predict"] = valid_salary_predict

    date_string = re.sub("[ :.]", "", str(datetime.datetime.now()))
    predict_filename = 'predict' + date_string + '.csv'
    score_filename = 'score' + date_string + '.txt'
    with open(predict_filename,'wb') as f:
        valid[["Id","SalaryNormalized_Predict"]].to_csv(f, index=False,
                                                    header=False)

    ##Computes average RMS error and writes score to file
    if hasattr(valid, 'SalaryNormalized'):
        score = 0
        for i,_ in enumerate(valid["SalaryNormalized_Predict"]):
            score += (valid.SalaryNormalized[i] -
                                valid.SalaryNormalized_Predict[i]) **2
        score = math.sqrt(score/len(valid["SalaryNormalized_Predict"]))
        with open (score_filename, 'wb') as f:
            f.write("Train: " + train_file + "\n")
            f.write("Test: " + test_file + "\n")
            f.write("Score: " + str(score) + "\n")
def get_model_per_cluster(X, Y):
    model_per_cluster = {}
    for c in X.cluster.unique():    
        X_cluster = X[X.cluster==c]
        Y_true = Y[Y.cluster == c].ALSFRS_slope
        
        regr = LassoCV(cv=5)
        regr.fit(X_cluster, Y_true)

        print 'cluster: %d size: %s' % (c, Y_true.shape)
        Y_predict = regr.predict(X_cluster)
        print "\t RMS error (0 is perfect): %.2f" % np.sqrt(np.mean(
            (Y_predict - Y_true) ** 2))
        regression_SS = ((Y_predict - Y_true) ** 2).sum()
        residual_SS =((Y_true - Y_true.mean()) ** 2).sum()
        print '\t coefficient of determination R^2 = %.2f ' % (1.0 - regression_SS/residual_SS) # regr.score(X_cluster, Y_true)
        cov = sum((Y_predict - Y_predict.mean())*(Y_true - Y_true.mean()))
        Y_predict_std = np.sqrt(sum((Y_predict - Y_predict.mean())**2))
        Y_true_std = np.sqrt(sum((Y_true - Y_true.mean())**2))
        print '\t pearson correlation r = %.2f ' % (cov/(Y_predict_std*Y_true_std)) # scipy.stats.pearsonr(Y_predict, Y_true)[0]
        print "3 sample predictions: ", regr.predict(X_cluster)[:3]
        model_per_cluster[c] = {"cluster_train_data_means": X_cluster.mean(), "model" : regr}
    return model_per_cluster
def lassoRegularization(X,Y):
    """
    :param X: data consisting of features (excluding class variable)
    :param Y: column vector consisting of class variable
    :return: report best RMSE value for lasso regularization
    """
    tuningAlpha = [0.1,0.01,0.001]
    lasso = LassoCV(normalize=True, alphas=tuningAlpha, cv=10)
    lasso.fit(X,Y)
    prediction = lasso.predict(X)

    print
    print "LASSO REGULARIZATION"
    print "Best Alpha value for Lasso Regularization : " + str(lasso.alpha_)
    print 'Best RMSE for corresponding Alpha =', np.sqrt(mean_squared_error(Y, prediction))
class LocalRegression:
    """This class implements "local" regression. Given a set of training data and a set of unknown data,
           iterate through each unknown spectrum, find the nearest training spectra, and generate a model.
           Each of these local models is optimized using built-in cross validation methods from scikit."""
    def __init__(self, params, n_neighbors = 250):
        """Initialize LocalRegression

        Arguments:
        params = Dict containing the keywords and parameters for the regression method to be used.

        Keyword arguments:
        n_neighbors = User-specified number of training spectra to use to generate the local regression model for each
                      unknown spectrum.

        """
        self.model = LassoCV(**params) # For now, the only option is LASSO. Other methods to be added in the future
                                       # params is a dict containing the keywords and parameters for LassoCV

        self.neighbors = NearestNeighbors(n_neighbors=n_neighbors)

    def fit_predict(self,x_train,y_train, x_predict):
        """Use local regression to predict values for unknown data.

        Arguments:
            x_train = The training data spectra.
            y_train = The values of the quantity being predicted for the training data
            x_predict = The unknown spectra for which y needs to be predicted.
        """
        self.neighbors.fit(x_train)
        predictions = []
        coeffs = []
        intercepts = []
        for i in range(x_predict.shape[0]):
            print('Predicting spectrum ' + str(i + 1))
            x_temp = np.array(x_predict[i])
            foo, ind = self.neighbors.kneighbors([x_temp])
            x_train_local = np.squeeze(x_train[ind])
            y_train_local = np.squeeze(y_train[ind])

            cv = GroupKFold(n_splits=3)
            cv = cv.split(x_train_local, y_train_local,
                          groups=y_train_local)
            self.model.fit(x_train_local, y_train_local)
            predictions.append(self.model.predict([x_temp])[0])
            coeffs.append(self.model.coef_)
            intercepts.append(self.model.intercept_)
        return predictions, coeffs, intercepts
Beispiel #11
0
def lassocvclassifier(training_samples, eval_samples, vectorizer, do_grid_search=False):
    X_train, Y_train = training_samples
    X_eval, Y_eval = eval_samples
    #clf = SGDClassifier(loss='log', penalty= 'l2',l1_ratio=0.0, n_iter=30, shuffle=True, verbose=False, 
    #                    n_jobs=4, alpha=1e-4, average=True, class_weight=None)
    clf = LassoCV()
   
    clf.fit(X_train, Y_train)
    #y_train_true, y_train_pred = Y_train, clf.predict(X_train)
    print_top_10_words = True
    
    
    scores = cross_validation.cross_val_score(clf, X_train, Y_train, cv=5, n_jobs=5, scoring='log_loss')
    print scores, np.mean(scores), np.median(scores)

    print(clf)
    #scores = cross_validation.cross_val_score(clf.best_estimator_, X_train, Y_train, cv=10, scoring='log_loss')
    #print scores, np.mean(scores), np.median(scores)
    y_true, y_pred = Y_eval, clf.predict(X_eval)
    y_prob = clf.predict_proba(X_eval)
    test_data["casual_log"], feature_engg_linreg_model.predict(test_data.drop(target, axis=1))
)


# Not much difference? > Doesn't look like we are overfitting!

# But how to perform shrinkage/penalized regression in general?

from sklearn.linear_model import LassoCV

feature_engg_lassocv_model = LassoCV(max_iter=50, cv=3, n_jobs=-1, random_state=42)

feature_engg_lassocv_model.fit(train_data.drop(target, axis=1), train_data["casual_log"])

feature_engg_lassocv_mse_train = metrics.mean_squared_error(
    train_data["casual_log"], feature_engg_lassocv_model.predict(train_data.drop(target, axis=1))
)

feature_engg_lassocv_mse_test = metrics.mean_squared_error(
    test_data["casual_log"], feature_engg_lassocv_model.predict(test_data.drop(target, axis=1))
)


# Check the performance on test set
print feature_engg_linreg_mse_test
print feature_engg_lassocv_mse_test
# Penalization decreases performance?

# Compare coefficients with non penalized model
print feature_engg_linreg_model.coef_[1:10]
print feature_engg_lassocv_model.coef_[1:10]
Beispiel #13
0
print("Try again for more precision with alphas centered around " + str(alpha))
lasso = LassoCV(alphas=[
    alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
    alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
    alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4
],
                max_iter=50000,
                cv=10)
lasso.fit(X_train, y_train)
alpha = lasso.alpha_
print("Best alpha :", alpha)

print("Lasso RMSE on Training set :", rmse_cv_train(lasso).mean())
print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean())
y_train_las = lasso.predict(X_train)
y_test_las = lasso.predict(X_test)

# Plot residuals
plt.scatter(y_train_las,
            y_train_las - y_train,
            c="blue",
            marker="s",
            label="Training data")
plt.scatter(y_test_las,
            y_test_las - y_test,
            c="lightgreen",
            marker="s",
            label="Validation data")
plt.title("Linear regression with Lasso regularization")
plt.xlabel("Predicted values")
Beispiel #14
0
print(y_test)

# train

alpha = [0.01, 0.1, 1, 10, 100, 1000]
lasso = LassoCV(alphas=alpha, cv=5)
lasso.fit(x_train, y_train)

# alpha

alpha = lasso.alpha_
print('best alpha is : ' + str(alpha))

# test

y_train_pred = lasso.predict(x_train)
y_test_pred = lasso.predict(x_test)

# calculate

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

print('RMSE of training dataset is : ' + str(rmse_train))
print('RMSE of test dataset is : ' + str(rmse_test))
# print(y_test_pred)

# plot

# y_test_pred_mean = y_test_pred.mean()
# y_test_pred_std = y_test_pred.std()
Beispiel #15
0
# What does our prediction error look like?
from yellowbrick.regressor import PredictionError
prederr = PredictionError(lasso)
prederr.fit(Xtrain, ytrain)
prederr.score(Xtest, ytest)
g = prederr.poof()


# Next, we pull out our fitted values (yhat) and actuals (ytest) to see how they compare.
# We also calculate our residuals by subtracting our fitted values from the actuals.
import matplotlib.pyplot as plt

lasso.fit(Xtrain, ytrain)

yhat = lasso.predict(Xtest)
error = ytest - yhat

data = pd.DataFrame({'t': range(1, len(yhat) + 1),
                     'ytest': ytest,
                     'yhat': yhat,
                     'error': error})

plt.plot('t', 'ytest', data=data, color='blue', linewidth=1, label='actual')
plt.plot('t', 'yhat', data=data, color='orange', marker='o', linestyle="None", label='predicted', alpha=0.5)
plt.plot('t', 'error', data=data, color='gray')
plt.legend()
plt.show()

# Pickle model
from sklearn.externals import joblib
Beispiel #16
0
#print('\nTarget on train data',predict_train)
#
## Accuray Score on train dataset
#accuracy_train = accuracy_score(train_y,predict_train)
#print('accuracy_score on train dataset : ', accuracy_train)
#
## predict the target on the test dataset
#predict_test = model.predict(test_x)
#print('Target on test data',predict_test)
#
## Accuracy Score on test dataset
#accuracy_test = accuracy_score(test_y,predict_test)
#print('accuracy_score on test dataset : ', accuracy_test)

###########################################################################
#############   LINEAR REGRESSION       ####################################
###########################################################################

reg = LinearRegression()
reg.fit(X_train, Y_train)
reg.score(X_test, Y_test)
from sklearn import ensemble
clf = ensemble.GradientBoostingRegressor(n_estimators=400,
                                         max_depth=5,
                                         min_samples_split=2,
                                         learning_rate=0.1,
                                         loss='ls')
clf.fit(X_train, Y_train)
clf.score(X_test, Y_test)
y_pred = reg.predict(X_test)
rmse_cv(model_lasso).mean()

coef = pd.Series(model_lasso.coef_, index = X_train.columns)
print("Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " +  str(sum(coef == 0)) + " variables")

imp_coef = pd.concat([coef.sort_values().head(10),
                     coef.sort_values().tail(10)])

matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind = "barh")
plt.title("Coefficients in the Lasso Model")

#let's look at the residuals as well:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)

preds = pd.DataFrame({"preds":model_lasso.predict(X_train), "true":y})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x = "preds", y = "residuals",kind = "scatter")

preds = np.expm1(model_lasso.predict(X_test))

#Adding an xgboost model
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label = y)
dtest = xgb.DMatrix(X_test)

params = {"max_depth":2, "eta":0.1}
model = xgb.cv(params, dtrain,  num_boost_round=500, early_stopping_rounds=100)
model.loc[30:,["test-rmse-mean", "train-rmse-mean"]].plot()
plt.ylim(ymin, ymax)
##get the coeff for each column
coef = pd.Series(model_lasso.coef_, index = X_train1.columns).sort_values(ascending=False) 

plt.figure(figsize=(10, 5))
coef.head(20).plot(kind='bar')
plt.title('Feature Importance in Lasso Model')
plt.tight_layout()

imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)])
plt.figure(figsize=(8, 10))
imp_coef.plot(kind = "barh")
plt.title("Coefficients in Lasso Model")
plt.tight_layout()

preds_lasso = pd.DataFrame({"preds":model_lasso.predict(X_test1), "true":y_test1})
preds_lasso["residuals"] = preds_lasso["true"] - preds_lasso["preds"]
preds_lasso["residuals"].abs().mean() #0.0807606
preds_lasso.plot(x = "preds", y = "residuals",kind = "scatter")

plt.figure(figsize=(10, 5))
plt.scatter(y_test1, preds_lasso["preds"], s=20)
plt.title('Predicted vs. Actual')
plt.xlabel('Actual Sale Price')
plt.ylabel('Predicted Sale Price')
plt.plot([min(y_test1), max(y_test1)], [min(y_test1), max(y_test1)])
plt.tight_layout()

#####xgboost model##########
import xgboost as xgb
Beispiel #19
0
def lasso_model(data, y):
    X_train, X_test, y_train, y_test = train_test_split(data,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=0)
    scorer = make_scorer(mean_squared_error, greater_is_better=False)

    def rmse_cv_train(model):
        rmse = np.sqrt(
            -cross_val_score(model, X_train, y_train, scoring=scorer, cv=10))
        return (rmse)

    def rmse_cv_test(model):
        rmse = np.sqrt(
            -cross_val_score(model, X_test, y_test, scoring=scorer, cv=10))
        return (rmse)

    lasso = LassoCV(alphas=[
        0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1,
        0.3, 0.6, 1
    ],
                    max_iter=50000,
                    cv=10)
    lasso.fit(X_train, y_train)
    alpha = lasso.alpha_
    print("Best alpha :", alpha)

    print("Try again for more precision with alphas centered around " +
          str(alpha))
    lasso = LassoCV(alphas=[
        alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8,
        alpha * .85, alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1,
        alpha * 1.15, alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4
    ],
                    max_iter=50000,
                    cv=10)
    lasso.fit(X_train, y_train)
    alpha = lasso.alpha_
    print("Best alpha :", alpha)

    print("Lasso RMSE on Training set :", rmse_cv_train(lasso).mean())
    print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean())
    y_train_las = lasso.predict(X_train)
    y_test_las = lasso.predict(X_test)

    # Plot residuals
    plt.scatter(y_train_las,
                y_train_las - y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_las,
                y_test_las - y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear regression with Lasso regularization")
    plt.xlabel("Predicted values")
    plt.ylabel("Residuals")
    plt.legend(loc="upper left")
    plt.hlines(y=0, xmin=10.5, xmax=13.5, color="red")
    plt.show()

    # Plot predictions
    plt.scatter(y_train_las,
                y_train,
                c="blue",
                marker="s",
                label="Training data")
    plt.scatter(y_test_las,
                y_test,
                c="lightgreen",
                marker="s",
                label="Validation data")
    plt.title("Linear regression with Lasso regularization")
    plt.xlabel("Predicted values")
    plt.ylabel("Real values")
    plt.legend(loc="upper left")
    plt.plot([10.5, 13.5], [10.5, 13.5], c="red")
    plt.show()

    # Plot important coefficients
    coefs = pd.Series(lasso.coef_, index=X_train.columns)
    print("Lasso picked " + str(sum(coefs != 0)) + " features and eliminated the other " + \
          str(sum(coefs == 0)) + " features")
    imp_coefs = pd.concat(
        [coefs.sort_values().head(10),
         coefs.sort_values().tail(10)])
    imp_coefs.plot(kind="barh")
    plt.title("Coefficients in the Lasso Model")
    plt.show()

    return lasso
Beispiel #20
0

# Entrenamos el modelo.
xgbReg.fit(x_train, y_train)

# Calculamos el error.
print("xgb score:")
get_score(prediction=xgbReg.predict(x_train), lables=y_train)
y_pred_xgb = xgbReg.predict(x_test)

# In[72]:

model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(x_train, y_train)
#model_lasso = Lasso(alpha=0.00099, max_iter=50000)
#model_lasso.fit(x_train_st,y_train)
get_score(prediction=model_lasso.predict(x_train), lables=y_train)
y_pred_lasso = model_lasso.predict(x_test)

# In[78]:

model_elastic = ElasticNetCV(alphas=[0.0001, 0.0005, 0.001, 0.01, 0.1, 1, 10],
                             l1_ratio=[.01, .1, .5, .9, .99],
                             max_iter=5000).fit(x_train, y_train)
get_score(prediction=model_elastic.predict(x_train), lables=y_train)
y_pred_elastic = model_elastic.predict(x_test)

y_pred = (y_pred_xgb * 0.3 + y_pred_lasso * 0.4 + y_pred_elastic * 0.3
          )  # submission-31-12-4
# y_pred = (y_pred_xgb*0.3 + y_pred_lasso*0.45 + y_pred_elastic*0.25) # submission-31-12-5
# y_pred = (y_pred_xgb + y_pred_elastic+ y_pred_lasso)/3 # submission-31-12-6
y_pred = np.exp(y_pred)
Beispiel #21
0
print coef_path_forest_cv.get_params
print coef_path_forest_cv.feature_importances_
forest_prediction = coef_path_forest_cv.predict(X)
forest_score = coef_path_forest_cv.score(X,y)
print "Forest_score:%.3g" % forest_score
forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, y, n_jobs=2, cv=5)
print forest_cv_score

print "########LASSO######"
coef_path_lasso_cv.fit(X,y)
print coef_path_lasso_cv.get_params
print "alphas:" 
print  coef_path_lasso_cv.alphas_
print "coef_:"
print coef_path_lasso_cv.coef_
lasso_prediction = coef_path_lasso_cv.predict(X)
lasso_score = coef_path_lasso_cv.score(X,y)
print "Lasso_score:%.3g" % lasso_score
#print "Lasso precision:%.3g" %  precision_score(y, lasso_predict) 
#print "Lasso_confusion matrix:"
#print confusion_matrix(y, lasso_prediction)
lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=5)
print lasso_cv_score
plt.figure()
plt.hist2d(y, lasso_prediction)
plt.ylabel("Predicted Values")
plt.xlabel("Truth Values")
plt.title("Lasso Linear Regression")
plt.savefig("figures/lasso_predicted_truth.png")
print "#######ELASTIC#####"
coef_path_elastic_cv.fit(X,y)
Beispiel #22
0
test = pd.read_csv('test.csv')
ids = test['User_ID'].values
pid = test['Product_ID'].values
features_test = test.drop('User_ID',1)


le = LabelEncoder()
print("assuming text variables are categorical & replacing them with numeric ids\n")
for c in featureNames:
   # features_train[c] = features_train[np.isnan(features_train[c])] = -1
   # features_test[c] = features_test[np.isnan(features_test[c])] = -1
   if features_train[c].dtype.name == 'object':
      
      le.fit(np.append(features_train[c],features_test[c])) 
      features_train[c] = le.transform(features_train[c]).astype(int)
      features_test[c] = le.transform(features_test[c]).astype(int)

features_train = features_train.fillna(0)
features_test = features_test.fillna(0)
dhackLassoModel = LassoCV(cv = 10).fit(features_train,labels_train)
pred = dhackLassoModel.predict(features_test)

submission = pd.DataFrame({"User_ID": ids, "Product_ID": pid, "Purchase": pred})
#print out the value of alpha that minimizes the CV-error
print("alpha Value that Minimizes CV Error ",dhackLassoModel.alpha_)
print("Minimum MSE ", min(dhackLassoModel.mse_path_.mean(axis=-1)))

submission.to_csv("submissionLassocv.csv", index=False)


Beispiel #23
0
class RuleFit(BaseEstimator, TransformerMixin):
    """Rulefit class


    Parameters
    ----------
        tree_generator: object GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)

    Attributes
    ----------
    rule_ensemble: RuleEnsemble
        The rule ensemble

    feature_names: list of strings, optional (default=None)
        The names of the features (columns)

    """
    def __init__(self,
                 tree_generator=None):
        self.tree_generator = tree_generator


    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        ## Enumerate features if feature names not provided
        if feature_names is None:
            self.feature_names = ['feature_' + str(x) for x in range(0, X.shape[1])]
        else:
            self.feature_names=feature_names

        ## initialise tree generator
        if self.tree_generator is None:
            self.tree_generator = GradientBoostingRegressor()

        if type(self.tree_generator) not in [GradientBoostingRegressor,
                                             GradientBoostingClassifier,
                                             RandomForestRegressor,
                                             RandomForestClassifier]:
            raise ValueError("RuleFit only works with RandomForest and BoostingRegressor")
        ## TODO: Error if tree generator not GB nor RF

        ## fit tree generator
        self.tree_generator.fit(X, y)

        tree_list = self.tree_generator.estimators_
        if isinstance(self.tree_generator, RandomForestRegressor) or isinstance(self.tree_generator, RandomForestClassifier):
             tree_list = [[x] for x in self.tree_generator.estimators_]
        ## extract rules
        self.rule_ensemble = RuleEnsemble(tree_list = tree_list,
                                          feature_names=self.feature_names)

        ## concatenate original features and rules
        X_rules = self.rule_ensemble.transform(X)
        ## No rules found
        if X_rules.shape[0] == 0:
            X_concat = X
        else:
            X_concat = np.concatenate((X, X_rules), axis=1)

        ## initialise Lasso
        self.lscv = LassoCV()

        ## fit Lasso
        self.lscv.fit(X_concat, y)
        return self

    def predict(self, X):
        """Predict outcome for X

        """

        X_rules = self.rule_ensemble.transform(X)
        X_concat = np.concatenate((X, X_rules), axis=1)

        return self.lscv.predict(X_concat)

    def transform(self, X=None, y=None):
        """Transform dataset.

        Parameters
        ----------
        X : array-like matrix, shape=(n_samples, n_features)
            Input data to be transformed. Use ``dtype=np.float32`` for maximum
            efficiency.

        Returns
        -------
        X_transformed: matrix, shape=(n_samples, n_out)
            Transformed data set
        """
        return self.rule_ensemble.transform(X)

    def get_rules(self, exclude_zero_coef=True):
        """Return the estimated rules

        Parameters
        ----------
        exclude_zero_coef: If True (default), returns only the rules with an estimated
                           coefficient not equalt to  zero.

        Returns
        -------
        rules: pandas.DataFrame with the rules. Column 'rule' describes the rule, 'coef' holds
               the coefficients and 'support' the support of the rule in the training
               data set (X)
        """

        n_features= len(self.lscv.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)
        output_rules = []
        ## Add coefficients for linear effects
        for i in range(0, n_features):
            output_rules += [(self.feature_names[i], 'linear', self.lscv.coef_[i], 1)]
        ## Add rules
        for i in range(0, len(self.rule_ensemble.rules)):
            rule = rule_ensemble[i]
            output_rules += [(rule.__str__(), 'rule', self.lscv.coef_[i + n_features],  rule.support)]
        rules = pd.DataFrame(output_rules, columns=["rule", "type","coef", "support"])
        if exclude_zero_coef:
            rules = rules.ix[rules.coef != 0]
        return rules
Beispiel #24
0
print "2.5.b) Lambda déterminé par LassoCV:", clf.alpha_
# plot lasso path with CV choice
ax1.axvline(clf.alpha_, color='K',linestyle='-', linewidth= 3)
plt.annotate('CV',
         xy=(1.1*clf.alpha_,0.2), xycoords='data',
         xytext=(0, 0), textcoords='offset points', fontsize=18)
plt.show(block=False)
filename="lassoCV"
image_name=dirname+filename+imageformat
fig2.savefig(image_name)


################------ Exercice 2.5c ------###############################

xnew = [6,0.3,0.2,6,0.053,25,149,0.9934,3.24,0.35,10]
scorenew = clf.predict(xnew)
print "2.5.c) Prédiction de score pour xnew = \
    [6,0.3,0.2,6,0.053,25,149,0.9934,3.24,0.35,10]: ", scorenew 



################------ Exercice 2.5d ------###############################

mymodel = linear_model.LinearRegression(fit_intercept=False)
mymodel.fit(X_cr,y_cr)
print "coefs des MCOs"
#print ([mymodel.coef_ , mymodel.intercept_])
print (mymodel.coef_)
theta_LR = mymodel.coef_

# mymodel = linear_model.LinearRegression(fit_intercept=True)
Beispiel #25
0
# - [LassoCV](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html): lasso regression with built-in cross-validation of the alpha parameter
# - **n_alphas:** number of alpha values (automatically chosen) to try

# select the best alpha with LassoCV
from sklearn.linear_model import LassoCV
lassoregcv = LassoCV(n_alphas=100, normalize=True, random_state=1)
lassoregcv.fit(X_train, y_train)
lassoregcv.alpha_


# examine the coefficients
print lassoregcv.coef_


# predict method uses the best alpha value
y_pred = lassoregcv.predict(X_test)
print np.sqrt(metrics.mean_squared_error(y_test, y_pred))


# ## Part 5: Regularized classification in scikit-learn
# 
# - Wine dataset from the UCI Machine Learning Repository: [data](http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data), [data dictionary](http://archive.ics.uci.edu/ml/datasets/Wine)
# - **Goal:** Predict the origin of wine using chemical analysis

# ### Load and prepare the wine dataset

# read in the dataset
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data'
wine = pd.read_csv(url, header=None)
wine.head()
Beispiel #26
0
class RuleFit(BaseEstimator, TransformerMixin):
    """Rulefit class


    Parameters
    ----------
        tree_generator: object GradientBoostingRegressor or GradientBoostingClassifier, optional (default=None)

    Attributes
    ----------
    rule_ensemble: RuleEnsemble
        The rule ensemble

    feature_names: list of strings, optional (default=None)
        The names of the features (columns)

    """
    def __init__(self,
                 tree_generator=None):
        self.tree_generator = tree_generator


    def fit(self, X, y=None, feature_names=None):
        """Fit and estimate linear combination of rule ensemble

        """
        self.feature_names=feature_names

        ## initialise tree generator
        if self.tree_generator is None:
            self.tree_generator = GradientBoostingRegressor()

        if type(self.tree_generator) not in [GradientBoostingRegressor,
                                             GradientBoostingClassifier,
                                             RandomForestRegressor,
                                             RandomForestClassifier]:
            raise ValueError("RuleFit only works with RandomForest and BoostingRegressor")
        ## TODO: Error if tree generator not GB nor RF

        ## fit tree generator
        self.tree_generator.fit(X, y)

        tree_list = self.tree_generator.estimators_
        if isinstance(self.tree_generator, RandomForestRegressor) or isinstance(self.tree_generator, RandomForestClassifier):
             tree_list = [[x] for x in self.tree_generator.estimators_]
        ## extract rules
        self.rule_ensemble = RuleEnsemble(tree_list = tree_list,
                                          feature_names=self.feature_names)

        ## concatenate original features and rules
        X_rules = self.rule_ensemble.transform(X)
        ## No rules found
        if X_rules.shape[0] == 0:
            X_concat = X
        else:
            X_concat = np.concatenate((X, X_rules), axis=1)

        ## initialise Lasso
        self.lscv = LassoCV()

        ## fit Lasso
        self.lscv.fit(X_concat, y)
        return self

    def predict(self, X):
        """Predict outcome for X

        """

        X_rules = self.rule_ensemble.transform(X)
        X_concat = np.concatenate((X, X_rules), axis=1)

        return self.lscv.predict(X_concat)

    def transform(self, X=None, y=None):
        """Transform dataset.

        Parameters
        ----------
        X : array-like matrix, shape=(n_samples, n_features)
            Input data to be transformed. Use ``dtype=np.float32`` for maximum
            efficiency.

        Returns
        -------
        X_transformed: matrix, shape=(n_samples, n_out)
            Transformed data set
        """
        return self.rule_ensemble.transform(X)

    def get_rules(self):
        n_features= len(self.lscv.coef_) - len(self.rule_ensemble.rules)
        rule_ensemble = list(self.rule_ensemble.rules)
        output_rules = []
        for i in range(0, len(self.rule_ensemble.rules) - 1):
            rule = rule_ensemble[i]
            output_rules += [(rule.__str__(), self.lscv.coef_[i + n_features], rule.support)]
        return pd.DataFrame(output_rules, columns=["rule", "coef", "support"])
msg("Fitting!")

weights = np.ones(train.shape[0])

do_statsmodels = True
if do_statsmodels:
    ols = sm.wls(formula=formula, data=train, weights=weights).fit()
    print(ols.summary())
    msg("Making predictions for all playergames")
    yy_df['ols_prediction'] = ols.predict(yy_df)
else:
    ols_lr = LassoCV(n_jobs=-1, verbose=True)
    X = train[rhs_cols]
    y = train['elo']
    ols_lr.fit(X, y)
    yy_df['ols_prediction'] = ols_lr.predict(X)

yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs()
yy_df['training'] = (yy_df['gamenum'] % 3)
insample_scores = yy_df.groupby('training')['ols_error'].agg({
    'mean': np.mean,
    'median': np.median,
    'stdev': np.std
})
print(insample_scores)

msg("Error summary by ELO:")
elo_centuries = cut(yy_df['elo'], 20)
print(
    yy_df.groupby(elo_centuries)['ols_error'].agg({
        'sum': np.sum,
Beispiel #28
0
pd.options.display.max_rows = 1999
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

df_list = [df_train, df_test]

df_final = pd.concat(df_list)

df_train_age = df_final[~np.logical_or(df_final['Age'].isnull(), df_final['Fare'].isnull())].copy()
df_train_age.loc[df_train_age['Fare'].isnull(), 'Fare'] = df_train_age['Fare'].dropna().median()

df_train_age['Sex'] = df_train_age['Sex'].map({'female': 0, 'male':
    1}).astype(int)

df_train_age.head(10)

predictors = ['Sex', 'SibSp', 'Parch', 'Fare']
print(df_train_age[predictors])
print('==================')
print(df_train_age[df_train_age['Age'].isnull()])
model = LassoCV(cv=10).fit(df_train_age[predictors], df_train_age['Age'])

df_test = pd.read_csv("test.csv")
df_test.loc[df_test['Fare'].isnull(), 'Fare'] = df_test['Fare'].dropna().median()
df_test['Sex'] = df_test['Sex'].map({'female': 0, 'male':
    1}).astype(int)
df_test['AgeFill'] = model.predict(df_test[predictors])

print(df_test[['Name', 'Sex', 'Age', 'AgeFill']])

        sample[fold_size * 2:fold_size * 3],
        sample[fold_size * 3:fold_size * 4],
        sample[fold_size * 4:fold_size * 5],
        sample[fold_size * 5:fold_size * 6],
        sample[fold_size * 6:fold_size * 7],
        sample[fold_size * 7:fold_size * 8],
        sample[fold_size * 8:fold_size * 9], sample[fold_size * 9:]
    ]
    for fold in tenFold_sample:
        test = fold
        train = [s for s in sample if s not in test]
        y_train = com_P2T_df.loc[train, :][column]
        x_train = com_comb_df.loc[train, :][lasso_otusPairs[column]]
        model = LinearRegression().fit(x_train, y_train)
        x_test = com_comb_df.loc[test, :][lasso_otusPairs[column]]
        P2T_pred_df.loc[test, column] = model.predict(x_test)

P2T_pred_df.fillna(0, inplace=True)

# pearson and spearman correlation
from scipy import stats

index_names = P2T_pred_df.columns.values
correlation_df = pd.DataFrame(index=index_names,
                              columns=[
                                  'pearsonCor', 'pearsonPvalue', 'spearmanCor',
                                  'spearmanPvalue', 'coef_num', 'zeroCoef_num'
                              ])
#print(correlation_df.head())

for column in P2T_pred_df:
Beispiel #30
0
def stacking_train(x_train,y_train,list_index1,list_index2,x):

    k_fold=len(list_index1)
    
    m=5  #stacking构成训练集合的特征数
    k_fold_train_num=486 #k_fold 中每一层的样本数


    n_train=len(x_train)#获取stacking构成训练集合的样本数
    data_stacking_train=pd.DataFrame(np.zeros((n_train,m)))

    data_stacking_submit=pd.DataFrame(np.zeros((len(x),m)))

    models=[LassoCV(),
    RidgeCV(),
    GradientBoostingRegressor(n_estimators=150,max_depth=3),
    XGBRegressor(n_estimators=470,max_depth=2),
    BaggingRegressor(base_estimator=LassoCV(),n_estimators=50, max_samples=0.6, max_features=0.8,),

    ]



    for i in list(range(k_fold)):#将各层训练集合,通过第一层的模型训练预测为stacking训练集合中生成特征元素
        
        pred=[]
        pred_test=[]
        pred_submit=[]

        index1=list_index1[i]
        index2=list_index2[i]

        for model in models: 

            for n in index1:
                if n not in list(y_train.index):
                    print('no',n)
            
            model.fit(x_train.iloc[index1,:],y_train.iloc[index1,:])
            pred.append(model.predict(x_train.iloc[index2,:]))

            pred_submit.append(model.predict(x))


        for j in range(k_fold_train_num):#将第一层的预测值作为特征值赋值给新构造的stacking训练集合---------------------
            b=index2[0]
    
            
            data_stacking_train.iloc[j+b,0]=pred[0][j]
            data_stacking_train.iloc[j+b,1]=pred[1][j]
            data_stacking_train.iloc[j+b,2]=pred[2][j]
            data_stacking_train.iloc[j+b,3]=pred[3][j]
            data_stacking_train.iloc[j+b,4]=pred[4][j]


        for k in range(len(x)):#用第一层的模型来给submit集合生成特征元素-----------------------------------
        
            data_stacking_submit.iloc[k,0]+=pred_submit[0][k]/k_fold
            data_stacking_submit.iloc[k,1]+=pred_submit[1][k]/k_fold
            data_stacking_submit.iloc[k,2]+=pred_submit[2][k]/k_fold
            data_stacking_submit.iloc[k,3]+=pred_submit[3][k]/k_fold
            data_stacking_submit.iloc[k,4]+=pred_submit[4][k]/k_fold
        print(i)

    ######### 利用data——stacking——train 进行第二层模型的训练
    model_2=LassoCV()
    model_2.fit(data_stacking_train,y_train)

    return (model_2.predict(data_stacking_submit))
    # prepare features
    X_train = alldata.iloc[:train.shape[0], :]
    X_test = alldata.iloc[train.shape[0]:, :]

    # prepare target
    train['SalePrice_log'] = np.log1p(train['SalePrice'])
    y = train.SalePrice_log

    # model
    alphas = np.linspace(0.0001, 0.001, 100)
    cv = 5
    model_lasso = LassoCV(alphas=alphas, cv=5)
    res = model_lasso.fit(X_train, y)
    score = cross_val_score(model_lasso, X_train, y, cv=cv).mean()
    coef = pd.Series(model_lasso.coef_, index=X_train.columns)
    print 'Lasso has chosen alpha to be %f.' % (res.alpha_)
    print 'The cross validation score is %f.' % (score)

    # plot the most significant 10 features
    plot_import_vars(coef, 5)

    # prediction
    preds = np.expm1(model_lasso.predict(X_test))
    solution = pd.DataFrame({'id': test.Id, 'SalePrice': preds})
    solution.to_csv('house_price.csv', index=False)

# to do:
# fill missing value by distribution
# create new features
# more samples
Beispiel #32
0
train_num = len(train)

all_data = pd.concat((train.loc[:,'MSSubClass':'SaleCondition'],
                      test.loc[:,'MSSubClass':'SaleCondition']))
#log transform the target:
train["SalePrice"] = np.log1p(train["SalePrice"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])
all_data = pd.get_dummies(all_data)
#filling NA's with the mean of the column:
all_data = all_data.fillna(all_data.mean())

X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice

model_lasso = LassoCV(alphas = [1, 0.1, 0.001, 0.0005]).fit(X_train, y)
print 'generating file'
y_test_pred = model_lasso.predict(X_test)
submission = pd.DataFrame({"Id": test["Id"],"SalePrice": y_test_pred})
submission.loc[submission['SalePrice'] <= 0, 'SalePrice'] = 0
fileName = "submission_.csv"
submission.to_csv(fileName, index=False)
def Lasso_Mode(X_train, y_train, X_test, y_test, num_class):
    algo_name = 'Lasso Regression'
    lasso_model = LassoCV(alphas=[0.01, 0.05, 0.10, 0.20, 0.50, 1])
    lasso_model.fit(X_train, y_train)
    y_pred_lm = lasso_model.predict(X_test)
    PRAF(y_test, y_pred_lm, num_class, algo_name)
Beispiel #34
0
df.index = range(1994, 2014)
df.loc[2014] = None
df.loc[2015] = None
l = ['x1', 'x2', 'x3', 'x4', 'x5', 'x7']
for i in l:
    f = GM11(df[i][list(range(1994, 2014))].values)[0]
    df[i][2014] = f(len(df) - 1)
    df[i][2015] = f(len(df))
    df[i] = df[i].round(2)

features = ['x1', 'x2', 'x3', 'x4', 'x5', 'x7']
train = df.loc[list(range(1994, 2014)), features + ['y']].copy()

scaler = StandardScaler()
train = scaler.fit_transform(train)
x_train = train[:, :-1]
y_train = train[:, -1]

model = Sequential()
model.add(Dense(12, input_shape=(6, )))
model.add(Activation('relu'))
model.add(Dense(1, input_shape=(12, )))
model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, epochs=10000, batch_size=16)
model.save_weights('1-net.model')

x = (df[features] - scaler.mean_[:-1]) / scaler.scale_[:-1]
df['y_pred'] = model.predict(x) * scaler.scale_[-1] + scaler.mean_[-1]

df[['y', 'y_pred']].plot(subplots=True, style=['b-o', 'r-*'])
plt.show()
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.SalePrice
print y


def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(
        model, X_train, y, scoring="neg_mean_squared_error", cv=5))
    return (rmse)


model_ridge = Ridge()
alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
cv_ridge = [rmse_cv(Ridge(alpha=alpha)).mean() for alpha in alphas]
cv_ridge = pd.Series(cv_ridge, index=alphas)
cv_ridge.plot(title="Validation - Just Do It")
plt.xlabel("alpha")
plt.ylabel("rmse")

model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(X_train, y)

rmse_cv(model_lasso).mean()

coef = pd.Series(model_lasso.coef_, index=X_train.columns)

preds = pd.DataFrame({"preds": model_lasso.predict(X_train), "true": y})
preds["residuals"] = preds["true"] - preds["preds"]
preds = np.expm1(model_lasso.predict(X_test))
solution = pd.DataFrame({"id": test.Id, "SalePrice": preds})
solution.to_csv("ridge_sol.csv", index=False)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

# try a smaller alpha
las = Lasso(alpha=0.0001, normalize=True)
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
print 'RMSE (Lasso reg.) =', np.sqrt(metrics.mean_squared_error(y_test, preds))

# use LassoCV to select best alpha (tries 100 alphas by default)
from sklearn.linear_model import LassoCV
lascv = LassoCV(normalize=True, alphas=alpha_range)
lascv.fit(X_train, y_train)
lascv.alpha_
lascv.coef_
preds = lascv.predict(X_test)
print 'RMSE (Lasso CV reg.) =', np.sqrt(
    metrics.mean_squared_error(y_test, preds))

###############################################################################
##### Regularization with Logistic Regression
###############################################################################

## TASK: Regularized classification
## FUNCTION: LogisticRegression
## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
## DATA: Titanic (n=891, p=5 selected, type=classification)
## DATA DICTIONARY: https://www.kaggle.com/c/titanic-gettingStarted/data

########## Prepare data ##########
# Get and prepare data
watchlist = [(dtrain, 'train')]  # list of things to evaluate and print
gbm = xgb.train(params,
                dtrain,
                num_boost_round,
                evals=watchlist,
                early_stopping_rounds=early_stopping_rounds,
                verbose_eval=True)  # find the best score
x_pred = np.expm1(gbm.predict(dtest))

# In[188]:

elastic = ElasticNet(alpha=0.0005, l1_ratio=0.9)
elastic.fit(X_train, y)
elas_preds = np.expm1(elastic.predict(X_test))

# In[189]:

lasso_preds = np.expm1(lasso_model.predict(X_test))
final_result = 0.8 * lasso_preds + 0.2 * x_pred
#+0.1*elas_preds

solution = pd.DataFrame({
    "id": test.Id,
    "SalePrice": final_result
},
                        columns=['id', 'SalePrice'])
solution.to_csv("final_upload_11PM2.csv", index=False)

# In[ ]:
Beispiel #38
0
# # 画图产看预测结果和实际结果的分布情况:
# plt.plot(pre[0:100],c='red',label="pre")
# plt.plot(data_test_6_label[0:100],c='black',label='true')
# plt.title("lasso pre and label distribute circumstance")
# plt.legend()
# plt.show()
# # 0.77679992833927
# # 9.687457422736461
## 预测效果太差,几乎没有波动

# 使用lasso来做
from sklearn.linear_model import LassoCV
model_lasso = LassoCV(alphas=[x for x in np.arange(0, 2, 0.000001)]).fit(
    train, train_label)

y_redge = np.expm1(model_lasso.predict(test))
print(mean_absolute_error(data_test_6_label, y_redge))

# 画图
plt.plot(y_redge[0:100], c='red', label="pre")
plt.plot(data_test_6_label[0:100], c='black', label='true')
plt.title("lasso pre and label distribute circumstance")
plt.legend()
plt.show()
'''
倾斜的好!套索的性能更好,
我们就用这个来预测测试集。
套索的另一个优点是它为你做了特性选择——
设置它认为不重要的特性的系数为零。
让我们来看看系数:数值特性
'''
print(lasso.coef_)
print(lasso.score(X_test, y_test))

y_pred = lasso.predict(X_test)

mse_lasso = mean_squared_error(y_true=y_test, y_pred=y_pred)

num_alphas = 200
alphas = np.linspace(0.01, 10, num_alphas)
lasso_cv = LassoCV(alphas=alphas, cv=5, random_state=SEED)
lasso_cv.fit(X_train, y_train)

print(lasso_cv.intercept_)
print(lasso_cv.coef_)
print(lasso_cv.score(X_test, y_test))

y_pred = lasso_cv.predict(X_test)
mse_lasso_cv = mean_squared_error(y_true=y_test, y_pred=y_pred)

n_alphas = 200
ridge_alphas = np.logspace(-2, 6, n_alphas)
ridge_cv = RidgeCV(alphas=ridge_alphas, scoring="neg_mean_squared_error", cv=3)
ridge_cv.fit(X_train, y_train)

print(ridge_cv.intercept_)
print(ridge_cv.coef_)
print(ridge_cv.score(X_test, y_test))

y_pred = ridge_cv.predict(X_test)
mse_ridge_cv = mean_squared_error(y_true=y_test, y_pred=y_pred)
Beispiel #40
0
#                        'hot',
#                        'frigid',
#                        'all_high_snow',
#                        'all_high_precip',
                        'cold'
                        ]
        X_total = store_df[columns_list]
        X_train = df_train[columns_list]
        X_test = df_test[columns_list]

        total_data = X_total.values
        train_data = X_train.values
        test_data = X_test.values
        regr = regr.fit( train_data[0::,1::], train_data[0::,0] )
        #print(regr.alpha_,store,item)
        prediction = regr.predict(test_data[0::,1::])
        prediction = np.maximum(prediction, 0.)
        prediction_total = regr.predict(total_data[0::,1::])
        prediction_total = np.maximum(prediction_total, 0.)
        total_series = pd.Series(prediction_total, unique_dates_int)

        rmse = np.sqrt(((test_data[0::,0] - prediction) ** 2).mean())
        se = ((test_data[0::,0] - prediction) ** 2).sum()
#        print(rmse,store,item)
        rmse_total = rmse_total + rmse
        se_total = se_total + se

#        plt.scatter(df_test.index,test_data[0::,0] - prediction)
#        plt.xlabel('date')
#        plt.xlim(0,1050)
#        plt.ylabel('truth - pred')
Beispiel #41
0
#outliers_id = np.array([31, 463, 524, 633, 969, 971, 1299, 1325])
outliers_id = np.array([523,1298])


X_train = X_train.drop(outliers_id)
y = y.drop(outliers_id)        

def rmse_cv(model):
    rmse= np.sqrt(-cross_val_score(model, X_train, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)


#LASSO MODEL
clf1 = LassoCV(alphas = [1, 0.1, 0.001, 0.0005, 5e-4])
clf1.fit(X_train, y)
lasso_preds = np.expm1(clf1.predict(X_test))

#ELASTIC NET
clf2 = ElasticNet(alpha=0.0005, l1_ratio=0.9)
clf2.fit(X_train, y)
elas_preds = np.expm1(clf2.predict(X_test))

#XGBOOST
clf3=xgb.XGBRegressor(colsample_bytree=0.4,
                 gamma=0.045,                 
                 learning_rate=0.07,
                 max_depth=20,
                 min_child_weight=1.5,
                 n_estimators=300,                                                                    
                 reg_alpha=0.65,
                 reg_lambda=0.45,
Beispiel #42
0
print('Condition:', uline=True)
for i in range(10):
    plt.plot(tcmax_condition.alphas_, tcmax_condition.mse_path_[:, i])
plt.xlim([max(tcmax_condition.alphas_), min(tcmax_condition.alphas_)])
plt.axvline(tcmax_condition.alpha_, c='r')
plt.xlabel('Alpha')
plt.ylabel('CVE')
plt.title('TcMax Condition')
plt.show()

## Check Heat vs Control
p = np.array(
    list(
        zip([
            'Heat'
            if i > np.mean(tcmax_condition.predict(test.T)) else 'Control'
            for i in tcmax_condition.predict(test.T)
        ], [
            samples['Condition'][i] for i in samples.index if i in test.columns
        ])))
print(p)
p = sum([i[0] == i[1] for i in p])

## Check empirical p-value for Heat vs Control
labels = [
    1 if samples['Condition'][i] == 'Heat' else 0 for i in samples.index
    if i in base.columns
]
dist = []
for i in range(50):
    np.random.shuffle(labels)  # in place
Beispiel #43
0
rfRegressorPredicts = np.expm1(rfRegressor.predict(dataTest))
print('The rfRegressor achieves RMSE of ', rmseCV(rfRegressor).mean())

# Train the Lasso Regressor
from sklearn.linear_model import Lasso, LassoCV  # Ridge, RidgeCV, ElasticNet, LassoCV, LassoLarsCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.00075, 0.0005, 0.0004],
                cv=5).fit(dataTrain, targetTrain)
print('The amount of penalization in LASSO chosen by cross validation is',
      lasso.alpha_)
# Make it more robust to outliers using the sklearn's Robustscaler() method on pipeline
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
lasso.fit(dataTrain, targetTrain)
print('The lasso model achieves RMSE of ', rmseCV(lasso).mean())
lassoPredicts = np.expm1(lasso.predict(dataTest))

# Train the GradientBoostingRegressor (using huber loss for robustness to outliers)
gboost = GradientBoostingRegressor(n_estimators=3000,
                                   learning_rate=0.05,
                                   max_depth=4,
                                   max_features='sqrt',
                                   min_samples_leaf=15,
                                   min_samples_split=10,
                                   loss='huber',
                                   random_state=5)
gboost.fit(dataTrain, targetTrain)
gboostPredicts = np.expm1(gboost.predict(dataTest))
# print('The GradientBoostingRegressor achieves RMSE of ', rmseCV(gboost))

predictions = lassoPredicts  # 1.0/3.0*(rfRegressorPredicts + lassoPredicts + gboostPredicts)
Beispiel #44
0
print("Try again for more precision with alphas centered around " + str(alpha))
mod_lasso = LassoCV(alphas=[
    alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
    alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
    alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4
],
                    max_iter=50000,
                    cv=10)
mod_lasso.fit(X_train, y_train)
alpha = mod_lasso.alpha_
#print("Lasso RMSE on Test set :", rmse_cv_test(lasso).mean())
print("Best alpha :", alpha)
print("Lasso RMSE on Training set :", rmse_cv_train(mod_lasso).mean())

y_train_las = mod_lasso.predict(X_train)
#y_test_las = lasso.predict(X_test)

y_pred1 = mod_lasso.predict(X_train)
y_pred2 = mod_lasso.predict(X_test)
score1 = np.mean(
    np.abs((np.expm1(y_train) - np.expm1(y_pred1)) / np.expm1(y_train))) * 100
score2 = np.mean(
    np.abs((np.expm1(y_test) - np.expm1(y_pred2)) / np.expm1(y_test))) * 100
print("\nLASSO Model Report")
print("train {:.2f} | valid {:.2f}".format(float(score1), float(score2)))

# Plot residuals
plt.scatter(y_train_las,
            y_train_las - y_train,
            c="blue",
model_lasso = LassoCV(alphas=[1, 0.1, 0.001, 0.0005]).fit(X_train, y)
print "lasso rmse: "
print rmse_cv(model_lasso).mean()

coef = pd.Series(model_lasso.coef_, index=X_train.columns)

print(
"Lasso picked " + str(sum(coef != 0)) + " variables and eliminated the other " + str(sum(coef == 0)) + " variables")

imp_coef = pd.concat([coef.sort_values().head(10), coef.sort_values().tail(10)])

imp_coef.plot(kind="barh")
plt.title("Coefficients in the Lasso Model")
plt.show()

# 模型预测结果展示
preds = pd.DataFrame({"preds": model_lasso.predict(X_train), "true": y})
preds["residuals"] = preds["true"] - preds["preds"]
preds.plot(x="preds", y="residuals", kind="scatter")
plt.show()

## 显示估计误差的分布
diff = preds["true"] - preds["preds"]
diff.hist()
plt.show()





Beispiel #46
0
        ana_ind_val = ana_clf.predict(X_val_2)
        X_train_2 = X_tr_2[msk]
        train_ind = ana_clf.predict(X_train_2)

        ## fit
        est1 = LassoCV(normalize=True,
                       random_state=0).fit(X_train_1_scaled[train_ind == 1],
                                           np.log1p(y_train[train_ind == 1]))
        est2 = GradientBoostingRegressor(random_state=0,
                                         n_estimators=500,
                                         subsample=0.9).fit(
                                             X_train_1_scaled,
                                             np.log1p(y_train))

        ## predict
        preds1 = np.expm1(est1.predict(X_val_1_scaled))
        preds2 = np.expm1(est2.predict(X_val_1_scaled))
        fin_preds = []

        ana_val_scores = ana_clf.decision_function(X_val_2)
        ana_val_ind = np.argsort(ana_val_scores)
        sorted_ana_val_scores = ana_val_scores[ana_val_ind]

        # get worst outliers
        ana_val_ind_worst = ana_val_ind[:3]
        print(sorted_ana_val_scores[:3])

        for idx in range(len(y_val)):
            if idx in ana_val_ind_worst:
                fin_preds.append(preds2[idx])
            else:
Beispiel #47
0
      " variables")

imp_coef = pd.concat(
    [coef.sort_values().head(10),
     coef.sort_values().tail(10)])
import matplotlib
matplotlib.rcParams['figure.figsize'] = (8.0, 10.0)
imp_coef.plot(kind="barh")
plt.title("Coefficients in the Lasso Model")
plt.show()

#let's look at the residuals as well:
matplotlib.rcParams['figure.figsize'] = (6.0, 6.0)

preds = pd.DataFrame({
    "preds": model_lasso.predict(train),
    "true": train_label
})

preds["residuals"] = preds["true"] - preds["preds"]
print(preds.head())
preds.plot(x="preds",
           y="true",
           kind="scatter",
           title="preds-true-distribute-lasso2,xgboost"
           )  # 直接用pandas得dataframe对象画图的时候给出下标就可以画了;
plt.show()

import xgboost as xgb

dtrain = xgb.DMatrix(train, label=train_label)
#split the data into train(0.75) and test(0.25)
from sklearn.cross_validation import train_test_split
train, test = train_test_split(movie_df, test_size = 0.25)

#fit lasso CV model
#predictor_list is the list of variables that we want to use for modelling
#here I get it from the column names -predictors and movie title. you
#can create your own directly from the name list
predictor_list = list(train.columns.values)
predictor_list.remove('domestic_gross')
predictor_list.remove('tomatoRating')
predictor_list.remove('new_title')

#fit the lasso model, which get selection and estimation at one step.
#if you want to see which variable is significant, there is an easy way:
#just look at the coefficients, greater than 0 means that it is used for prediction

from sklearn.linear_model import LassoCV
clf = LassoCV(cv=20).fit(train[predictor_list],train.domestic_gross)
Y_pred = clf.predict(test[predictor_list])

print(clf.coef_)

#calculate the mean_squared_error
from sklearn.metrics import mean_squared_error
print mean_squared_error(Y_pred, test.domestic_gross)
lasso.fit(X, y)
ridge.fit(X, y)

lr.coef_.astype(str)

lasso.coef_

ridge.coef_

x_pred = np.linspace(0, 1, 100)
X_pred = to_polynomial(x_pred)

#plt.plot(x, y_real, '--', alpha=0.5, label='Real function')
plt.scatter(x, y, s=100, c='g', label='Data points')
plt.plot(x_pred, lr.predict(X_pred), c='red', label='Prediction - No regularization')
plt.plot(x_pred, lasso.predict(X_pred), c='blue',  alpha=0.5, label='Prediction - L1 reguralization')
plt.plot(x_pred, ridge.predict(X_pred), c='purple', alpha=0.5,  label='Prediction - L2 reguralization')
_ = plt.legend(loc='best')

"""# Impact of different levels of regularization."""

from sklearn.linear_model import Ridge
import seaborn as sns

x_pred = np.linspace(0, 1, 100)
X_pred = to_polynomial(x_pred)

plt.scatter(x, y, s=100, c='g', label='Data points')
for i, alpha in enumerate([1.0, 0.01, 0.0001, 0.]):
  ridge = Ridge(alpha=alpha)
  ridge.fit(X, y)
Beispiel #50
0
class MovieTrainer(object):
    
    def __init__(self,training_file,test_file):
        self._training_pickle=training_file
        self._test_pickle=test_file
        
        #to be defined later
        self._list_of_dicts=None
        self._dataframe=None
        self._features=None
        self._test_features=None
        self._labels=None
        self._clf=None
        
        self._training_frame=None
        self._test_frame=None
        self._prediction_frame=None
        
        #dicts
        self._actor_dict=None
        self._director_dict=None
        self._genre_dict=None
        self._production_house=None
        
    
    def _load_dataframe(self):
        if os.path.isfile(self._training_pickle) ==True:
            self._training_dict=pickle.load(file(self._training_pickle))
        else:
            raise AttributeError("Cannot find pickle file:%s"%self._training_pickle)
        
        if os.path.isfile(self._test_pickle) ==True:
            self._test_dict=pickle.load(file(self._test_pickle))
        else:
           raise AttributeError("Cannot find pickle file:%s"%self._test_pickle)
          
         #load pandas frame
         
        self._training_frame=pd.DataFrame(self._training_dict)
        self._test_frame=pd.DataFrame(self._test_dict)
        
        #drop movies with no names 
        self._training_frame.dropna(subset=["moviename"])
        self._test_frame.dropna(subset=["moviename"])
        return
            #raise error?
    
    def _addtodict(self,name,this_dict):
        if this_dict.has_key(name):
            this_dict[name]+=1
        else:
            this_dict[name]=1
        return
        
    def _modify_string(self,playername):
        playername = re.sub('^\s+|\s+$','', playername)
        playername=re.sub('\s+','_',playername)
        playername=re.sub('\*','',playername)
        return playername
    
    #this function creates a list of features
    #corresponding to the most frequent actors 
    #in a movie
    def _create_playerdict(self,frame,colname,num_features):
        
        playerdict={}
        
        
        for index in frame.index:
            #for each row, we have list of actors
            #like ['Sandra Bullock', 'Melissa McCarthy']
            playerlist=frame.ix[index,colname] 
            
            
            if type(playerlist)!=float:
                #only actors have multiple list members, other players
                #like director don't
                if colname=="actors":
                    for playername in playerlist:
                        #remove spaces, *, leading trailing spaces
                        playername=self._modify_string(playername)
                        self._addtodict(playername,playerdict)

                else:
                    playerlist=self._modify_string(playerlist)
                    self._addtodict(playerlist,playerdict)   
        
        
        
        counter=0
        feature_list=[]
        #sort the dict to get players with highest number of movies
        for key,value in sorted(playerdict.items(),key=lambda x:x[1],reverse=True):
            #print key,value
            feature_list.append(key)
            counter+=1
            if counter>num_features:
                break
        return feature_list
    
    #this function returns a value of the player features for 
    #each movie
    
    def _create_player_features(self,frame,colname,num_features):
        #feature_list is all names of players with most movies
        feature_list=self._create_playerdict(frame,colname,num_features)
        actor_frame = pd.DataFrame()
        
        for player in feature_list:
            feature_name=colname+":"+player
            actor_frame[feature_name]=pd.Series(0,index=frame.index)
            bigplayer_name="feature:big_"+colname #TODO: take out of loop
            actor_frame[bigplayer_name]=pd.Series(0,index=frame.index)#big actors directors present or not?
            
        for index in frame.index:
            playerval=frame.ix[index,colname]
            if type(playerval)!=float: #playerval is not None
                if colname=="actors":
                    for actor in playerval:
                        actor=self._modify_string(actor)
                        if actor in feature_list:
                            thisfeature=colname+":"+actor
                            actor_frame.loc[index,thisfeature]=1
                else:
                    playerval=self._modify_string(playerval)
                    if playerval in feature_list:
                        thisfeature=colname+":"+playerval
                        actor_frame.loc[index,thisfeature]=1
                actor_frame.loc[index,bigplayer_name]=1
            else:
                actor_frame.loc[index,bigplayer_name]=0
                        
        return actor_frame
                        
                
        
    def _create_theater_features(self,frame):
        
        #add feature column
        theater_frame=pd.DataFrame()
        theater_frame["feature:num_theaters"]=pd.Series(0,index=frame.index)
        
        
        for index in frame.index:
            theater_list=frame.ix[index,"theater_list"]
            if type(theater_list)==list and len(theater_list)>0:
                theater=theater_list[0]
                theater=re.sub(',','',theater)
                if re.search('\d+',theater) is not None:
                    theater_frame.loc[index,"feature:num_theaters"]=int(theater)
                else:
                    theater_frame.loc[index,"feature:num_theaters"]=0
            else:
                theater_frame.loc[index,"feature:num_theaters"]=0
        
        return theater_frame
        
    def _first_weekend_rank(self,frame):
        #todo: try to merge with create theater features
        weekend_frame = pd.DataFrame()
        weekend_frame["feature:rank"]=pd.Series(0,index=frame.index)
        for index in frame.index:
            rank_list=frame.ix[index,"rank_list"]
            if type(rank_list)==list and len(rank_list)>0:
                rank=rank_list[0]
                rank=re.sub(',','',rank)
                if re.search('\d+',rank) is not None:
                    weekend_frame.loc[index,"feature:rank"]=int(rank)
                else:
                    weekend_frame.loc[index,"feature:rank"]=1000#some large number? or zero?
            else:
                weekend_frame.loc[index,"feature:rank"]=1000
        
        return weekend_frame
    
    def _create_running_time_feature(self,frame):
        runtime_frame = pd.DataFrame()
        runtime_frame["feature:runtime"]=pd.Series(0,index=frame.index)
        
        for index in frame.index:
            running_time=frame.ix[index,"runtime"]
            if type(running_time)!= float: #not NaN
                pattern='(\d+).+\s(\d+)'
                hrmin=re.match(pattern,running_time)
                if hrmin is not None:
                    hrs=hrmin.group(1)
                    mins=hrmin.group(2)
                    tot_time=int(hrs)*60+int(mins)
                    runtime_frame.loc[index,"feature:runtime"]=tot_time
                    
                else:
                    runtime_frame.loc[index,"feature:runtime"]=0
                    
            else:
                runtime_frame.loc[index,"feature:runtime"]=0
            
        return runtime_frame
                
    
    def _create_release_date_feature(self,frame):
        monthlist=["January","February","March","April","May","June"\
                "July","August","September","October","November","December"]
        month_frame = pd.DataFrame()
        for month in monthlist:
            feature_name="feature:release_"+month
            month_frame[feature_name]=pd.Series(0,index=frame.index) 
        
        for index in frame.index:
            release_date=frame.ix[index,"release_date"]
            if type(release_date)!=float:
                pattern='(\S+)\s(\d+)'
                monthday=re.match(pattern,release_date)
                if monthday is not None:
                    month=monthday.group(1)
                    day=monthday.group(2)
                    
                    if month in monthlist:
                        thisfeature="feature:release_"+month
                        month_frame.loc[index,thisfeature]=1
        return month_frame
                        
                       
    
    def _extract_features(self,frame,isTraining=True):
        """
        extracts features from training and test frame
        all major data munging, cleaning takes place here
        
        """ 
        #pass
        #we will make clean_frame as the data frame, 
        #then we will define the training/test frame
        #and add each feature as a dataframe
        #and finally concatenate the features
    
                
        #check if labels exist for these movies 
        clean_data=frame[pd.notnull(frame["domestic_gross"])]
        
        list_of_frames=[]
        
        #no of theaters it opened at in the first week
        #keep this as first feature so that you can plot using this
        list_of_frames.append( self._create_theater_features(clean_data) )
        print "Created Theater Feature..."
        list_of_frames.append( self._first_weekend_rank(clean_data) )
        print "Created Rank Feature..."
        list_of_frames.append( self._create_running_time_feature(clean_data) )
        print "Created running time Feature..."
        list_of_frames.append( self._create_release_date_feature(clean_data) )
        print "Created release date Feature..."
        #create player features
        list_of_frames.append( \
                    self._create_player_features(clean_data,"actors",5) )
        list_of_frames.append( \
                    self._create_player_features(clean_data,"director",5) )
        list_of_frames.append( 
                    self._create_player_features(clean_data,"distributor",5) )
        list_of_frames.append( 
                    self._create_player_features(clean_data,"genre_toplist",5) )
        list_of_frames.append( 
                    self._create_player_features(clean_data,"mpaa_rating",5) )
        print "Created player Features..."
        
        #check dataframe shapes
        for frames in list_of_frames:
            assert frames.shape[0] == clean_data.shape[0]
            
        
        #concatenate the dataframes
        final_frame = pd.concat(list_of_frames,axis = 1)
        
        final_frame.to_csv("Training/training_frame.csv")
        
        #get training labels
        if isTraining == True:
            labels_arr=self._extract_labels(clean_data)
        else:
            prediction_frame=clean_data[["moviename","genre_toplist","actors"]]
       

        n_samples=len(final_frame.index)
        n_features=len(final_frame.columns)
        
        #from Dataframe to numpy array
        feature_arr=final_frame.values.reshape(n_samples,n_features)
        
        print "Created All Features....."
        
        if isTraining is True:
            return feature_arr,labels_arr
        else:
            return feature_arr,prediction_frame

        
        #plt.plot(theater_arr,self._clf.predict(theater_arr),'r-',linewidth=2)
        
        #plt.show()
        return
    
    def _extract_labels(self,frame):
        
        df_Y=frame["domestic_gross"].values
        gross_list=df_Y.tolist()
        for i in range(len(gross_list)):
            gross_list[i]=int(gross_list[i])
        
        max_gross=np.max(gross_list)
        #print max_gross
        gross_list=[x/max_gross for x in gross_list]
        n_samples=len(gross_list)
        gross_arr=np.array(gross_list).reshape(n_samples,1)
        
        return gross_arr
        
    def _get_top_actors(self,actorlist):
        top_actors=[None,None,None]
        if type(actorlist) ==float:
            return top_actors;
        
        counter=0
        for actor in actorlist:
            top_actors[counter]=self._modify_string(actor)
            counter+=1
            if counter==2:
                break
        return top_actors
                
                
             
        
    def explore_data(self):
        """
        plots and prints various kinds of stuff to test out the data
        change, comment and uncomment here directly
        """
        if self._training_frame is None:
            self._load_dataframe()
        
        
        #col_list.remove('actors')
        #print col_list
        #self._training_frame.drop(col_list,axis=1,inplace=True)
        
        #print self._training_frame.ix[500:510]
        #print len(self._training_frame.index)
        #only_budget=self._training_frame[pd.isnull(self._training_frame["domestic_gross"])]
        #print len(only_budget.index)
        
        #actors_there=self._training_frame[pd.notnull(self._training_frame["actors"])]
        #print len(actors_there.index)
        #print actors_there.head()
        
        #director_there=self._training_frame[pd.notnull(self._training_frame["director"])]
        #print len(director_there.index)
        #print director_there.head()
        pass
    
    def top_5_genres(self):
        if self._training_frame is None:
            self._load_dataframe()
        genre_list=self._create_playerdict(self._training_frame,"genre_toplist",5)
        print genre_list
        
    
    
    def train_2013(self):
        #pass
        self._load_dataframe()
        self._training_frame.to_csv("Training/raw_frame.csv")
        total_features,total_labels=self._extract_features(self._training_frame,isTraining=True)
        total_labels=np.ravel(total_labels)
        
        print type(total_features)
        print type(total_labels)

                                       
        
        
        #create train and test split
        self._features, test_features, self._labels, test_labels =\
            train_test_split(total_features, total_labels, test_size = 0.33)
        
        
        
        print self._features.shape
        print self._labels.shape
        print test_features.shape
        print test_labels.shape
        
        cv_outer = KFold(self._labels.shape[0],n_folds=5)
        self._clf = LassoCV(eps=0.01, n_alphas=10,cv =5)
        cross_val_arr=cross_val_score(self._clf,self._features,self._labels,cv=cv_outer)
        print "Finished Training....."
        
        r_sq=np.mean(cross_val_arr)
        print "R Square for training set: ",r_sq
        
        self._clf.fit(self._features,self._labels)
        plt.plot(test_labels, self._clf.predict(test_features),'ro',linewidth=2)
        plt.plot(np.arange(0,1.,.1),np.arange(0,1.,.1),'b-',linewidth=2)
        plt.xlabel("Actual Gross")
        plt.ylabel("Predicted Gross")
        plt.show()
        
    
    def test_2014(self):
        #check if already trained
        if self._clf is None:
            self.train_2013()
        
        print "Generating Test Features..."
        self._test_features,self._prediction_frame=self._extract_features(\
                                           self._test_frame,isTraining=False)
        
        self._prediction_frame["prediction"]=self._clf.predict(self._test_features)
        print "Finished Testing..."
                
        #sanity check and normalize
        self._prediction_frame["prediction"]=self._prediction_frame["prediction"].apply(\
                                             lambda x: 0 if x<0 else x)
        maxpred=self._prediction_frame["prediction"].max()
        if maxpred>1:
            self._prediction_frame["prediction"]=self._prediction_frame["prediction"].apply(\
                                                 lambda x: x/maxpred)
            
        
        
        print self._prediction_frame.head()
        
        
        
        
    
    def save_db(self,filename):
        con=sqlite3.connect(filename)
        cursor=con.cursor()
        cursor.execute('DROP TABLE IF EXISTS currentmovies')
        cursor.execute('CREATE TABLE currentmovies(\
                                     moviename VARCHAR(255) ,\
                                     genre VARCHAR(255),\
                                     prediction INT,\
                                     actor1 VARCHAR(255),\
                                     actor2 VARCHAR(255),\
                                     actor3 VARCHAR(255))')
        
        for index in self._prediction_frame.index:
            movname=self._prediction_frame.ix[index]["moviename"].encode('utf-8')
            pred=self._prediction_frame.ix[index]["prediction"]
            genre=self._prediction_frame.ix[index]["genre_toplist"].encode('utf-8')
            (actor1,actor2,actor3)=self._get_top_actors(self._prediction_frame.ix[index]["actors"])
            if type(movname)==float and math.isnan(movname)==True:
                continue
            print movname,genre,pred
            
            cursor.execute('INSERT INTO currentmovies\
                             VALUES(?,?,?,?,?,?)',(movname,genre,pred,actor1,actor2,actor3))
        
        con.commit()
        con.close()
# try a smaller alpha
las = Lasso(alpha=0.0001, normalize=True)
las.fit(X_train, y_train)
las.coef_
preds = las.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, preds))

# use LassoCV to select best alpha (tries 100 alphas by default)
from sklearn.linear_model import LassoCV

lascv = LassoCV(normalize=True)
lascv.fit(X_train, y_train)
lascv.alpha_
lascv.coef_
preds = lascv.predict(X_test)
np.sqrt(metrics.mean_squared_error(y_test, preds))


## TASK: Regularized classification
## FUNCTION: LogisticRegression
## DOCUMENTATION: http://scikit-learn.org/stable/modules/linear_model.html
## DATA: Titanic (n=891, p=5 selected, type=classification)
## DATA DICTIONARY: https://www.kaggle.com/c/titanic-gettingStarted/data

# define X and y
feature_cols = ["pclass", "sex", "age", "embarked_Q", "embarked_S"]
X = titanic[feature_cols]
y = titanic.survived

# split into train/test
Beispiel #52
0
                clf.fit_cv(X_train, Y_train, [(X_cv, Y_cv)])
            else:
                clf.fit(X_train, Y_train)

            one_result = clf.predict(X_cv)
            blend_train[cv_index, j] = one_result
            cv_score = gini_normalized(Y_cv, blend_train[cv_index, j])
            cv_results[j, i] = cv_score
            score_mse = metrics.mean_absolute_error(Y_cv, one_result)
            print ('Fold [%s] norm. Gini = %0.5f, MSE = %0.5f' % (i, cv_score, score_mse))
            blend_test_j[:, i] = clf.predict(X_test)
        blend_test[:, j] = blend_test_j.mean(1)
        print ('Clf_%d Mean norm. Gini = %0.5f (%0.5f)' % (j, cv_results[j,].mean(), cv_results[j,].std()))

    end_time = datetime.now()
    time_taken = (end_time - start_time)
    print ("Time taken for pre-blending calculations: {0}".format(time_taken))
    print ("CV-Results", cv_results)
    print ("Blending models.")

    bclf = LassoCV(n_alphas=100, alphas=None, normalize=True, cv=5, fit_intercept=True, max_iter=10000, positive=True)
    bclf.fit(blend_train, Y_dev)

    Y_test_predict = bclf.predict(blend_test)

    cv_score = cv_results.mean()
    print ('Avg. CV-Score = %s' % (cv_score))
    submission = pd.DataFrame({"Id": test_ids, "Hazard": Y_test_predict})
    submission = submission.set_index('Id')
    submission.to_csv("farons_solution.csv")
Beispiel #53
0
my_l = my_l[:-1]

my_list_arr = np.array(my_l)

X_train = df.values[:1495, :126]
Y_train = df.values[:1495, 126:]

X_test = df.values[1495:, :126]
Y_test = df.values[1495:, 126:]
arr = np.linspace(0.001, 100.1, 3000)
print(arr)

model = LassoCV(alphas=arr, cv=5)
model.fit(X_train, Y_train)

preds = model.predict(X_test)
error = mean_squared_error(Y_test, preds)

print("Score:", model.score(X_test, Y_test))
print("Test Error:", error)

print("penalty", model.alpha_)

final_features = model.coef_
# print("Coef_path:",model.coef_)
# print(final_features.shape)
print(type(final_features))
temp = 0
for i in range(0, 126):
    if (final_features[i] != 0):
        print(my_list_arr[i], ":", final_features[i])
coef_path_lasso_cv.fit(X,y)
coef_path_binary_x_logistic_cv.fit(binary_X,binary_y)
coef_path_logistic_cv.fit(X,binary_y)
coef_path_elastic_cv.fit(X,y)

forest_cv_score = cross_validation.cross_val_score(coef_path_forest_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')
lasso_cv_score = cross_validation.cross_val_score(coef_path_lasso_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring)
elastic_cv_score = cross_validation.cross_val_score(coef_path_elastic_cv, X, y, n_jobs=2, cv=CV, scoring=Scoring)
logistic_cv_score = cross_validation.cross_val_score(coef_path_logistic_cv, X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')
binary_x_logistic_cv_score = cross_validation.cross_val_score(coef_path_binary_x_logistic_cv, binary_X, binary_y, n_jobs=2, cv=CV, scoring='roc_auc')

forest_results_parameters = [ coef_path_forest_cv.predict(X), coef_path_forest_cv.get_params, coef_path_forest_cv.feature_importances_, 
				coef_path_forest_cv.classes_, coef_path_forest_cv.n_classes_]
forest_scores = [forest_cv_score, classification_report(binary_y, forest_results_parameters[0]), 'forest']

lasso_results_parameters = [coef_path_lasso_cv.predict(X), coef_path_lasso_cv.get_params, coef_path_lasso_cv.alphas_, coef_path_lasso_cv.coef_]  

lasso_scores = [lasso_cv_score, r2_score(y,lasso_results_parameters[0]), 'lasso']

elastic_results_parameters = [ coef_path_elastic_cv.predict(X), coef_path_elastic_cv.get_params, coef_path_elastic_cv.alphas_ ,
				coef_path_elastic_cv.coef_]
elastic_scores = [elastic_cv_score, r2_score(y,elastic_results_parameters[0]), 'elastic']

logistic_results_parameters = [coef_path_logistic_cv.predict(X), coef_path_logistic_cv.get_params, coef_path_logistic_cv.coef_]

logistic_scores = [logistic_cv_score, classification_report(binary_y, logistic_results_parameters[0]), 'logistic']

binary_x_logistic_results_parameters = [coef_path_binary_x_logistic_cv.predict(X), coef_path_binary_x_logistic_cv.get_params, coef_path_binary_x_logistic_cv.coef_]

binary_x_logistic_scores = [binary_x_logistic_cv_score, classification_report(binary_y, binary_x_logistic_results_parameters[0]), 'binary_logistic']
Beispiel #55
0
lasso = LassoCV(alphas=[
    0.0001, 0.0003, 0.0006, 0.001, 0.003, 0.006, 0.01, 0.03, 0.06, 0.1, 0.3,
    0.6, 1
],
                max_iter=50000,
                cv=10)
lasso.fit(X_train, y)
alpha = lasso.alpha_
print "Best alpha :", alpha

print "Trying alphas centered around " + str(alpha)
lasso = LassoCV(alphas=[
    alpha * .6, alpha * .65, alpha * .7, alpha * .75, alpha * .8, alpha * .85,
    alpha * .9, alpha * .95, alpha, alpha * 1.05, alpha * 1.1, alpha * 1.15,
    alpha * 1.25, alpha * 1.3, alpha * 1.35, alpha * 1.4
],
                max_iter=50000,
                cv=10)
lasso.fit(X_train, y)

lasso_pred1 = np.expm1(lasso.predict(X_test))

#----------------------------Take weighted sum of lasso predictions and XGB predictions.------------#
final1234 = 0.7 * lasso_pred1 + 0.3 * xgb_p

# final1234

f_sub = pd.DataFrame({"id": test.Id, "SalePrice": final1234})

f_sub.to_csv("Sol_1234.csv", index=False)
msg("Fitting!")

weights = np.ones(train.shape[0])

do_statsmodels=True
if do_statsmodels:
    ols = sm.wls(formula=formula, data=train, weights=weights).fit()
    print ols.summary()
    msg("Making predictions for all playergames")
    yy_df['ols_prediction'] = ols.predict(yy_df)
else:
    ols_lr = LassoCV(n_jobs=-1, verbose=True)
    X = train[rhs_cols]
    y = train['elo']
    ols_lr.fit(X,y)
    yy_df['ols_prediction'] = ols_lr.predict(X)

yy_df['ols_error'] = (yy_df['ols_prediction'] - yy_df['elo']).abs()
yy_df['training'] = (yy_df['gamenum'] % 3)
insample_scores = yy_df.groupby('training')['ols_error'].agg({'mean' : np.mean, 'median' : np.median, 'stdev': np.std})
print insample_scores

msg("Error summary by ELO:")
elo_centuries = cut(yy_df['elo'], 20)
print yy_df.groupby(elo_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean})

msg("Error summary by gamenum:")
gamenum_centuries = cut(yy_df['gamenum'], 20)
print yy_df.groupby(gamenum_centuries)['ols_error'].agg({'sum': np.sum, 'count': len, 'mean': np.mean})

msg("Writing yy_df back out with ols predictions inside")
#############################################################################################################
# 2. Lasso
#############################################################################################################
# The parameter alpha increases L1 penalty when smaller. Alpha = 0 is linear regression. 
# LassoCV incorporates iterating through many alphas and CV as well. Also Lasso can be used for dimensionality 
# reduction as it is able to set coefficients to 0. 
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

lasso = LassoCV(alphas=np.logspace(-10, 10, 10), normalize=True, cv=10, positive=False)  
lasso.fit(xtrain, ytrain)

# Train dataset performance
lasso_train_pred = lasso.predict(xtrain)
lasso_train_r2 = r2_score((ytrain), lasso_train_pred)
lasso_train_error = np.sqrt(mean_squared_error(ytrain, lasso_train_pred))

# Test dataset performance
lasso_test_pred = lasso.predict(xtest)
lasso_test_r2 = r2_score((ytest), lasso_test_pred)
lasso_test_error = np.sqrt(mean_squared_error(ytest, lasso_test_pred))

# Build coefficients table
from pandas import DataFrame
lassocoeff = DataFrame(data.columns, columns = ['Features'])
lassocoeff['Coefficients'] = lasso.coef_

print 'LASSO  ------------------------------------------------------------------------'
print '\nThe alpha (L1) level selected: {}' .format(lasso.alpha_)