Exemple #1
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    mten = MultiTaskElasticNet(alpha=0.1,
                               rho=0.5,
                               fit_intercept=True,
                               normalize=False,
                               copy_X=True,
                               max_iter=1000,
                               tol=0.0001,
                               warm_start=False)

    X = []
    for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
        X.append([i])
    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    global y
    y = []

    print "Collecting statuses"

    for element in data["OpenStatus"]:
        for index, status in enumerate(ques_status):
            if element == status:
                y.append(index)

    print "Fitting"
    mten.fit(fea, y)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    print "Reading test data and features"
    test_data = cu.get_dataframe(test_file)
    test_fea = features.extract_features(feature_names, test_data)

    print "Making predictions"
    global probs
    probs = mten.predict(test_fea)
    # shape of probs is [n_samples]
    # convert probs to shape [n_samples,n_classes]
    probs = np.resize(probs, (len(probs) / 5, 5))

    if is_full_train_set == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "writing submission to " + submission_file
    cu.write_submission(submission_file, probs)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
Exemple #2
0
def test_enet_float_precision():
    # Generate dataset
    X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10)
    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests

    for normalize in [True, False]:
        for fit_intercept in [True, False]:
            coef = {}
            intercept = {}
            for dtype in [np.float64, np.float32]:
                clf = ElasticNet(alpha=0.5,
                                 max_iter=100,
                                 precompute=False,
                                 fit_intercept=fit_intercept,
                                 normalize=normalize)

                X = dtype(X)
                y = dtype(y)
                ignore_warnings(clf.fit)(X, y)

                coef[('simple', dtype)] = clf.coef_
                intercept[('simple', dtype)] = clf.intercept_

                assert clf.coef_.dtype == dtype

                # test precompute Gram array
                Gram = X.T.dot(X)
                clf_precompute = ElasticNet(alpha=0.5,
                                            max_iter=100,
                                            precompute=Gram,
                                            fit_intercept=fit_intercept,
                                            normalize=normalize)
                ignore_warnings(clf_precompute.fit)(X, y)
                assert_array_almost_equal(clf.coef_, clf_precompute.coef_)
                assert_array_almost_equal(clf.intercept_,
                                          clf_precompute.intercept_)

                # test multi task enet
                multi_y = np.hstack((y[:, np.newaxis], y[:, np.newaxis]))
                clf_multioutput = MultiTaskElasticNet(
                    alpha=0.5,
                    max_iter=100,
                    fit_intercept=fit_intercept,
                    normalize=normalize)
                clf_multioutput.fit(X, multi_y)
                coef[('multi', dtype)] = clf_multioutput.coef_
                intercept[('multi', dtype)] = clf_multioutput.intercept_
                assert clf.coef_.dtype == dtype

            for v in ['simple', 'multi']:
                assert_array_almost_equal(coef[(v, np.float32)],
                                          coef[(v, np.float64)],
                                          decimal=4)
                assert_array_almost_equal(intercept[(v, np.float32)],
                                          intercept[(v, np.float64)],
                                          decimal=4)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	mten.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	probs = mten.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	if is_full_train_set == 0:
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Exemple #4
0
class MultiTaskElasticNetImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Exemple #5
0
    def mtelastic_model(self, X_train, y_train, X_test, y_test):
        # Multi-task Elastic-Net Regression Model

        mten_model = MultiTaskElasticNet(alpha=.1918)

        mten_model.fit(X_train, y_train)

        y_train_pred = mten_model.predict(X_train)
        y_test_pred = mten_model.predict(X_test)

        # To score the model I can either use the .score from sklearn or use the MSE R^2 from the Machine Learning Book
        print(mten_model.score(X_train, y_train))
        print(mten_model.score(X_test, y_test))
        print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error(
            y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.6f, R^2 test: %.6f' %
              (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
def train_metaregressor(stack_path, train, labels, run_sequence, scale_data, models, predict_mode_all, full = True, verbose = False):

    if full: model_suffix = "_30"
    else: model_suffix = "_8"

    print("".join(["\n", "=" * 50, "".join(["\nTraining Metaregressor", model_suffix, " (Level 2)\n"]), "=" * 50, "\n"]))

    # Model definition for metaregressor
    if predict_mode_all:
        model = MultiTaskElasticNet(random_state = 42, max_iter = 1000, l1_ratio = 1.0, alpha = 0.1)
    else:
        model = ElasticNet(random_state = 42, max_iter = 1000, l1_ratio = 1.0, alpha = 0.1)
    
    print('Training linear metaregressors for %d models and %d total independent variables.\n' % (len(models), train.shape[1]))
    
    reg_models, rmse = [], []
    if predict_mode_all:
        print("// MODE: All-in-One Pass //\n")
        model.fit(train.values, labels.values)
        rmse = [np.sqrt(mean_squared_error(y_true = labels.values, y_pred = model.predict(train.values)))]
        reg_models.append(model)
    else:
        print("// MODE: One-at-a-Time //\n")
        # iterate and build a model over all dependent variables (30)
        for f in range(len(TRAIN_COLS)):
            # get the list of values to predict, column-wise
            predict_me = labels.values[:,f]
            # build the list of independent variables 
            for i in range((0+f), ((30 * len(models)) + f), 30):
                if i == 0+f:
                    train_me = train.values[:,i].reshape(-1, 1)
                else:
                    train_me = np.hstack((train_me, train.values[:,i].reshape(-1, 1)))
            # fit and store in our reg_models list
            model.fit(train_me, predict_me)
            reg_models.append(model)
            score = np.sqrt(mean_squared_error(y_true = predict_me, y_pred = model.predict(train_me)))
            rmse.append(score)
            print("Metaregressor #%d of %d trained for feature '%s'; RMSE was: %.5f" % 
                ((f + 1), len(TRAIN_COLS), TRAIN_COLS[f], score))
    
    print("\nAll metaregressors trained; average RMSE: %.5f" % np.mean(rmse))

    print("".join(["\n", "=" * 50, "".join(["\nMetaregressor", model_suffix, " Training Complete\n"]), "=" * 50, "\n"]))

    return reg_models
Exemple #7
0
    def predict(
        self,
        forecast_length: int,
        future_regressor=[],
        just_point_forecast: bool = False,
    ):
        """Generates forecast data immediately following dates of index supplied to .fit()

        Args:
            forecast_length (int): Number of periods of data to forecast ahead
            regressor (numpy.Array): additional regressor
            just_point_forecast (bool): If True, return a pandas.DataFrame of just point forecasts

        Returns:
            Either a PredictionObject of forecasts and metadata, or
            if just_point_forecast == True, a dataframe of point forecasts
        """
        if not _has_tsfresh:
            raise ImportError("Package tsfresh is required")
        # num_subsamples = 10
        predictStartTime = datetime.datetime.now()

        # from tsfresh import extract_features
        from tsfresh.utilities.dataframe_functions import make_forecasting_frame

        # from sklearn.ensemble import AdaBoostRegressor
        from tsfresh.utilities.dataframe_functions import impute as tsfresh_impute

        # from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters

        max_timeshift = 10
        regression_model = 'Adaboost'
        feature_selection = None

        max_timeshift = self.max_timeshift
        regression_model = self.regression_model
        feature_selection = self.feature_selection

        sktraindata = self.df_train.copy()

        X = pd.DataFrame()
        y = pd.DataFrame()
        counter = 0
        for column in sktraindata.columns:
            df_shift, current_y = make_forecasting_frame(
                sktraindata[column],
                kind="time_series",
                max_timeshift=max_timeshift,
                rolling_direction=1,
            )
            # disable_progressbar = True MinimalFCParameters EfficientFCParameters
            current_X = extract_features(
                df_shift,
                column_id="id",
                column_sort="time",
                column_value="value",
                impute_function=tsfresh_impute,
                show_warnings=False,
                default_fc_parameters=EfficientFCParameters(),
                n_jobs=1,
            )  #
            current_X["feature_last_value"] = current_y.shift(1)
            current_X.rename(columns=lambda x: str(counter) + '_' + x,
                             inplace=True)

            X = pd.concat([X, current_X], axis=1)
            y = pd.concat([y, current_y], axis=1)
            counter += 1

        # drop constant features
        X = X.loc[:, X.apply(pd.Series.nunique) != 1]
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(0)
        y = y.fillna(method='ffill').fillna(method='bfill')

        if feature_selection == 'Variance':
            from sklearn.feature_selection import VarianceThreshold

            sel = VarianceThreshold(threshold=(0.15))
            X = pd.DataFrame(sel.fit_transform(X))
        if feature_selection == 'Percentile':
            from sklearn.feature_selection import SelectPercentile, chi2

            X = pd.DataFrame(
                SelectPercentile(chi2, percentile=20).fit_transform(
                    X, y[y.columns[0]]))
        if feature_selection == 'DecisionTree':
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.feature_selection import SelectFromModel

            clf = DecisionTreeRegressor()
            clf = clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)

            X = model.transform(X)
        if feature_selection == 'Lasso':
            from sklearn.linear_model import MultiTaskLasso
            from sklearn.feature_selection import SelectFromModel

            clf = MultiTaskLasso(max_iter=2000)
            clf = clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)

            X = model.transform(X)
        """
         decisionTreeList = X.columns[model.get_support()]
         LassoList = X.columns[model.get_support()]
         
         feature_list = decisionTreeList.to_list()
         set([x for x in feature_list if feature_list.count(x) > 1])
         from collections import Counter
         repeat_features = Counter(feature_list)
         repeat_features = repeat_features.most_common(20)
        """

        # Drop first line
        X = X.iloc[1:, ]
        y = y.iloc[1:]

        y = y.fillna(method='ffill').fillna(method='bfill')

        index = self.create_forecast_index(forecast_length=forecast_length)

        if regression_model == 'ElasticNet':
            from sklearn.linear_model import MultiTaskElasticNet

            regr = MultiTaskElasticNet(alpha=1.0,
                                       random_state=self.random_seed)
        elif regression_model == 'DecisionTree':
            from sklearn.tree import DecisionTreeRegressor

            regr = DecisionTreeRegressor(random_state=self.random_seed)
        elif regression_model == 'MLP':
            from sklearn.neural_network import MLPRegressor

            # relu/tanh lbfgs/adam layer_sizes (100) (10)
            regr = MLPRegressor(
                hidden_layer_sizes=(10, 25, 10),
                verbose=self.verbose_bool,
                max_iter=200,
                activation='tanh',
                solver='lbfgs',
                random_state=self.random_seed,
            )
        elif regression_model == 'KNN':
            from sklearn.multioutput import MultiOutputRegressor
            from sklearn.neighbors import KNeighborsRegressor

            regr = MultiOutputRegressor(
                KNeighborsRegressor(random_state=self.random_seed))
        elif regression_model == 'Adaboost':
            from sklearn.multioutput import MultiOutputRegressor
            from sklearn.ensemble import AdaBoostRegressor

            regr = MultiOutputRegressor(AdaBoostRegressor(
                n_estimators=200))  # , random_state=self.random_seed))
        else:
            regression_model = 'RandomForest'
            from sklearn.ensemble import RandomForestRegressor

            regr = RandomForestRegressor(random_state=self.random_seed,
                                         n_estimators=1000,
                                         verbose=self.verbose)

        regr.fit(X, y)

        combined_index = self.df_train.index.append(index)
        forecast = pd.DataFrame()
        sktraindata.columns = [x for x in range(len(sktraindata.columns))]

        for x in range(forecast_length):
            x_dat = pd.DataFrame()
            y_dat = pd.DataFrame()
            counter = 0
            for column in sktraindata.columns:
                df_shift, current_y = make_forecasting_frame(
                    sktraindata.tail(max_timeshift)[column],
                    kind="time_series",
                    max_timeshift=max_timeshift,
                    rolling_direction=1,
                )
                # disable_progressbar = True MinimalFCParameters EfficientFCParameters
                current_X = extract_features(
                    df_shift,
                    column_id="id",
                    column_sort="time",
                    column_value="value",
                    impute_function=tsfresh_impute,
                    show_warnings=False,
                    n_jobs=1,
                    default_fc_parameters=EfficientFCParameters(),
                )  # default_fc_parameters=MinimalFCParameters(),
                current_X["feature_last_value"] = current_y.shift(1)

                current_X.rename(columns=lambda x: str(counter) + '_' + x,
                                 inplace=True)

                x_dat = pd.concat([x_dat, current_X], axis=1)
                y_dat = pd.concat([y_dat, current_y], axis=1)
                counter += 1

            x_dat = x_dat[X.columns]
            rfPred = pd.DataFrame(regr.predict(x_dat.tail(1).values))

            forecast = pd.concat([forecast, rfPred], axis=0, ignore_index=True)
            sktraindata = pd.concat([sktraindata, rfPred],
                                    axis=0,
                                    ignore_index=True)
            sktraindata.index = combined_index[:len(sktraindata.index)]

        forecast.columns = self.column_names
        forecast.index = index

        if just_point_forecast:
            return forecast
        else:
            upper_forecast, lower_forecast = Point_to_Probability(
                self.df_train,
                forecast,
                prediction_interval=self.prediction_interval)

            predict_runtime = datetime.datetime.now() - predictStartTime
            prediction = PredictionObject(
                model_name=self.name,
                forecast_length=forecast_length,
                forecast_index=forecast.index,
                forecast_columns=forecast.columns,
                lower_forecast=lower_forecast,
                forecast=forecast,
                upper_forecast=upper_forecast,
                prediction_interval=self.prediction_interval,
                predict_runtime=predict_runtime,
                fit_runtime=self.fit_runtime,
                model_parameters=self.get_params(),
            )
            return prediction
Exemple #8
0
                                      model_name=f'best_model_batch{ind}.h5')
              ])
    all_predictions.append(model.predict(X_test))

model = create_model()
model.fit(X_train, y_train, epochs=33, batch_size=32, verbose=1)
all_predictions.append(model.predict(X_test))

kf = KFold(n_splits=5, random_state=2019, shuffle=True)
for ind, (tr, val) in enumerate(kf.split(X_train)):
    X_tr = X_train[tr]
    y_tr = y_train[tr]
    X_vl = X_train[val]
    y_vl = y_train[val]

    model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
    model.fit(X_tr, y_tr)
    all_predictions.append(model.predict(X_test))

model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
model.fit(X_train, y_train)
all_predictions.append(model.predict(X_test))

test_preds = np.array([
    np.array([rankdata(c) for c in p.T]).T for p in all_predictions
]).mean(axis=0)
max_val = test_preds.max() + 1
test_preds = test_preds / max_val + 1e-12
submission = pd.read_csv(path_join(data_dir, 'sample_submission.csv'))
submission[targets] = test_preds
submission.to_csv("submission.csv", index=False)
Exemple #9
0
class model:
    def __init__(self,params,X,Y):
        self.params=params
        self.original_predictors=list(X)
#        if 'time' in self.original_predictors:
#            self.original_predictors.remove('time')
        if params['NONLIN_TYPE']=='POLY':
            #add non linear terms
            self.X=self.add_nonlinear_terms(X)
            #print(self.X)
            self.Y=Y
        if params['STANDARDIZE']:
            #standardize
            self.standardize()
        self.predictor_names=list(self.X)
        self.target_names=list(Y)
        self.Y_final=self.Y.iloc[-1,:]
        self.time=self.X.iloc[-1, X.columns.get_loc('time')]
        self.date=self.X.index[-1]
#        print(self.X)
#        print(self.Y)
#        print(self.Y_final)
#        print(self.time)
        self.make_model()
    
        

    def add_nonlinear_terms(self,X):
        df,var_names=add_polynomial_terms(X,list(X),self.params['ORDER'])
        return(df)
    
    def standardize(self):
        self.X_mean=self.X.mean()
        self.Y_mean=self.Y.mean()
        self.X_std=self.X.std()
        self.Y_std=self.Y.std()
        self.X=(self.X-self.X_mean)/self.X_std
        self.Y=(self.Y-self.Y_mean)/self.Y_std
        

    def make_model(self):
        max_iter=1000
        tol=0.015
        l1_ratio=0.8 # we want a relatively sparse model
        elastic=MultiTaskElasticNet(fit_intercept=True, max_iter=max_iter,tol=tol,l1_ratio=l1_ratio)
        
        #Note that we are assuming that error are independent of each other GIVEN THE PREDICTORS
        #Otherwise cross validation won't be applicable
        #We will perform a grid search to find best parameters
        
        print('################ Find hyper-parameter values#######################')
        search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8)},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)
        search.fit(self.X,self.Y)
        
        #Now create a final elastic net model using the optimal hyper parameters
        print('################ Build final model ##############################')
        optimal_alpha=search.best_params_['alpha']
        #optimal_l1_ratio=search.best_params_['l1_ratio']
        self.model=MultiTaskElasticNet(fit_intercept=True,alpha=optimal_alpha,l1_ratio=l1_ratio,max_iter=max_iter,tol=tol)
        self.model.fit(self.X.values,self.Y.values)
        self.predicted=pd.DataFrame(index=self.Y.index, columns= self.Y.columns, data=self.model.predict(self.X.values))
        self.predicted=self.predicted*self.Y_std+self.Y_mean
        #second_model=(mean_squared_error(y_true=Y_train,y_pred=elastic.predict(X_train)))

    
    def predict(self,X,plot=False,Y_True=None,plot_list=None):
        # If plot = True , Y_true should contain the True values and this function will plot a comparion between true vs predicted
        if self.params['STANDARDIZE'] and self.params['NONLIN_TYPE']=='POLY':
            #X1=X.copy() # don't modify the original
            X1=self.add_nonlinear_terms(X)
            #print('Unnormalized predictors: ',X1)
            X1=(X1-self.X_mean)/self.X_std # standardized
            #print('Normalized predictors: ',X1)
            Y1=self.model.predict(X1.values)
            
            dfY1=pd.DataFrame(index=Y_True.index,columns=list(Y_True),data=Y1)
            dfY1=dfY1*self.Y_std+self.Y_mean
            #Y1=Y1*
            if plot:
                X_ax=Y_True.index
                label_true=[l+'_True' for l in plot_list]
                label_pred=[l+'_Pred' for l in plot_list]
                plt.figure(figsize=(6,4))
                plt.plot(X_ax,Y_True[plot_list],label=label_true)
                plt.plot(X_ax,dfY1[plot_list],label=label_pred)
                plt.legend(loc='best')
                plt.show()
            return(dfY1)
            
            
            
            
    def forecast(self):
        pred=None
        if self.params['STANDARDIZE'] and self.params['NONLIN_TYPE']=='POLY': #if standardized and polynomial
            
            Xp=self.Y_final*self.Y_std + self.Y_mean # destandardize Y, this is needed to calculate the non linear term
            #print(Xp)
            Xp['time']=self.time+1 #- self.X_mean['time'])/self.X_std
            dfp=pd.DataFrame(index=[self.date],columns=self.original_predictors,data=Xp.values.reshape(1,-1))
            dfp=self.add_nonlinear_terms(dfp) # add the non linear terms
            #print(dfp)
            dfp=(dfp-self.X_mean)/self.X_std # standardize, then predict
            #print(dfp)
            pred=self.model.predict(dfp.values)
            self.time=self.time+1
            #print(self.date)
            self.date=self.date+MonthEnd(1)
            
            df=pd.DataFrame(index=[self.date],columns=self.target_names,data=pred)
            self.Y_final=df
            #print(self.date)
        return(pred,self.date)
    
    def multistep_forecast(self,steps):
        df=pd.DataFrame(columns=self.target_names)
        for i in range(steps):
            pred,date=self.forecast()
            print(pred.shape)
            df.loc[date,:]=np.multiply(pred,self.Y_std.values.reshape(1,-1)) + self.Y_mean.values.reshape(1,-1)
        return df
            
    def plot_coeffs(self):
        C=self.model.coef_[-1,:]
        indexes=np.where(np.abs(C)>0.0001)
        
        #significant predictors
        C_sig=C[indexes[0]]
        preds_sig=[self.predictor_names[int(i)] for i in indexes[0]]
        
        f,ax=plt.subplots()
        f.set_size_inches((10,2))
        ax.bar(range(len(C_sig)),C_sig)
        ax.set_xticks(range(len(C_sig)))
        ax.set_xticklabels(labels=preds_sig)
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.show()
    
    def variable_importance(self,orig_var_names,labels):
        all_preds=list(self.X)# all predictors
        imp=[]
        for v in orig_var_names:
            v1=[ap for ap in all_preds if v in ap]
            print(v1)
            X1=self.X.copy()
            X1[v1]=0
            Y1=self.model.predict(X1)
            imp.append(np.sum((self.Y.values-Y1)**2))
        #print(imp)
        indexes=np.argsort(np.array(imp))
        #print(indexes)
        preds1=[labels[i] for i in indexes]
        imps1=[imp[i] for i in indexes]
        imps1=imps1/np.max(imps1)
        #plot importance
        f,ax=plt.subplots()
        ax.barh(range(len(imp)),imps1)
        ax.set_yticks(range(len(imp)))
        ax.set_yticklabels(labels=preds1)
        ax.set_xlabel(xlabel='Importance',fontsize=12)
        plt.tight_layout()
        plt.show()
Exemple #10
0
def test(test_id, dir, strength_scale, n_samples, num_features,
         num_instruments, num_treatments, num_outcomes):
    def tau_fn(x, p):
        return (
            -1.5 * x + .9 * (x**2)
        ) * p  #np.abs(p) * x # #np.abs(x) #-1.5 * x + .9 * (x**2)# 2/(1+np.exp(-2*x)) #-1.5 * x + .9 * (x**2) #np.abs(x) #-1.5 * x + .9 * (x**2)  #np.abs(x) #-1.5 * x + .9 * (x**2) #np.sin(x) #1. * (x<0) + 2.5 * (x>=0) #np.abs(x)  # 1. * (x<0) + 3. * (x>=0) #-1.5 * x + .9 * (x**2)  #-1.5 * x + .9 * (x**2) #np.abs(x) #-1.5 * x + .9 * (x**2) + x**3 #-1.5 * x + .9 * (x**2) + x**3 # np.sin(x) #-1.5 * x + .9 * (x**2) + x**3 #np.sin(x) #-1.5 * x + .9 * (x**2) + x**3 #np.sin(x) #np.abs(x) #np.sin(x) #2/(1+np.exp(-2*x)) #2/(1+np.exp(-2*x)) #1.5 * x - .9 * (x**2) #2/(1+np.exp(-2*x))#-1.5 * x + .9 * (x**2)

    iv_strength = strength_scale * np.random.uniform(
        1., 1.1, size=(num_instruments, 1))
    degree_benchmarks = 3

    # Network parameters
    hidden_layers = [1000, 1000, 1000]

    # Generate data
    data_x, data_z, data_treatment, data_y = get_data(n_samples,
                                                      num_instruments,
                                                      iv_strength, tau_fn,
                                                      num_features)
    data_z = np.concatenate((data_z, data_x), axis=1)
    data_p = np.concatenate((data_treatment, data_x), axis=1)
    num_instruments = num_features + num_instruments
    num_treatments = num_features + num_treatments
    print(data_p.shape)
    print(data_z.shape)
    print(data_y.shape)
    if num_instruments >= 2:
        plt.figure()
        plt.subplot(1, 4, 1)
        plt.scatter(data_z[:, 0], data_p[:, 0], label='p vs z1')
        plt.legend()
        plt.subplot(1, 4, 2)
        plt.scatter(data_z[:, 1], data_p[:, 0], label='p vs z2')
        plt.legend()
        plt.subplot(1, 4, 3)
        plt.scatter(data_p[:, 0], data_y)
        plt.legend()
        plt.subplot(1, 4, 4)
        plt.scatter(data_p[:, 1], data_y)
        plt.legend()
        plt.savefig(os.path.join(dir, 'data_{}.png'.format(test_id)))

    # We reset the whole graph
    dgmm = DeepGMM(
        n_critics=70,
        num_steps=200,
        store_step=5,
        learning_rate_modeler=0.01,
        learning_rate_critics=0.01,
        critics_jitter=True,
        dissimilarity_eta=0.0,
        cluster_type='kmeans',
        critic_type='Gaussian',
        critics_precision=None,
        min_cluster_size=200,  #num_trees=5,
        eta_hedge=0.16,
        bootstrap_hedge=False,
        l1_reg_weight_modeler=0.0,
        l2_reg_weight_modeler=0.0,
        dnn_layers=hidden_layers,
        dnn_poly_degree=1,
        log_summary=False,
        summary_dir='./graphs_monte')
    dgmm.fit(data_z, data_p, data_y)

    test_min = np.percentile(data_p, 10)
    test_max = np.percentile(data_p, 90)
    test_grid = np.array(
        list(
            itertools.product(np.linspace(test_min, test_max, 100),
                              repeat=num_treatments)))
    print(test_grid.shape)

    test_data_x, _, test_data_treatment, _ = get_data(5 * n_samples,
                                                      num_instruments,
                                                      iv_strength, tau_fn,
                                                      num_features)
    test_data_p = np.concatenate((test_data_treatment, test_data_x), axis=1)
    print(test_data_p.shape)
    clip_edges = (np.all((test_data_p > test_min), axis=1) & np.all(
        (test_data_p < test_max), axis=1)).flatten()
    test_data_p = test_data_p[clip_edges, :]
    test_data_treatment = test_data_treatment[clip_edges, :]
    test_data_x = test_data_x[clip_edges, :]
    print(test_data_p.shape)

    best_fn_grid = dgmm.predict(test_grid, model='best')
    final_fn_grid = dgmm.predict(test_grid, model='final')
    avg_fn_grid = dgmm.predict(test_grid, model='avg')
    best_fn_dist = dgmm.predict(test_data_p, model='best')
    final_fn_dist = dgmm.predict(test_data_p, model='final')
    avg_fn_dist = dgmm.predict(test_data_p, model='avg')

    ##################################
    # Benchmarks
    ##################################
    from sklearn.linear_model import LinearRegression, MultiTaskElasticNet, ElasticNet
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.pipeline import Pipeline
    from sklearn.neural_network import MLPRegressor

    direct_poly = Pipeline([('poly',
                             PolynomialFeatures(degree=degree_benchmarks)),
                            ('linear', LinearRegression())])
    direct_poly.fit(data_p, data_y.flatten())
    direct_poly_fn_grid = direct_poly.predict(test_grid)
    direct_poly_fn_dist = direct_poly.predict(test_data_p)

    direct_nn = MLPRegressor(hidden_layer_sizes=hidden_layers)
    direct_nn.fit(data_p, data_y.flatten())
    direct_nn_fn_grid = direct_nn.predict(test_grid)
    direct_nn_fn_dist = direct_nn.predict(test_data_p)

    plf = PolynomialFeatures(degree=degree_benchmarks)
    sls_poly_first = MultiTaskElasticNet()
    sls_poly_first.fit(plf.fit_transform(data_z), plf.fit_transform(data_p))
    sls_poly_second = ElasticNet()
    sls_poly_second.fit(sls_poly_first.predict(plf.fit_transform(data_z)),
                        data_y)
    sls_poly_fn_grid = sls_poly_second.predict(plf.fit_transform(test_grid))
    sls_poly_fn_dist = sls_poly_second.predict(plf.fit_transform(test_data_p))

    sls_first = LinearRegression()
    sls_first.fit(data_z, data_p)
    sls_second = LinearRegression()
    sls_second.fit(sls_first.predict(data_z), data_y)
    sls_fn_grid = sls_second.predict(test_grid)
    sls_fn_dist = sls_second.predict(test_data_p)

    ######
    # Deep IV
    #####
    # We reset the whole graph
    with tf.name_scope("DeepIV"):
        deep_iv = deep_iv_fit(data_x,
                              data_z,
                              data_treatment,
                              data_y,
                              epochs=10,
                              hidden=hidden_layers)
        deep_iv_fn_grid = deep_iv.predict([test_grid[:, 1], test_grid[:, 0]])
        deep_iv_fn_dist = deep_iv.predict([test_data_x, test_data_treatment])

    plt.figure()
    plot_3d(test_grid, tau_fn(test_grid[:, [1]], test_grid[:, [0]]).flatten())
    plt.savefig(os.path.join(dir, 'true_{}.png'.format(test_id)))

    print(avg_fn_grid.shape)
    plt.figure()
    plot_3d(test_grid, avg_fn_grid.flatten())
    plt.savefig(os.path.join(dir, 'avg_fn_{}.png'.format(test_id)))

    plt.figure()
    plot_3d(test_grid, best_fn_grid.flatten())
    plt.savefig(os.path.join(dir, 'best_fn_{}.png'.format(test_id)))

    plt.figure()
    plot_3d(test_grid, final_fn_grid.flatten())
    plt.savefig(os.path.join(dir, 'final_fn_{}.png'.format(test_id)))

    plt.figure()
    plot_3d(test_grid, deep_iv_fn_grid.flatten())
    plt.savefig(os.path.join(dir, 'deep_iv_{}.png'.format(test_id)))

    plt.figure()
    plot_3d(test_grid, sls_poly_fn_grid.flatten())
    plt.savefig(os.path.join(dir, 'sls_poly_{}.png'.format(test_id)))

    plt.figure()
    plot_3d(test_grid, sls_fn_grid.flatten())
    plt.savefig(os.path.join(dir, 'sls_{}.png'.format(test_id)))

    plt.figure()
    plot_3d(test_grid, direct_poly_fn_grid.flatten())
    plt.savefig(os.path.join(dir, 'direct_poly_{}.png'.format(test_id)))

    plt.figure()
    plot_3d(test_grid, direct_nn_fn_grid.flatten())
    plt.savefig(os.path.join(dir, 'direct_nn_{}.png'.format(test_id)))

    def mse_test(y_true, y_pred):
        return 1 - np.mean((y_pred.flatten() - y_true.flatten())**2) / np.var(
            y_true.flatten())

    mse_best = mse_test(tau_fn(test_data_x, test_data_treatment), best_fn_dist)
    mse_final = mse_test(tau_fn(test_data_x, test_data_treatment),
                         final_fn_dist)
    mse_avg = mse_test(tau_fn(test_data_x, test_data_treatment), avg_fn_dist)
    mse_2sls_poly = mse_test(tau_fn(test_data_x, test_data_treatment),
                             sls_poly_fn_dist)
    mse_direct_poly = mse_test(tau_fn(test_data_x, test_data_treatment),
                               direct_poly_fn_dist)
    mse_direct_nn = mse_test(tau_fn(test_data_x, test_data_treatment),
                             direct_nn_fn_dist)
    mse_2sls = mse_test(tau_fn(test_data_x, test_data_treatment), sls_fn_dist)
    mse_deep_iv = mse_test(tau_fn(test_data_x, test_data_treatment),
                           deep_iv_fn_dist)

    on_p_dist = [
        mse_best, mse_final, mse_avg, mse_deep_iv, mse_2sls_poly, mse_2sls,
        mse_direct_poly, mse_direct_nn
    ]

    mse_best = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]),
                        best_fn_grid)
    mse_final = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]),
                         final_fn_grid)
    mse_avg = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]),
                       avg_fn_grid)
    mse_2sls_poly = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]),
                             sls_poly_fn_grid)
    mse_direct_poly = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]),
                               direct_poly_fn_grid)
    mse_direct_nn = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]),
                             direct_nn_fn_grid)
    mse_2sls = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]),
                        sls_fn_grid)
    mse_deep_iv = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]),
                           deep_iv_fn_grid)

    on_p_grid = [
        mse_best, mse_final, mse_avg, mse_deep_iv, mse_2sls_poly, mse_2sls,
        mse_direct_poly, mse_direct_nn
    ]

    return on_p_dist, on_p_grid
Exemple #11
0
print "MultiTaskLasso", mtl.score(features_test, labels_test)

######################################################################
#this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal

#load necessary libs
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.cross_validation import train_test_split

#splite dataset to get necessary sub-dataset
features_train, features_test, labels_train, labels_test = train_test_split(
    features_sc, label_scm, test_size=0.33, random_state=42)

#pre-process: dimensional reduction(SVD)
svd1 = TruncatedSVD(n_components=9, random_state=1).fit(features_train)
features_train = svd1.transform(features_train)

svd2 = TruncatedSVD(n_components=9, random_state=1).fit(features_test)
features_test = svd2.transform(features_test)

#do regression
mte = MultiTaskElasticNet(alpha=0.000000001, l1_ratio=0.01, random_state=1)
mte.fit(features_train, labels_train)
print "MultiTaskElasticNet", mte.score(features_test, labels_test)
##########################################################################

#All of the codes end.
#Thank you!
def test(test_id, dir, strength_scale=.5, n_samples=4000, num_instruments=2, num_treatments=1, num_outcomes=1,
         num_steps=100, jitter=True, n_critics=50, func='abs', radius=50, dgp_two=False):
    print("Parameters: {}".format(locals()))
    with open(os.path.join(dir, "params_{}.txt".format(test_id)), 'w') as f:
        f.write("Parameters: {}".format(locals()))

    np.random.seed(test_id)

    if func=='abs':
        def tau_fn(x): return np.abs(x)
    elif func=='2dpoly':
        def tau_fn(x): return -1.5 * x + .9 * (x**2)
    elif func=='sigmoid':
        def tau_fn(x): return 2/(1+np.exp(-2*x))
    elif func=='sin':
        def tau_fn(x): return np.sin(x)
    elif func=='step':
        def tau_fn(x): return 1. * (x<0) + 2.5 * (x>=0)
    elif func=='3dpoly':
        def tau_fn(x): return -1.5 * x + .9 * (x**2) + x**3
    elif func=='linear':
        def tau_fn(x): return x
    elif func=='rand_pw':
        pw_linear = generate_random_pw_linear()
        def tau_fn(x):             
            return np.reshape(np.array([pw_linear(x_i) for x_i in x.flatten()]), x.shape)

    iv_strength = strength_scale
    degree_benchmarks = 3

    # Network parameters
    hidden_layers = [1000, 1000, 1000]

    # Generate data
    data_z, data_p, data_y = get_data(
        n_samples, num_instruments, iv_strength, tau_fn, dgp_two)
    print(data_p.shape)
    print(data_z.shape)
    print(data_y.shape)
    if num_instruments >= 2:
        plt.figure()
        plt.subplot(1, 3, 1)
        plt.scatter(data_z[:, 0], data_p, label='p vs z1')
        plt.legend()
        plt.subplot(1, 3, 2)
        plt.scatter(data_z[:, 1], data_p, label='p vs z2')
        plt.legend()
        plt.subplot(1, 3, 3)
        plt.scatter(data_p, data_y, label='y vs p')
        plt.legend()
        plt.savefig(os.path.join(dir, 'data_{}.png'.format(test_id)))

    # We reset the whole graph
    dgmm = DeepGMM(n_critics=n_critics, num_steps=num_steps, store_step=5, learning_rate_modeler=0.007,
                   learning_rate_critics=0.007, critics_jitter=jitter, dissimilarity_eta=0.0,
                   cluster_type='kmeans', critic_type='Gaussian', critics_precision=None,
                   min_cluster_size=radius,  # num_trees=5,
                   eta_hedge=0.11, bootstrap_hedge=False,
                   l1_reg_weight_modeler=0.0, l2_reg_weight_modeler=0.0,
                   dnn_layers=hidden_layers, dnn_poly_degree=1,
                   log_summary=False, summary_dir='./graphs_monte', display_step=20, random_seed=test_id)
    inst_inds = np.arange(num_instruments)
    np.random.shuffle(inst_inds)
    dgmm.fit(data_z[:, inst_inds], data_p, data_y)

    test_min = np.percentile(data_p, 10)
    test_max = np.percentile(data_p, 90)
    test_grid = np.array(list(itertools.product(
        np.linspace(test_min, test_max, 100), repeat=num_treatments)))
    print(test_grid.shape)

    _, test_data_p, _ = get_data(
        5 * n_samples, num_instruments, iv_strength, tau_fn, dgp_two)
    print(test_data_p.shape)
    clip_edges = ((test_data_p > test_min) & (
        test_data_p < test_max)).flatten()
    test_data_p = test_data_p[clip_edges, :]

    best_fn_grid = dgmm.predict(test_grid.reshape(-1, 1), model='best')
    final_fn_grid = dgmm.predict(test_grid.reshape(-1, 1), model='final')
    avg_fn_grid = dgmm.predict(test_grid.reshape(-1, 1), model='avg')
    best_fn_dist = dgmm.predict(test_data_p, model='best')
    final_fn_dist = dgmm.predict(test_data_p, model='final')
    avg_fn_dist = dgmm.predict(test_data_p, model='avg')

    ########################
    # Plot alone
    ########################
    plt.figure(figsize=(10, 10))
    plt.plot(test_grid, avg_fn_grid, label='AvgANN y=g(p)')
    plt.plot(test_grid, best_fn_grid, label='BestANN y=g(p)')
    plt.plot(test_grid, final_fn_grid, label='FinalANN y=g(p)')
    plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)')
    plt.xlabel('Treatment')
    plt.ylabel('Outcome')
    plt.legend()
    plt.savefig(os.path.join(dir, 'deep_gmm_{}.png'.format(test_id)))

    ##################################
    # Benchmarks
    ##################################
    from sklearn.linear_model import LinearRegression, MultiTaskElasticNet, ElasticNet
    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.pipeline import Pipeline
    from sklearn.neural_network import MLPRegressor

    direct_poly = Pipeline([('poly', PolynomialFeatures(
        degree=degree_benchmarks)), ('linear', LinearRegression())])
    direct_poly.fit(data_p, data_y.flatten())
    direct_poly_fn_grid = direct_poly.predict(test_grid.reshape(-1, 1))
    direct_poly_fn_dist = direct_poly.predict(test_data_p)

    direct_nn = MLPRegressor(hidden_layer_sizes=hidden_layers)
    direct_nn.fit(data_p, data_y.flatten())
    direct_nn_fn_grid = direct_nn.predict(test_grid.reshape(-1, 1))
    direct_nn_fn_dist = direct_nn.predict(test_data_p)

    plf = PolynomialFeatures(degree=degree_benchmarks)
    sls_poly_first = MultiTaskElasticNet()
    sls_poly_first.fit(plf.fit_transform(data_z), plf.fit_transform(data_p))
    sls_poly_second = ElasticNet()
    sls_poly_second.fit(sls_poly_first.predict(
        plf.fit_transform(data_z)), data_y)
    sls_poly_fn_grid = sls_poly_second.predict(
        plf.fit_transform(test_grid.reshape(-1, 1)))
    sls_poly_fn_dist = sls_poly_second.predict(plf.fit_transform(test_data_p))

    sls_first = LinearRegression()
    sls_first.fit(data_z, data_p)
    sls_second = LinearRegression()
    sls_second.fit(sls_first.predict(data_z), data_y)
    sls_fn_grid = sls_second.predict(test_grid.reshape(-1, 1))
    sls_fn_dist = sls_second.predict(test_data_p)

    ######
    # Deep IV
    #####
    # We reset the whole graph
    with tf.name_scope("DeepIV"):
        deep_iv = deep_iv_fit(data_z, data_p, data_y,
                              epochs=100, hidden=hidden_layers)
        deep_iv_fn_grid = deep_iv.predict(test_grid.reshape(-1, 1))
        deep_iv_fn_dist = deep_iv.predict(test_data_p)

    plt.figure(figsize=(40, 10))
    plt.subplot(1, 7, 1)
    plt.plot(test_grid, avg_fn_grid, label='AvgANN y=g(p)')
    plt.plot(test_grid, best_fn_grid, label='BestANN y=g(p)')
    plt.plot(test_grid, final_fn_grid, label='FinalANN y=g(p)')
    plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)')
    plt.xlabel('Treatment')
    plt.ylabel('Outcome')
    plt.legend()
    plt.subplot(1, 7, 2)
    plt.plot(test_grid, deep_iv_fn_grid, label='DeepIV')
    plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)')
    plt.xlabel('Treatment')
    plt.ylabel('Outcome')
    plt.legend()
    plt.subplot(1, 7, 3)
    plt.plot(test_grid, sls_poly_fn_grid, label='2SLS_poly')
    plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)')
    plt.xlabel('Treatment')
    plt.ylabel('Outcome')
    plt.legend()
    plt.subplot(1, 7, 4)
    plt.plot(test_grid, sls_fn_grid, label='2SLS')
    plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)')
    plt.xlabel('Treatment')
    plt.ylabel('Outcome')
    plt.legend()
    plt.subplot(1, 7, 5)
    plt.plot(test_grid, direct_poly_fn_grid, label='Direct poly')
    plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)')
    plt.xlabel('Treatment')
    plt.ylabel('Outcome')
    plt.legend()
    plt.subplot(1, 7, 6)
    plt.plot(test_grid, direct_nn_fn_grid, label='Direct ANN')
    plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)')
    plt.xlabel('Treatment')
    plt.ylabel('Outcome')
    plt.legend()
    plt.subplot(1, 7, 7)
    plt.scatter(data_p, data_y, color='blue', label='Data')
    plt.plot(test_grid, tau_fn(test_grid), color='red', label='true y=g(p)')
    plt.xlabel('Treatment')
    plt.ylabel('Outcome')
    plt.legend()
    plt.savefig(os.path.join(dir, 'benchmarks_{}.png'.format(test_id)))

    def mse_test(y_true, y_pred):
        return 1 - np.mean((y_pred.flatten() - y_true.flatten())**2) / np.var(y_true.flatten())

    mse_best = mse_test(tau_fn(test_data_p), best_fn_dist)
    mse_final = mse_test(tau_fn(test_data_p), final_fn_dist)
    mse_avg = mse_test(tau_fn(test_data_p), avg_fn_dist)
    mse_2sls_poly = mse_test(tau_fn(test_data_p), sls_poly_fn_dist)
    mse_direct_poly = mse_test(tau_fn(test_data_p), direct_poly_fn_dist)
    mse_direct_nn = mse_test(tau_fn(test_data_p), direct_nn_fn_dist)
    mse_2sls = mse_test(tau_fn(test_data_p), sls_fn_dist)
    mse_deep_iv = mse_test(tau_fn(test_data_p), deep_iv_fn_dist)

    on_p_dist = [mse_best, mse_final, mse_avg, mse_deep_iv,
                 mse_2sls_poly, mse_2sls, mse_direct_poly, mse_direct_nn]

    mse_best = mse_test(tau_fn(test_grid), best_fn_grid)
    mse_final = mse_test(tau_fn(test_grid), final_fn_grid)
    mse_avg = mse_test(tau_fn(test_grid), avg_fn_grid)
    mse_2sls_poly = mse_test(tau_fn(test_grid), sls_poly_fn_grid)
    mse_direct_poly = mse_test(tau_fn(test_grid), direct_poly_fn_grid)
    mse_direct_nn = mse_test(tau_fn(test_grid), direct_nn_fn_grid)
    mse_2sls = mse_test(tau_fn(test_grid), sls_fn_grid)
    mse_deep_iv = mse_test(tau_fn(test_grid), deep_iv_fn_grid)

    on_p_grid = [mse_best, mse_final, mse_avg, mse_deep_iv,
                 mse_2sls_poly, mse_2sls, mse_direct_poly, mse_direct_nn]

    return on_p_dist, on_p_grid
Exemple #13
0
    n_samples = 100
    n_features = 40
    n_tasks = 12
    rel_f = 7
    coef = np.zeros((n_tasks, n_features))
    times = np.linspace(0, 2 * np.pi, n_tasks)
    for k in range(rel_f):
        coef[:, k] = np.sin((1.0 + rr.randn(1)) * times + 3 * rr.randn(1))
    X = rr.randn(n_samples, n_features)
    y = np.dot(X, coef.T) + rr.randn(n_samples, n_tasks)
    X_train = X[:-20]
    y_train = y[:-20]
    X_test = X[-20:]
    y_test = y[-20:]

    print("Fitting Elastic Net model...")
    ll = ElasticNet(alpha=0.45)
    ll.fit(X_train, y_train)
    print("R2 score: {0}".format(r2_score(y_test, ll.predict(X_test))))

    print("Fitting Multitask Elastic Net model...")
    ml = MultiTaskElasticNet(alpha=0.45)
    ml.fit(X_train, y_train)
    print("R2 score: {0}".format(r2_score(y_test, ml.predict(X_test))))

    print("Plotting predictions...")
    plt.scatter(X[:, 1], y[:, 1])
    plt.scatter(X[:, 1], ll.predict(X)[:, 1], color="blue")
    plt.scatter(X[:, 1], ml.predict(X)[:, 1], color="red")
    plt.show()
#this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal 

#load necessary libs 
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.cross_validation import train_test_split

#splite dataset to get necessary sub-dataset
features_train, features_test, labels_train, labels_test = train_test_split(features_sc,label_scm,test_size=0.33,random_state=42)

#pre-process: dimensional reduction(SVD)
svd1 = TruncatedSVD(n_components=9,random_state=1).fit(features_train)
features_train = svd1.transform(features_train)

svd2 = TruncatedSVD(n_components=9,random_state=1).fit(features_test)
features_test = svd2.transform(features_test)

#do regression
mte = MultiTaskElasticNet(alpha=0.000000001,l1_ratio=0.01,random_state=1)
mte.fit(features_train,labels_train)
print "MultiTaskElasticNet",mte.score(features_test,labels_test)
##########################################################################

#All of the codes end. 
#Thank you!




Exemple #15
0
    print "测试集得分:", multiTaskLassoCV.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, multiTaskLassoCV.predict(X))
    print "TSS(Total Sum of Squares): ", tss
    print "RSS(Residual Sum of Squares): ", rss
    print "ESS(Explained Sum of Squares): ", ess
    print "R^2: ", r2

    print "\n**********测试MultiTaskElasticNet类**********"
    # 在初始化MultiTaskElasticNet类时, 指定超参数α和ρ, 默认值分别是1.0和0.5.
    multiTaskElasticNet = MultiTaskElasticNet(alpha=0.01, l1_ratio=0.7)
    # 拟合训练集
    multiTaskElasticNet.fit(train_X, train_Y)
    # 打印模型的系数
    print "系数:", multiTaskElasticNet.coef_
    print "截距:", multiTaskElasticNet.intercept_
    print '训练集R2: ', r2_score(train_Y, multiTaskElasticNet.predict(train_X))

    # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者
    # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏.
    test_Y_pred = multiTaskElasticNet.predict(test_X)
    print "测试集得分:", multiTaskElasticNet.score(test_X, test_Y)
    print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred)
    print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred))
    print "测试集R2:", r2_score(test_Y, test_Y_pred)

    tss, rss, ess, r2 = xss(Y, multiTaskElasticNet.predict(X))
    print "TSS(Total Sum of Squares): ", tss