def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def test_enet_float_precision(): # Generate dataset X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10) # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests for normalize in [True, False]: for fit_intercept in [True, False]: coef = {} intercept = {} for dtype in [np.float64, np.float32]: clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False, fit_intercept=fit_intercept, normalize=normalize) X = dtype(X) y = dtype(y) ignore_warnings(clf.fit)(X, y) coef[('simple', dtype)] = clf.coef_ intercept[('simple', dtype)] = clf.intercept_ assert clf.coef_.dtype == dtype # test precompute Gram array Gram = X.T.dot(X) clf_precompute = ElasticNet(alpha=0.5, max_iter=100, precompute=Gram, fit_intercept=fit_intercept, normalize=normalize) ignore_warnings(clf_precompute.fit)(X, y) assert_array_almost_equal(clf.coef_, clf_precompute.coef_) assert_array_almost_equal(clf.intercept_, clf_precompute.intercept_) # test multi task enet multi_y = np.hstack((y[:, np.newaxis], y[:, np.newaxis])) clf_multioutput = MultiTaskElasticNet( alpha=0.5, max_iter=100, fit_intercept=fit_intercept, normalize=normalize) clf_multioutput.fit(X, multi_y) coef[('multi', dtype)] = clf_multioutput.coef_ intercept[('multi', dtype)] = clf_multioutput.intercept_ assert clf.coef_.dtype == dtype for v in ['simple', 'multi']: assert_array_almost_equal(coef[(v, np.float32)], coef[(v, np.float64)], decimal=4) assert_array_almost_equal(intercept[(v, np.float32)], intercept[(v, np.float64)], decimal=4)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
class MultiTaskElasticNetImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def mtelastic_model(self, X_train, y_train, X_test, y_test): # Multi-task Elastic-Net Regression Model mten_model = MultiTaskElasticNet(alpha=.1918) mten_model.fit(X_train, y_train) y_train_pred = mten_model.predict(X_train) y_test_pred = mten_model.predict(X_test) # To score the model I can either use the .score from sklearn or use the MSE R^2 from the Machine Learning Book print(mten_model.score(X_train, y_train)) print(mten_model.score(X_test, y_test)) print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.6f, R^2 test: %.6f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
def train_metaregressor(stack_path, train, labels, run_sequence, scale_data, models, predict_mode_all, full = True, verbose = False): if full: model_suffix = "_30" else: model_suffix = "_8" print("".join(["\n", "=" * 50, "".join(["\nTraining Metaregressor", model_suffix, " (Level 2)\n"]), "=" * 50, "\n"])) # Model definition for metaregressor if predict_mode_all: model = MultiTaskElasticNet(random_state = 42, max_iter = 1000, l1_ratio = 1.0, alpha = 0.1) else: model = ElasticNet(random_state = 42, max_iter = 1000, l1_ratio = 1.0, alpha = 0.1) print('Training linear metaregressors for %d models and %d total independent variables.\n' % (len(models), train.shape[1])) reg_models, rmse = [], [] if predict_mode_all: print("// MODE: All-in-One Pass //\n") model.fit(train.values, labels.values) rmse = [np.sqrt(mean_squared_error(y_true = labels.values, y_pred = model.predict(train.values)))] reg_models.append(model) else: print("// MODE: One-at-a-Time //\n") # iterate and build a model over all dependent variables (30) for f in range(len(TRAIN_COLS)): # get the list of values to predict, column-wise predict_me = labels.values[:,f] # build the list of independent variables for i in range((0+f), ((30 * len(models)) + f), 30): if i == 0+f: train_me = train.values[:,i].reshape(-1, 1) else: train_me = np.hstack((train_me, train.values[:,i].reshape(-1, 1))) # fit and store in our reg_models list model.fit(train_me, predict_me) reg_models.append(model) score = np.sqrt(mean_squared_error(y_true = predict_me, y_pred = model.predict(train_me))) rmse.append(score) print("Metaregressor #%d of %d trained for feature '%s'; RMSE was: %.5f" % ((f + 1), len(TRAIN_COLS), TRAIN_COLS[f], score)) print("\nAll metaregressors trained; average RMSE: %.5f" % np.mean(rmse)) print("".join(["\n", "=" * 50, "".join(["\nMetaregressor", model_suffix, " Training Complete\n"]), "=" * 50, "\n"])) return reg_models
def predict( self, forecast_length: int, future_regressor=[], just_point_forecast: bool = False, ): """Generates forecast data immediately following dates of index supplied to .fit() Args: forecast_length (int): Number of periods of data to forecast ahead regressor (numpy.Array): additional regressor just_point_forecast (bool): If True, return a pandas.DataFrame of just point forecasts Returns: Either a PredictionObject of forecasts and metadata, or if just_point_forecast == True, a dataframe of point forecasts """ if not _has_tsfresh: raise ImportError("Package tsfresh is required") # num_subsamples = 10 predictStartTime = datetime.datetime.now() # from tsfresh import extract_features from tsfresh.utilities.dataframe_functions import make_forecasting_frame # from sklearn.ensemble import AdaBoostRegressor from tsfresh.utilities.dataframe_functions import impute as tsfresh_impute # from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters max_timeshift = 10 regression_model = 'Adaboost' feature_selection = None max_timeshift = self.max_timeshift regression_model = self.regression_model feature_selection = self.feature_selection sktraindata = self.df_train.copy() X = pd.DataFrame() y = pd.DataFrame() counter = 0 for column in sktraindata.columns: df_shift, current_y = make_forecasting_frame( sktraindata[column], kind="time_series", max_timeshift=max_timeshift, rolling_direction=1, ) # disable_progressbar = True MinimalFCParameters EfficientFCParameters current_X = extract_features( df_shift, column_id="id", column_sort="time", column_value="value", impute_function=tsfresh_impute, show_warnings=False, default_fc_parameters=EfficientFCParameters(), n_jobs=1, ) # current_X["feature_last_value"] = current_y.shift(1) current_X.rename(columns=lambda x: str(counter) + '_' + x, inplace=True) X = pd.concat([X, current_X], axis=1) y = pd.concat([y, current_y], axis=1) counter += 1 # drop constant features X = X.loc[:, X.apply(pd.Series.nunique) != 1] X = X.replace([np.inf, -np.inf], np.nan) X = X.fillna(0) y = y.fillna(method='ffill').fillna(method='bfill') if feature_selection == 'Variance': from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(0.15)) X = pd.DataFrame(sel.fit_transform(X)) if feature_selection == 'Percentile': from sklearn.feature_selection import SelectPercentile, chi2 X = pd.DataFrame( SelectPercentile(chi2, percentile=20).fit_transform( X, y[y.columns[0]])) if feature_selection == 'DecisionTree': from sklearn.tree import DecisionTreeRegressor from sklearn.feature_selection import SelectFromModel clf = DecisionTreeRegressor() clf = clf.fit(X, y) model = SelectFromModel(clf, prefit=True) X = model.transform(X) if feature_selection == 'Lasso': from sklearn.linear_model import MultiTaskLasso from sklearn.feature_selection import SelectFromModel clf = MultiTaskLasso(max_iter=2000) clf = clf.fit(X, y) model = SelectFromModel(clf, prefit=True) X = model.transform(X) """ decisionTreeList = X.columns[model.get_support()] LassoList = X.columns[model.get_support()] feature_list = decisionTreeList.to_list() set([x for x in feature_list if feature_list.count(x) > 1]) from collections import Counter repeat_features = Counter(feature_list) repeat_features = repeat_features.most_common(20) """ # Drop first line X = X.iloc[1:, ] y = y.iloc[1:] y = y.fillna(method='ffill').fillna(method='bfill') index = self.create_forecast_index(forecast_length=forecast_length) if regression_model == 'ElasticNet': from sklearn.linear_model import MultiTaskElasticNet regr = MultiTaskElasticNet(alpha=1.0, random_state=self.random_seed) elif regression_model == 'DecisionTree': from sklearn.tree import DecisionTreeRegressor regr = DecisionTreeRegressor(random_state=self.random_seed) elif regression_model == 'MLP': from sklearn.neural_network import MLPRegressor # relu/tanh lbfgs/adam layer_sizes (100) (10) regr = MLPRegressor( hidden_layer_sizes=(10, 25, 10), verbose=self.verbose_bool, max_iter=200, activation='tanh', solver='lbfgs', random_state=self.random_seed, ) elif regression_model == 'KNN': from sklearn.multioutput import MultiOutputRegressor from sklearn.neighbors import KNeighborsRegressor regr = MultiOutputRegressor( KNeighborsRegressor(random_state=self.random_seed)) elif regression_model == 'Adaboost': from sklearn.multioutput import MultiOutputRegressor from sklearn.ensemble import AdaBoostRegressor regr = MultiOutputRegressor(AdaBoostRegressor( n_estimators=200)) # , random_state=self.random_seed)) else: regression_model = 'RandomForest' from sklearn.ensemble import RandomForestRegressor regr = RandomForestRegressor(random_state=self.random_seed, n_estimators=1000, verbose=self.verbose) regr.fit(X, y) combined_index = self.df_train.index.append(index) forecast = pd.DataFrame() sktraindata.columns = [x for x in range(len(sktraindata.columns))] for x in range(forecast_length): x_dat = pd.DataFrame() y_dat = pd.DataFrame() counter = 0 for column in sktraindata.columns: df_shift, current_y = make_forecasting_frame( sktraindata.tail(max_timeshift)[column], kind="time_series", max_timeshift=max_timeshift, rolling_direction=1, ) # disable_progressbar = True MinimalFCParameters EfficientFCParameters current_X = extract_features( df_shift, column_id="id", column_sort="time", column_value="value", impute_function=tsfresh_impute, show_warnings=False, n_jobs=1, default_fc_parameters=EfficientFCParameters(), ) # default_fc_parameters=MinimalFCParameters(), current_X["feature_last_value"] = current_y.shift(1) current_X.rename(columns=lambda x: str(counter) + '_' + x, inplace=True) x_dat = pd.concat([x_dat, current_X], axis=1) y_dat = pd.concat([y_dat, current_y], axis=1) counter += 1 x_dat = x_dat[X.columns] rfPred = pd.DataFrame(regr.predict(x_dat.tail(1).values)) forecast = pd.concat([forecast, rfPred], axis=0, ignore_index=True) sktraindata = pd.concat([sktraindata, rfPred], axis=0, ignore_index=True) sktraindata.index = combined_index[:len(sktraindata.index)] forecast.columns = self.column_names forecast.index = index if just_point_forecast: return forecast else: upper_forecast, lower_forecast = Point_to_Probability( self.df_train, forecast, prediction_interval=self.prediction_interval) predict_runtime = datetime.datetime.now() - predictStartTime prediction = PredictionObject( model_name=self.name, forecast_length=forecast_length, forecast_index=forecast.index, forecast_columns=forecast.columns, lower_forecast=lower_forecast, forecast=forecast, upper_forecast=upper_forecast, prediction_interval=self.prediction_interval, predict_runtime=predict_runtime, fit_runtime=self.fit_runtime, model_parameters=self.get_params(), ) return prediction
model_name=f'best_model_batch{ind}.h5') ]) all_predictions.append(model.predict(X_test)) model = create_model() model.fit(X_train, y_train, epochs=33, batch_size=32, verbose=1) all_predictions.append(model.predict(X_test)) kf = KFold(n_splits=5, random_state=2019, shuffle=True) for ind, (tr, val) in enumerate(kf.split(X_train)): X_tr = X_train[tr] y_tr = y_train[tr] X_vl = X_train[val] y_vl = y_train[val] model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5) model.fit(X_tr, y_tr) all_predictions.append(model.predict(X_test)) model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5) model.fit(X_train, y_train) all_predictions.append(model.predict(X_test)) test_preds = np.array([ np.array([rankdata(c) for c in p.T]).T for p in all_predictions ]).mean(axis=0) max_val = test_preds.max() + 1 test_preds = test_preds / max_val + 1e-12 submission = pd.read_csv(path_join(data_dir, 'sample_submission.csv')) submission[targets] = test_preds submission.to_csv("submission.csv", index=False)
class model: def __init__(self,params,X,Y): self.params=params self.original_predictors=list(X) # if 'time' in self.original_predictors: # self.original_predictors.remove('time') if params['NONLIN_TYPE']=='POLY': #add non linear terms self.X=self.add_nonlinear_terms(X) #print(self.X) self.Y=Y if params['STANDARDIZE']: #standardize self.standardize() self.predictor_names=list(self.X) self.target_names=list(Y) self.Y_final=self.Y.iloc[-1,:] self.time=self.X.iloc[-1, X.columns.get_loc('time')] self.date=self.X.index[-1] # print(self.X) # print(self.Y) # print(self.Y_final) # print(self.time) self.make_model() def add_nonlinear_terms(self,X): df,var_names=add_polynomial_terms(X,list(X),self.params['ORDER']) return(df) def standardize(self): self.X_mean=self.X.mean() self.Y_mean=self.Y.mean() self.X_std=self.X.std() self.Y_std=self.Y.std() self.X=(self.X-self.X_mean)/self.X_std self.Y=(self.Y-self.Y_mean)/self.Y_std def make_model(self): max_iter=1000 tol=0.015 l1_ratio=0.8 # we want a relatively sparse model elastic=MultiTaskElasticNet(fit_intercept=True, max_iter=max_iter,tol=tol,l1_ratio=l1_ratio) #Note that we are assuming that error are independent of each other GIVEN THE PREDICTORS #Otherwise cross validation won't be applicable #We will perform a grid search to find best parameters print('################ Find hyper-parameter values#######################') search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8)},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10) search.fit(self.X,self.Y) #Now create a final elastic net model using the optimal hyper parameters print('################ Build final model ##############################') optimal_alpha=search.best_params_['alpha'] #optimal_l1_ratio=search.best_params_['l1_ratio'] self.model=MultiTaskElasticNet(fit_intercept=True,alpha=optimal_alpha,l1_ratio=l1_ratio,max_iter=max_iter,tol=tol) self.model.fit(self.X.values,self.Y.values) self.predicted=pd.DataFrame(index=self.Y.index, columns= self.Y.columns, data=self.model.predict(self.X.values)) self.predicted=self.predicted*self.Y_std+self.Y_mean #second_model=(mean_squared_error(y_true=Y_train,y_pred=elastic.predict(X_train))) def predict(self,X,plot=False,Y_True=None,plot_list=None): # If plot = True , Y_true should contain the True values and this function will plot a comparion between true vs predicted if self.params['STANDARDIZE'] and self.params['NONLIN_TYPE']=='POLY': #X1=X.copy() # don't modify the original X1=self.add_nonlinear_terms(X) #print('Unnormalized predictors: ',X1) X1=(X1-self.X_mean)/self.X_std # standardized #print('Normalized predictors: ',X1) Y1=self.model.predict(X1.values) dfY1=pd.DataFrame(index=Y_True.index,columns=list(Y_True),data=Y1) dfY1=dfY1*self.Y_std+self.Y_mean #Y1=Y1* if plot: X_ax=Y_True.index label_true=[l+'_True' for l in plot_list] label_pred=[l+'_Pred' for l in plot_list] plt.figure(figsize=(6,4)) plt.plot(X_ax,Y_True[plot_list],label=label_true) plt.plot(X_ax,dfY1[plot_list],label=label_pred) plt.legend(loc='best') plt.show() return(dfY1) def forecast(self): pred=None if self.params['STANDARDIZE'] and self.params['NONLIN_TYPE']=='POLY': #if standardized and polynomial Xp=self.Y_final*self.Y_std + self.Y_mean # destandardize Y, this is needed to calculate the non linear term #print(Xp) Xp['time']=self.time+1 #- self.X_mean['time'])/self.X_std dfp=pd.DataFrame(index=[self.date],columns=self.original_predictors,data=Xp.values.reshape(1,-1)) dfp=self.add_nonlinear_terms(dfp) # add the non linear terms #print(dfp) dfp=(dfp-self.X_mean)/self.X_std # standardize, then predict #print(dfp) pred=self.model.predict(dfp.values) self.time=self.time+1 #print(self.date) self.date=self.date+MonthEnd(1) df=pd.DataFrame(index=[self.date],columns=self.target_names,data=pred) self.Y_final=df #print(self.date) return(pred,self.date) def multistep_forecast(self,steps): df=pd.DataFrame(columns=self.target_names) for i in range(steps): pred,date=self.forecast() print(pred.shape) df.loc[date,:]=np.multiply(pred,self.Y_std.values.reshape(1,-1)) + self.Y_mean.values.reshape(1,-1) return df def plot_coeffs(self): C=self.model.coef_[-1,:] indexes=np.where(np.abs(C)>0.0001) #significant predictors C_sig=C[indexes[0]] preds_sig=[self.predictor_names[int(i)] for i in indexes[0]] f,ax=plt.subplots() f.set_size_inches((10,2)) ax.bar(range(len(C_sig)),C_sig) ax.set_xticks(range(len(C_sig))) ax.set_xticklabels(labels=preds_sig) plt.xticks(rotation=90) plt.tight_layout() plt.show() def variable_importance(self,orig_var_names,labels): all_preds=list(self.X)# all predictors imp=[] for v in orig_var_names: v1=[ap for ap in all_preds if v in ap] print(v1) X1=self.X.copy() X1[v1]=0 Y1=self.model.predict(X1) imp.append(np.sum((self.Y.values-Y1)**2)) #print(imp) indexes=np.argsort(np.array(imp)) #print(indexes) preds1=[labels[i] for i in indexes] imps1=[imp[i] for i in indexes] imps1=imps1/np.max(imps1) #plot importance f,ax=plt.subplots() ax.barh(range(len(imp)),imps1) ax.set_yticks(range(len(imp))) ax.set_yticklabels(labels=preds1) ax.set_xlabel(xlabel='Importance',fontsize=12) plt.tight_layout() plt.show()
def test(test_id, dir, strength_scale, n_samples, num_features, num_instruments, num_treatments, num_outcomes): def tau_fn(x, p): return ( -1.5 * x + .9 * (x**2) ) * p #np.abs(p) * x # #np.abs(x) #-1.5 * x + .9 * (x**2)# 2/(1+np.exp(-2*x)) #-1.5 * x + .9 * (x**2) #np.abs(x) #-1.5 * x + .9 * (x**2) #np.abs(x) #-1.5 * x + .9 * (x**2) #np.sin(x) #1. * (x<0) + 2.5 * (x>=0) #np.abs(x) # 1. * (x<0) + 3. * (x>=0) #-1.5 * x + .9 * (x**2) #-1.5 * x + .9 * (x**2) #np.abs(x) #-1.5 * x + .9 * (x**2) + x**3 #-1.5 * x + .9 * (x**2) + x**3 # np.sin(x) #-1.5 * x + .9 * (x**2) + x**3 #np.sin(x) #-1.5 * x + .9 * (x**2) + x**3 #np.sin(x) #np.abs(x) #np.sin(x) #2/(1+np.exp(-2*x)) #2/(1+np.exp(-2*x)) #1.5 * x - .9 * (x**2) #2/(1+np.exp(-2*x))#-1.5 * x + .9 * (x**2) iv_strength = strength_scale * np.random.uniform( 1., 1.1, size=(num_instruments, 1)) degree_benchmarks = 3 # Network parameters hidden_layers = [1000, 1000, 1000] # Generate data data_x, data_z, data_treatment, data_y = get_data(n_samples, num_instruments, iv_strength, tau_fn, num_features) data_z = np.concatenate((data_z, data_x), axis=1) data_p = np.concatenate((data_treatment, data_x), axis=1) num_instruments = num_features + num_instruments num_treatments = num_features + num_treatments print(data_p.shape) print(data_z.shape) print(data_y.shape) if num_instruments >= 2: plt.figure() plt.subplot(1, 4, 1) plt.scatter(data_z[:, 0], data_p[:, 0], label='p vs z1') plt.legend() plt.subplot(1, 4, 2) plt.scatter(data_z[:, 1], data_p[:, 0], label='p vs z2') plt.legend() plt.subplot(1, 4, 3) plt.scatter(data_p[:, 0], data_y) plt.legend() plt.subplot(1, 4, 4) plt.scatter(data_p[:, 1], data_y) plt.legend() plt.savefig(os.path.join(dir, 'data_{}.png'.format(test_id))) # We reset the whole graph dgmm = DeepGMM( n_critics=70, num_steps=200, store_step=5, learning_rate_modeler=0.01, learning_rate_critics=0.01, critics_jitter=True, dissimilarity_eta=0.0, cluster_type='kmeans', critic_type='Gaussian', critics_precision=None, min_cluster_size=200, #num_trees=5, eta_hedge=0.16, bootstrap_hedge=False, l1_reg_weight_modeler=0.0, l2_reg_weight_modeler=0.0, dnn_layers=hidden_layers, dnn_poly_degree=1, log_summary=False, summary_dir='./graphs_monte') dgmm.fit(data_z, data_p, data_y) test_min = np.percentile(data_p, 10) test_max = np.percentile(data_p, 90) test_grid = np.array( list( itertools.product(np.linspace(test_min, test_max, 100), repeat=num_treatments))) print(test_grid.shape) test_data_x, _, test_data_treatment, _ = get_data(5 * n_samples, num_instruments, iv_strength, tau_fn, num_features) test_data_p = np.concatenate((test_data_treatment, test_data_x), axis=1) print(test_data_p.shape) clip_edges = (np.all((test_data_p > test_min), axis=1) & np.all( (test_data_p < test_max), axis=1)).flatten() test_data_p = test_data_p[clip_edges, :] test_data_treatment = test_data_treatment[clip_edges, :] test_data_x = test_data_x[clip_edges, :] print(test_data_p.shape) best_fn_grid = dgmm.predict(test_grid, model='best') final_fn_grid = dgmm.predict(test_grid, model='final') avg_fn_grid = dgmm.predict(test_grid, model='avg') best_fn_dist = dgmm.predict(test_data_p, model='best') final_fn_dist = dgmm.predict(test_data_p, model='final') avg_fn_dist = dgmm.predict(test_data_p, model='avg') ################################## # Benchmarks ################################## from sklearn.linear_model import LinearRegression, MultiTaskElasticNet, ElasticNet from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline from sklearn.neural_network import MLPRegressor direct_poly = Pipeline([('poly', PolynomialFeatures(degree=degree_benchmarks)), ('linear', LinearRegression())]) direct_poly.fit(data_p, data_y.flatten()) direct_poly_fn_grid = direct_poly.predict(test_grid) direct_poly_fn_dist = direct_poly.predict(test_data_p) direct_nn = MLPRegressor(hidden_layer_sizes=hidden_layers) direct_nn.fit(data_p, data_y.flatten()) direct_nn_fn_grid = direct_nn.predict(test_grid) direct_nn_fn_dist = direct_nn.predict(test_data_p) plf = PolynomialFeatures(degree=degree_benchmarks) sls_poly_first = MultiTaskElasticNet() sls_poly_first.fit(plf.fit_transform(data_z), plf.fit_transform(data_p)) sls_poly_second = ElasticNet() sls_poly_second.fit(sls_poly_first.predict(plf.fit_transform(data_z)), data_y) sls_poly_fn_grid = sls_poly_second.predict(plf.fit_transform(test_grid)) sls_poly_fn_dist = sls_poly_second.predict(plf.fit_transform(test_data_p)) sls_first = LinearRegression() sls_first.fit(data_z, data_p) sls_second = LinearRegression() sls_second.fit(sls_first.predict(data_z), data_y) sls_fn_grid = sls_second.predict(test_grid) sls_fn_dist = sls_second.predict(test_data_p) ###### # Deep IV ##### # We reset the whole graph with tf.name_scope("DeepIV"): deep_iv = deep_iv_fit(data_x, data_z, data_treatment, data_y, epochs=10, hidden=hidden_layers) deep_iv_fn_grid = deep_iv.predict([test_grid[:, 1], test_grid[:, 0]]) deep_iv_fn_dist = deep_iv.predict([test_data_x, test_data_treatment]) plt.figure() plot_3d(test_grid, tau_fn(test_grid[:, [1]], test_grid[:, [0]]).flatten()) plt.savefig(os.path.join(dir, 'true_{}.png'.format(test_id))) print(avg_fn_grid.shape) plt.figure() plot_3d(test_grid, avg_fn_grid.flatten()) plt.savefig(os.path.join(dir, 'avg_fn_{}.png'.format(test_id))) plt.figure() plot_3d(test_grid, best_fn_grid.flatten()) plt.savefig(os.path.join(dir, 'best_fn_{}.png'.format(test_id))) plt.figure() plot_3d(test_grid, final_fn_grid.flatten()) plt.savefig(os.path.join(dir, 'final_fn_{}.png'.format(test_id))) plt.figure() plot_3d(test_grid, deep_iv_fn_grid.flatten()) plt.savefig(os.path.join(dir, 'deep_iv_{}.png'.format(test_id))) plt.figure() plot_3d(test_grid, sls_poly_fn_grid.flatten()) plt.savefig(os.path.join(dir, 'sls_poly_{}.png'.format(test_id))) plt.figure() plot_3d(test_grid, sls_fn_grid.flatten()) plt.savefig(os.path.join(dir, 'sls_{}.png'.format(test_id))) plt.figure() plot_3d(test_grid, direct_poly_fn_grid.flatten()) plt.savefig(os.path.join(dir, 'direct_poly_{}.png'.format(test_id))) plt.figure() plot_3d(test_grid, direct_nn_fn_grid.flatten()) plt.savefig(os.path.join(dir, 'direct_nn_{}.png'.format(test_id))) def mse_test(y_true, y_pred): return 1 - np.mean((y_pred.flatten() - y_true.flatten())**2) / np.var( y_true.flatten()) mse_best = mse_test(tau_fn(test_data_x, test_data_treatment), best_fn_dist) mse_final = mse_test(tau_fn(test_data_x, test_data_treatment), final_fn_dist) mse_avg = mse_test(tau_fn(test_data_x, test_data_treatment), avg_fn_dist) mse_2sls_poly = mse_test(tau_fn(test_data_x, test_data_treatment), sls_poly_fn_dist) mse_direct_poly = mse_test(tau_fn(test_data_x, test_data_treatment), direct_poly_fn_dist) mse_direct_nn = mse_test(tau_fn(test_data_x, test_data_treatment), direct_nn_fn_dist) mse_2sls = mse_test(tau_fn(test_data_x, test_data_treatment), sls_fn_dist) mse_deep_iv = mse_test(tau_fn(test_data_x, test_data_treatment), deep_iv_fn_dist) on_p_dist = [ mse_best, mse_final, mse_avg, mse_deep_iv, mse_2sls_poly, mse_2sls, mse_direct_poly, mse_direct_nn ] mse_best = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]), best_fn_grid) mse_final = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]), final_fn_grid) mse_avg = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]), avg_fn_grid) mse_2sls_poly = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]), sls_poly_fn_grid) mse_direct_poly = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]), direct_poly_fn_grid) mse_direct_nn = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]), direct_nn_fn_grid) mse_2sls = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]), sls_fn_grid) mse_deep_iv = mse_test(tau_fn(test_grid[:, [1]], test_grid[:, [0]]), deep_iv_fn_grid) on_p_grid = [ mse_best, mse_final, mse_avg, mse_deep_iv, mse_2sls_poly, mse_2sls, mse_direct_poly, mse_direct_nn ] return on_p_dist, on_p_grid
print "MultiTaskLasso", mtl.score(features_test, labels_test) ###################################################################### #this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal #load necessary libs from sklearn.feature_selection import SelectKBest from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import MultiTaskElasticNet from sklearn.cross_validation import train_test_split #splite dataset to get necessary sub-dataset features_train, features_test, labels_train, labels_test = train_test_split( features_sc, label_scm, test_size=0.33, random_state=42) #pre-process: dimensional reduction(SVD) svd1 = TruncatedSVD(n_components=9, random_state=1).fit(features_train) features_train = svd1.transform(features_train) svd2 = TruncatedSVD(n_components=9, random_state=1).fit(features_test) features_test = svd2.transform(features_test) #do regression mte = MultiTaskElasticNet(alpha=0.000000001, l1_ratio=0.01, random_state=1) mte.fit(features_train, labels_train) print "MultiTaskElasticNet", mte.score(features_test, labels_test) ########################################################################## #All of the codes end. #Thank you!
def test(test_id, dir, strength_scale=.5, n_samples=4000, num_instruments=2, num_treatments=1, num_outcomes=1, num_steps=100, jitter=True, n_critics=50, func='abs', radius=50, dgp_two=False): print("Parameters: {}".format(locals())) with open(os.path.join(dir, "params_{}.txt".format(test_id)), 'w') as f: f.write("Parameters: {}".format(locals())) np.random.seed(test_id) if func=='abs': def tau_fn(x): return np.abs(x) elif func=='2dpoly': def tau_fn(x): return -1.5 * x + .9 * (x**2) elif func=='sigmoid': def tau_fn(x): return 2/(1+np.exp(-2*x)) elif func=='sin': def tau_fn(x): return np.sin(x) elif func=='step': def tau_fn(x): return 1. * (x<0) + 2.5 * (x>=0) elif func=='3dpoly': def tau_fn(x): return -1.5 * x + .9 * (x**2) + x**3 elif func=='linear': def tau_fn(x): return x elif func=='rand_pw': pw_linear = generate_random_pw_linear() def tau_fn(x): return np.reshape(np.array([pw_linear(x_i) for x_i in x.flatten()]), x.shape) iv_strength = strength_scale degree_benchmarks = 3 # Network parameters hidden_layers = [1000, 1000, 1000] # Generate data data_z, data_p, data_y = get_data( n_samples, num_instruments, iv_strength, tau_fn, dgp_two) print(data_p.shape) print(data_z.shape) print(data_y.shape) if num_instruments >= 2: plt.figure() plt.subplot(1, 3, 1) plt.scatter(data_z[:, 0], data_p, label='p vs z1') plt.legend() plt.subplot(1, 3, 2) plt.scatter(data_z[:, 1], data_p, label='p vs z2') plt.legend() plt.subplot(1, 3, 3) plt.scatter(data_p, data_y, label='y vs p') plt.legend() plt.savefig(os.path.join(dir, 'data_{}.png'.format(test_id))) # We reset the whole graph dgmm = DeepGMM(n_critics=n_critics, num_steps=num_steps, store_step=5, learning_rate_modeler=0.007, learning_rate_critics=0.007, critics_jitter=jitter, dissimilarity_eta=0.0, cluster_type='kmeans', critic_type='Gaussian', critics_precision=None, min_cluster_size=radius, # num_trees=5, eta_hedge=0.11, bootstrap_hedge=False, l1_reg_weight_modeler=0.0, l2_reg_weight_modeler=0.0, dnn_layers=hidden_layers, dnn_poly_degree=1, log_summary=False, summary_dir='./graphs_monte', display_step=20, random_seed=test_id) inst_inds = np.arange(num_instruments) np.random.shuffle(inst_inds) dgmm.fit(data_z[:, inst_inds], data_p, data_y) test_min = np.percentile(data_p, 10) test_max = np.percentile(data_p, 90) test_grid = np.array(list(itertools.product( np.linspace(test_min, test_max, 100), repeat=num_treatments))) print(test_grid.shape) _, test_data_p, _ = get_data( 5 * n_samples, num_instruments, iv_strength, tau_fn, dgp_two) print(test_data_p.shape) clip_edges = ((test_data_p > test_min) & ( test_data_p < test_max)).flatten() test_data_p = test_data_p[clip_edges, :] best_fn_grid = dgmm.predict(test_grid.reshape(-1, 1), model='best') final_fn_grid = dgmm.predict(test_grid.reshape(-1, 1), model='final') avg_fn_grid = dgmm.predict(test_grid.reshape(-1, 1), model='avg') best_fn_dist = dgmm.predict(test_data_p, model='best') final_fn_dist = dgmm.predict(test_data_p, model='final') avg_fn_dist = dgmm.predict(test_data_p, model='avg') ######################## # Plot alone ######################## plt.figure(figsize=(10, 10)) plt.plot(test_grid, avg_fn_grid, label='AvgANN y=g(p)') plt.plot(test_grid, best_fn_grid, label='BestANN y=g(p)') plt.plot(test_grid, final_fn_grid, label='FinalANN y=g(p)') plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)') plt.xlabel('Treatment') plt.ylabel('Outcome') plt.legend() plt.savefig(os.path.join(dir, 'deep_gmm_{}.png'.format(test_id))) ################################## # Benchmarks ################################## from sklearn.linear_model import LinearRegression, MultiTaskElasticNet, ElasticNet from sklearn.preprocessing import PolynomialFeatures from sklearn.pipeline import Pipeline from sklearn.neural_network import MLPRegressor direct_poly = Pipeline([('poly', PolynomialFeatures( degree=degree_benchmarks)), ('linear', LinearRegression())]) direct_poly.fit(data_p, data_y.flatten()) direct_poly_fn_grid = direct_poly.predict(test_grid.reshape(-1, 1)) direct_poly_fn_dist = direct_poly.predict(test_data_p) direct_nn = MLPRegressor(hidden_layer_sizes=hidden_layers) direct_nn.fit(data_p, data_y.flatten()) direct_nn_fn_grid = direct_nn.predict(test_grid.reshape(-1, 1)) direct_nn_fn_dist = direct_nn.predict(test_data_p) plf = PolynomialFeatures(degree=degree_benchmarks) sls_poly_first = MultiTaskElasticNet() sls_poly_first.fit(plf.fit_transform(data_z), plf.fit_transform(data_p)) sls_poly_second = ElasticNet() sls_poly_second.fit(sls_poly_first.predict( plf.fit_transform(data_z)), data_y) sls_poly_fn_grid = sls_poly_second.predict( plf.fit_transform(test_grid.reshape(-1, 1))) sls_poly_fn_dist = sls_poly_second.predict(plf.fit_transform(test_data_p)) sls_first = LinearRegression() sls_first.fit(data_z, data_p) sls_second = LinearRegression() sls_second.fit(sls_first.predict(data_z), data_y) sls_fn_grid = sls_second.predict(test_grid.reshape(-1, 1)) sls_fn_dist = sls_second.predict(test_data_p) ###### # Deep IV ##### # We reset the whole graph with tf.name_scope("DeepIV"): deep_iv = deep_iv_fit(data_z, data_p, data_y, epochs=100, hidden=hidden_layers) deep_iv_fn_grid = deep_iv.predict(test_grid.reshape(-1, 1)) deep_iv_fn_dist = deep_iv.predict(test_data_p) plt.figure(figsize=(40, 10)) plt.subplot(1, 7, 1) plt.plot(test_grid, avg_fn_grid, label='AvgANN y=g(p)') plt.plot(test_grid, best_fn_grid, label='BestANN y=g(p)') plt.plot(test_grid, final_fn_grid, label='FinalANN y=g(p)') plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)') plt.xlabel('Treatment') plt.ylabel('Outcome') plt.legend() plt.subplot(1, 7, 2) plt.plot(test_grid, deep_iv_fn_grid, label='DeepIV') plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)') plt.xlabel('Treatment') plt.ylabel('Outcome') plt.legend() plt.subplot(1, 7, 3) plt.plot(test_grid, sls_poly_fn_grid, label='2SLS_poly') plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)') plt.xlabel('Treatment') plt.ylabel('Outcome') plt.legend() plt.subplot(1, 7, 4) plt.plot(test_grid, sls_fn_grid, label='2SLS') plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)') plt.xlabel('Treatment') plt.ylabel('Outcome') plt.legend() plt.subplot(1, 7, 5) plt.plot(test_grid, direct_poly_fn_grid, label='Direct poly') plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)') plt.xlabel('Treatment') plt.ylabel('Outcome') plt.legend() plt.subplot(1, 7, 6) plt.plot(test_grid, direct_nn_fn_grid, label='Direct ANN') plt.plot(test_grid, tau_fn(test_grid), label='true y=g(p)') plt.xlabel('Treatment') plt.ylabel('Outcome') plt.legend() plt.subplot(1, 7, 7) plt.scatter(data_p, data_y, color='blue', label='Data') plt.plot(test_grid, tau_fn(test_grid), color='red', label='true y=g(p)') plt.xlabel('Treatment') plt.ylabel('Outcome') plt.legend() plt.savefig(os.path.join(dir, 'benchmarks_{}.png'.format(test_id))) def mse_test(y_true, y_pred): return 1 - np.mean((y_pred.flatten() - y_true.flatten())**2) / np.var(y_true.flatten()) mse_best = mse_test(tau_fn(test_data_p), best_fn_dist) mse_final = mse_test(tau_fn(test_data_p), final_fn_dist) mse_avg = mse_test(tau_fn(test_data_p), avg_fn_dist) mse_2sls_poly = mse_test(tau_fn(test_data_p), sls_poly_fn_dist) mse_direct_poly = mse_test(tau_fn(test_data_p), direct_poly_fn_dist) mse_direct_nn = mse_test(tau_fn(test_data_p), direct_nn_fn_dist) mse_2sls = mse_test(tau_fn(test_data_p), sls_fn_dist) mse_deep_iv = mse_test(tau_fn(test_data_p), deep_iv_fn_dist) on_p_dist = [mse_best, mse_final, mse_avg, mse_deep_iv, mse_2sls_poly, mse_2sls, mse_direct_poly, mse_direct_nn] mse_best = mse_test(tau_fn(test_grid), best_fn_grid) mse_final = mse_test(tau_fn(test_grid), final_fn_grid) mse_avg = mse_test(tau_fn(test_grid), avg_fn_grid) mse_2sls_poly = mse_test(tau_fn(test_grid), sls_poly_fn_grid) mse_direct_poly = mse_test(tau_fn(test_grid), direct_poly_fn_grid) mse_direct_nn = mse_test(tau_fn(test_grid), direct_nn_fn_grid) mse_2sls = mse_test(tau_fn(test_grid), sls_fn_grid) mse_deep_iv = mse_test(tau_fn(test_grid), deep_iv_fn_grid) on_p_grid = [mse_best, mse_final, mse_avg, mse_deep_iv, mse_2sls_poly, mse_2sls, mse_direct_poly, mse_direct_nn] return on_p_dist, on_p_grid
n_samples = 100 n_features = 40 n_tasks = 12 rel_f = 7 coef = np.zeros((n_tasks, n_features)) times = np.linspace(0, 2 * np.pi, n_tasks) for k in range(rel_f): coef[:, k] = np.sin((1.0 + rr.randn(1)) * times + 3 * rr.randn(1)) X = rr.randn(n_samples, n_features) y = np.dot(X, coef.T) + rr.randn(n_samples, n_tasks) X_train = X[:-20] y_train = y[:-20] X_test = X[-20:] y_test = y[-20:] print("Fitting Elastic Net model...") ll = ElasticNet(alpha=0.45) ll.fit(X_train, y_train) print("R2 score: {0}".format(r2_score(y_test, ll.predict(X_test)))) print("Fitting Multitask Elastic Net model...") ml = MultiTaskElasticNet(alpha=0.45) ml.fit(X_train, y_train) print("R2 score: {0}".format(r2_score(y_test, ml.predict(X_test)))) print("Plotting predictions...") plt.scatter(X[:, 1], y[:, 1]) plt.scatter(X[:, 1], ll.predict(X)[:, 1], color="blue") plt.scatter(X[:, 1], ml.predict(X)[:, 1], color="red") plt.show()
#this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal #load necessary libs from sklearn.feature_selection import SelectKBest from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import MultiTaskElasticNet from sklearn.cross_validation import train_test_split #splite dataset to get necessary sub-dataset features_train, features_test, labels_train, labels_test = train_test_split(features_sc,label_scm,test_size=0.33,random_state=42) #pre-process: dimensional reduction(SVD) svd1 = TruncatedSVD(n_components=9,random_state=1).fit(features_train) features_train = svd1.transform(features_train) svd2 = TruncatedSVD(n_components=9,random_state=1).fit(features_test) features_test = svd2.transform(features_test) #do regression mte = MultiTaskElasticNet(alpha=0.000000001,l1_ratio=0.01,random_state=1) mte.fit(features_train,labels_train) print "MultiTaskElasticNet",mte.score(features_test,labels_test) ########################################################################## #All of the codes end. #Thank you!
print "测试集得分:", multiTaskLassoCV.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, multiTaskLassoCV.predict(X)) print "TSS(Total Sum of Squares): ", tss print "RSS(Residual Sum of Squares): ", rss print "ESS(Explained Sum of Squares): ", ess print "R^2: ", r2 print "\n**********测试MultiTaskElasticNet类**********" # 在初始化MultiTaskElasticNet类时, 指定超参数α和ρ, 默认值分别是1.0和0.5. multiTaskElasticNet = MultiTaskElasticNet(alpha=0.01, l1_ratio=0.7) # 拟合训练集 multiTaskElasticNet.fit(train_X, train_Y) # 打印模型的系数 print "系数:", multiTaskElasticNet.coef_ print "截距:", multiTaskElasticNet.intercept_ print '训练集R2: ', r2_score(train_Y, multiTaskElasticNet.predict(train_X)) # 对于线性回归模型, 一般使用均方误差(Mean Squared Error,MSE)或者 # 均方根误差(Root Mean Squared Error,RMSE)在测试集上的表现来评该价模型的好坏. test_Y_pred = multiTaskElasticNet.predict(test_X) print "测试集得分:", multiTaskElasticNet.score(test_X, test_Y) print "测试集MSE:", mean_squared_error(test_Y, test_Y_pred) print "测试集RMSE:", np.sqrt(mean_squared_error(test_Y, test_Y_pred)) print "测试集R2:", r2_score(test_Y, test_Y_pred) tss, rss, ess, r2 = xss(Y, multiTaskElasticNet.predict(X)) print "TSS(Total Sum of Squares): ", tss