def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names, data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names, test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish - start)
def test_enet_float_precision(): # Generate dataset X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10) # Here we have a small number of iterations, and thus the # ElasticNet might not converge. This is to speed up tests for normalize in [True, False]: for fit_intercept in [True, False]: coef = {} intercept = {} for dtype in [np.float64, np.float32]: clf = ElasticNet(alpha=0.5, max_iter=100, precompute=False, fit_intercept=fit_intercept, normalize=normalize) X = dtype(X) y = dtype(y) ignore_warnings(clf.fit)(X, y) coef[('simple', dtype)] = clf.coef_ intercept[('simple', dtype)] = clf.intercept_ assert clf.coef_.dtype == dtype # test precompute Gram array Gram = X.T.dot(X) clf_precompute = ElasticNet(alpha=0.5, max_iter=100, precompute=Gram, fit_intercept=fit_intercept, normalize=normalize) ignore_warnings(clf_precompute.fit)(X, y) assert_array_almost_equal(clf.coef_, clf_precompute.coef_) assert_array_almost_equal(clf.intercept_, clf_precompute.intercept_) # test multi task enet multi_y = np.hstack((y[:, np.newaxis], y[:, np.newaxis])) clf_multioutput = MultiTaskElasticNet( alpha=0.5, max_iter=100, fit_intercept=fit_intercept, normalize=normalize) clf_multioutput.fit(X, multi_y) coef[('multi', dtype)] = clf_multioutput.coef_ intercept[('multi', dtype)] = clf_multioutput.intercept_ assert clf.coef_.dtype == dtype for v in ['simple', 'multi']: assert_array_almost_equal(coef[(v, np.float32)], coef[(v, np.float64)], decimal=4) assert_array_almost_equal(intercept[(v, np.float32)], intercept[(v, np.float64)], decimal=4)
def main(): start = time.time() print "Reading train data and its features from: " + train_file data = cu.get_dataframe(train_file) global fea fea = features.extract_features(feature_names,data) mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False) X = [] for i in data["OwnerUndeletedAnswerCountAtPostTime"]: X.append([i]) # Must be array type object. Strings must be converted to # to integer values, otherwise fit method raises ValueError global y y = [] print "Collecting statuses" for element in data["OpenStatus"]: for index, status in enumerate(ques_status): if element == status: y.append(index) print "Fitting" mten.fit(fea, y) '''Make sure you have the up to date version of sklearn; v0.12 has the predict_proba method; http://scikit-learn.org/0.11/install.html ''' print "Reading test data and features" test_data = cu.get_dataframe(test_file) test_fea = features.extract_features(feature_names,test_data) print "Making predictions" global probs probs = mten.predict(test_fea) # shape of probs is [n_samples] # convert probs to shape [n_samples,n_classes] probs = np.resize(probs, (len(probs) / 5, 5)) if is_full_train_set == 0: print("Calculating priors and updating posteriors") new_priors = cu.get_priors(full_train_file) old_priors = cu.get_priors(train_file) probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001) print "writing submission to " + submission_file cu.write_submission(submission_file, probs) finish = time.time() print "completed in %0.4f seconds" % (finish-start)
class MultiTaskElasticNetImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def test_convergence_warnings(): random_state = np.random.RandomState(0) X = random_state.standard_normal((1000, 500)) y = random_state.standard_normal((1000, 3)) # check that the model fails to converge with pytest.warns(ConvergenceWarning): MultiTaskElasticNet(max_iter=1, tol=0).fit(X, y) # check that the model converges w/o warnings with pytest.warns(None) as record: MultiTaskElasticNet(max_iter=1000).fit(X, y) assert not record.list
def test_multi_task_lasso_and_enet(): X, y, X_test, y_test = build_dataset() Y = np.c_[y, y] # Y_test = np.c_[y_test, y_test] clf = MultiTaskLasso(alpha=1, tol=1e-8).fit(X, Y) assert 0 < clf.dual_gap_ < 1e-5 assert_array_almost_equal(clf.coef_[0], clf.coef_[1]) clf = MultiTaskElasticNet(alpha=1, tol=1e-8).fit(X, Y) assert 0 < clf.dual_gap_ < 1e-5 assert_array_almost_equal(clf.coef_[0], clf.coef_[1]) clf = MultiTaskElasticNet(alpha=1.0, tol=1e-8, max_iter=1) assert_warns_message(ConvergenceWarning, 'did not converge', clf.fit, X, Y)
def train_base_model(X_train, X_valid, Y_train, Y_valid, norms_X, norms_Y, model='rf'): if model == 'rf': predictor = RandomForestRegressor(max_features=0.3, n_estimators=200, n_jobs=3) elif model == 'elastic': predictor = MultiTaskElasticNet(alpha=0.003, l1_ratio=0.7) elif model == 'knn': predictor = KNeighborsRegressor(2, weights='distance') else: raise ValueError('{} is not a valid model!'.format(model)) predictor.fit(X_train, Y_train) recon_train = predictor.predict(X_train) recon_valid = predictor.predict(X_valid) X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid = _correct_data( X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid, norms_X, norms_Y) train_mae = np.average(np.absolute(Y_train - recon_train)) train_mse = np.average(np.square(Y_train - recon_train)) val_mae = np.average(np.absolute(Y_valid - recon_valid)) val_mse = np.average(np.square(Y_valid - recon_valid)) return X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid, np.array( [train_mae, train_mse, val_mae, val_mse])
def train_base_model(X_train, X_valid, Y_train, Y_valid, norms_X, norms_Y, model='rf'): if model == 'rf': #random regression forest predictor = RandomForestRegressor(max_features=0.3, n_estimators=200, n_jobs=3) elif model == 'elastic': #elastic net (note the results for the elastic net in not included in the manuscript- it did not prefer as well as the random regression forest) predictor = MultiTaskElasticNet(alpha=0.003, l1_ratio=0.7) elif model == 'knn': #k-nearest neighbours. predictor = KNeighborsRegressor(2, weights='distance') else: raise ValueError('{} is not a valid model!'.format(model)) predictor.fit(X_train, Y_train) recon_train = predictor.predict(X_train) recon_valid = predictor.predict(X_valid) X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid = _correct_data(X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid, norms_X, norms_Y) train_mae = np.average(np.absolute(Y_train - recon_train)) train_mse = np.average(np.square(Y_train - recon_train)) val_mae = np.average(np.absolute(Y_valid - recon_valid)) val_mse = np.average(np.square(Y_valid - recon_valid)) return X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid, np.array( [train_mae, train_mse, val_mae, val_mse])
def getModel(self, _params): return MultiTaskElasticNet( alpha=_params['alpha'], l1_ratio=_params['l1_ratio'], fit_intercept=_params['fit_intercept'], normalize=_params['normalize'], copy_X=_params['copy_X'], selection=_params['selection'], )
def test_random_descent(): # Test that both random and cyclic selection give the same results. # Ensure that the test models fully converge and check a wide # range of conditions. # This uses the coordinate descent algo using the gram trick. X, y, _, _ = build_dataset(n_samples=50, n_features=20) clf_cyclic = ElasticNet(selection='cyclic', tol=1e-8) clf_cyclic.fit(X, y) clf_random = ElasticNet(selection='random', tol=1e-8, random_state=42) clf_random.fit(X, y) assert_array_almost_equal(clf_cyclic.coef_, clf_random.coef_) assert_almost_equal(clf_cyclic.intercept_, clf_random.intercept_) # This uses the descent algo without the gram trick clf_cyclic = ElasticNet(selection='cyclic', tol=1e-8) clf_cyclic.fit(X.T, y[:20]) clf_random = ElasticNet(selection='random', tol=1e-8, random_state=42) clf_random.fit(X.T, y[:20]) assert_array_almost_equal(clf_cyclic.coef_, clf_random.coef_) assert_almost_equal(clf_cyclic.intercept_, clf_random.intercept_) # Sparse Case clf_cyclic = ElasticNet(selection='cyclic', tol=1e-8) clf_cyclic.fit(sparse.csr_matrix(X), y) clf_random = ElasticNet(selection='random', tol=1e-8, random_state=42) clf_random.fit(sparse.csr_matrix(X), y) assert_array_almost_equal(clf_cyclic.coef_, clf_random.coef_) assert_almost_equal(clf_cyclic.intercept_, clf_random.intercept_) # Multioutput case. new_y = np.hstack((y[:, np.newaxis], y[:, np.newaxis])) clf_cyclic = MultiTaskElasticNet(selection='cyclic', tol=1e-8) clf_cyclic.fit(X, new_y) clf_random = MultiTaskElasticNet(selection='random', tol=1e-8, random_state=42) clf_random.fit(X, new_y) assert_array_almost_equal(clf_cyclic.coef_, clf_random.coef_) assert_almost_equal(clf_cyclic.intercept_, clf_random.intercept_) # Raise error when selection is not in cyclic or random. clf_random = ElasticNet(selection='invalid') assert_raises(ValueError, clf_random.fit, X, y)
def test_model_multi_task_elasticnet(self): model, X = fit_regression_model(MultiTaskElasticNet(), n_targets=2) model_onnx = convert_sklearn( model, "multi-task elasticnet", [("input", FloatTensorType([None, X.shape[1]]))]) self.assertIsNotNone(model_onnx) dump_data_and_model(X, model, model_onnx, verbose=False, basename="SklearnMultiTaskElasticNet-Dec4")
def train_metaregressor(stack_path, train, labels, run_sequence, scale_data, models, predict_mode_all, full = True, verbose = False): if full: model_suffix = "_30" else: model_suffix = "_8" print("".join(["\n", "=" * 50, "".join(["\nTraining Metaregressor", model_suffix, " (Level 2)\n"]), "=" * 50, "\n"])) # Model definition for metaregressor if predict_mode_all: model = MultiTaskElasticNet(random_state = 42, max_iter = 1000, l1_ratio = 1.0, alpha = 0.1) else: model = ElasticNet(random_state = 42, max_iter = 1000, l1_ratio = 1.0, alpha = 0.1) print('Training linear metaregressors for %d models and %d total independent variables.\n' % (len(models), train.shape[1])) reg_models, rmse = [], [] if predict_mode_all: print("// MODE: All-in-One Pass //\n") model.fit(train.values, labels.values) rmse = [np.sqrt(mean_squared_error(y_true = labels.values, y_pred = model.predict(train.values)))] reg_models.append(model) else: print("// MODE: One-at-a-Time //\n") # iterate and build a model over all dependent variables (30) for f in range(len(TRAIN_COLS)): # get the list of values to predict, column-wise predict_me = labels.values[:,f] # build the list of independent variables for i in range((0+f), ((30 * len(models)) + f), 30): if i == 0+f: train_me = train.values[:,i].reshape(-1, 1) else: train_me = np.hstack((train_me, train.values[:,i].reshape(-1, 1))) # fit and store in our reg_models list model.fit(train_me, predict_me) reg_models.append(model) score = np.sqrt(mean_squared_error(y_true = predict_me, y_pred = model.predict(train_me))) rmse.append(score) print("Metaregressor #%d of %d trained for feature '%s'; RMSE was: %.5f" % ((f + 1), len(TRAIN_COLS), TRAIN_COLS[f], score)) print("\nAll metaregressors trained; average RMSE: %.5f" % np.mean(rmse)) print("".join(["\n", "=" * 50, "".join(["\nMetaregressor", model_suffix, " Training Complete\n"]), "=" * 50, "\n"])) return reg_models
def get_hyperparameters_model(): param_dist = {} clf = MultiTaskElasticNet() model = { 'multi_task_elastic_net': { 'model': clf, 'param_distributions': param_dist } } return model
def make_model(self): max_iter = 1000 tol = 0.015 l1_ratio = 0.8 # we want a relatively sparse model elastic = MultiTaskElasticNet(fit_intercept=True, max_iter=max_iter, tol=tol, l1_ratio=l1_ratio) #Note that we are assuming that error are independent of each other GIVEN THE PREDICTORS #Otherwise cross validation won't be applicable #We will perform a grid search to find best parameters print( '################ Find hyper-parameter values#######################' ) search = GridSearchCV(estimator=elastic, param_grid={'alpha': np.logspace(-5, 2, 8)}, scoring='neg_mean_squared_error', n_jobs=1, refit=True, cv=10) search.fit(self.X, self.Y) #Now create a final elastic net model using the optimal hyper parameters print( '################ Build final model ##############################' ) optimal_alpha = search.best_params_['alpha'] #optimal_l1_ratio=search.best_params_['l1_ratio'] self.model = MultiTaskElasticNet(fit_intercept=True, alpha=optimal_alpha, l1_ratio=l1_ratio, max_iter=max_iter, tol=tol) self.model.fit(self.X.values, self.Y.values) self.predicted = pd.DataFrame(index=self.Y.index, columns=self.Y.columns, data=self.model.predict(self.X.values)) self.predicted = self.predicted * self.Y_std + self.Y_mean
def make_dictionary(X, n_components=20, alpha=5., write_dir='/tmp/', contrasts=[], method='multitask', l1_ratio=.5, n_subjects=13): """Create dictionary + encoding""" from sklearn.decomposition import dict_learning_online, sparse_encode from sklearn.preprocessing import StandardScaler from sklearn.linear_model import MultiTaskLasso, MultiTaskElasticNet mem = Memory(write_dir, verbose=0) dictionary = mem.cache(initial_dictionary)(n_components, X) np.savez(os.path.join(write_dir, 'dictionary.npz'), loadings=dictionary, contrasts=contrasts) if method == 'online': components, dictionary = dict_learning_online(X.T, n_components, alpha=alpha, dict_init=dictionary, batch_size=200, method='cd', return_code=True, shuffle=True, n_jobs=1, positive_code=True) np.savez(os.path.join(write_dir, 'dictionary.npz'), loadings=dictionary, contrasts=contrasts) elif method == 'sparse': components = sparse_encode(X.T, dictionary, alpha=alpha, max_iter=10, n_jobs=1, check_input=True, verbose=0, positive=True) elif method == 'multitask': # too many hard-typed parameters !!! n_voxels = X.shape[1] // n_subjects components = np.zeros((X.shape[1], n_components)) clf = MultiTaskLasso(alpha=alpha) clf = MultiTaskElasticNet(alpha=alpha, l1_ratio=l1_ratio) for i in range(n_voxels): x = X[:, i:i + n_subjects * n_voxels:n_voxels] components[i: i + n_subjects * n_voxels: n_voxels] =\ clf.fit(dictionary.T, x).coef_ return dictionary, components
def mtelastic_model(self, X_train, y_train, X_test, y_test): # Multi-task Elastic-Net Regression Model mten_model = MultiTaskElasticNet(alpha=.1918) mten_model.fit(X_train, y_train) y_train_pred = mten_model.predict(X_train) y_test_pred = mten_model.predict(X_test) # To score the model I can either use the .score from sklearn or use the MSE R^2 from the Machine Learning Book print(mten_model.score(X_train, y_train)) print(mten_model.score(X_test, y_test)) print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error( y_train, y_train_pred), mean_squared_error(y_test, y_test_pred))) print('R^2 train: %.6f, R^2 test: %.6f' % (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
def get_regressors_multitask(nmodels='all'): """ Returns one or all of Multi-task linear regressors """ # 1. MultiTaskElasticNet lr1 = MultiTaskElasticNet() # 2. MultiTaskLasso lr2 = MultiTaskLasso() if (nmodels == 'all'): models = [lr1, lr2] else: models = ['lr' + str(nmodels)] return models
def train_ElasticNet(trainx, trainy, testx=None, testy=None, results=None): print('\nTraining ElasticNet...') if Y.multiclass: net = SGDClassifier(penalty='elasticnet', l1_ratio=.5, random_state=seed) net.fit(trainx, trainy) testp = net.predict(testx) t_bacc = balanced_accuracy_score(testy, testp) outcome = Y.outcome_names[0] results['test_balanced_accuracy'][outcome].append(t_bacc) elif Y.multioutcome: net = MultiTaskElasticNet(random_state=seed) net.fit(trainx, trainy) testp = net.predict(testx) trainp = net.predict(trainx) for i, outcome in enumerate(Y.outcome_names): t_r2 = r2_score(testy[:, i], testp[:, i]) t_mae = mean_absolute_error(testy[:, i], testp[:, i]) results['test_r2_sklearn'][outcome].append(t_r2) results['test_mean_absolute_error'][outcome].append(t_mae) else: net = ElasticNet(random_state=seed) net.fit(trainx, trainy) testp = net.predict(testx) trainp = net.predict(trainx) t_r2 = r2_score(testy, testp) t_mae = mean_absolute_error(testy, testp) outcome = Y.outcome_names[0] results['test_r2_sklearn'][outcome].append(t_r2) results['test_mean_absolute_error'][outcome].append(t_mae) best_output = [trainp, trainy, testp, testy] output_names = ['trainp', 'trainy', 'testp', 'testy'] return results, net, best_output, output_names
def train_multi_elasticnet(train_features, train_labels, num_alphas, skip_cross_validation, alpha, l1_ratio, num_jobs): """ Performs the cross validation of multi elastic net model, and returns the trained model with best params. Assume features are scaled/normalized. Assumes train_labels has more than one column. """ best_alpha = alpha best_l1_ratio = l1_ratio max_iter = 10000 tol = 0.0005 if not skip_cross_validation: # use 5 fold cross validation model = MultiTaskElasticNetCV(l1_ratio=[ 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 0.99, 0.999, 0.9999 ], max_iter=max_iter, cv=5, n_alphas=num_alphas, n_jobs=num_jobs, normalize=False, tol=tol) model.fit(train_features, train_labels) best_alpha = model.alpha_ best_l1_ratio = model.l1_ratio_ #print("number of iterations were {}".format(model.n_iter_)) model = MultiTaskElasticNet(alpha=best_alpha, l1_ratio=best_l1_ratio, normalize=False, max_iter=max_iter, tol=tol) model.fit(train_features, train_labels) return (model, {'alpha': best_alpha, 'l1_ratio': best_l1_ratio})
def regressor_creator(indata, outdata): return MultiTaskElasticNet(max_iter=3000)
#this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal #load necessary libs from sklearn.feature_selection import SelectKBest from sklearn.decomposition import TruncatedSVD from sklearn.linear_model import MultiTaskElasticNet from sklearn.cross_validation import train_test_split #splite dataset to get necessary sub-dataset features_train, features_test, labels_train, labels_test = train_test_split(features_sc,label_scm,test_size=0.33,random_state=42) #pre-process: dimensional reduction(SVD) svd1 = TruncatedSVD(n_components=9,random_state=1).fit(features_train) features_train = svd1.transform(features_train) svd2 = TruncatedSVD(n_components=9,random_state=1).fit(features_test) features_test = svd2.transform(features_test) #do regression mte = MultiTaskElasticNet(alpha=0.000000001,l1_ratio=0.01,random_state=1) mte.fit(features_train,labels_train) print "MultiTaskElasticNet",mte.score(features_test,labels_test) ########################################################################## #All of the codes end. #Thank you!
def GetAllModelsForComparison(X_train, Y_train): models = { 'ARDRegression': ARDRegression(), 'BayesianRidge': BayesianRidge(), 'ElasticNet': ElasticNet(), 'ElasticNetCV': ElasticNetCV(), 'Hinge': Hinge(), #'Huber': Huber(), 'HuberRegressor': HuberRegressor(), 'Lars': Lars(), 'LarsCV': LarsCV(), 'Lasso': Lasso(), 'LassoCV': LassoCV(), 'LassoLars': LassoLars(), 'LassoLarsCV': LassoLarsCV(), 'LinearRegression': LinearRegression(), 'Log': Log(), 'LogisticRegression': LogisticRegression(), 'LogisticRegressionCV': LogisticRegressionCV(), 'ModifiedHuber': ModifiedHuber(), 'MultiTaskElasticNet': MultiTaskElasticNet(), 'MultiTaskElasticNetCV': MultiTaskElasticNetCV(), 'MultiTaskLasso': MultiTaskLasso(), 'MultiTaskLassoCV': MultiTaskLassoCV(), 'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(), 'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(), 'PassiveAggressiveClassifier': PassiveAggressiveClassifier(), 'PassiveAggressiveRegressor': PassiveAggressiveRegressor(), 'Perceptron': Perceptron(), 'RANSACRegressor': RANSACRegressor(), #'RandomizedLasso': RandomizedLasso(), #'RandomizedLogisticRegression': RandomizedLogisticRegression(), 'Ridge': Ridge(), 'RidgeCV': RidgeCV(), 'RidgeClassifier': RidgeClassifier(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), 'SquaredLoss': SquaredLoss(), 'TheilSenRegressor': TheilSenRegressor(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LinearClassifierMixin': LinearClassifierMixin(), 'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(), 'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(), 'StandardScaler': StandardScaler(), 'TransformerMixin': TransformerMixin(), 'BaseEstimator': BaseEstimator(), 'KernelRidge': KernelRidge(), 'RegressorMixin': RegressorMixin(), 'LinearSVC': LinearSVC(), 'LinearSVR': LinearSVR(), 'NuSVC': NuSVC(), 'NuSVR': NuSVR(), 'OneClassSVM': OneClassSVM(), 'SVC': SVC(), 'SVR': SVR(), 'SGDClassifier': SGDClassifier(), 'SGDRegressor': SGDRegressor(), #'BallTree': BallTree(), #'DistanceMetric': DistanceMetric(), #'KDTree': KDTree(), 'KNeighborsClassifier': KNeighborsClassifier(), 'KNeighborsRegressor': KNeighborsRegressor(), 'KernelDensity': KernelDensity(), #'LSHForest': LSHForest(), 'LocalOutlierFactor': LocalOutlierFactor(), 'NearestCentroid': NearestCentroid(), 'NearestNeighbors': NearestNeighbors(), 'RadiusNeighborsClassifier': RadiusNeighborsClassifier(), 'RadiusNeighborsRegressor': RadiusNeighborsRegressor(), #'GaussianProcess': GaussianProcess(), 'GaussianProcessRegressor': GaussianProcessRegressor(), 'GaussianProcessClassifier': GaussianProcessClassifier(), 'CCA': CCA(), 'PLSCanonical': PLSCanonical(), 'PLSRegression': PLSRegression(), 'PLSSVD': PLSSVD(), #'ABCMeta': ABCMeta(), #'BaseDiscreteNB': BaseDiscreteNB(), 'BaseEstimator': BaseEstimator(), #'BaseNB': BaseNB(), 'BernoulliNB': BernoulliNB(), 'ClassifierMixin': ClassifierMixin(), 'GaussianNB': GaussianNB(), 'LabelBinarizer': LabelBinarizer(), 'MultinomialNB': MultinomialNB(), 'DecisionTreeClassifier': DecisionTreeClassifier(), 'DecisionTreeRegressor': DecisionTreeRegressor(), 'ExtraTreeClassifier': ExtraTreeClassifier(), 'AdaBoostClassifier': AdaBoostClassifier(), 'AdaBoostRegressor': AdaBoostRegressor(), 'BaggingClassifier': BaggingClassifier(), 'BaggingRegressor': BaggingRegressor(), #'BaseEnsemble': BaseEnsemble(), 'ExtraTreesClassifier': ExtraTreesClassifier(), 'ExtraTreesRegressor': ExtraTreesRegressor(), 'GradientBoostingClassifier': GradientBoostingClassifier(), 'GradientBoostingRegressor': GradientBoostingRegressor(), 'IsolationForest': IsolationForest(), 'RandomForestClassifier': RandomForestClassifier(), 'RandomForestRegressor': RandomForestRegressor(), 'RandomTreesEmbedding': RandomTreesEmbedding(), #'VotingClassifier': VotingClassifier(), 'BaseEstimator': BaseEstimator(), 'ClassifierMixin': ClassifierMixin(), 'LabelBinarizer': LabelBinarizer(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'OneVsOneClassifier': OneVsOneClassifier(), #'OneVsRestClassifier': OneVsRestClassifier(), #'OutputCodeClassifier': OutputCodeClassifier(), 'Parallel': Parallel(), #'ABCMeta': ABCMeta(), 'BaseEstimator': BaseEstimator(), #'ClassifierChain': ClassifierChain(), 'ClassifierMixin': ClassifierMixin(), 'MetaEstimatorMixin': MetaEstimatorMixin(), #'MultiOutputClassifier': MultiOutputClassifier(), #'MultiOutputEstimator': MultiOutputEstimator(), #'MultiOutputRegressor': MultiOutputRegressor(), 'Parallel': Parallel(), 'RegressorMixin': RegressorMixin(), 'LabelPropagation': LabelPropagation(), 'LabelSpreading': LabelSpreading(), 'BaseEstimator': BaseEstimator(), 'IsotonicRegression': IsotonicRegression(), 'RegressorMixin': RegressorMixin(), 'TransformerMixin': TransformerMixin(), 'BernoulliRBM': BernoulliRBM(), 'MLPClassifier': MLPClassifier(), 'MLPRegressor': MLPRegressor() } return models
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "it does not implement a 'get_params' methods" assert_raises_regex(TypeError, msg, check_estimator, object) assert_raises_regex(TypeError, msg, check_estimator, object()) # check that we have a fit method msg = "object has no attribute 'fit'" assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator) assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator()) # check that fit does input validation msg = "TypeError not raised" assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier) assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier()) # check that sample_weights in fit accepts pandas.Series type try: from pandas import Series # noqa msg = ("Estimator NoSampleWeightPandasSeriesType raises error if " "'sample_weight' parameter is of type pandas.Series") assert_raises_regex( ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType) except ImportError: pass # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict) assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict()) # check that estimator state does not change # at transform/predict/predict_proba time msg = 'Estimator changes __dict__ during predict' assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict) # check that `fit` only changes attribures that # are private (start with an _ or end with a _). msg = ('Estimator ChangesWrongAttribute should not change or mutate ' 'the parameter wrong_attribute from 0 to 1 during fit.') assert_raises_regex(AssertionError, msg, check_estimator, ChangesWrongAttribute) check_estimator(ChangesUnderscoreAttribute) # check that `fit` doesn't add any public attribute msg = ('Estimator adds public attribute\(s\) during the fit method.' ' Estimators are only allowed to add private attributes' ' either started with _ or ended' ' with _ but wrong_attribute added') assert_raises_regex(AssertionError, msg, check_estimator, SetsWrongAttribute) # check for invariant method name = NotInvariantPredict.__name__ method = 'predict' msg = ("{method} of {name} is not invariant when applied " "to a subset.").format(method=method, name=name) assert_raises_regex(AssertionError, msg, check_estimator, NotInvariantPredict) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name # the check for sparse input handling prints to the stdout, # instead of raising an error, so as not to remove the original traceback. # that means we need to jump through some hoops to catch it. old_stdout = sys.stdout string_buffer = StringIO() sys.stdout = string_buffer try: check_estimator(NoSparseClassifier) except: pass finally: sys.stdout = old_stdout assert_true(msg in string_buffer.getvalue()) # doesn't error on actual estimator check_estimator(AdaBoostClassifier) check_estimator(AdaBoostClassifier()) check_estimator(MultiTaskElasticNet) check_estimator(MultiTaskElasticNet())
def __init__( self, species: str, reprocess: Optional[bool] = False, gene_selection_method: Optional[Literal['deg', 'lasso', 'elastic-net']] = 'deg', model_cache_dir: Optional[str] = None, alpha: Optional[Union[float, Sequence[float]]] = 1e-2, learning_rate: Optional[float] = 1e-3, equal_weight: Optional[bool] = True, train_split: Optional[float] = 0.8, n_jobs: Optional[int] = 15, remove_correlated: Optional[Literal['both', 'ct', 'region']] = None, normalize: Optional[bool] = False, dim_reduction: Optional[str] = None, n_components: Optional[int] = None): super().__init__() torch.set_num_threads(n_jobs) filename = f'{species}_ex_colors' self.learning_rate = learning_rate self.device = 'cpu' # Used saved data if possible if not reprocess and os.path.exists( f'withcolors_preprocessed/{filename}.pickle'): with open(f'withcolors_preprocessed/{filename}.pickle', mode='rb') as file: data_dict = pickle.load(file) self.data = data_dict['data'] self.ct_axis_mask = data_dict['ct_axis_mask'] self.r_axis_mask = data_dict['r_axis_mask'] # No need to do anything else return species_data = sc.read(f'withcolors/{filename}.h5ad') if dim_reduction is not None: sc.pp.pca(species_data, n_comps=n_components) sc.pp.highly_variable_genes(species_data) sc.pp.neighbors(species_data, n_pcs=n_components) if dim_reduction == 'pca': sc.tl.pca(species_data, n_comps=n_components) elif dim_reduction == 'umap': sc.tl.umap(species_data, n_components=n_components) elif dim_reduction == 'tsne': sc.tl.tsne(species_data, n_pcs=n_components) species_data = AnnData(species_data.obsm[f'X_{dim_reduction}'], obs=species_data.obs) species_data.var.index = pd.Index([ f'{dim_reduction}{x}' for x in range(len(species_data.var.index)) ]) # Label each observation with its subregion and species species_data.obs['clusters'] = species_data.obs['clusters'].apply( lambda s: species[0].upper() + '_' + s) species_data.obs['subregion'] = species_data.obs['clusters'].apply( lambda s: s.split('.')[0]) self.n_var = len(species_data.var.index) self.n_subregions = len(np.unique(species_data.obs['subregion'])) self.n_clusters = len(np.unique(species_data.obs['clusters'])) self.n_obs = len(species_data.obs.index) if gene_selection_method == 'deg': self._deg_select(dim_reduction, species_data) elif gene_selection_method in ['lasso', 'elastic-net']: # if isinstance(alpha, float): # alpha = [alpha] for label in ['subregion', 'clusters']: if equal_weight: # get count of number of occurrences of each label label_to_count = species_data.obs[label].value_counts( normalize=True).to_dict() # Map each observation to its appropriate label appearance frequency w = species_data.obs[label].map(label_to_count) # Diagonalize and take square root to appropriately normalize data w = np.diag(np.sqrt(w)) # normalize data transcriptomes = np.matmul(w, species_data.X.toarray()) else: transcriptomes = species_data.X.toarray() model_file = f'{model_cache_dir}/{gene_selection_method}/' \ f'{species[0].upper()}_normalized-{equal_weight}_{label}_a-{alpha}.pt' if model_cache_dir is not None and os.path.exists(model_file): with open(model_file, 'rb') as file: model = pickle.load(file) else: # Create one-hot encoding of labels num_labels = self.n_subregions if label == 'subregion' else self.n_clusters label_to_id = { r: i for i, r in enumerate( np.unique(species_data.obs[label])) } labels = species_data.obs[label].map(label_to_id) labels_expanded = np.zeros((self.n_obs, num_labels)) labels_expanded[np.arange(self.n_obs), labels] = 1 if gene_selection_method == 'lasso': model = MultiTaskLasso(alpha=alpha, max_iter=10000) else: model = MultiTaskElasticNet(alpha=alpha, max_iter=10000) model.fit(transcriptomes, labels_expanded) with open(model_file, 'wb') as file: pickle.dump(model, file, protocol=5) max_weight_per_gene = (model.coef_ != 0).max(axis=0) # # define the model # model = nn.Sequential( # # nn.BatchNorm1d(self.n_var), # nn.Linear(self.n_var, num_labels) # ) # model_file = f'{model_cache_dir}_{label}.pt' # if model_cache_dir is None or not os.path.exists(model_file): # print(f'\nTraining lasso on {label}.\n') # # Create the dataset and dataloader # ds = SparseDataSet(species_data, label) # train_size = int(train_split * len(ds)) # val_size = len(ds) - train_size # train_ds, val_ds = torch.utils.data.random_split(ds, [train_size, val_size]) # train_dl = DataLoader(train_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=0) # val_dl = DataLoader(val_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=0) # optimizer = optim.Adam(model.parameters(), lr=self.learning_rate) # # train # num_nonzero_features_by_alpha = [] # for alpha in alpha: # loss_history = self._train_model(model, train_dl, val_dl, optimizer, alpha=alpha, epochs=50) # max_weight_per_gene = torch.abs(model[-1].weight).max(dim=0)[0] # num_nonzero_features_by_alpha.append([(max_weight_per_gene > 1e-4).sum(), alpha]) # # save the model # torch.save(model.state_dict(), model_file) # plt.plot(loss_history[:, 0], label='train loss') # plt.plot(loss_history[:, 1], label='val loss') # plt.legend() # plt.show() # num_nonzero_features_by_alpha = np.array(num_nonzero_features_by_alpha) # plt.plot(num_nonzero_features_by_alpha[:, 0], num_nonzero_features_by_alpha[:, 1]) # plt.savefig('num_features_selected_v_l1_weight.pdf') # plt.show() # else: # model.load_state_dict(torch.load(model_file)) # # Get the max weight per gene to see whether it's relevant to at least one subregion # with torch.no_grad(): # max_weight_per_gene = torch.abs(model[-1].weight).max(dim=0)[0] # with torch.no_grad(): # sns.distplot(max_weight_per_gene) # plt.show() if label == 'subregion': self.r_axis_mask = max_weight_per_gene != 0 else: self.ct_axis_mask = max_weight_per_gene != 0 print( f'Before removing correlated genes, found {self.r_axis_mask.sum()} region genes ' f'and {self.ct_axis_mask.sum()} cell type genes.') if remove_correlated is not None: self._remove_r_ct_correlated(remove_correlated, species_data) print( f'After removing correlated genes, found {self.r_axis_mask.sum()} region genes ' f'and {self.ct_axis_mask.sum()} cell type genes.') # Average transcriptomes within each cell type and put into data frame with cell types as rows and genes as cols ct_names = np.unique(species_data.obs['clusters']) ct_avg_data = [ species_data[species_data.obs['clusters'] == ct].X.mean(axis=0) for ct in ct_names ] self.data = pd.concat([ pd.DataFrame(data.reshape((1, -1)), columns=species_data.var.index, index=[cluster_name]) for data, cluster_name in zip(ct_avg_data, ct_names) ]) # Divide each row by mean, as in Tosches et al, rename columns, # and transpose so that column labels are genes and rows are cell types # Divide each row by mean if normalize: self.data = self.data.div(self.data.mean(axis=0).to_numpy(), axis=1) # noqa # Save data data_dict = { 'data': self.data, 'ct_axis_mask': self.ct_axis_mask, 'r_axis_mask': self.r_axis_mask } with open(f'withcolors_preprocessed/{filename}.pickle', mode='wb') as file: pickle.dump(data_dict, file)
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "Passing a class was deprecated" assert_raises_regex(TypeError, msg, check_estimator, object) msg = "object has no attribute '_get_tags'" assert_raises_regex(AttributeError, msg, check_estimator, object()) # check that values returned by get_params match set_params msg = "get_params result does not match what was passed to set_params" assert_raises_regex(AssertionError, msg, check_estimator, ModifiesValueInsteadOfRaisingError()) assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams()) assert_raises_regex(AssertionError, msg, check_estimator, ModifiesAnotherValue()) # check that we have a fit method msg = "object has no attribute 'fit'" assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator()) # check that fit does input validation msg = "ValueError not raised" assert_raises_regex(AssertionError, msg, check_estimator, BaseBadClassifier()) # check that sample_weights in fit accepts pandas.Series type try: from pandas import Series # noqa msg = ("Estimator NoSampleWeightPandasSeriesType raises error if " "'sample_weight' parameter is of type pandas.Series") assert_raises_regex(ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType()) except ImportError: pass # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator doesn't check for NaN and inf in predict" assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict()) # check that estimator state does not change # at transform/predict/predict_proba time msg = 'Estimator changes __dict__ during predict' assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict()) # check that `fit` only changes attribures that # are private (start with an _ or end with a _). msg = ('Estimator ChangesWrongAttribute should not change or mutate ' 'the parameter wrong_attribute from 0 to 1 during fit.') assert_raises_regex(AssertionError, msg, check_estimator, ChangesWrongAttribute()) check_estimator(ChangesUnderscoreAttribute()) # check that `fit` doesn't add any public attribute msg = (r'Estimator adds public attribute\(s\) during the fit method.' ' Estimators are only allowed to add private attributes' ' either started with _ or ended' ' with _ but wrong_attribute added') assert_raises_regex(AssertionError, msg, check_estimator, SetsWrongAttribute()) # check for invariant method name = NotInvariantPredict.__name__ method = 'predict' msg = ("{method} of {name} is not invariant when applied " "to a subset.").format(method=method, name=name) assert_raises_regex(AssertionError, msg, check_estimator, NotInvariantPredict()) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name # the check for sparse input handling prints to the stdout, # instead of raising an error, so as not to remove the original traceback. # that means we need to jump through some hoops to catch it. old_stdout = sys.stdout string_buffer = StringIO() sys.stdout = string_buffer try: check_estimator(NoSparseClassifier()) except Exception: pass finally: sys.stdout = old_stdout assert msg in string_buffer.getvalue() # Large indices test on bad estimator msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to ' r'support \S{3}_64 matrix, and is not failing gracefully.*') assert_raises_regex(AssertionError, msg, check_estimator, LargeSparseNotSupportedClassifier()) # does error on binary_only untagged estimator msg = 'Only 2 classes are supported' assert_raises_regex(ValueError, msg, check_estimator, UntaggedBinaryClassifier()) # non-regression test for estimators transforming to sparse data check_estimator(SparseTransformer()) # doesn't error on actual estimator check_estimator(LogisticRegression()) check_estimator(LogisticRegression(C=0.01)) check_estimator(MultiTaskElasticNet()) # doesn't error on binary_only tagged estimator check_estimator(TaggedBinaryClassifier()) # Check regressor with requires_positive_y estimator tag msg = 'negative y values not supported!' assert_raises_regex(ValueError, msg, check_estimator, RequiresPositiveYRegressor())
def regressor_creator(indata, outdata): return MultiTaskElasticNet()
def train_linear_model(X, y, random_state=1, test_size=0.2, regularization_type='elasticnet', k_fold=5, max_iter=1000000, tol=0.0001, l1_ratio=None): """ Function to train linear model with regularization and cross-validation. Args: X (pandas.DataFrame): dataframe of descriptors. y (pandas.DataFrame): dataframe of cycle lifetimes. random_state (int): seed for train/test split. test_size (float): proportion of the dataset reserved for model evaluation. regularization_type (str): lasso or ridge or elastic-net (with cv). k_fold (int): k in k-fold cross-validation. max_iter (int): maximum number of iterations for model fitting. tol (float): tolerance for optimization. l1_ratio ([float]): list of lasso to ridge ratios for elasticnet. Returns: sklearn.linear_model.LinearModel: fitted model. mu (float): Mean value of descriptors used in training. s (float): Std dev of descriptors used in training. """ if l1_ratio is None: l1_ratio = [.1, .5, .7, .9, .95, 1] X_train, X_test, y_train, y_test = \ train_test_split(X, y, test_size=test_size, random_state=random_state) # Standardize (training) data after train/test split mu = np.mean(X_train, axis=0) s = np.std(X_train, axis=0) X_scaled = (X_train - mu) / s hyperparameters = {'random_state': random_state, 'test_size': test_size, 'k_fold': k_fold, 'tol': tol, 'max_iter': max_iter } if regularization_type == 'lasso' and y.shape[1] == 1: lassocv = LassoCV(fit_intercept=True, alphas=None, tol=tol, cv=k_fold, max_iter=max_iter) lassocv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = lassocv.alpha_ linear_model = Lasso(fit_intercept=True, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train.values) hyperparameters['l1_ratio'] = 1 elif regularization_type == 'ridge' and y.shape[1] == 1: ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold) ridgecv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and refit model alpha_opt = ridgecv.alpha_ linear_model = Ridge(fit_intercept=True, alpha=alpha_opt) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = 0 elif regularization_type == 'elasticnet' and y.shape[1] == 1: elasticnetcv = ElasticNetCV(fit_intercept=True, normalize=False, alphas=None, cv=k_fold, l1_ratio=l1_ratio, max_iter=max_iter) elasticnetcv.fit(X_scaled, y_train.values.ravel()) # Set optimal alpha and l1_ratio. Refit model alpha_opt = elasticnetcv.alpha_ l1_ratio_opt = elasticnetcv.l1_ratio_ linear_model = ElasticNet(fit_intercept=True, normalize=False, l1_ratio=l1_ratio_opt, alpha=alpha_opt, max_iter=max_iter) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = l1_ratio_opt # If more than 1 outcome present, perform multitask regression elif regularization_type == 'elasticnet' and y.shape[1] > 1: multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True, cv=k_fold, normalize=False, l1_ratio=l1_ratio, max_iter=max_iter) multi_elasticnet_CV.fit(X_scaled, y_train) # Set optimal alpha and l1_ratio. Refit model alpha_opt = multi_elasticnet_CV.alpha_ l1_ratio_opt = multi_elasticnet_CV.l1_ratio_ linear_model = MultiTaskElasticNet(fit_intercept=True, normalize=False, max_iter=max_iter) linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt) linear_model.fit(X_scaled, y_train) hyperparameters['l1_ratio'] = l1_ratio_opt else: raise NotImplementedError y_pred = linear_model.predict((X_test-mu)/s) Rsq = linear_model.score((X_test - mu) / s, y_test) # Compute 95% confidence interval # Multioutput = 'raw_values' provides prediction error per output pred_actual_ratio = [x/y for x, y in zip(y_pred, np.array(y_test))] relative_prediction_error = 1.96*np.sqrt(mean_squared_error(np.ones(y_pred.shape), pred_actual_ratio, multioutput='raw_values')/y_pred.shape[0]) hyperparameters['alpha'] = alpha_opt return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
model_name=f'best_model_batch{ind}.h5') ]) all_predictions.append(model.predict(X_test)) model = create_model() model.fit(X_train, y_train, epochs=33, batch_size=32, verbose=1) all_predictions.append(model.predict(X_test)) kf = KFold(n_splits=5, random_state=2019, shuffle=True) for ind, (tr, val) in enumerate(kf.split(X_train)): X_tr = X_train[tr] y_tr = y_train[tr] X_vl = X_train[val] y_vl = y_train[val] model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5) model.fit(X_tr, y_tr) all_predictions.append(model.predict(X_test)) model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5) model.fit(X_train, y_train) all_predictions.append(model.predict(X_test)) test_preds = np.array([ np.array([rankdata(c) for c in p.T]).T for p in all_predictions ]).mean(axis=0) max_val = test_preds.max() + 1 test_preds = test_preds / max_val + 1e-12 submission = pd.read_csv(path_join(data_dir, 'sample_submission.csv')) submission[targets] = test_preds submission.to_csv("submission.csv", index=False)
def test_check_estimator(): # tests that the estimator actually fails on "bad" estimators. # not a complete test of all checks, which are very extensive. # check that we have a set_params and can clone msg = "Passing a class was deprecated" with raises(TypeError, match=msg): check_estimator(object) msg = ("Parameter 'p' of estimator 'HasMutableParameters' is of type " "object which is not allowed") # check that the "default_constructible" test checks for mutable parameters check_estimator(HasImmutableParameters()) # should pass with raises(AssertionError, match=msg): check_estimator(HasMutableParameters()) # check that values returned by get_params match set_params msg = "get_params result does not match what was passed to set_params" with raises(AssertionError, match=msg): check_estimator(ModifiesValueInsteadOfRaisingError()) with warnings.catch_warnings(record=True) as records: check_estimator(RaisesErrorInSetParams()) assert UserWarning in [rec.category for rec in records] with raises(AssertionError, match=msg): check_estimator(ModifiesAnotherValue()) # check that we have a fit method msg = "object has no attribute 'fit'" with raises(AttributeError, match=msg): check_estimator(BaseEstimator()) # check that fit does input validation msg = "Did not raise" with raises(AssertionError, match=msg): check_estimator(BaseBadClassifier()) # check that sample_weights in fit accepts pandas.Series type try: from pandas import Series # noqa msg = ("Estimator NoSampleWeightPandasSeriesType raises error if " "'sample_weight' parameter is of type pandas.Series") with raises(ValueError, match=msg): check_estimator(NoSampleWeightPandasSeriesType()) except ImportError: pass # check that predict does input validation (doesn't accept dicts in input) msg = "Estimator NoCheckinPredict doesn't check for NaN and inf in predict" with raises(AssertionError, match=msg): check_estimator(NoCheckinPredict()) # check that estimator state does not change # at transform/predict/predict_proba time msg = "Estimator changes __dict__ during predict" with raises(AssertionError, match=msg): check_estimator(ChangesDict()) # check that `fit` only changes attributes that # are private (start with an _ or end with a _). msg = ("Estimator ChangesWrongAttribute should not change or mutate " "the parameter wrong_attribute from 0 to 1 during fit.") with raises(AssertionError, match=msg): check_estimator(ChangesWrongAttribute()) check_estimator(ChangesUnderscoreAttribute()) # check that `fit` doesn't add any public attribute msg = (r"Estimator adds public attribute\(s\) during the fit method." " Estimators are only allowed to add private attributes" " either started with _ or ended" " with _ but wrong_attribute added") with raises(AssertionError, match=msg): check_estimator(SetsWrongAttribute()) # check for sample order invariance name = NotInvariantSampleOrder.__name__ method = "predict" msg = ("{method} of {name} is not invariant when applied to a dataset" "with different sample order.").format(method=method, name=name) with raises(AssertionError, match=msg): check_estimator(NotInvariantSampleOrder()) # check for invariant method name = NotInvariantPredict.__name__ method = "predict" msg = ("{method} of {name} is not invariant when applied to a subset." ).format(method=method, name=name) with raises(AssertionError, match=msg): check_estimator(NotInvariantPredict()) # check for sparse matrix input handling name = NoSparseClassifier.__name__ msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name with raises(AssertionError, match=msg): check_estimator(NoSparseClassifier()) # Large indices test on bad estimator msg = ("Estimator LargeSparseNotSupportedClassifier doesn't seem to " r"support \S{3}_64 matrix, and is not failing gracefully.*") with raises(AssertionError, match=msg): check_estimator(LargeSparseNotSupportedClassifier()) # does error on binary_only untagged estimator msg = "Only 2 classes are supported" with raises(ValueError, match=msg): check_estimator(UntaggedBinaryClassifier()) # non-regression test for estimators transforming to sparse data check_estimator(SparseTransformer()) # doesn't error on actual estimator check_estimator(LogisticRegression()) check_estimator(LogisticRegression(C=0.01)) check_estimator(MultiTaskElasticNet()) # doesn't error on binary_only tagged estimator check_estimator(TaggedBinaryClassifier()) # Check regressor with requires_positive_y estimator tag msg = "negative y values not supported!" with raises(ValueError, match=msg): check_estimator(RequiresPositiveYRegressor()) # Does not raise error on classifier with poor_score tag check_estimator(PoorScoreLogisticRegression())
print_cost=True, random_state=42) nn = nn.fit(X_train, Y_train) t1 = dt() print('\nRuntime (s):', t1 - t0, '\n') long_seq = np.arange(0, n_iters, 1) short_seq = np.arange(0, n_iters, 1000) plt.figure() plt.plot(long_seq, nn.costs) plt.plot(short_seq, nn.costs[::1000]) plt.figure() plt.plot(long_seq, nn.metric) plt.plot(short_seq, nn.metric[::1000]) h_test = nn.predict(X_test) print("NN Test RMSE: ", np.sqrt(np.mean((h_test - Y_test)**2))) print("NN Test my R2: ", cust_r2(Y_test, h_test)) #print("NN Test Accuracy: ", np.mean(one_hot_decode(h_test) == one_hot_decode(Y_test_oh))) #mod = LinearRegression().fit(X_train, Y_train.reshape(-1, )) mod = MultiTaskElasticNet(l1_ratio=0.00001).fit(X_train, Y_train) #print(wb['bias0'], wb['Weight0'].reshape(-1, )) #print(glm.intercept_, glm.coef_.reshape(-1, )) print("GLM Test my R2: ", cust_r2(Y_test, mod.predict(X_test))) print("GLM Test RMSE: ", np.sqrt(np.mean((mod.predict(X_test) - Y_test)**2))) samples = nn.draw_predictive_samples(X_test, n_samples=10000, n_outputs=1) plt.figure() plt.hist(samples[0, :], bins=30)
def predict( self, forecast_length: int, future_regressor=[], just_point_forecast: bool = False, ): """Generates forecast data immediately following dates of index supplied to .fit() Args: forecast_length (int): Number of periods of data to forecast ahead regressor (numpy.Array): additional regressor just_point_forecast (bool): If True, return a pandas.DataFrame of just point forecasts Returns: Either a PredictionObject of forecasts and metadata, or if just_point_forecast == True, a dataframe of point forecasts """ if not _has_tsfresh: raise ImportError("Package tsfresh is required") # num_subsamples = 10 predictStartTime = datetime.datetime.now() # from tsfresh import extract_features from tsfresh.utilities.dataframe_functions import make_forecasting_frame # from sklearn.ensemble import AdaBoostRegressor from tsfresh.utilities.dataframe_functions import impute as tsfresh_impute # from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters max_timeshift = 10 regression_model = 'Adaboost' feature_selection = None max_timeshift = self.max_timeshift regression_model = self.regression_model feature_selection = self.feature_selection sktraindata = self.df_train.copy() X = pd.DataFrame() y = pd.DataFrame() counter = 0 for column in sktraindata.columns: df_shift, current_y = make_forecasting_frame( sktraindata[column], kind="time_series", max_timeshift=max_timeshift, rolling_direction=1, ) # disable_progressbar = True MinimalFCParameters EfficientFCParameters current_X = extract_features( df_shift, column_id="id", column_sort="time", column_value="value", impute_function=tsfresh_impute, show_warnings=False, default_fc_parameters=EfficientFCParameters(), n_jobs=1, ) # current_X["feature_last_value"] = current_y.shift(1) current_X.rename(columns=lambda x: str(counter) + '_' + x, inplace=True) X = pd.concat([X, current_X], axis=1) y = pd.concat([y, current_y], axis=1) counter += 1 # drop constant features X = X.loc[:, X.apply(pd.Series.nunique) != 1] X = X.replace([np.inf, -np.inf], np.nan) X = X.fillna(0) y = y.fillna(method='ffill').fillna(method='bfill') if feature_selection == 'Variance': from sklearn.feature_selection import VarianceThreshold sel = VarianceThreshold(threshold=(0.15)) X = pd.DataFrame(sel.fit_transform(X)) if feature_selection == 'Percentile': from sklearn.feature_selection import SelectPercentile, chi2 X = pd.DataFrame( SelectPercentile(chi2, percentile=20).fit_transform( X, y[y.columns[0]])) if feature_selection == 'DecisionTree': from sklearn.tree import DecisionTreeRegressor from sklearn.feature_selection import SelectFromModel clf = DecisionTreeRegressor() clf = clf.fit(X, y) model = SelectFromModel(clf, prefit=True) X = model.transform(X) if feature_selection == 'Lasso': from sklearn.linear_model import MultiTaskLasso from sklearn.feature_selection import SelectFromModel clf = MultiTaskLasso(max_iter=2000) clf = clf.fit(X, y) model = SelectFromModel(clf, prefit=True) X = model.transform(X) """ decisionTreeList = X.columns[model.get_support()] LassoList = X.columns[model.get_support()] feature_list = decisionTreeList.to_list() set([x for x in feature_list if feature_list.count(x) > 1]) from collections import Counter repeat_features = Counter(feature_list) repeat_features = repeat_features.most_common(20) """ # Drop first line X = X.iloc[1:, ] y = y.iloc[1:] y = y.fillna(method='ffill').fillna(method='bfill') index = self.create_forecast_index(forecast_length=forecast_length) if regression_model == 'ElasticNet': from sklearn.linear_model import MultiTaskElasticNet regr = MultiTaskElasticNet(alpha=1.0, random_state=self.random_seed) elif regression_model == 'DecisionTree': from sklearn.tree import DecisionTreeRegressor regr = DecisionTreeRegressor(random_state=self.random_seed) elif regression_model == 'MLP': from sklearn.neural_network import MLPRegressor # relu/tanh lbfgs/adam layer_sizes (100) (10) regr = MLPRegressor( hidden_layer_sizes=(10, 25, 10), verbose=self.verbose_bool, max_iter=200, activation='tanh', solver='lbfgs', random_state=self.random_seed, ) elif regression_model == 'KNN': from sklearn.multioutput import MultiOutputRegressor from sklearn.neighbors import KNeighborsRegressor regr = MultiOutputRegressor( KNeighborsRegressor(random_state=self.random_seed)) elif regression_model == 'Adaboost': from sklearn.multioutput import MultiOutputRegressor from sklearn.ensemble import AdaBoostRegressor regr = MultiOutputRegressor(AdaBoostRegressor( n_estimators=200)) # , random_state=self.random_seed)) else: regression_model = 'RandomForest' from sklearn.ensemble import RandomForestRegressor regr = RandomForestRegressor(random_state=self.random_seed, n_estimators=1000, verbose=self.verbose) regr.fit(X, y) combined_index = self.df_train.index.append(index) forecast = pd.DataFrame() sktraindata.columns = [x for x in range(len(sktraindata.columns))] for x in range(forecast_length): x_dat = pd.DataFrame() y_dat = pd.DataFrame() counter = 0 for column in sktraindata.columns: df_shift, current_y = make_forecasting_frame( sktraindata.tail(max_timeshift)[column], kind="time_series", max_timeshift=max_timeshift, rolling_direction=1, ) # disable_progressbar = True MinimalFCParameters EfficientFCParameters current_X = extract_features( df_shift, column_id="id", column_sort="time", column_value="value", impute_function=tsfresh_impute, show_warnings=False, n_jobs=1, default_fc_parameters=EfficientFCParameters(), ) # default_fc_parameters=MinimalFCParameters(), current_X["feature_last_value"] = current_y.shift(1) current_X.rename(columns=lambda x: str(counter) + '_' + x, inplace=True) x_dat = pd.concat([x_dat, current_X], axis=1) y_dat = pd.concat([y_dat, current_y], axis=1) counter += 1 x_dat = x_dat[X.columns] rfPred = pd.DataFrame(regr.predict(x_dat.tail(1).values)) forecast = pd.concat([forecast, rfPred], axis=0, ignore_index=True) sktraindata = pd.concat([sktraindata, rfPred], axis=0, ignore_index=True) sktraindata.index = combined_index[:len(sktraindata.index)] forecast.columns = self.column_names forecast.index = index if just_point_forecast: return forecast else: upper_forecast, lower_forecast = Point_to_Probability( self.df_train, forecast, prediction_interval=self.prediction_interval) predict_runtime = datetime.datetime.now() - predictStartTime prediction = PredictionObject( model_name=self.name, forecast_length=forecast_length, forecast_index=forecast.index, forecast_columns=forecast.columns, lower_forecast=lower_forecast, forecast=forecast, upper_forecast=upper_forecast, prediction_interval=self.prediction_interval, predict_runtime=predict_runtime, fit_runtime=self.fit_runtime, model_parameters=self.get_params(), ) return prediction
class model: def __init__(self,params,X,Y): self.params=params self.original_predictors=list(X) # if 'time' in self.original_predictors: # self.original_predictors.remove('time') if params['NONLIN_TYPE']=='POLY': #add non linear terms self.X=self.add_nonlinear_terms(X) #print(self.X) self.Y=Y if params['STANDARDIZE']: #standardize self.standardize() self.predictor_names=list(self.X) self.target_names=list(Y) self.Y_final=self.Y.iloc[-1,:] self.time=self.X.iloc[-1, X.columns.get_loc('time')] self.date=self.X.index[-1] # print(self.X) # print(self.Y) # print(self.Y_final) # print(self.time) self.make_model() def add_nonlinear_terms(self,X): df,var_names=add_polynomial_terms(X,list(X),self.params['ORDER']) return(df) def standardize(self): self.X_mean=self.X.mean() self.Y_mean=self.Y.mean() self.X_std=self.X.std() self.Y_std=self.Y.std() self.X=(self.X-self.X_mean)/self.X_std self.Y=(self.Y-self.Y_mean)/self.Y_std def make_model(self): max_iter=1000 tol=0.015 l1_ratio=0.8 # we want a relatively sparse model elastic=MultiTaskElasticNet(fit_intercept=True, max_iter=max_iter,tol=tol,l1_ratio=l1_ratio) #Note that we are assuming that error are independent of each other GIVEN THE PREDICTORS #Otherwise cross validation won't be applicable #We will perform a grid search to find best parameters print('################ Find hyper-parameter values#######################') search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8)},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10) search.fit(self.X,self.Y) #Now create a final elastic net model using the optimal hyper parameters print('################ Build final model ##############################') optimal_alpha=search.best_params_['alpha'] #optimal_l1_ratio=search.best_params_['l1_ratio'] self.model=MultiTaskElasticNet(fit_intercept=True,alpha=optimal_alpha,l1_ratio=l1_ratio,max_iter=max_iter,tol=tol) self.model.fit(self.X.values,self.Y.values) self.predicted=pd.DataFrame(index=self.Y.index, columns= self.Y.columns, data=self.model.predict(self.X.values)) self.predicted=self.predicted*self.Y_std+self.Y_mean #second_model=(mean_squared_error(y_true=Y_train,y_pred=elastic.predict(X_train))) def predict(self,X,plot=False,Y_True=None,plot_list=None): # If plot = True , Y_true should contain the True values and this function will plot a comparion between true vs predicted if self.params['STANDARDIZE'] and self.params['NONLIN_TYPE']=='POLY': #X1=X.copy() # don't modify the original X1=self.add_nonlinear_terms(X) #print('Unnormalized predictors: ',X1) X1=(X1-self.X_mean)/self.X_std # standardized #print('Normalized predictors: ',X1) Y1=self.model.predict(X1.values) dfY1=pd.DataFrame(index=Y_True.index,columns=list(Y_True),data=Y1) dfY1=dfY1*self.Y_std+self.Y_mean #Y1=Y1* if plot: X_ax=Y_True.index label_true=[l+'_True' for l in plot_list] label_pred=[l+'_Pred' for l in plot_list] plt.figure(figsize=(6,4)) plt.plot(X_ax,Y_True[plot_list],label=label_true) plt.plot(X_ax,dfY1[plot_list],label=label_pred) plt.legend(loc='best') plt.show() return(dfY1) def forecast(self): pred=None if self.params['STANDARDIZE'] and self.params['NONLIN_TYPE']=='POLY': #if standardized and polynomial Xp=self.Y_final*self.Y_std + self.Y_mean # destandardize Y, this is needed to calculate the non linear term #print(Xp) Xp['time']=self.time+1 #- self.X_mean['time'])/self.X_std dfp=pd.DataFrame(index=[self.date],columns=self.original_predictors,data=Xp.values.reshape(1,-1)) dfp=self.add_nonlinear_terms(dfp) # add the non linear terms #print(dfp) dfp=(dfp-self.X_mean)/self.X_std # standardize, then predict #print(dfp) pred=self.model.predict(dfp.values) self.time=self.time+1 #print(self.date) self.date=self.date+MonthEnd(1) df=pd.DataFrame(index=[self.date],columns=self.target_names,data=pred) self.Y_final=df #print(self.date) return(pred,self.date) def multistep_forecast(self,steps): df=pd.DataFrame(columns=self.target_names) for i in range(steps): pred,date=self.forecast() print(pred.shape) df.loc[date,:]=np.multiply(pred,self.Y_std.values.reshape(1,-1)) + self.Y_mean.values.reshape(1,-1) return df def plot_coeffs(self): C=self.model.coef_[-1,:] indexes=np.where(np.abs(C)>0.0001) #significant predictors C_sig=C[indexes[0]] preds_sig=[self.predictor_names[int(i)] for i in indexes[0]] f,ax=plt.subplots() f.set_size_inches((10,2)) ax.bar(range(len(C_sig)),C_sig) ax.set_xticks(range(len(C_sig))) ax.set_xticklabels(labels=preds_sig) plt.xticks(rotation=90) plt.tight_layout() plt.show() def variable_importance(self,orig_var_names,labels): all_preds=list(self.X)# all predictors imp=[] for v in orig_var_names: v1=[ap for ap in all_preds if v in ap] print(v1) X1=self.X.copy() X1[v1]=0 Y1=self.model.predict(X1) imp.append(np.sum((self.Y.values-Y1)**2)) #print(imp) indexes=np.argsort(np.array(imp)) #print(indexes) preds1=[labels[i] for i in indexes] imps1=[imp[i] for i in indexes] imps1=imps1/np.max(imps1) #plot importance f,ax=plt.subplots() ax.barh(range(len(imp)),imps1) ax.set_yticks(range(len(imp))) ax.set_yticklabels(labels=preds1) ax.set_xlabel(xlabel='Importance',fontsize=12) plt.tight_layout() plt.show()