Beispiel #1
0
def main():
    start = time.time()

    print "Reading train data and its features from: " + train_file
    data = cu.get_dataframe(train_file)
    global fea
    fea = features.extract_features(feature_names, data)

    mten = MultiTaskElasticNet(alpha=0.1,
                               rho=0.5,
                               fit_intercept=True,
                               normalize=False,
                               copy_X=True,
                               max_iter=1000,
                               tol=0.0001,
                               warm_start=False)

    X = []
    for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
        X.append([i])
    # Must be array type object. Strings must be converted to
    # to integer values, otherwise fit method raises ValueError
    global y
    y = []

    print "Collecting statuses"

    for element in data["OpenStatus"]:
        for index, status in enumerate(ques_status):
            if element == status:
                y.append(index)

    print "Fitting"
    mten.fit(fea, y)
    '''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''

    print "Reading test data and features"
    test_data = cu.get_dataframe(test_file)
    test_fea = features.extract_features(feature_names, test_data)

    print "Making predictions"
    global probs
    probs = mten.predict(test_fea)
    # shape of probs is [n_samples]
    # convert probs to shape [n_samples,n_classes]
    probs = np.resize(probs, (len(probs) / 5, 5))

    if is_full_train_set == 0:
        print("Calculating priors and updating posteriors")
        new_priors = cu.get_priors(full_train_file)
        old_priors = cu.get_priors(train_file)
        probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)

    print "writing submission to " + submission_file
    cu.write_submission(submission_file, probs)
    finish = time.time()
    print "completed in %0.4f seconds" % (finish - start)
Beispiel #2
0
def test_enet_float_precision():
    # Generate dataset
    X, y, X_test, y_test = build_dataset(n_samples=20, n_features=10)
    # Here we have a small number of iterations, and thus the
    # ElasticNet might not converge. This is to speed up tests

    for normalize in [True, False]:
        for fit_intercept in [True, False]:
            coef = {}
            intercept = {}
            for dtype in [np.float64, np.float32]:
                clf = ElasticNet(alpha=0.5,
                                 max_iter=100,
                                 precompute=False,
                                 fit_intercept=fit_intercept,
                                 normalize=normalize)

                X = dtype(X)
                y = dtype(y)
                ignore_warnings(clf.fit)(X, y)

                coef[('simple', dtype)] = clf.coef_
                intercept[('simple', dtype)] = clf.intercept_

                assert clf.coef_.dtype == dtype

                # test precompute Gram array
                Gram = X.T.dot(X)
                clf_precompute = ElasticNet(alpha=0.5,
                                            max_iter=100,
                                            precompute=Gram,
                                            fit_intercept=fit_intercept,
                                            normalize=normalize)
                ignore_warnings(clf_precompute.fit)(X, y)
                assert_array_almost_equal(clf.coef_, clf_precompute.coef_)
                assert_array_almost_equal(clf.intercept_,
                                          clf_precompute.intercept_)

                # test multi task enet
                multi_y = np.hstack((y[:, np.newaxis], y[:, np.newaxis]))
                clf_multioutput = MultiTaskElasticNet(
                    alpha=0.5,
                    max_iter=100,
                    fit_intercept=fit_intercept,
                    normalize=normalize)
                clf_multioutput.fit(X, multi_y)
                coef[('multi', dtype)] = clf_multioutput.coef_
                intercept[('multi', dtype)] = clf_multioutput.intercept_
                assert clf.coef_.dtype == dtype

            for v in ['simple', 'multi']:
                assert_array_almost_equal(coef[(v, np.float32)],
                                          coef[(v, np.float64)],
                                          decimal=4)
                assert_array_almost_equal(intercept[(v, np.float32)],
                                          intercept[(v, np.float64)],
                                          decimal=4)
def main():
	start = time.time()

	print "Reading train data and its features from: " + train_file
	data = cu.get_dataframe(train_file)
	global fea
	fea = features.extract_features(feature_names,data)

	mten = MultiTaskElasticNet(alpha=0.1, rho=0.5, fit_intercept=True, normalize=False, copy_X=True, max_iter=1000, tol=0.0001, warm_start=False)

	X = []
	for i in data["OwnerUndeletedAnswerCountAtPostTime"]:
		X.append([i])
	# Must be array type object. Strings must be converted to
	# to integer values, otherwise fit method raises ValueError
	global y
	y = [] 

	print "Collecting statuses"
	
	for element in data["OpenStatus"]:
            for index, status in enumerate(ques_status):
                if element == status:
                    y.append(index)
            
	print "Fitting"
	mten.fit(fea, y)
	
	'''Make sure you have the up to date version of sklearn; v0.12 has the
           predict_proba method; http://scikit-learn.org/0.11/install.html '''   
	
	print "Reading test data and features"
	test_data = cu.get_dataframe(test_file)
	test_fea = features.extract_features(feature_names,test_data)

	print "Making predictions"
	global probs
	probs = mten.predict(test_fea)
	# shape of probs is [n_samples]
	# convert probs to shape [n_samples,n_classes]
	probs = np.resize(probs, (len(probs) / 5, 5))
	
	if is_full_train_set == 0:
		print("Calculating priors and updating posteriors")
		new_priors = cu.get_priors(full_train_file)
		old_priors = cu.get_priors(train_file)
		probs = cu.cap_and_update_priors(old_priors, probs, new_priors, 0.001)	

	print "writing submission to " + submission_file
	cu.write_submission(submission_file, probs)
	finish = time.time()
	print "completed in %0.4f seconds" % (finish-start)
Beispiel #4
0
class MultiTaskElasticNetImpl:
    def __init__(self, **hyperparams):
        self._hyperparams = hyperparams
        self._wrapped_model = Op(**self._hyperparams)

    def fit(self, X, y=None):
        if y is not None:
            self._wrapped_model.fit(X, y)
        else:
            self._wrapped_model.fit(X)
        return self

    def predict(self, X):
        return self._wrapped_model.predict(X)
Beispiel #5
0
def test_convergence_warnings():
    random_state = np.random.RandomState(0)
    X = random_state.standard_normal((1000, 500))
    y = random_state.standard_normal((1000, 3))

    # check that the model fails to converge
    with pytest.warns(ConvergenceWarning):
        MultiTaskElasticNet(max_iter=1, tol=0).fit(X, y)

    # check that the model converges w/o warnings
    with pytest.warns(None) as record:
        MultiTaskElasticNet(max_iter=1000).fit(X, y)

    assert not record.list
Beispiel #6
0
def test_multi_task_lasso_and_enet():
    X, y, X_test, y_test = build_dataset()
    Y = np.c_[y, y]
    # Y_test = np.c_[y_test, y_test]
    clf = MultiTaskLasso(alpha=1, tol=1e-8).fit(X, Y)
    assert 0 < clf.dual_gap_ < 1e-5
    assert_array_almost_equal(clf.coef_[0], clf.coef_[1])

    clf = MultiTaskElasticNet(alpha=1, tol=1e-8).fit(X, Y)
    assert 0 < clf.dual_gap_ < 1e-5
    assert_array_almost_equal(clf.coef_[0], clf.coef_[1])

    clf = MultiTaskElasticNet(alpha=1.0, tol=1e-8, max_iter=1)
    assert_warns_message(ConvergenceWarning, 'did not converge', clf.fit, X, Y)
Beispiel #7
0
def train_base_model(X_train,
                     X_valid,
                     Y_train,
                     Y_valid,
                     norms_X,
                     norms_Y,
                     model='rf'):
    if model == 'rf':
        predictor = RandomForestRegressor(max_features=0.3,
                                          n_estimators=200,
                                          n_jobs=3)
    elif model == 'elastic':
        predictor = MultiTaskElasticNet(alpha=0.003, l1_ratio=0.7)
    elif model == 'knn':
        predictor = KNeighborsRegressor(2, weights='distance')
    else:
        raise ValueError('{} is not a valid model!'.format(model))

    predictor.fit(X_train, Y_train)

    recon_train = predictor.predict(X_train)
    recon_valid = predictor.predict(X_valid)

    X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid = _correct_data(
        X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid, norms_X,
        norms_Y)

    train_mae = np.average(np.absolute(Y_train - recon_train))
    train_mse = np.average(np.square(Y_train - recon_train))
    val_mae = np.average(np.absolute(Y_valid - recon_valid))
    val_mse = np.average(np.square(Y_valid - recon_valid))

    return X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid, np.array(
        [train_mae, train_mse, val_mae, val_mse])
Beispiel #8
0
def train_base_model(X_train, X_valid, Y_train, Y_valid, norms_X, norms_Y, model='rf'):
    if model == 'rf': #random regression forest
        predictor = RandomForestRegressor(max_features=0.3, n_estimators=200, n_jobs=3)
    elif model == 'elastic': #elastic net (note the results for the elastic net in not included in the manuscript- it did not prefer as well as the random regression forest)
        predictor = MultiTaskElasticNet(alpha=0.003, l1_ratio=0.7)
    elif model == 'knn': #k-nearest neighbours.
        predictor = KNeighborsRegressor(2, weights='distance')
    else:
        raise ValueError('{} is not a valid model!'.format(model))

    predictor.fit(X_train, Y_train)

    recon_train = predictor.predict(X_train)
    recon_valid = predictor.predict(X_valid)

    X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid = _correct_data(X_train, X_valid, Y_train, Y_valid,
                                                                                 recon_train, recon_valid, norms_X,
                                                                                 norms_Y)

    train_mae = np.average(np.absolute(Y_train - recon_train))
    train_mse = np.average(np.square(Y_train - recon_train))
    val_mae = np.average(np.absolute(Y_valid - recon_valid))
    val_mse = np.average(np.square(Y_valid - recon_valid))

    return X_train, X_valid, Y_train, Y_valid, recon_train, recon_valid, np.array(
        [train_mae, train_mse, val_mae, val_mse])
Beispiel #9
0
 def getModel(self, _params):
     return MultiTaskElasticNet(
         alpha=_params['alpha'],
         l1_ratio=_params['l1_ratio'],
         fit_intercept=_params['fit_intercept'],
         normalize=_params['normalize'],
         copy_X=_params['copy_X'],
         selection=_params['selection'],
     )
Beispiel #10
0
def test_random_descent():
    # Test that both random and cyclic selection give the same results.
    # Ensure that the test models fully converge and check a wide
    # range of conditions.

    # This uses the coordinate descent algo using the gram trick.
    X, y, _, _ = build_dataset(n_samples=50, n_features=20)
    clf_cyclic = ElasticNet(selection='cyclic', tol=1e-8)
    clf_cyclic.fit(X, y)
    clf_random = ElasticNet(selection='random', tol=1e-8, random_state=42)
    clf_random.fit(X, y)
    assert_array_almost_equal(clf_cyclic.coef_, clf_random.coef_)
    assert_almost_equal(clf_cyclic.intercept_, clf_random.intercept_)

    # This uses the descent algo without the gram trick
    clf_cyclic = ElasticNet(selection='cyclic', tol=1e-8)
    clf_cyclic.fit(X.T, y[:20])
    clf_random = ElasticNet(selection='random', tol=1e-8, random_state=42)
    clf_random.fit(X.T, y[:20])
    assert_array_almost_equal(clf_cyclic.coef_, clf_random.coef_)
    assert_almost_equal(clf_cyclic.intercept_, clf_random.intercept_)

    # Sparse Case
    clf_cyclic = ElasticNet(selection='cyclic', tol=1e-8)
    clf_cyclic.fit(sparse.csr_matrix(X), y)
    clf_random = ElasticNet(selection='random', tol=1e-8, random_state=42)
    clf_random.fit(sparse.csr_matrix(X), y)
    assert_array_almost_equal(clf_cyclic.coef_, clf_random.coef_)
    assert_almost_equal(clf_cyclic.intercept_, clf_random.intercept_)

    # Multioutput case.
    new_y = np.hstack((y[:, np.newaxis], y[:, np.newaxis]))
    clf_cyclic = MultiTaskElasticNet(selection='cyclic', tol=1e-8)
    clf_cyclic.fit(X, new_y)
    clf_random = MultiTaskElasticNet(selection='random',
                                     tol=1e-8,
                                     random_state=42)
    clf_random.fit(X, new_y)
    assert_array_almost_equal(clf_cyclic.coef_, clf_random.coef_)
    assert_almost_equal(clf_cyclic.intercept_, clf_random.intercept_)

    # Raise error when selection is not in cyclic or random.
    clf_random = ElasticNet(selection='invalid')
    assert_raises(ValueError, clf_random.fit, X, y)
Beispiel #11
0
 def test_model_multi_task_elasticnet(self):
     model, X = fit_regression_model(MultiTaskElasticNet(), n_targets=2)
     model_onnx = convert_sklearn(
         model, "multi-task elasticnet",
         [("input", FloatTensorType([None, X.shape[1]]))])
     self.assertIsNotNone(model_onnx)
     dump_data_and_model(X,
                         model,
                         model_onnx,
                         verbose=False,
                         basename="SklearnMultiTaskElasticNet-Dec4")
def train_metaregressor(stack_path, train, labels, run_sequence, scale_data, models, predict_mode_all, full = True, verbose = False):

    if full: model_suffix = "_30"
    else: model_suffix = "_8"

    print("".join(["\n", "=" * 50, "".join(["\nTraining Metaregressor", model_suffix, " (Level 2)\n"]), "=" * 50, "\n"]))

    # Model definition for metaregressor
    if predict_mode_all:
        model = MultiTaskElasticNet(random_state = 42, max_iter = 1000, l1_ratio = 1.0, alpha = 0.1)
    else:
        model = ElasticNet(random_state = 42, max_iter = 1000, l1_ratio = 1.0, alpha = 0.1)
    
    print('Training linear metaregressors for %d models and %d total independent variables.\n' % (len(models), train.shape[1]))
    
    reg_models, rmse = [], []
    if predict_mode_all:
        print("// MODE: All-in-One Pass //\n")
        model.fit(train.values, labels.values)
        rmse = [np.sqrt(mean_squared_error(y_true = labels.values, y_pred = model.predict(train.values)))]
        reg_models.append(model)
    else:
        print("// MODE: One-at-a-Time //\n")
        # iterate and build a model over all dependent variables (30)
        for f in range(len(TRAIN_COLS)):
            # get the list of values to predict, column-wise
            predict_me = labels.values[:,f]
            # build the list of independent variables 
            for i in range((0+f), ((30 * len(models)) + f), 30):
                if i == 0+f:
                    train_me = train.values[:,i].reshape(-1, 1)
                else:
                    train_me = np.hstack((train_me, train.values[:,i].reshape(-1, 1)))
            # fit and store in our reg_models list
            model.fit(train_me, predict_me)
            reg_models.append(model)
            score = np.sqrt(mean_squared_error(y_true = predict_me, y_pred = model.predict(train_me)))
            rmse.append(score)
            print("Metaregressor #%d of %d trained for feature '%s'; RMSE was: %.5f" % 
                ((f + 1), len(TRAIN_COLS), TRAIN_COLS[f], score))
    
    print("\nAll metaregressors trained; average RMSE: %.5f" % np.mean(rmse))

    print("".join(["\n", "=" * 50, "".join(["\nMetaregressor", model_suffix, " Training Complete\n"]), "=" * 50, "\n"]))

    return reg_models
def get_hyperparameters_model():
    param_dist = {}

    clf = MultiTaskElasticNet()

    model = {
        'multi_task_elastic_net': {
            'model': clf,
            'param_distributions': param_dist
        }
    }
    return model
Beispiel #14
0
    def make_model(self):
        max_iter = 1000
        tol = 0.015
        l1_ratio = 0.8  # we want a relatively sparse model
        elastic = MultiTaskElasticNet(fit_intercept=True,
                                      max_iter=max_iter,
                                      tol=tol,
                                      l1_ratio=l1_ratio)

        #Note that we are assuming that error are independent of each other GIVEN THE PREDICTORS
        #Otherwise cross validation won't be applicable
        #We will perform a grid search to find best parameters

        print(
            '################ Find hyper-parameter values#######################'
        )
        search = GridSearchCV(estimator=elastic,
                              param_grid={'alpha': np.logspace(-5, 2, 8)},
                              scoring='neg_mean_squared_error',
                              n_jobs=1,
                              refit=True,
                              cv=10)
        search.fit(self.X, self.Y)

        #Now create a final elastic net model using the optimal hyper parameters
        print(
            '################ Build final model ##############################'
        )
        optimal_alpha = search.best_params_['alpha']
        #optimal_l1_ratio=search.best_params_['l1_ratio']
        self.model = MultiTaskElasticNet(fit_intercept=True,
                                         alpha=optimal_alpha,
                                         l1_ratio=l1_ratio,
                                         max_iter=max_iter,
                                         tol=tol)
        self.model.fit(self.X.values, self.Y.values)
        self.predicted = pd.DataFrame(index=self.Y.index,
                                      columns=self.Y.columns,
                                      data=self.model.predict(self.X.values))
        self.predicted = self.predicted * self.Y_std + self.Y_mean
def make_dictionary(X,
                    n_components=20,
                    alpha=5.,
                    write_dir='/tmp/',
                    contrasts=[],
                    method='multitask',
                    l1_ratio=.5,
                    n_subjects=13):
    """Create dictionary + encoding"""
    from sklearn.decomposition import dict_learning_online, sparse_encode
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import MultiTaskLasso, MultiTaskElasticNet

    mem = Memory(write_dir, verbose=0)
    dictionary = mem.cache(initial_dictionary)(n_components, X)
    np.savez(os.path.join(write_dir, 'dictionary.npz'),
             loadings=dictionary,
             contrasts=contrasts)
    if method == 'online':
        components, dictionary = dict_learning_online(X.T,
                                                      n_components,
                                                      alpha=alpha,
                                                      dict_init=dictionary,
                                                      batch_size=200,
                                                      method='cd',
                                                      return_code=True,
                                                      shuffle=True,
                                                      n_jobs=1,
                                                      positive_code=True)
        np.savez(os.path.join(write_dir, 'dictionary.npz'),
                 loadings=dictionary,
                 contrasts=contrasts)
    elif method == 'sparse':
        components = sparse_encode(X.T,
                                   dictionary,
                                   alpha=alpha,
                                   max_iter=10,
                                   n_jobs=1,
                                   check_input=True,
                                   verbose=0,
                                   positive=True)
    elif method == 'multitask':
        # too many hard-typed parameters !!!
        n_voxels = X.shape[1] // n_subjects
        components = np.zeros((X.shape[1], n_components))
        clf = MultiTaskLasso(alpha=alpha)
        clf = MultiTaskElasticNet(alpha=alpha, l1_ratio=l1_ratio)
        for i in range(n_voxels):
            x = X[:, i:i + n_subjects * n_voxels:n_voxels]
            components[i: i + n_subjects * n_voxels: n_voxels] =\
                clf.fit(dictionary.T, x).coef_
    return dictionary, components
Beispiel #16
0
    def mtelastic_model(self, X_train, y_train, X_test, y_test):
        # Multi-task Elastic-Net Regression Model

        mten_model = MultiTaskElasticNet(alpha=.1918)

        mten_model.fit(X_train, y_train)

        y_train_pred = mten_model.predict(X_train)
        y_test_pred = mten_model.predict(X_test)

        # To score the model I can either use the .score from sklearn or use the MSE R^2 from the Machine Learning Book
        print(mten_model.score(X_train, y_train))
        print(mten_model.score(X_test, y_test))
        print('MSE train: %.6f, MSE test: %.6f' % (mean_squared_error(
            y_train, y_train_pred), mean_squared_error(y_test, y_test_pred)))
        print('R^2 train: %.6f, R^2 test: %.6f' %
              (r2_score(y_train, y_train_pred), r2_score(y_test, y_test_pred)))
def get_regressors_multitask(nmodels='all'):
    """
		Returns one or all of Multi-task linear regressors 
	"""
    # 1. MultiTaskElasticNet
    lr1 = MultiTaskElasticNet()

    # 2. MultiTaskLasso
    lr2 = MultiTaskLasso()

    if (nmodels == 'all'):
        models = [lr1, lr2]
    else:
        models = ['lr' + str(nmodels)]

    return models
Beispiel #18
0
    def train_ElasticNet(trainx, trainy, testx=None, testy=None, results=None):
        print('\nTraining ElasticNet...')

        if Y.multiclass:
            net = SGDClassifier(penalty='elasticnet',
                                l1_ratio=.5,
                                random_state=seed)
            net.fit(trainx, trainy)
            testp = net.predict(testx)
            t_bacc = balanced_accuracy_score(testy, testp)

            outcome = Y.outcome_names[0]
            results['test_balanced_accuracy'][outcome].append(t_bacc)

        elif Y.multioutcome:
            net = MultiTaskElasticNet(random_state=seed)
            net.fit(trainx, trainy)
            testp = net.predict(testx)
            trainp = net.predict(trainx)

            for i, outcome in enumerate(Y.outcome_names):
                t_r2 = r2_score(testy[:, i], testp[:, i])
                t_mae = mean_absolute_error(testy[:, i], testp[:, i])

                results['test_r2_sklearn'][outcome].append(t_r2)
                results['test_mean_absolute_error'][outcome].append(t_mae)

        else:
            net = ElasticNet(random_state=seed)
            net.fit(trainx, trainy)
            testp = net.predict(testx)
            trainp = net.predict(trainx)
            t_r2 = r2_score(testy, testp)
            t_mae = mean_absolute_error(testy, testp)

            outcome = Y.outcome_names[0]
            results['test_r2_sklearn'][outcome].append(t_r2)
            results['test_mean_absolute_error'][outcome].append(t_mae)

        best_output = [trainp, trainy, testp, testy]
        output_names = ['trainp', 'trainy', 'testp', 'testy']

        return results, net, best_output, output_names
Beispiel #19
0
def train_multi_elasticnet(train_features, train_labels, num_alphas,
                           skip_cross_validation, alpha, l1_ratio, num_jobs):
    """
  Performs the cross validation of multi elastic net model, and returns the trained model
  with best params. Assume features are scaled/normalized. Assumes train_labels has more
  than one column.
  """

    best_alpha = alpha
    best_l1_ratio = l1_ratio
    max_iter = 10000
    tol = 0.0005
    if not skip_cross_validation:
        # use 5 fold cross validation
        model = MultiTaskElasticNetCV(l1_ratio=[
            0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.925, 0.95, 0.975, 0.99, 0.999,
            0.9999
        ],
                                      max_iter=max_iter,
                                      cv=5,
                                      n_alphas=num_alphas,
                                      n_jobs=num_jobs,
                                      normalize=False,
                                      tol=tol)
        model.fit(train_features, train_labels)
        best_alpha = model.alpha_
        best_l1_ratio = model.l1_ratio_
        #print("number of iterations were {}".format(model.n_iter_))

    model = MultiTaskElasticNet(alpha=best_alpha,
                                l1_ratio=best_l1_ratio,
                                normalize=False,
                                max_iter=max_iter,
                                tol=tol)
    model.fit(train_features, train_labels)

    return (model, {'alpha': best_alpha, 'l1_ratio': best_l1_ratio})
Beispiel #20
0
 def regressor_creator(indata, outdata):
     return MultiTaskElasticNet(max_iter=3000)
#this part is used to calculate the Multi-Task Elastic-net's score when the hyper-parameter is optimal 

#load necessary libs 
from sklearn.feature_selection import SelectKBest
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import MultiTaskElasticNet
from sklearn.cross_validation import train_test_split

#splite dataset to get necessary sub-dataset
features_train, features_test, labels_train, labels_test = train_test_split(features_sc,label_scm,test_size=0.33,random_state=42)

#pre-process: dimensional reduction(SVD)
svd1 = TruncatedSVD(n_components=9,random_state=1).fit(features_train)
features_train = svd1.transform(features_train)

svd2 = TruncatedSVD(n_components=9,random_state=1).fit(features_test)
features_test = svd2.transform(features_test)

#do regression
mte = MultiTaskElasticNet(alpha=0.000000001,l1_ratio=0.01,random_state=1)
mte.fit(features_train,labels_train)
print "MultiTaskElasticNet",mte.score(features_test,labels_test)
##########################################################################

#All of the codes end. 
#Thank you!




Beispiel #22
0
def GetAllModelsForComparison(X_train, Y_train):
    models = {
        'ARDRegression': ARDRegression(),
        'BayesianRidge': BayesianRidge(),
        'ElasticNet': ElasticNet(),
        'ElasticNetCV': ElasticNetCV(),
        'Hinge': Hinge(),
        #'Huber': Huber(),
        'HuberRegressor': HuberRegressor(),
        'Lars': Lars(),
        'LarsCV': LarsCV(),
        'Lasso': Lasso(),
        'LassoCV': LassoCV(),
        'LassoLars': LassoLars(),
        'LassoLarsCV': LassoLarsCV(),
        'LinearRegression': LinearRegression(),
        'Log': Log(),
        'LogisticRegression': LogisticRegression(),
        'LogisticRegressionCV': LogisticRegressionCV(),
        'ModifiedHuber': ModifiedHuber(),
        'MultiTaskElasticNet': MultiTaskElasticNet(),
        'MultiTaskElasticNetCV': MultiTaskElasticNetCV(),
        'MultiTaskLasso': MultiTaskLasso(),
        'MultiTaskLassoCV': MultiTaskLassoCV(),
        'OrthogonalMatchingPursuit': OrthogonalMatchingPursuit(),
        'OrthogonalMatchingPursuitCV': OrthogonalMatchingPursuitCV(),
        'PassiveAggressiveClassifier': PassiveAggressiveClassifier(),
        'PassiveAggressiveRegressor': PassiveAggressiveRegressor(),
        'Perceptron': Perceptron(),
        'RANSACRegressor': RANSACRegressor(),
        #'RandomizedLasso': RandomizedLasso(),
        #'RandomizedLogisticRegression': RandomizedLogisticRegression(),
        'Ridge': Ridge(),
        'RidgeCV': RidgeCV(),
        'RidgeClassifier': RidgeClassifier(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        'SquaredLoss': SquaredLoss(),
        'TheilSenRegressor': TheilSenRegressor(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LinearClassifierMixin': LinearClassifierMixin(),
        'LinearDiscriminantAnalysis': LinearDiscriminantAnalysis(),
        'QuadraticDiscriminantAnalysis': QuadraticDiscriminantAnalysis(),
        'StandardScaler': StandardScaler(),
        'TransformerMixin': TransformerMixin(),
        'BaseEstimator': BaseEstimator(),
        'KernelRidge': KernelRidge(),
        'RegressorMixin': RegressorMixin(),
        'LinearSVC': LinearSVC(),
        'LinearSVR': LinearSVR(),
        'NuSVC': NuSVC(),
        'NuSVR': NuSVR(),
        'OneClassSVM': OneClassSVM(),
        'SVC': SVC(),
        'SVR': SVR(),
        'SGDClassifier': SGDClassifier(),
        'SGDRegressor': SGDRegressor(),
        #'BallTree': BallTree(),
        #'DistanceMetric': DistanceMetric(),
        #'KDTree': KDTree(),
        'KNeighborsClassifier': KNeighborsClassifier(),
        'KNeighborsRegressor': KNeighborsRegressor(),
        'KernelDensity': KernelDensity(),
        #'LSHForest': LSHForest(),
        'LocalOutlierFactor': LocalOutlierFactor(),
        'NearestCentroid': NearestCentroid(),
        'NearestNeighbors': NearestNeighbors(),
        'RadiusNeighborsClassifier': RadiusNeighborsClassifier(),
        'RadiusNeighborsRegressor': RadiusNeighborsRegressor(),
        #'GaussianProcess': GaussianProcess(),
        'GaussianProcessRegressor': GaussianProcessRegressor(),
        'GaussianProcessClassifier': GaussianProcessClassifier(),
        'CCA': CCA(),
        'PLSCanonical': PLSCanonical(),
        'PLSRegression': PLSRegression(),
        'PLSSVD': PLSSVD(),
        #'ABCMeta': ABCMeta(),
        #'BaseDiscreteNB': BaseDiscreteNB(),
        'BaseEstimator': BaseEstimator(),
        #'BaseNB': BaseNB(),
        'BernoulliNB': BernoulliNB(),
        'ClassifierMixin': ClassifierMixin(),
        'GaussianNB': GaussianNB(),
        'LabelBinarizer': LabelBinarizer(),
        'MultinomialNB': MultinomialNB(),
        'DecisionTreeClassifier': DecisionTreeClassifier(),
        'DecisionTreeRegressor': DecisionTreeRegressor(),
        'ExtraTreeClassifier': ExtraTreeClassifier(),
        'AdaBoostClassifier': AdaBoostClassifier(),
        'AdaBoostRegressor': AdaBoostRegressor(),
        'BaggingClassifier': BaggingClassifier(),
        'BaggingRegressor': BaggingRegressor(),
        #'BaseEnsemble': BaseEnsemble(),
        'ExtraTreesClassifier': ExtraTreesClassifier(),
        'ExtraTreesRegressor': ExtraTreesRegressor(),
        'GradientBoostingClassifier': GradientBoostingClassifier(),
        'GradientBoostingRegressor': GradientBoostingRegressor(),
        'IsolationForest': IsolationForest(),
        'RandomForestClassifier': RandomForestClassifier(),
        'RandomForestRegressor': RandomForestRegressor(),
        'RandomTreesEmbedding': RandomTreesEmbedding(),
        #'VotingClassifier': VotingClassifier(),
        'BaseEstimator': BaseEstimator(),
        'ClassifierMixin': ClassifierMixin(),
        'LabelBinarizer': LabelBinarizer(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'OneVsOneClassifier': OneVsOneClassifier(),
        #'OneVsRestClassifier': OneVsRestClassifier(),
        #'OutputCodeClassifier': OutputCodeClassifier(),
        'Parallel': Parallel(),
        #'ABCMeta': ABCMeta(),
        'BaseEstimator': BaseEstimator(),
        #'ClassifierChain': ClassifierChain(),
        'ClassifierMixin': ClassifierMixin(),
        'MetaEstimatorMixin': MetaEstimatorMixin(),
        #'MultiOutputClassifier': MultiOutputClassifier(),
        #'MultiOutputEstimator': MultiOutputEstimator(),
        #'MultiOutputRegressor': MultiOutputRegressor(),
        'Parallel': Parallel(),
        'RegressorMixin': RegressorMixin(),
        'LabelPropagation': LabelPropagation(),
        'LabelSpreading': LabelSpreading(),
        'BaseEstimator': BaseEstimator(),
        'IsotonicRegression': IsotonicRegression(),
        'RegressorMixin': RegressorMixin(),
        'TransformerMixin': TransformerMixin(),
        'BernoulliRBM': BernoulliRBM(),
        'MLPClassifier': MLPClassifier(),
        'MLPRegressor': MLPRegressor()
    }
    return models
def test_check_estimator():
    # tests that the estimator actually fails on "bad" estimators.
    # not a complete test of all checks, which are very extensive.

    # check that we have a set_params and can clone
    msg = "it does not implement a 'get_params' methods"
    assert_raises_regex(TypeError, msg, check_estimator, object)
    assert_raises_regex(TypeError, msg, check_estimator, object())
    # check that we have a fit method
    msg = "object has no attribute 'fit'"
    assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator)
    assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator())
    # check that fit does input validation
    msg = "TypeError not raised"
    assert_raises_regex(AssertionError, msg, check_estimator,
                        BaseBadClassifier)
    assert_raises_regex(AssertionError, msg, check_estimator,
                        BaseBadClassifier())
    # check that sample_weights in fit accepts pandas.Series type
    try:
        from pandas import Series  # noqa
        msg = ("Estimator NoSampleWeightPandasSeriesType raises error if "
               "'sample_weight' parameter is of type pandas.Series")
        assert_raises_regex(
            ValueError, msg, check_estimator, NoSampleWeightPandasSeriesType)
    except ImportError:
        pass
    # check that predict does input validation (doesn't accept dicts in input)
    msg = "Estimator doesn't check for NaN and inf in predict"
    assert_raises_regex(AssertionError, msg, check_estimator, NoCheckinPredict)
    assert_raises_regex(AssertionError, msg, check_estimator,
                        NoCheckinPredict())
    # check that estimator state does not change
    # at transform/predict/predict_proba time
    msg = 'Estimator changes __dict__ during predict'
    assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict)
    # check that `fit` only changes attribures that
    # are private (start with an _ or end with a _).
    msg = ('Estimator ChangesWrongAttribute should not change or mutate  '
           'the parameter wrong_attribute from 0 to 1 during fit.')
    assert_raises_regex(AssertionError, msg,
                        check_estimator, ChangesWrongAttribute)
    check_estimator(ChangesUnderscoreAttribute)
    # check that `fit` doesn't add any public attribute
    msg = ('Estimator adds public attribute\(s\) during the fit method.'
           ' Estimators are only allowed to add private attributes'
           ' either started with _ or ended'
           ' with _ but wrong_attribute added')
    assert_raises_regex(AssertionError, msg,
                        check_estimator, SetsWrongAttribute)
    # check for invariant method
    name = NotInvariantPredict.__name__
    method = 'predict'
    msg = ("{method} of {name} is not invariant when applied "
           "to a subset.").format(method=method, name=name)
    assert_raises_regex(AssertionError, msg,
                        check_estimator, NotInvariantPredict)
    # check for sparse matrix input handling
    name = NoSparseClassifier.__name__
    msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
    # the check for sparse input handling prints to the stdout,
    # instead of raising an error, so as not to remove the original traceback.
    # that means we need to jump through some hoops to catch it.
    old_stdout = sys.stdout
    string_buffer = StringIO()
    sys.stdout = string_buffer
    try:
        check_estimator(NoSparseClassifier)
    except:
        pass
    finally:
        sys.stdout = old_stdout
    assert_true(msg in string_buffer.getvalue())

    # doesn't error on actual estimator
    check_estimator(AdaBoostClassifier)
    check_estimator(AdaBoostClassifier())
    check_estimator(MultiTaskElasticNet)
    check_estimator(MultiTaskElasticNet())
Beispiel #24
0
    def __init__(
            self,
            species: str,
            reprocess: Optional[bool] = False,
            gene_selection_method: Optional[Literal['deg', 'lasso',
                                                    'elastic-net']] = 'deg',
            model_cache_dir: Optional[str] = None,
            alpha: Optional[Union[float, Sequence[float]]] = 1e-2,
            learning_rate: Optional[float] = 1e-3,
            equal_weight: Optional[bool] = True,
            train_split: Optional[float] = 0.8,
            n_jobs: Optional[int] = 15,
            remove_correlated: Optional[Literal['both', 'ct',
                                                'region']] = None,
            normalize: Optional[bool] = False,
            dim_reduction: Optional[str] = None,
            n_components: Optional[int] = None):
        super().__init__()
        torch.set_num_threads(n_jobs)

        filename = f'{species}_ex_colors'

        self.learning_rate = learning_rate
        self.device = 'cpu'

        # Used saved data if possible
        if not reprocess and os.path.exists(
                f'withcolors_preprocessed/{filename}.pickle'):
            with open(f'withcolors_preprocessed/{filename}.pickle',
                      mode='rb') as file:
                data_dict = pickle.load(file)
                self.data = data_dict['data']
                self.ct_axis_mask = data_dict['ct_axis_mask']
                self.r_axis_mask = data_dict['r_axis_mask']
                # No need to do anything else
                return

        species_data = sc.read(f'withcolors/{filename}.h5ad')
        if dim_reduction is not None:
            sc.pp.pca(species_data, n_comps=n_components)
            sc.pp.highly_variable_genes(species_data)
            sc.pp.neighbors(species_data, n_pcs=n_components)
            if dim_reduction == 'pca':
                sc.tl.pca(species_data, n_comps=n_components)
            elif dim_reduction == 'umap':
                sc.tl.umap(species_data, n_components=n_components)
            elif dim_reduction == 'tsne':
                sc.tl.tsne(species_data, n_pcs=n_components)
            species_data = AnnData(species_data.obsm[f'X_{dim_reduction}'],
                                   obs=species_data.obs)
            species_data.var.index = pd.Index([
                f'{dim_reduction}{x}'
                for x in range(len(species_data.var.index))
            ])

        # Label each observation with its subregion and species
        species_data.obs['clusters'] = species_data.obs['clusters'].apply(
            lambda s: species[0].upper() + '_' + s)
        species_data.obs['subregion'] = species_data.obs['clusters'].apply(
            lambda s: s.split('.')[0])
        self.n_var = len(species_data.var.index)
        self.n_subregions = len(np.unique(species_data.obs['subregion']))
        self.n_clusters = len(np.unique(species_data.obs['clusters']))
        self.n_obs = len(species_data.obs.index)

        if gene_selection_method == 'deg':
            self._deg_select(dim_reduction, species_data)
        elif gene_selection_method in ['lasso', 'elastic-net']:
            # if isinstance(alpha, float):
            #     alpha = [alpha]
            for label in ['subregion', 'clusters']:
                if equal_weight:
                    # get count of number of occurrences of each label
                    label_to_count = species_data.obs[label].value_counts(
                        normalize=True).to_dict()
                    # Map each observation to its appropriate label appearance frequency
                    w = species_data.obs[label].map(label_to_count)
                    # Diagonalize and take square root to appropriately normalize data
                    w = np.diag(np.sqrt(w))
                    # normalize data
                    transcriptomes = np.matmul(w, species_data.X.toarray())
                else:
                    transcriptomes = species_data.X.toarray()
                model_file = f'{model_cache_dir}/{gene_selection_method}/' \
                             f'{species[0].upper()}_normalized-{equal_weight}_{label}_a-{alpha}.pt'
                if model_cache_dir is not None and os.path.exists(model_file):
                    with open(model_file, 'rb') as file:
                        model = pickle.load(file)
                else:
                    # Create one-hot encoding of labels
                    num_labels = self.n_subregions if label == 'subregion' else self.n_clusters
                    label_to_id = {
                        r: i
                        for i, r in enumerate(
                            np.unique(species_data.obs[label]))
                    }
                    labels = species_data.obs[label].map(label_to_id)
                    labels_expanded = np.zeros((self.n_obs, num_labels))
                    labels_expanded[np.arange(self.n_obs), labels] = 1
                    if gene_selection_method == 'lasso':
                        model = MultiTaskLasso(alpha=alpha, max_iter=10000)
                    else:
                        model = MultiTaskElasticNet(alpha=alpha,
                                                    max_iter=10000)
                    model.fit(transcriptomes, labels_expanded)
                    with open(model_file, 'wb') as file:
                        pickle.dump(model, file, protocol=5)
                max_weight_per_gene = (model.coef_ != 0).max(axis=0)
                # # define the model
                # model = nn.Sequential(
                #     # nn.BatchNorm1d(self.n_var),
                #     nn.Linear(self.n_var, num_labels)
                # )
                # model_file = f'{model_cache_dir}_{label}.pt'
                # if model_cache_dir is None or not os.path.exists(model_file):
                #     print(f'\nTraining lasso on {label}.\n')
                #     # Create the dataset and dataloader
                #     ds = SparseDataSet(species_data, label)
                #     train_size = int(train_split * len(ds))
                #     val_size = len(ds) - train_size
                #     train_ds, val_ds = torch.utils.data.random_split(ds, [train_size, val_size])
                #     train_dl = DataLoader(train_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=0)
                #     val_dl = DataLoader(val_ds, shuffle=True, batch_size=BATCH_SIZE, num_workers=0)
                #     optimizer = optim.Adam(model.parameters(), lr=self.learning_rate)
                #     # train
                #     num_nonzero_features_by_alpha = []
                #     for alpha in alpha:
                #         loss_history = self._train_model(model, train_dl, val_dl, optimizer, alpha=alpha, epochs=50)
                #         max_weight_per_gene = torch.abs(model[-1].weight).max(dim=0)[0]
                #         num_nonzero_features_by_alpha.append([(max_weight_per_gene > 1e-4).sum(), alpha])
                #         # save the model
                #         torch.save(model.state_dict(), model_file)
                #         plt.plot(loss_history[:, 0], label='train loss')
                #         plt.plot(loss_history[:, 1], label='val loss')
                #         plt.legend()
                #         plt.show()
                #     num_nonzero_features_by_alpha = np.array(num_nonzero_features_by_alpha)
                #     plt.plot(num_nonzero_features_by_alpha[:, 0], num_nonzero_features_by_alpha[:, 1])
                #     plt.savefig('num_features_selected_v_l1_weight.pdf')
                #     plt.show()
                # else:
                #     model.load_state_dict(torch.load(model_file))
                # # Get the max weight per gene to see whether it's relevant to at least one subregion
                # with torch.no_grad():
                #     max_weight_per_gene = torch.abs(model[-1].weight).max(dim=0)[0]
                #     with torch.no_grad():
                #         sns.distplot(max_weight_per_gene)
                #         plt.show()
                if label == 'subregion':
                    self.r_axis_mask = max_weight_per_gene != 0
                else:
                    self.ct_axis_mask = max_weight_per_gene != 0
        print(
            f'Before removing correlated genes, found {self.r_axis_mask.sum()} region genes '
            f'and {self.ct_axis_mask.sum()} cell type genes.')

        if remove_correlated is not None:
            self._remove_r_ct_correlated(remove_correlated, species_data)
            print(
                f'After removing correlated genes, found {self.r_axis_mask.sum()} region genes '
                f'and {self.ct_axis_mask.sum()} cell type genes.')

        # Average transcriptomes within each cell type and put into data frame with cell types as rows and genes as cols
        ct_names = np.unique(species_data.obs['clusters'])
        ct_avg_data = [
            species_data[species_data.obs['clusters'] == ct].X.mean(axis=0)
            for ct in ct_names
        ]
        self.data = pd.concat([
            pd.DataFrame(data.reshape((1, -1)),
                         columns=species_data.var.index,
                         index=[cluster_name])
            for data, cluster_name in zip(ct_avg_data, ct_names)
        ])
        # Divide each row by mean, as in Tosches et al, rename columns,
        # and transpose so that column labels are genes and rows are cell types
        # Divide each row by mean
        if normalize:
            self.data = self.data.div(self.data.mean(axis=0).to_numpy(),
                                      axis=1)  # noqa

        # Save data
        data_dict = {
            'data': self.data,
            'ct_axis_mask': self.ct_axis_mask,
            'r_axis_mask': self.r_axis_mask
        }
        with open(f'withcolors_preprocessed/{filename}.pickle',
                  mode='wb') as file:
            pickle.dump(data_dict, file)
Beispiel #25
0
def test_check_estimator():
    # tests that the estimator actually fails on "bad" estimators.
    # not a complete test of all checks, which are very extensive.

    # check that we have a set_params and can clone
    msg = "Passing a class was deprecated"
    assert_raises_regex(TypeError, msg, check_estimator, object)
    msg = "object has no attribute '_get_tags'"
    assert_raises_regex(AttributeError, msg, check_estimator, object())
    # check that values returned by get_params match set_params
    msg = "get_params result does not match what was passed to set_params"
    assert_raises_regex(AssertionError, msg, check_estimator,
                        ModifiesValueInsteadOfRaisingError())
    assert_warns(UserWarning, check_estimator, RaisesErrorInSetParams())
    assert_raises_regex(AssertionError, msg, check_estimator,
                        ModifiesAnotherValue())
    # check that we have a fit method
    msg = "object has no attribute 'fit'"
    assert_raises_regex(AttributeError, msg, check_estimator, BaseEstimator())
    # check that fit does input validation
    msg = "ValueError not raised"
    assert_raises_regex(AssertionError, msg, check_estimator,
                        BaseBadClassifier())
    # check that sample_weights in fit accepts pandas.Series type
    try:
        from pandas import Series  # noqa
        msg = ("Estimator NoSampleWeightPandasSeriesType raises error if "
               "'sample_weight' parameter is of type pandas.Series")
        assert_raises_regex(ValueError, msg, check_estimator,
                            NoSampleWeightPandasSeriesType())
    except ImportError:
        pass
    # check that predict does input validation (doesn't accept dicts in input)
    msg = "Estimator doesn't check for NaN and inf in predict"
    assert_raises_regex(AssertionError, msg, check_estimator,
                        NoCheckinPredict())
    # check that estimator state does not change
    # at transform/predict/predict_proba time
    msg = 'Estimator changes __dict__ during predict'
    assert_raises_regex(AssertionError, msg, check_estimator, ChangesDict())
    # check that `fit` only changes attribures that
    # are private (start with an _ or end with a _).
    msg = ('Estimator ChangesWrongAttribute should not change or mutate  '
           'the parameter wrong_attribute from 0 to 1 during fit.')
    assert_raises_regex(AssertionError, msg, check_estimator,
                        ChangesWrongAttribute())
    check_estimator(ChangesUnderscoreAttribute())
    # check that `fit` doesn't add any public attribute
    msg = (r'Estimator adds public attribute\(s\) during the fit method.'
           ' Estimators are only allowed to add private attributes'
           ' either started with _ or ended'
           ' with _ but wrong_attribute added')
    assert_raises_regex(AssertionError, msg, check_estimator,
                        SetsWrongAttribute())
    # check for invariant method
    name = NotInvariantPredict.__name__
    method = 'predict'
    msg = ("{method} of {name} is not invariant when applied "
           "to a subset.").format(method=method, name=name)
    assert_raises_regex(AssertionError, msg, check_estimator,
                        NotInvariantPredict())
    # check for sparse matrix input handling
    name = NoSparseClassifier.__name__
    msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
    # the check for sparse input handling prints to the stdout,
    # instead of raising an error, so as not to remove the original traceback.
    # that means we need to jump through some hoops to catch it.
    old_stdout = sys.stdout
    string_buffer = StringIO()
    sys.stdout = string_buffer
    try:
        check_estimator(NoSparseClassifier())
    except Exception:
        pass
    finally:
        sys.stdout = old_stdout
    assert msg in string_buffer.getvalue()

    # Large indices test on bad estimator
    msg = ('Estimator LargeSparseNotSupportedClassifier doesn\'t seem to '
           r'support \S{3}_64 matrix, and is not failing gracefully.*')
    assert_raises_regex(AssertionError, msg, check_estimator,
                        LargeSparseNotSupportedClassifier())

    # does error on binary_only untagged estimator
    msg = 'Only 2 classes are supported'
    assert_raises_regex(ValueError, msg, check_estimator,
                        UntaggedBinaryClassifier())

    # non-regression test for estimators transforming to sparse data
    check_estimator(SparseTransformer())

    # doesn't error on actual estimator
    check_estimator(LogisticRegression())
    check_estimator(LogisticRegression(C=0.01))
    check_estimator(MultiTaskElasticNet())

    # doesn't error on binary_only tagged estimator
    check_estimator(TaggedBinaryClassifier())

    # Check regressor with requires_positive_y estimator tag
    msg = 'negative y values not supported!'
    assert_raises_regex(ValueError, msg, check_estimator,
                        RequiresPositiveYRegressor())
Beispiel #26
0
def regressor_creator(indata, outdata):
    return MultiTaskElasticNet()
Beispiel #27
0
def train_linear_model(X, y, random_state=1, test_size=0.2,
                       regularization_type='elasticnet', k_fold=5,
                       max_iter=1000000, tol=0.0001,
                       l1_ratio=None):
    """
    Function to train linear model with regularization and cross-validation.

    Args:
        X (pandas.DataFrame): dataframe of descriptors.
        y (pandas.DataFrame): dataframe of cycle lifetimes.
        random_state (int): seed for train/test split.
        test_size (float): proportion of the dataset reserved for model evaluation.
        regularization_type (str): lasso or ridge or elastic-net (with cv).
        k_fold (int): k in k-fold cross-validation.
        max_iter (int): maximum number of iterations for model fitting.
        tol (float): tolerance for optimization.
        l1_ratio ([float]): list of lasso to ridge ratios for elasticnet.

    Returns:
        sklearn.linear_model.LinearModel: fitted model.
        mu (float): Mean value of descriptors used in training.
        s (float): Std dev of descriptors used in training.

    """
    if l1_ratio is None:
        l1_ratio = [.1, .5, .7, .9, .95, 1]
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Standardize (training) data after train/test split
    mu = np.mean(X_train, axis=0)
    s = np.std(X_train, axis=0)
    X_scaled = (X_train - mu) / s
    hyperparameters = {'random_state': random_state,
                       'test_size': test_size,
                       'k_fold': k_fold,
                       'tol': tol,
                       'max_iter': max_iter
                       }
    if regularization_type == 'lasso' and y.shape[1] == 1:
        lassocv = LassoCV(fit_intercept=True, alphas=None, tol=tol,
                          cv=k_fold, max_iter=max_iter)
        lassocv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = lassocv.alpha_
        linear_model = Lasso(fit_intercept=True, alpha=alpha_opt,
                             max_iter=max_iter)
        linear_model.fit(X_scaled, y_train.values)
        hyperparameters['l1_ratio'] = 1

    elif regularization_type == 'ridge' and y.shape[1] == 1:
        ridgecv = RidgeCV(fit_intercept=True, alphas=None, cv=k_fold)
        ridgecv.fit(X_scaled, y_train.values.ravel())
        # Set optimal alpha and refit model
        alpha_opt = ridgecv.alpha_
        linear_model = Ridge(fit_intercept=True, alpha=alpha_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = 0

    elif regularization_type == 'elasticnet' and y.shape[1] == 1:
        elasticnetcv = ElasticNetCV(fit_intercept=True, normalize=False,
                                    alphas=None, cv=k_fold,
                                    l1_ratio=l1_ratio, max_iter=max_iter)
        elasticnetcv.fit(X_scaled, y_train.values.ravel())

        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = elasticnetcv.alpha_
        l1_ratio_opt = elasticnetcv.l1_ratio_
        linear_model = ElasticNet(fit_intercept=True, normalize=False,
                                  l1_ratio=l1_ratio_opt,
                                  alpha=alpha_opt, max_iter=max_iter)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt

    # If more than 1 outcome present, perform multitask regression
    elif regularization_type == 'elasticnet' and y.shape[1] > 1:
        multi_elasticnet_CV = MultiTaskElasticNetCV(fit_intercept=True, cv=k_fold,
                                                    normalize=False,
                                                    l1_ratio=l1_ratio, max_iter=max_iter)
        multi_elasticnet_CV.fit(X_scaled, y_train)
        # Set optimal alpha and l1_ratio. Refit model
        alpha_opt = multi_elasticnet_CV.alpha_
        l1_ratio_opt = multi_elasticnet_CV.l1_ratio_
        linear_model = MultiTaskElasticNet(fit_intercept=True, normalize=False,
                                           max_iter=max_iter)
        linear_model.set_params(alpha=alpha_opt, l1_ratio=l1_ratio_opt)
        linear_model.fit(X_scaled, y_train)
        hyperparameters['l1_ratio'] = l1_ratio_opt
    else:
        raise NotImplementedError

    y_pred = linear_model.predict((X_test-mu)/s)
    Rsq = linear_model.score((X_test - mu) / s, y_test)
    # Compute 95% confidence interval
    # Multioutput = 'raw_values' provides prediction error per output
    pred_actual_ratio = [x/y for x, y in zip(y_pred, np.array(y_test))]
    relative_prediction_error = 1.96*np.sqrt(mean_squared_error(np.ones(y_pred.shape),
                                                                pred_actual_ratio,
                                                                multioutput='raw_values')/y_pred.shape[0])
    hyperparameters['alpha'] = alpha_opt
    return linear_model, mu, s, relative_prediction_error, Rsq, hyperparameters
Beispiel #28
0
                                      model_name=f'best_model_batch{ind}.h5')
              ])
    all_predictions.append(model.predict(X_test))

model = create_model()
model.fit(X_train, y_train, epochs=33, batch_size=32, verbose=1)
all_predictions.append(model.predict(X_test))

kf = KFold(n_splits=5, random_state=2019, shuffle=True)
for ind, (tr, val) in enumerate(kf.split(X_train)):
    X_tr = X_train[tr]
    y_tr = y_train[tr]
    X_vl = X_train[val]
    y_vl = y_train[val]

    model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
    model.fit(X_tr, y_tr)
    all_predictions.append(model.predict(X_test))

model = MultiTaskElasticNet(alpha=0.001, random_state=42, l1_ratio=0.5)
model.fit(X_train, y_train)
all_predictions.append(model.predict(X_test))

test_preds = np.array([
    np.array([rankdata(c) for c in p.T]).T for p in all_predictions
]).mean(axis=0)
max_val = test_preds.max() + 1
test_preds = test_preds / max_val + 1e-12
submission = pd.read_csv(path_join(data_dir, 'sample_submission.csv'))
submission[targets] = test_preds
submission.to_csv("submission.csv", index=False)
Beispiel #29
0
def test_check_estimator():
    # tests that the estimator actually fails on "bad" estimators.
    # not a complete test of all checks, which are very extensive.

    # check that we have a set_params and can clone
    msg = "Passing a class was deprecated"
    with raises(TypeError, match=msg):
        check_estimator(object)
    msg = ("Parameter 'p' of estimator 'HasMutableParameters' is of type "
           "object which is not allowed")
    # check that the "default_constructible" test checks for mutable parameters
    check_estimator(HasImmutableParameters())  # should pass
    with raises(AssertionError, match=msg):
        check_estimator(HasMutableParameters())
    # check that values returned by get_params match set_params
    msg = "get_params result does not match what was passed to set_params"
    with raises(AssertionError, match=msg):
        check_estimator(ModifiesValueInsteadOfRaisingError())
    with warnings.catch_warnings(record=True) as records:
        check_estimator(RaisesErrorInSetParams())
    assert UserWarning in [rec.category for rec in records]

    with raises(AssertionError, match=msg):
        check_estimator(ModifiesAnotherValue())
    # check that we have a fit method
    msg = "object has no attribute 'fit'"
    with raises(AttributeError, match=msg):
        check_estimator(BaseEstimator())
    # check that fit does input validation
    msg = "Did not raise"
    with raises(AssertionError, match=msg):
        check_estimator(BaseBadClassifier())
    # check that sample_weights in fit accepts pandas.Series type
    try:
        from pandas import Series  # noqa

        msg = ("Estimator NoSampleWeightPandasSeriesType raises error if "
               "'sample_weight' parameter is of type pandas.Series")
        with raises(ValueError, match=msg):
            check_estimator(NoSampleWeightPandasSeriesType())
    except ImportError:
        pass
    # check that predict does input validation (doesn't accept dicts in input)
    msg = "Estimator NoCheckinPredict doesn't check for NaN and inf in predict"
    with raises(AssertionError, match=msg):
        check_estimator(NoCheckinPredict())
    # check that estimator state does not change
    # at transform/predict/predict_proba time
    msg = "Estimator changes __dict__ during predict"
    with raises(AssertionError, match=msg):
        check_estimator(ChangesDict())
    # check that `fit` only changes attributes that
    # are private (start with an _ or end with a _).
    msg = ("Estimator ChangesWrongAttribute should not change or mutate  "
           "the parameter wrong_attribute from 0 to 1 during fit.")
    with raises(AssertionError, match=msg):
        check_estimator(ChangesWrongAttribute())
    check_estimator(ChangesUnderscoreAttribute())
    # check that `fit` doesn't add any public attribute
    msg = (r"Estimator adds public attribute\(s\) during the fit method."
           " Estimators are only allowed to add private attributes"
           " either started with _ or ended"
           " with _ but wrong_attribute added")
    with raises(AssertionError, match=msg):
        check_estimator(SetsWrongAttribute())
    # check for sample order invariance
    name = NotInvariantSampleOrder.__name__
    method = "predict"
    msg = ("{method} of {name} is not invariant when applied to a dataset"
           "with different sample order.").format(method=method, name=name)
    with raises(AssertionError, match=msg):
        check_estimator(NotInvariantSampleOrder())
    # check for invariant method
    name = NotInvariantPredict.__name__
    method = "predict"
    msg = ("{method} of {name} is not invariant when applied to a subset."
           ).format(method=method, name=name)
    with raises(AssertionError, match=msg):
        check_estimator(NotInvariantPredict())
    # check for sparse matrix input handling
    name = NoSparseClassifier.__name__
    msg = "Estimator %s doesn't seem to fail gracefully on sparse data" % name
    with raises(AssertionError, match=msg):
        check_estimator(NoSparseClassifier())

    # Large indices test on bad estimator
    msg = ("Estimator LargeSparseNotSupportedClassifier doesn't seem to "
           r"support \S{3}_64 matrix, and is not failing gracefully.*")
    with raises(AssertionError, match=msg):
        check_estimator(LargeSparseNotSupportedClassifier())

    # does error on binary_only untagged estimator
    msg = "Only 2 classes are supported"
    with raises(ValueError, match=msg):
        check_estimator(UntaggedBinaryClassifier())

    # non-regression test for estimators transforming to sparse data
    check_estimator(SparseTransformer())

    # doesn't error on actual estimator
    check_estimator(LogisticRegression())
    check_estimator(LogisticRegression(C=0.01))
    check_estimator(MultiTaskElasticNet())

    # doesn't error on binary_only tagged estimator
    check_estimator(TaggedBinaryClassifier())

    # Check regressor with requires_positive_y estimator tag
    msg = "negative y values not supported!"
    with raises(ValueError, match=msg):
        check_estimator(RequiresPositiveYRegressor())

    # Does not raise error on classifier with poor_score tag
    check_estimator(PoorScoreLogisticRegression())
                   print_cost=True,
                   random_state=42)
nn = nn.fit(X_train, Y_train)
t1 = dt()
print('\nRuntime (s):', t1 - t0, '\n') 
long_seq = np.arange(0, n_iters, 1)
short_seq = np.arange(0, n_iters, 1000)
plt.figure()
plt.plot(long_seq, nn.costs)
plt.plot(short_seq, nn.costs[::1000])
plt.figure()
plt.plot(long_seq, nn.metric)
plt.plot(short_seq, nn.metric[::1000])

h_test = nn.predict(X_test)
print("NN Test RMSE: ", np.sqrt(np.mean((h_test - Y_test)**2)))
print("NN Test my R2: ", cust_r2(Y_test, h_test))
#print("NN Test Accuracy: ", np.mean(one_hot_decode(h_test) == one_hot_decode(Y_test_oh)))

#mod = LinearRegression().fit(X_train, Y_train.reshape(-1, ))
mod = MultiTaskElasticNet(l1_ratio=0.00001).fit(X_train, Y_train)

#print(wb['bias0'], wb['Weight0'].reshape(-1, ))    
#print(glm.intercept_, glm.coef_.reshape(-1, ))
print("GLM Test my R2: ", cust_r2(Y_test, mod.predict(X_test)))
print("GLM Test RMSE: ", np.sqrt(np.mean((mod.predict(X_test) - Y_test)**2)))


samples = nn.draw_predictive_samples(X_test, n_samples=10000, n_outputs=1)
plt.figure()
plt.hist(samples[0, :], bins=30)
Beispiel #31
0
    def predict(
        self,
        forecast_length: int,
        future_regressor=[],
        just_point_forecast: bool = False,
    ):
        """Generates forecast data immediately following dates of index supplied to .fit()

        Args:
            forecast_length (int): Number of periods of data to forecast ahead
            regressor (numpy.Array): additional regressor
            just_point_forecast (bool): If True, return a pandas.DataFrame of just point forecasts

        Returns:
            Either a PredictionObject of forecasts and metadata, or
            if just_point_forecast == True, a dataframe of point forecasts
        """
        if not _has_tsfresh:
            raise ImportError("Package tsfresh is required")
        # num_subsamples = 10
        predictStartTime = datetime.datetime.now()

        # from tsfresh import extract_features
        from tsfresh.utilities.dataframe_functions import make_forecasting_frame

        # from sklearn.ensemble import AdaBoostRegressor
        from tsfresh.utilities.dataframe_functions import impute as tsfresh_impute

        # from tsfresh.feature_extraction import EfficientFCParameters, MinimalFCParameters

        max_timeshift = 10
        regression_model = 'Adaboost'
        feature_selection = None

        max_timeshift = self.max_timeshift
        regression_model = self.regression_model
        feature_selection = self.feature_selection

        sktraindata = self.df_train.copy()

        X = pd.DataFrame()
        y = pd.DataFrame()
        counter = 0
        for column in sktraindata.columns:
            df_shift, current_y = make_forecasting_frame(
                sktraindata[column],
                kind="time_series",
                max_timeshift=max_timeshift,
                rolling_direction=1,
            )
            # disable_progressbar = True MinimalFCParameters EfficientFCParameters
            current_X = extract_features(
                df_shift,
                column_id="id",
                column_sort="time",
                column_value="value",
                impute_function=tsfresh_impute,
                show_warnings=False,
                default_fc_parameters=EfficientFCParameters(),
                n_jobs=1,
            )  #
            current_X["feature_last_value"] = current_y.shift(1)
            current_X.rename(columns=lambda x: str(counter) + '_' + x,
                             inplace=True)

            X = pd.concat([X, current_X], axis=1)
            y = pd.concat([y, current_y], axis=1)
            counter += 1

        # drop constant features
        X = X.loc[:, X.apply(pd.Series.nunique) != 1]
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(0)
        y = y.fillna(method='ffill').fillna(method='bfill')

        if feature_selection == 'Variance':
            from sklearn.feature_selection import VarianceThreshold

            sel = VarianceThreshold(threshold=(0.15))
            X = pd.DataFrame(sel.fit_transform(X))
        if feature_selection == 'Percentile':
            from sklearn.feature_selection import SelectPercentile, chi2

            X = pd.DataFrame(
                SelectPercentile(chi2, percentile=20).fit_transform(
                    X, y[y.columns[0]]))
        if feature_selection == 'DecisionTree':
            from sklearn.tree import DecisionTreeRegressor
            from sklearn.feature_selection import SelectFromModel

            clf = DecisionTreeRegressor()
            clf = clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)

            X = model.transform(X)
        if feature_selection == 'Lasso':
            from sklearn.linear_model import MultiTaskLasso
            from sklearn.feature_selection import SelectFromModel

            clf = MultiTaskLasso(max_iter=2000)
            clf = clf.fit(X, y)
            model = SelectFromModel(clf, prefit=True)

            X = model.transform(X)
        """
         decisionTreeList = X.columns[model.get_support()]
         LassoList = X.columns[model.get_support()]
         
         feature_list = decisionTreeList.to_list()
         set([x for x in feature_list if feature_list.count(x) > 1])
         from collections import Counter
         repeat_features = Counter(feature_list)
         repeat_features = repeat_features.most_common(20)
        """

        # Drop first line
        X = X.iloc[1:, ]
        y = y.iloc[1:]

        y = y.fillna(method='ffill').fillna(method='bfill')

        index = self.create_forecast_index(forecast_length=forecast_length)

        if regression_model == 'ElasticNet':
            from sklearn.linear_model import MultiTaskElasticNet

            regr = MultiTaskElasticNet(alpha=1.0,
                                       random_state=self.random_seed)
        elif regression_model == 'DecisionTree':
            from sklearn.tree import DecisionTreeRegressor

            regr = DecisionTreeRegressor(random_state=self.random_seed)
        elif regression_model == 'MLP':
            from sklearn.neural_network import MLPRegressor

            # relu/tanh lbfgs/adam layer_sizes (100) (10)
            regr = MLPRegressor(
                hidden_layer_sizes=(10, 25, 10),
                verbose=self.verbose_bool,
                max_iter=200,
                activation='tanh',
                solver='lbfgs',
                random_state=self.random_seed,
            )
        elif regression_model == 'KNN':
            from sklearn.multioutput import MultiOutputRegressor
            from sklearn.neighbors import KNeighborsRegressor

            regr = MultiOutputRegressor(
                KNeighborsRegressor(random_state=self.random_seed))
        elif regression_model == 'Adaboost':
            from sklearn.multioutput import MultiOutputRegressor
            from sklearn.ensemble import AdaBoostRegressor

            regr = MultiOutputRegressor(AdaBoostRegressor(
                n_estimators=200))  # , random_state=self.random_seed))
        else:
            regression_model = 'RandomForest'
            from sklearn.ensemble import RandomForestRegressor

            regr = RandomForestRegressor(random_state=self.random_seed,
                                         n_estimators=1000,
                                         verbose=self.verbose)

        regr.fit(X, y)

        combined_index = self.df_train.index.append(index)
        forecast = pd.DataFrame()
        sktraindata.columns = [x for x in range(len(sktraindata.columns))]

        for x in range(forecast_length):
            x_dat = pd.DataFrame()
            y_dat = pd.DataFrame()
            counter = 0
            for column in sktraindata.columns:
                df_shift, current_y = make_forecasting_frame(
                    sktraindata.tail(max_timeshift)[column],
                    kind="time_series",
                    max_timeshift=max_timeshift,
                    rolling_direction=1,
                )
                # disable_progressbar = True MinimalFCParameters EfficientFCParameters
                current_X = extract_features(
                    df_shift,
                    column_id="id",
                    column_sort="time",
                    column_value="value",
                    impute_function=tsfresh_impute,
                    show_warnings=False,
                    n_jobs=1,
                    default_fc_parameters=EfficientFCParameters(),
                )  # default_fc_parameters=MinimalFCParameters(),
                current_X["feature_last_value"] = current_y.shift(1)

                current_X.rename(columns=lambda x: str(counter) + '_' + x,
                                 inplace=True)

                x_dat = pd.concat([x_dat, current_X], axis=1)
                y_dat = pd.concat([y_dat, current_y], axis=1)
                counter += 1

            x_dat = x_dat[X.columns]
            rfPred = pd.DataFrame(regr.predict(x_dat.tail(1).values))

            forecast = pd.concat([forecast, rfPred], axis=0, ignore_index=True)
            sktraindata = pd.concat([sktraindata, rfPred],
                                    axis=0,
                                    ignore_index=True)
            sktraindata.index = combined_index[:len(sktraindata.index)]

        forecast.columns = self.column_names
        forecast.index = index

        if just_point_forecast:
            return forecast
        else:
            upper_forecast, lower_forecast = Point_to_Probability(
                self.df_train,
                forecast,
                prediction_interval=self.prediction_interval)

            predict_runtime = datetime.datetime.now() - predictStartTime
            prediction = PredictionObject(
                model_name=self.name,
                forecast_length=forecast_length,
                forecast_index=forecast.index,
                forecast_columns=forecast.columns,
                lower_forecast=lower_forecast,
                forecast=forecast,
                upper_forecast=upper_forecast,
                prediction_interval=self.prediction_interval,
                predict_runtime=predict_runtime,
                fit_runtime=self.fit_runtime,
                model_parameters=self.get_params(),
            )
            return prediction
Beispiel #32
0
class model:
    def __init__(self,params,X,Y):
        self.params=params
        self.original_predictors=list(X)
#        if 'time' in self.original_predictors:
#            self.original_predictors.remove('time')
        if params['NONLIN_TYPE']=='POLY':
            #add non linear terms
            self.X=self.add_nonlinear_terms(X)
            #print(self.X)
            self.Y=Y
        if params['STANDARDIZE']:
            #standardize
            self.standardize()
        self.predictor_names=list(self.X)
        self.target_names=list(Y)
        self.Y_final=self.Y.iloc[-1,:]
        self.time=self.X.iloc[-1, X.columns.get_loc('time')]
        self.date=self.X.index[-1]
#        print(self.X)
#        print(self.Y)
#        print(self.Y_final)
#        print(self.time)
        self.make_model()
    
        

    def add_nonlinear_terms(self,X):
        df,var_names=add_polynomial_terms(X,list(X),self.params['ORDER'])
        return(df)
    
    def standardize(self):
        self.X_mean=self.X.mean()
        self.Y_mean=self.Y.mean()
        self.X_std=self.X.std()
        self.Y_std=self.Y.std()
        self.X=(self.X-self.X_mean)/self.X_std
        self.Y=(self.Y-self.Y_mean)/self.Y_std
        

    def make_model(self):
        max_iter=1000
        tol=0.015
        l1_ratio=0.8 # we want a relatively sparse model
        elastic=MultiTaskElasticNet(fit_intercept=True, max_iter=max_iter,tol=tol,l1_ratio=l1_ratio)
        
        #Note that we are assuming that error are independent of each other GIVEN THE PREDICTORS
        #Otherwise cross validation won't be applicable
        #We will perform a grid search to find best parameters
        
        print('################ Find hyper-parameter values#######################')
        search=GridSearchCV(estimator=elastic,param_grid={'alpha':np.logspace(-5,2,8)},scoring='neg_mean_squared_error',n_jobs=1,refit=True,cv=10)
        search.fit(self.X,self.Y)
        
        #Now create a final elastic net model using the optimal hyper parameters
        print('################ Build final model ##############################')
        optimal_alpha=search.best_params_['alpha']
        #optimal_l1_ratio=search.best_params_['l1_ratio']
        self.model=MultiTaskElasticNet(fit_intercept=True,alpha=optimal_alpha,l1_ratio=l1_ratio,max_iter=max_iter,tol=tol)
        self.model.fit(self.X.values,self.Y.values)
        self.predicted=pd.DataFrame(index=self.Y.index, columns= self.Y.columns, data=self.model.predict(self.X.values))
        self.predicted=self.predicted*self.Y_std+self.Y_mean
        #second_model=(mean_squared_error(y_true=Y_train,y_pred=elastic.predict(X_train)))

    
    def predict(self,X,plot=False,Y_True=None,plot_list=None):
        # If plot = True , Y_true should contain the True values and this function will plot a comparion between true vs predicted
        if self.params['STANDARDIZE'] and self.params['NONLIN_TYPE']=='POLY':
            #X1=X.copy() # don't modify the original
            X1=self.add_nonlinear_terms(X)
            #print('Unnormalized predictors: ',X1)
            X1=(X1-self.X_mean)/self.X_std # standardized
            #print('Normalized predictors: ',X1)
            Y1=self.model.predict(X1.values)
            
            dfY1=pd.DataFrame(index=Y_True.index,columns=list(Y_True),data=Y1)
            dfY1=dfY1*self.Y_std+self.Y_mean
            #Y1=Y1*
            if plot:
                X_ax=Y_True.index
                label_true=[l+'_True' for l in plot_list]
                label_pred=[l+'_Pred' for l in plot_list]
                plt.figure(figsize=(6,4))
                plt.plot(X_ax,Y_True[plot_list],label=label_true)
                plt.plot(X_ax,dfY1[plot_list],label=label_pred)
                plt.legend(loc='best')
                plt.show()
            return(dfY1)
            
            
            
            
    def forecast(self):
        pred=None
        if self.params['STANDARDIZE'] and self.params['NONLIN_TYPE']=='POLY': #if standardized and polynomial
            
            Xp=self.Y_final*self.Y_std + self.Y_mean # destandardize Y, this is needed to calculate the non linear term
            #print(Xp)
            Xp['time']=self.time+1 #- self.X_mean['time'])/self.X_std
            dfp=pd.DataFrame(index=[self.date],columns=self.original_predictors,data=Xp.values.reshape(1,-1))
            dfp=self.add_nonlinear_terms(dfp) # add the non linear terms
            #print(dfp)
            dfp=(dfp-self.X_mean)/self.X_std # standardize, then predict
            #print(dfp)
            pred=self.model.predict(dfp.values)
            self.time=self.time+1
            #print(self.date)
            self.date=self.date+MonthEnd(1)
            
            df=pd.DataFrame(index=[self.date],columns=self.target_names,data=pred)
            self.Y_final=df
            #print(self.date)
        return(pred,self.date)
    
    def multistep_forecast(self,steps):
        df=pd.DataFrame(columns=self.target_names)
        for i in range(steps):
            pred,date=self.forecast()
            print(pred.shape)
            df.loc[date,:]=np.multiply(pred,self.Y_std.values.reshape(1,-1)) + self.Y_mean.values.reshape(1,-1)
        return df
            
    def plot_coeffs(self):
        C=self.model.coef_[-1,:]
        indexes=np.where(np.abs(C)>0.0001)
        
        #significant predictors
        C_sig=C[indexes[0]]
        preds_sig=[self.predictor_names[int(i)] for i in indexes[0]]
        
        f,ax=plt.subplots()
        f.set_size_inches((10,2))
        ax.bar(range(len(C_sig)),C_sig)
        ax.set_xticks(range(len(C_sig)))
        ax.set_xticklabels(labels=preds_sig)
        plt.xticks(rotation=90)
        plt.tight_layout()
        plt.show()
    
    def variable_importance(self,orig_var_names,labels):
        all_preds=list(self.X)# all predictors
        imp=[]
        for v in orig_var_names:
            v1=[ap for ap in all_preds if v in ap]
            print(v1)
            X1=self.X.copy()
            X1[v1]=0
            Y1=self.model.predict(X1)
            imp.append(np.sum((self.Y.values-Y1)**2))
        #print(imp)
        indexes=np.argsort(np.array(imp))
        #print(indexes)
        preds1=[labels[i] for i in indexes]
        imps1=[imp[i] for i in indexes]
        imps1=imps1/np.max(imps1)
        #plot importance
        f,ax=plt.subplots()
        ax.barh(range(len(imp)),imps1)
        ax.set_yticks(range(len(imp)))
        ax.set_yticklabels(labels=preds1)
        ax.set_xlabel(xlabel='Importance',fontsize=12)
        plt.tight_layout()
        plt.show()