def train_skll_model(model_name, df_train, experiment_id, csvdir, figdir): # instantiate the given SKLL learner learner = Learner(model_name) # get the features, IDs, and labels from the given data frame feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']] features = df_train[feature_columns].to_dict(orient='records') ids = df_train['spkitemid'].tolist() labels = df_train['sc1'].tolist() # create a FeatureSet and train the model fs = FeatureSet('train', ids=ids, labels=labels, features=features) # if it's a regression model, then our grid objective should be # pearson and otherwise it should be accuracy if model_name in ["AdaBoostRegressor", "DecisionTreeRegressor", "ElasticNet", "GradientBoostingRegressor", "KNeighborsRegressor", "Lasso", "LinearRegression", "RandomForestRegressor", "Ridge", "SGDRegressor", "LinearSVR", "SVR"]: objective = 'pearson' else: objective = 'f1_score_micro' learner.train(fs, grid_search=True, grid_objective=objective, grid_jobs=1) # TODO: compute betas for linear SKLL models? # save the SKLL model to disk with the given model name prefix model_file = join(csvdir, '{}.model'.format(experiment_id)) learner.save(model_file) # return the SKLL learner object return learner
def create_fake_skll_learner(df_coefficients): """ Create fake SKLL linear regression learner object using the coefficients in the given data frame. Parameters ---------- df_coefficients : pandas DataFrame Data frame containing the linear coefficients we want to create the fake SKLL model with. Returns ------- learner: skll Learner object SKLL LinearRegression Learner object containing with the specified coefficients. """ # get the logger logger = logging.getLogger(__name__) # initialize a random number generator randgen = RandomState(1234567890) # iterate over the coefficients coefdict = {} for feature, coefficient in df_coefficients.itertuples(index=False): if feature == 'Intercept': intercept = coefficient else: # exclude NA coefficients if coefficient == np.nan: logger.warning("No coefficient was estimated for " "{}. This is likely due to exact " "collinearity in the model. This " "feature will not be used for model " "building".format(feature)) else: coefdict[feature] = coefficient learner = Learner('LinearRegression') num_features = len(coefdict) # excluding the intercept fake_feature_values = randgen.rand(num_features) fake_features = [dict(zip(coefdict, fake_feature_values))] fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features) learner.train(fake_fs, grid_search=False) # now create its parameters from the coefficients from the built-in model learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0] learner.model.intercept_ = intercept return learner
def test_api_with_custom_prob_metric(): """Test API with custom probabilistic metric""" # register a custom metric from our file that requires probabilities input_dir = join(_my_dir, "other") custom_metrics_file = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file, "fake_prob_metric") # create some classification data train_fs, _ = make_classification_data(num_examples=1000, num_features=10, num_labels=2) # set up a learner to tune using this probabilistic metric # this should fail since LinearSVC doesn't support probabilities learner1 = Learner("LinearSVC") assert_raises_regex(AttributeError, r"has no attribute 'predict_proba'", learner1.train, train_fs, grid_objective="fake_prob_metric") # set up another learner with explicit probability support # this should work just fine with our custom metric learner2 = Learner("SVC", probability=True) grid_score, _ = learner2.train(train_fs, grid_objective="fake_prob_metric") ok_(grid_score > 0.95)
def test_custom_metric_api_experiment_with_kappa_filename(): """Test API with metric defined in a file named kappa""" # register a dummy metric that just returns 1 from # a file called 'kappa.py' input_dir = join(_my_dir, "other") custom_metrics_file = join(input_dir, "kappa.py") register_custom_metric(custom_metrics_file, "dummy_metric") # read in some train/test data train_file = join(input_dir, "examples_train.jsonlines") test_file = join(input_dir, "examples_test.jsonlines") train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file).read() # set up a learner to tune using our usual kappa metric # and evaluate it using the dummy metric we loaded # this should work as there should be no confict between # the two "kappa" names learner = Learner("LogisticRegression") _ = learner.train(train_fs, grid_objective="unweighted_kappa") results = learner.evaluate( test_fs, grid_objective="unweighted_kappa", output_metrics=["balanced_accuracy", "dummy_metric"]) test_objective_value = results[-2] test_output_metrics_dict = results[-1] test_accuracy_value = test_output_metrics_dict["balanced_accuracy"] test_dummy_metric_value = test_output_metrics_dict["dummy_metric"] # check that the values are as expected assert_almost_equal(test_objective_value, 0.9699, places=4) assert_almost_equal(test_accuracy_value, 0.9792, places=4) eq_(test_dummy_metric_value, 1.0)
def test_custom_metric_api_experiment(): """Test API with custom metrics""" # register two different metrics from two files input_dir = join(_my_dir, "other") custom_metrics_file1 = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file1, "f075_macro") custom_metrics_file2 = join(input_dir, "custom_metrics2.py") register_custom_metric(custom_metrics_file2, "f06_micro") # read in some train/test data train_file = join(input_dir, "examples_train.jsonlines") test_file = join(input_dir, "examples_test.jsonlines") train_fs = NDJReader.for_path(train_file).read() test_fs = NDJReader.for_path(test_file).read() # set up a learner to tune using one of the custom metrics # and evaluate it using the other one learner = Learner("LogisticRegression") _ = learner.train(train_fs, grid_objective="f075_macro") results = learner.evaluate( test_fs, grid_objective="f075_macro", output_metrics=["balanced_accuracy", "f06_micro"]) test_objective_value = results[-2] test_output_metrics_dict = results[-1] test_accuracy_value = test_output_metrics_dict["balanced_accuracy"] test_f06_micro_value = test_output_metrics_dict["f06_micro"] # check that the values are as expected assert_almost_equal(test_objective_value, 0.9785, places=4) assert_almost_equal(test_accuracy_value, 0.9792, places=4) assert_almost_equal(test_f06_micro_value, 0.98, places=4)
def test_api_with_inverted_custom_metric(): """Test API with a lower-is-better custom metric""" # register a lower-is-better custom metrics from our file # which is simply 1 minus the precision score input_dir = join(_my_dir, "other") custom_metrics_file1 = join(input_dir, "custom_metrics.py") register_custom_metric(custom_metrics_file1, "one_minus_precision") # create some classification data train_fs, _ = make_classification_data(num_examples=1000, num_features=10, num_labels=2) # set up a learner to tune using the lower-is-better custom metric learner1 = Learner("LogisticRegression") (grid_score1, grid_results_dict1) = learner1.train(train_fs, grid_objective="one_minus_precision") # now setup another learner that uses the complementary version # of our custom metric (regular precision) for grid search learner2 = Learner("LogisticRegression") (grid_score2, grid_results_dict2) = learner2.train(train_fs, grid_objective="precision") # for both learners the ranking of the C hyperparameter should be # should be the identical since when we defined one_minus_precision # we set the `greater_is_better` keyword argument to `False` assert_array_equal(grid_results_dict1['rank_test_score'], grid_results_dict2['rank_test_score']) # furthermore, the final grid score and the mean scores for each # C hyperparameter value should follow the same 1-X relationship # except that our custom metric should be negated due to the # keyword argument that we set when we defined it assert_almost_equal(1 - grid_score2, -1 * grid_score1, places=6) assert_array_almost_equal(1 - grid_results_dict2['mean_test_score'], -1 * grid_results_dict1['mean_test_score'], decimal=6)
def create_fake_skll_learner(df_coefficients): """ Create fake SKLL linear regression learner object using the coefficients in the given data frame. """ # get the logger logger = logging.getLogger(__name__) # initialize a random number generator randgen = RandomState(1234567890) # iterate over the coefficients coefdict = {} for feature, coefficient in df_coefficients.itertuples(index=False): if feature == 'Intercept': intercept = coefficient else: # exclude NA coefficients if coefficient == np.nan: logger.warning("No coefficient was estimated for " "{}. This is likely due to exact " "collinearity in the model. This " "feature will not be used for model " "building".format(feature)) else: coefdict[feature] = coefficient learner = Learner('LinearRegression') num_features = len(coefdict) # excluding the intercept fake_feature_values = randgen.rand(num_features) fake_features = [dict(zip(coefdict, fake_feature_values))] fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features) learner.train(fake_fs, grid_search=False) # now create its parameters from the coefficients from the built-in model learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0] learner.model.intercept_ = intercept return learner
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir): """ Train one of the built-in linear regression models. Parameters ---------- model_name : str Name of the built-in model to train. df_train : pandas DataFrame Data frame containing the features on which to train the model. experiment_id : str The experiment ID. csvdir : str Path to the `output` experiment output directory. figdir : str Path to the `figure` experiment output directory. Returns ------- learner : skll Learner object SKLL LinearRegression Learner object containing the coefficients learned by training the built-in model. """ # get the columns that actually contain the feature values feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']] # LinearRegression (formerly empWt) : simple linear regression if model_name == 'LinearRegression': # get the feature columns X = df_train[feature_columns] # add the intercept X = sm.add_constant(X) # fit the model fit = sm.OLS(df_train['sc1'], X).fit() df_coef = ols_coefficients_to_dataframe(fit.params) learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # EqualWeightsLR (formerly eqWt) : all features get equal weight elif model_name == 'EqualWeightsLR': # we first compute a single feature that is simply the sum of all features df_train_eqwt = df_train.copy() df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1) # train a plain Linear Regression model X = df_train_eqwt['sumfeature'] X = sm.add_constant(X) fit = sm.OLS(df_train_eqwt['sc1'], X).fit() # get the coefficient for the summed feature and the intercept coef = fit.params['sumfeature'] const = fit.params['const'] # now we need to assign this coefficient to all of the original # features and create a fake SKLL learner with these weights original_features = [c for c in df_train_eqwt.columns if c not in ['sc1', 'sumfeature', 'spkitemid']] coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)])) df_coef = ols_coefficients_to_dataframe(coefs) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # RebalancedLR (formerly empWtBalanced) : balanced empirical weights # by changing betas [adapted from http://bit.ly/UTP7gS] elif model_name == 'RebalancedLR': # train a plain Linear Regression model X = df_train[feature_columns] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_params = ols_coefficients_to_dataframe(fit.params) df_params = df_params.set_index('feature') # compute the betas for the non-intercept coefficients df_weights = df_params.loc[feature_columns] df_betas = df_weights.copy() df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std() # replace each negative beta with delta and adjust # all the positive betas to account for this RT = 0.05 df_positive_betas = df_betas[df_betas['coefficient'] > 0] df_negative_betas = df_betas[df_betas['coefficient'] < 0] delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas) df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1) # rescale the adjusted betas to get the new coefficients df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index') # add the intercept back to the new coefficients df_coef['Intercept'] = df_params.loc['Intercept'].coefficient df_coef = df_coef.sort_index().reset_index() df_coef.columns = ['feature', 'coefficient'] # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature # selection using lasso regression with a fixed lambda and then # use only those features to train a second linear regression elif model_name == 'LassoFixedLambdaThenLR': # train a Lasso Regression model with this featureset with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train a new vanilla linear regression with just the non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # get the coefficients data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature # selection using lasso regression optimized for log likelihood using # cross validation and then use only those features to train a # second linear regression elif model_name == 'PositiveLassoCVThenLR': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # NNLR (formerly empWtNNLS) : First do feature selection using # non-negative least squares (NNLS) and then use only its non-zero # features to train a regular linear regression. We do the regular # LR at the end since we want an LR object so that we have access # to R^2 and other useful statistics. There should be no difference # between the non-zero coefficients from NNLS and the coefficients # that end up coming out of the subsequent LR. elif model_name == 'NNLR': # add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters to a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # LassoFixedLambdaThenNNLR (formerly empWtDropNegLasso): First do # feature selection using lasso regression and positive only weights. # Then fit an NNLR (see above) on those features. elif model_name == 'LassoFixedLambdaThenNNLR': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train an NNLS regression using these non-zero features # first add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the positive features used_features = non_zero_features # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with # a fixed lambda elif model_name == 'LassoFixedLambda': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation alpha = p_lambda / len(df_train) learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True}) learner.train(fs_train, grid_search=False) # convert this model's parameters to a data frame df_coef = skll_learner_params_to_dataframe(learner) # there's no OLS fit object in this case fit = None # we used all the features used_features = feature_columns # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection # using lasso regression optimized for log likelihood using cross # validation. elif model_name == 'PositiveLassoCV': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # save the non-zero model coefficients and intercept to a data frame non_zero_features, non_zero_feature_values = [], [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) non_zero_feature_values.append(coefficient) # initialize the coefficient data frame with just the intercept df_coef = pd.DataFrame([('Intercept', model.intercept_)]) df_coef = df_coef.append(list(zip(non_zero_features, non_zero_feature_values)), ignore_index=True) df_coef.columns = ['feature', 'coefficient'] # create a fake SKLL learner with these non-zero weights learner = create_fake_skll_learner(df_coef) # there's no OLS fit object in this case fit = None # we used only the non-zero features used_features = non_zero_features # save the raw coefficients to a file df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False) # compute the standardized and relative coefficients (betas) for the # non-intercept features and save to a file df_betas = df_coef.set_index('feature').loc[used_features] df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std() df_betas.columns = ['standardized'] df_betas['relative'] = df_betas / sum(abs(df_betas['standardized'])) df_betas.reset_index(inplace=True) df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False) # save the OLS fit object and its summary to files if fit: ols_file = join(csvdir, '{}.ols'.format(experiment_id)) summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id)) with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf: pickle.dump(fit, olsf) summf.write(str(fit.summary())) # create a data frame with main model fit metrics and save to the file df_model_fit = model_fit_to_dataframe(fit) model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id)) df_model_fit.to_csv(model_fit_file, index=False) # save the SKLL model to a file model_file = join(csvdir, '{}.model'.format(experiment_id)) learner.save(model_file) return learner
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir): # get the columns that actually contain the feature values feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']] # LinearRegression (formerly empWt) : simple linear regression if model_name == 'LinearRegression': # get the feature columns X = df_train[feature_columns] # add the intercept X = sm.add_constant(X) # fit the model fit = sm.OLS(df_train['sc1'], X).fit() df_coef = ols_coefficients_to_dataframe(fit.params) learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # EqualWeightsLR (formerly eqWt) : all features get equal weight elif model_name == 'EqualWeightsLR': # we first compute a single feature that is simply the sum of all features df_train_eqwt = df_train.copy() df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1) # train a plain Linear Regression model X = df_train_eqwt['sumfeature'] X = sm.add_constant(X) fit = sm.OLS(df_train_eqwt['sc1'], X).fit() # get the coefficient for the summed feature and the intercept coef = fit.params['sumfeature'] const = fit.params['const'] # now we need to assign this coefficient to all of the original # features and create a fake SKLL learner with these weights original_features = [c for c in df_train_eqwt.columns if c not in ['sc1', 'sumfeature', 'spkitemid']] coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)])) df_coef = ols_coefficients_to_dataframe(coefs) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # RebalancedLR (formerly empWtBalanced) : balanced empirical weights # by changing betas [adapted from http://bit.ly/UTP7gS] elif model_name == 'RebalancedLR': # train a plain Linear Regression model X = df_train[feature_columns] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_params = ols_coefficients_to_dataframe(fit.params) df_params = df_params.set_index('feature') # compute the betas for the non-intercept coefficients df_weights = df_params.loc[feature_columns] df_betas = df_weights.copy() df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std() # replace each negative beta with delta and adjust # all the positive betas to account for this RT = 0.05 df_positive_betas = df_betas[df_betas['coefficient'] > 0] df_negative_betas = df_betas[df_betas['coefficient'] < 0] delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas) df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1) # rescale the adjusted betas to get the new coefficients df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index') # add the intercept back to the new coefficients df_coef['Intercept'] = df_params.loc['Intercept'].coefficient df_coef = df_coef.sort_index().reset_index() df_coef.columns = ['feature', 'coefficient'] # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used all the features used_features = feature_columns # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature # selection using lasso regression with a fixed lambda and then # use only those features to train a second linear regression elif model_name == 'LassoFixedLambdaThenLR': # train a Lasso Regression model with this featureset with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train a new vanilla linear regression with just the non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # get the coefficients data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature # selection using lasso regression optimized for log likelihood using # cross validation and then use only those features to train a # second linear regression elif model_name == 'PositiveLassoCVThenLR': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert the model parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # NNLR (formerly empWtNNLS) : First do feature selection using # non-negative least squares (NNLS) and then use only its non-zero # features to train a regular linear regression. We do the regular # LR at the end since we want an LR object so that we have access # to R^2 and other useful statistics. There should be no difference # between the non-zero coefficients from NNLS and the coefficients # that end up coming out of the subsequent LR. elif model_name == 'NNLR': # add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters to a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the non-zero features used_features = non_zero_features # LassoFixedLambdaThenNNLS (formerly empWtDropNegLasso): First do # feature selection using lasso regression and positive only weights. # Then fit an NNLR (see above) on those features. elif model_name == 'LassoFixedLambdaThenNNLR': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation p_alpha = p_lambda / len(df_train) l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True}) l_lasso.train(fs_train, grid_search=False) # get the feature names that have the non-zero coefficients non_zero_features = list(l_lasso.model_params[0].keys()) # now train an NNLS regression using these non-zero features # first add an intercept to the features manually X = df_train[feature_columns].values intercepts = np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) y = df_train['sc1'].values # fit an NNLS model on this data coefs, rnorm = nnls(X_plus_intercept, y) # check whether the intercept is set to 0 and if so then we need # to flip the sign and refit the model to ensure that it is always # kept in the model if coefs[0] == 0: intercepts = -1 * np.ones((len(df_train), 1)) X_plus_intercept = np.concatenate([intercepts, X], axis=1) coefs, rnorm = nnls(X_plus_intercept, y) # separate the intercept and feature coefficients intercept = coefs[0] coefficients = coefs[1:].tolist() # get the non-zero features from this model non_zero_features = [] for feature, coefficient in zip(feature_columns, coefficients): if coefficient != 0: non_zero_features.append(feature) # now train a new linear regression with just these non-zero features X = df_train[non_zero_features] X = sm.add_constant(X) fit = sm.OLS(df_train['sc1'], X).fit() # convert this model's parameters into a data frame df_coef = ols_coefficients_to_dataframe(fit.params) # create fake SKLL learner with these coefficients learner = create_fake_skll_learner(df_coef) # we used only the positive features used_features = non_zero_features # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with # a fixed lambda elif model_name == 'LassoFixedLambda': # train a Lasso Regression model with a preset lambda p_lambda = sqrt(len(df_train) * log10(len(feature_columns))) # create a SKLL FeatureSet instance from the given data frame fs_train = create_featureset_from_dataframe(df_train) # note that 'alpha' in sklearn is different from this lambda # so we need to normalize looking at the sklearn objective equation alpha = p_lambda / len(df_train) learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True}) learner.train(fs_train, grid_search=False) # convert this model's parameters to a data frame df_coef = skll_learner_params_to_dataframe(learner) # there's no OLS fit object in this case fit = None # we used all the features used_features = feature_columns # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection # using lasso regression optimized for log likelihood using cross # validation. elif model_name == 'PositiveLassoCV': # train a LassoCV outside of SKLL since it's not exposed there X = df_train[feature_columns].values y = df_train['sc1'].values clf = LassoCV(cv=10, positive=True, random_state=1234567890) model = clf.fit(X, y) # save the non-zero model coefficients and intercept to a data frame non_zero_features, non_zero_feature_values = [], [] for feature, coefficient in zip(feature_columns, model.coef_): if coefficient != 0: non_zero_features.append(feature) non_zero_feature_values.append(coefficient) # initialize the coefficient data frame with just the intercept df_coef = pd.DataFrame([('Intercept', model.intercept_)]) df_coef = df_coef.append(list(zip(non_zero_features, non_zero_feature_values)), ignore_index=True) df_coef.columns = ['feature', 'coefficient'] # create a fake SKLL learner with these non-zero weights learner = create_fake_skll_learner(df_coef) # there's no OLS fit object in this case fit = None # we used only the non-zero features used_features = non_zero_features # save the raw coefficients to a file df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False) # compute the standardized and relative coefficients (betas) for the # non-intercept features and save to a file df_betas = df_coef.set_index('feature').loc[used_features] df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std() df_betas.columns = ['standardized'] df_betas['relative'] = df_betas / sum(abs(df_betas['standardized'])) df_betas.reset_index(inplace=True) df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False) # save the OLS fit object and its summary to files if fit: ols_file = join(csvdir, '{}.ols'.format(experiment_id)) summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id)) with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf: pickle.dump(fit, olsf) summf.write(str(fit.summary())) # save the SKLL model to a file model_file = join(csvdir, '{}.model'.format(experiment_id)) learner.save(model_file) return learner