Esempio n. 1
0
def train_skll_model(model_name, df_train, experiment_id, csvdir, figdir):

    # instantiate the given SKLL learner
    learner = Learner(model_name)

    # get the features, IDs, and labels from the given data frame
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]
    features = df_train[feature_columns].to_dict(orient='records')
    ids = df_train['spkitemid'].tolist()
    labels = df_train['sc1'].tolist()

    # create a FeatureSet and train the model
    fs = FeatureSet('train', ids=ids, labels=labels, features=features)

    # if it's a regression model, then our grid objective should be
    # pearson and otherwise it should be accuracy
    if model_name in ["AdaBoostRegressor", "DecisionTreeRegressor", "ElasticNet",
                      "GradientBoostingRegressor", "KNeighborsRegressor", "Lasso",
                      "LinearRegression", "RandomForestRegressor", "Ridge",
                      "SGDRegressor", "LinearSVR", "SVR"]:
        objective = 'pearson'
    else:
        objective = 'f1_score_micro'

    learner.train(fs, grid_search=True, grid_objective=objective, grid_jobs=1)

    # TODO: compute betas for linear SKLL models?

    # save the SKLL model to disk with the given model name prefix
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    # return the SKLL learner object
    return learner
Esempio n. 2
0
def create_fake_skll_learner(df_coefficients):

    """
    Create fake SKLL linear regression learner object
    using the coefficients in the given data frame.

    Parameters
    ----------
    df_coefficients : pandas DataFrame
        Data frame containing the linear coefficients
        we want to create the fake SKLL model with.

    Returns
    -------
    learner: skll Learner object
        SKLL LinearRegression Learner object containing
        with the specified coefficients.
    """

    # get the logger
    logger = logging.getLogger(__name__)

    # initialize a random number generator
    randgen = RandomState(1234567890)

    # iterate over the coefficients
    coefdict = {}
    for feature, coefficient in df_coefficients.itertuples(index=False):
        if feature == 'Intercept':
            intercept = coefficient
        else:
            # exclude NA coefficients
            if coefficient == np.nan:
                logger.warning("No coefficient was estimated for "
                               "{}. This is likely due to exact "
                               "collinearity in the model. This "
                               "feature will not be used for model "
                               "building".format(feature))
            else:
                coefdict[feature] = coefficient

    learner = Learner('LinearRegression')
    num_features = len(coefdict)  # excluding the intercept
    fake_feature_values = randgen.rand(num_features)
    fake_features = [dict(zip(coefdict, fake_feature_values))]
    fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features)
    learner.train(fake_fs, grid_search=False)

    # now create its parameters from the coefficients from the built-in model
    learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0]
    learner.model.intercept_ = intercept
    return learner
Esempio n. 3
0
def test_api_with_custom_prob_metric():
    """Test API with custom probabilistic metric"""

    # register a custom metric from our file that requires probabilities
    input_dir = join(_my_dir, "other")
    custom_metrics_file = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file, "fake_prob_metric")

    # create some classification data
    train_fs, _ = make_classification_data(num_examples=1000,
                                           num_features=10,
                                           num_labels=2)

    # set up a learner to tune using this probabilistic metric
    # this should fail since LinearSVC doesn't support probabilities
    learner1 = Learner("LinearSVC")
    assert_raises_regex(AttributeError,
                        r"has no attribute 'predict_proba'",
                        learner1.train,
                        train_fs,
                        grid_objective="fake_prob_metric")

    # set up another learner with explicit probability support
    # this should work just fine with our custom metric
    learner2 = Learner("SVC", probability=True)
    grid_score, _ = learner2.train(train_fs, grid_objective="fake_prob_metric")
    ok_(grid_score > 0.95)
Esempio n. 4
0
def test_custom_metric_api_experiment_with_kappa_filename():
    """Test API with metric defined in a file named kappa"""

    # register a dummy metric that just returns 1 from
    # a file called 'kappa.py'
    input_dir = join(_my_dir, "other")
    custom_metrics_file = join(input_dir, "kappa.py")
    register_custom_metric(custom_metrics_file, "dummy_metric")

    # read in some train/test data
    train_file = join(input_dir, "examples_train.jsonlines")
    test_file = join(input_dir, "examples_test.jsonlines")

    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file).read()

    # set up a learner to tune using our usual kappa metric
    # and evaluate it using the dummy metric we loaded
    # this should work as there should be no confict between
    # the two "kappa" names
    learner = Learner("LogisticRegression")
    _ = learner.train(train_fs, grid_objective="unweighted_kappa")
    results = learner.evaluate(
        test_fs,
        grid_objective="unweighted_kappa",
        output_metrics=["balanced_accuracy", "dummy_metric"])
    test_objective_value = results[-2]
    test_output_metrics_dict = results[-1]
    test_accuracy_value = test_output_metrics_dict["balanced_accuracy"]
    test_dummy_metric_value = test_output_metrics_dict["dummy_metric"]

    # check that the values are as expected
    assert_almost_equal(test_objective_value, 0.9699, places=4)
    assert_almost_equal(test_accuracy_value, 0.9792, places=4)
    eq_(test_dummy_metric_value, 1.0)
Esempio n. 5
0
def test_custom_metric_api_experiment():
    """Test API with custom metrics"""

    # register two different metrics from two files
    input_dir = join(_my_dir, "other")
    custom_metrics_file1 = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file1, "f075_macro")
    custom_metrics_file2 = join(input_dir, "custom_metrics2.py")
    register_custom_metric(custom_metrics_file2, "f06_micro")

    # read in some train/test data
    train_file = join(input_dir, "examples_train.jsonlines")
    test_file = join(input_dir, "examples_test.jsonlines")

    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file).read()

    # set up a learner to tune using one of the custom metrics
    # and evaluate it using the other one
    learner = Learner("LogisticRegression")
    _ = learner.train(train_fs, grid_objective="f075_macro")
    results = learner.evaluate(
        test_fs,
        grid_objective="f075_macro",
        output_metrics=["balanced_accuracy", "f06_micro"])
    test_objective_value = results[-2]
    test_output_metrics_dict = results[-1]
    test_accuracy_value = test_output_metrics_dict["balanced_accuracy"]
    test_f06_micro_value = test_output_metrics_dict["f06_micro"]

    # check that the values are as expected
    assert_almost_equal(test_objective_value, 0.9785, places=4)
    assert_almost_equal(test_accuracy_value, 0.9792, places=4)
    assert_almost_equal(test_f06_micro_value, 0.98, places=4)
Esempio n. 6
0
def test_api_with_inverted_custom_metric():
    """Test API with a lower-is-better custom metric"""

    # register a lower-is-better custom metrics from our file
    # which is simply 1 minus the precision score
    input_dir = join(_my_dir, "other")
    custom_metrics_file1 = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file1, "one_minus_precision")

    # create some classification data
    train_fs, _ = make_classification_data(num_examples=1000,
                                           num_features=10,
                                           num_labels=2)

    # set up a learner to tune using the lower-is-better custom metric
    learner1 = Learner("LogisticRegression")
    (grid_score1,
     grid_results_dict1) = learner1.train(train_fs,
                                          grid_objective="one_minus_precision")

    # now setup another learner that uses the complementary version
    # of our custom metric (regular precision) for grid search
    learner2 = Learner("LogisticRegression")
    (grid_score2,
     grid_results_dict2) = learner2.train(train_fs, grid_objective="precision")

    # for both learners the ranking of the C hyperparameter should be
    # should be the identical since when we defined one_minus_precision
    # we set the `greater_is_better` keyword argument to `False`
    assert_array_equal(grid_results_dict1['rank_test_score'],
                       grid_results_dict2['rank_test_score'])

    # furthermore, the final grid score and the mean scores for each
    # C hyperparameter value should follow the same 1-X relationship
    # except that our custom metric should be negated due to the
    # keyword argument that we set when we defined it
    assert_almost_equal(1 - grid_score2, -1 * grid_score1, places=6)
    assert_array_almost_equal(1 - grid_results_dict2['mean_test_score'],
                              -1 * grid_results_dict1['mean_test_score'],
                              decimal=6)
Esempio n. 7
0
def create_fake_skll_learner(df_coefficients):
    """
    Create fake SKLL linear regression learner object
    using the coefficients in the given data frame.
    """

    # get the logger
    logger = logging.getLogger(__name__)

    # initialize a random number generator
    randgen = RandomState(1234567890)

    # iterate over the coefficients
    coefdict = {}
    for feature, coefficient in df_coefficients.itertuples(index=False):
        if feature == 'Intercept':
            intercept = coefficient
        else:
            # exclude NA coefficients
            if coefficient == np.nan:
                logger.warning("No coefficient was estimated for "
                               "{}. This is likely due to exact "
                               "collinearity in the model. This "
                               "feature will not be used for model "
                               "building".format(feature))
            else:
                coefdict[feature] = coefficient

    learner = Learner('LinearRegression')
    num_features = len(coefdict)  # excluding the intercept
    fake_feature_values = randgen.rand(num_features)
    fake_features = [dict(zip(coefdict, fake_feature_values))]
    fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features)
    learner.train(fake_fs, grid_search=False)

    # now create its parameters from the coefficients from the built-in model
    learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0]
    learner.model.intercept_ = intercept
    return learner
Esempio n. 8
0
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir):
    """
    Train one of the built-in linear regression models.

    Parameters
    ----------
    model_name : str
        Name of the built-in model to train.
    df_train : pandas DataFrame
        Data frame containing the features on which
        to train the model.
    experiment_id : str
        The experiment ID.
    csvdir : str
        Path to the `output` experiment output directory.
    figdir : str
        Path to the `figure` experiment output directory.

    Returns
    -------
    learner : skll Learner object
        SKLL LinearRegression Learner object containing
        the coefficients learned by training the built-in
        model.
    """
    # get the columns that actually contain the feature values
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]

    # LinearRegression (formerly empWt) : simple linear regression
    if model_name == 'LinearRegression':

        # get the feature columns
        X = df_train[feature_columns]

        # add the intercept
        X = sm.add_constant(X)

        # fit the model
        fit = sm.OLS(df_train['sc1'], X).fit()
        df_coef = ols_coefficients_to_dataframe(fit.params)
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # EqualWeightsLR (formerly eqWt) : all features get equal weight
    elif model_name == 'EqualWeightsLR':
        # we first compute a single feature that is simply the sum of all features
        df_train_eqwt = df_train.copy()
        df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1)

        # train a plain Linear Regression model
        X = df_train_eqwt['sumfeature']
        X = sm.add_constant(X)
        fit = sm.OLS(df_train_eqwt['sc1'], X).fit()

        # get the coefficient for the summed feature and the intercept
        coef = fit.params['sumfeature']
        const = fit.params['const']

        # now we need to assign this coefficient to all of the original
        # features and create a fake SKLL learner with these weights
        original_features = [c for c in df_train_eqwt.columns if c not in ['sc1',
                                                                           'sumfeature',
                                                                           'spkitemid']]
        coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)]))
        df_coef = ols_coefficients_to_dataframe(coefs)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # RebalancedLR (formerly empWtBalanced) : balanced empirical weights
    # by changing betas [adapted from http://bit.ly/UTP7gS]
    elif model_name == 'RebalancedLR':

        # train a plain Linear Regression model
        X = df_train[feature_columns]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_params = ols_coefficients_to_dataframe(fit.params)
        df_params = df_params.set_index('feature')

        # compute the betas for the non-intercept coefficients
        df_weights = df_params.loc[feature_columns]
        df_betas = df_weights.copy()
        df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std()

        # replace each negative beta with delta and adjust
        # all the positive betas to account for this
        RT = 0.05
        df_positive_betas = df_betas[df_betas['coefficient'] > 0]
        df_negative_betas = df_betas[df_betas['coefficient'] < 0]
        delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas)
        df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1)

        # rescale the adjusted betas to get the new coefficients
        df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index')

        # add the intercept back to the new coefficients
        df_coef['Intercept'] = df_params.loc['Intercept'].coefficient
        df_coef = df_coef.sort_index().reset_index()
        df_coef.columns = ['feature', 'coefficient']

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature
    # selection using lasso regression with a fixed lambda and then
    # use only those features to train a second linear regression
    elif model_name == 'LassoFixedLambdaThenLR':

        # train a Lasso Regression model with this featureset with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train a new vanilla linear regression with just the non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # get the coefficients data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature
    # selection using lasso regression optimized for log likelihood using
    # cross validation and then use only those features to train a
    # second linear regression
    elif model_name == 'PositiveLassoCVThenLR':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # NNLR (formerly empWtNNLS) : First do feature selection using
    # non-negative least squares (NNLS) and then use only its non-zero
    # features to train a regular linear regression. We do the regular
    # LR at the end since we want an LR object so that we have access
    # to R^2 and other useful statistics. There should be no difference
    # between the non-zero coefficients from NNLS and the coefficients
    # that end up coming out of the subsequent LR.
    elif model_name == 'NNLR':

        # add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters to a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # LassoFixedLambdaThenNNLR (formerly empWtDropNegLasso): First do
    # feature selection using lasso regression and positive only weights.
    # Then fit an NNLR (see above) on those features.
    elif model_name == 'LassoFixedLambdaThenNNLR':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train an NNLS regression using these non-zero features
        # first add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the positive features
        used_features = non_zero_features

    # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with
    # a fixed lambda
    elif model_name == 'LassoFixedLambda':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        alpha = p_lambda / len(df_train)
        learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
        learner.train(fs_train, grid_search=False)

        # convert this model's parameters to a data frame
        df_coef = skll_learner_params_to_dataframe(learner)

        # there's no OLS fit object in this case
        fit = None

        # we used all the features
        used_features = feature_columns

    # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection
    # using lasso regression optimized for log likelihood using cross
    # validation.
    elif model_name == 'PositiveLassoCV':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # save the non-zero model coefficients and intercept to a data frame
        non_zero_features, non_zero_feature_values = [], []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)
                non_zero_feature_values.append(coefficient)

        # initialize the coefficient data frame with just the intercept
        df_coef = pd.DataFrame([('Intercept', model.intercept_)])
        df_coef = df_coef.append(list(zip(non_zero_features,
                                          non_zero_feature_values)), ignore_index=True)
        df_coef.columns = ['feature', 'coefficient']

        # create a fake SKLL learner with these non-zero weights
        learner = create_fake_skll_learner(df_coef)

        # there's no OLS fit object in this case
        fit = None

        # we used only the non-zero features
        used_features = non_zero_features

    # save the raw coefficients to a file
    df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False)

    # compute the standardized and relative coefficients (betas) for the
    # non-intercept features and save to a file
    df_betas = df_coef.set_index('feature').loc[used_features]
    df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False)

    # save the OLS fit object and its summary to files
    if fit:
        ols_file = join(csvdir, '{}.ols'.format(experiment_id))
        summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

        # create a data frame with main model fit metrics and save to the file
        df_model_fit = model_fit_to_dataframe(fit)
        model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id))
        df_model_fit.to_csv(model_fit_file, index=False)

    # save the SKLL model to a file
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    return learner
Esempio n. 9
0
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir):

    # get the columns that actually contain the feature values
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]

    # LinearRegression (formerly empWt) : simple linear regression
    if model_name == 'LinearRegression':

        # get the feature columns
        X = df_train[feature_columns]

        # add the intercept
        X = sm.add_constant(X)

        # fit the model
        fit = sm.OLS(df_train['sc1'], X).fit()
        df_coef = ols_coefficients_to_dataframe(fit.params)
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # EqualWeightsLR (formerly eqWt) : all features get equal weight
    elif model_name == 'EqualWeightsLR':
        # we first compute a single feature that is simply the sum of all features
        df_train_eqwt = df_train.copy()
        df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1)

        # train a plain Linear Regression model
        X = df_train_eqwt['sumfeature']
        X = sm.add_constant(X)
        fit = sm.OLS(df_train_eqwt['sc1'], X).fit()

        # get the coefficient for the summed feature and the intercept
        coef = fit.params['sumfeature']
        const = fit.params['const']

        # now we need to assign this coefficient to all of the original
        # features and create a fake SKLL learner with these weights
        original_features = [c for c in df_train_eqwt.columns if c not in ['sc1',
                                                                           'sumfeature',
                                                                           'spkitemid']]
        coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)]))
        df_coef = ols_coefficients_to_dataframe(coefs)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # RebalancedLR (formerly empWtBalanced) : balanced empirical weights
    # by changing betas [adapted from http://bit.ly/UTP7gS]
    elif model_name == 'RebalancedLR':

        # train a plain Linear Regression model
        X = df_train[feature_columns]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_params = ols_coefficients_to_dataframe(fit.params)
        df_params = df_params.set_index('feature')

        # compute the betas for the non-intercept coefficients
        df_weights = df_params.loc[feature_columns]
        df_betas = df_weights.copy()
        df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std()

        # replace each negative beta with delta and adjust
        # all the positive betas to account for this
        RT = 0.05
        df_positive_betas = df_betas[df_betas['coefficient'] > 0]
        df_negative_betas = df_betas[df_betas['coefficient'] < 0]
        delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas)
        df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1)

        # rescale the adjusted betas to get the new coefficients
        df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index')

        # add the intercept back to the new coefficients
        df_coef['Intercept'] = df_params.loc['Intercept'].coefficient
        df_coef = df_coef.sort_index().reset_index()
        df_coef.columns = ['feature', 'coefficient']

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature
    # selection using lasso regression with a fixed lambda and then
    # use only those features to train a second linear regression
    elif model_name == 'LassoFixedLambdaThenLR':

        # train a Lasso Regression model with this featureset with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train a new vanilla linear regression with just the non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # get the coefficients data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature
    # selection using lasso regression optimized for log likelihood using
    # cross validation and then use only those features to train a
    # second linear regression
    elif model_name == 'PositiveLassoCVThenLR':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # NNLR (formerly empWtNNLS) : First do feature selection using
    # non-negative least squares (NNLS) and then use only its non-zero
    # features to train a regular linear regression. We do the regular
    # LR at the end since we want an LR object so that we have access
    # to R^2 and other useful statistics. There should be no difference
    # between the non-zero coefficients from NNLS and the coefficients
    # that end up coming out of the subsequent LR.
    elif model_name == 'NNLR':

        # add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters to a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # LassoFixedLambdaThenNNLS (formerly empWtDropNegLasso): First do
    # feature selection using lasso regression and positive only weights.
    # Then fit an NNLR (see above) on those features.
    elif model_name == 'LassoFixedLambdaThenNNLR':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train an NNLS regression using these non-zero features
        # first add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the positive features
        used_features = non_zero_features

    # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with
    # a fixed lambda
    elif model_name == 'LassoFixedLambda':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        alpha = p_lambda / len(df_train)
        learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
        learner.train(fs_train, grid_search=False)

        # convert this model's parameters to a data frame
        df_coef = skll_learner_params_to_dataframe(learner)

        # there's no OLS fit object in this case
        fit = None

        # we used all the features
        used_features = feature_columns

    # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection
    # using lasso regression optimized for log likelihood using cross
    # validation.
    elif model_name == 'PositiveLassoCV':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # save the non-zero model coefficients and intercept to a data frame
        non_zero_features, non_zero_feature_values = [], []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)
                non_zero_feature_values.append(coefficient)

        # initialize the coefficient data frame with just the intercept
        df_coef = pd.DataFrame([('Intercept', model.intercept_)])
        df_coef = df_coef.append(list(zip(non_zero_features,
                                          non_zero_feature_values)), ignore_index=True)
        df_coef.columns = ['feature', 'coefficient']

        # create a fake SKLL learner with these non-zero weights
        learner = create_fake_skll_learner(df_coef)

        # there's no OLS fit object in this case
        fit = None

        # we used only the non-zero features
        used_features = non_zero_features

    # save the raw coefficients to a file
    df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False)

    # compute the standardized and relative coefficients (betas) for the
    # non-intercept features and save to a file
    df_betas = df_coef.set_index('feature').loc[used_features]
    df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False)

    # save the OLS fit object and its summary to files
    if fit:
        ols_file = join(csvdir, '{}.ols'.format(experiment_id))
        summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

    # save the SKLL model to a file
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    return learner