Example #1
0
def test_custom_metric_api_experiment():
    """Test API with custom metrics"""

    # register two different metrics from two files
    input_dir = join(_my_dir, "other")
    custom_metrics_file1 = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file1, "f075_macro")
    custom_metrics_file2 = join(input_dir, "custom_metrics2.py")
    register_custom_metric(custom_metrics_file2, "f06_micro")

    # read in some train/test data
    train_file = join(input_dir, "examples_train.jsonlines")
    test_file = join(input_dir, "examples_test.jsonlines")

    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file).read()

    # set up a learner to tune using one of the custom metrics
    # and evaluate it using the other one
    learner = Learner("LogisticRegression")
    _ = learner.train(train_fs, grid_objective="f075_macro")
    results = learner.evaluate(
        test_fs,
        grid_objective="f075_macro",
        output_metrics=["balanced_accuracy", "f06_micro"])
    test_objective_value = results[-2]
    test_output_metrics_dict = results[-1]
    test_accuracy_value = test_output_metrics_dict["balanced_accuracy"]
    test_f06_micro_value = test_output_metrics_dict["f06_micro"]

    # check that the values are as expected
    assert_almost_equal(test_objective_value, 0.9785, places=4)
    assert_almost_equal(test_accuracy_value, 0.9792, places=4)
    assert_almost_equal(test_f06_micro_value, 0.98, places=4)
Example #2
0
def test_api_with_custom_prob_metric():
    """Test API with custom probabilistic metric"""

    # register a custom metric from our file that requires probabilities
    input_dir = join(_my_dir, "other")
    custom_metrics_file = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file, "fake_prob_metric")

    # create some classification data
    train_fs, _ = make_classification_data(num_examples=1000,
                                           num_features=10,
                                           num_labels=2)

    # set up a learner to tune using this probabilistic metric
    # this should fail since LinearSVC doesn't support probabilities
    learner1 = Learner("LinearSVC")
    assert_raises_regex(AttributeError,
                        r"has no attribute 'predict_proba'",
                        learner1.train,
                        train_fs,
                        grid_objective="fake_prob_metric")

    # set up another learner with explicit probability support
    # this should work just fine with our custom metric
    learner2 = Learner("SVC", probability=True)
    grid_score, _ = learner2.train(train_fs, grid_objective="fake_prob_metric")
    ok_(grid_score > 0.95)
Example #3
0
def test_custom_metric_api_experiment_with_kappa_filename():
    """Test API with metric defined in a file named kappa"""

    # register a dummy metric that just returns 1 from
    # a file called 'kappa.py'
    input_dir = join(_my_dir, "other")
    custom_metrics_file = join(input_dir, "kappa.py")
    register_custom_metric(custom_metrics_file, "dummy_metric")

    # read in some train/test data
    train_file = join(input_dir, "examples_train.jsonlines")
    test_file = join(input_dir, "examples_test.jsonlines")

    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file).read()

    # set up a learner to tune using our usual kappa metric
    # and evaluate it using the dummy metric we loaded
    # this should work as there should be no confict between
    # the two "kappa" names
    learner = Learner("LogisticRegression")
    _ = learner.train(train_fs, grid_objective="unweighted_kappa")
    results = learner.evaluate(
        test_fs,
        grid_objective="unweighted_kappa",
        output_metrics=["balanced_accuracy", "dummy_metric"])
    test_objective_value = results[-2]
    test_output_metrics_dict = results[-1]
    test_accuracy_value = test_output_metrics_dict["balanced_accuracy"]
    test_dummy_metric_value = test_output_metrics_dict["dummy_metric"]

    # check that the values are as expected
    assert_almost_equal(test_objective_value, 0.9699, places=4)
    assert_almost_equal(test_accuracy_value, 0.9792, places=4)
    eq_(test_dummy_metric_value, 1.0)
Example #4
0
def create_fake_skll_learner(df_coefficients):

    """
    Create fake SKLL linear regression learner object
    using the coefficients in the given data frame.

    Parameters
    ----------
    df_coefficients : pandas DataFrame
        Data frame containing the linear coefficients
        we want to create the fake SKLL model with.

    Returns
    -------
    learner: skll Learner object
        SKLL LinearRegression Learner object containing
        with the specified coefficients.
    """

    # get the logger
    logger = logging.getLogger(__name__)

    # initialize a random number generator
    randgen = RandomState(1234567890)

    # iterate over the coefficients
    coefdict = {}
    for feature, coefficient in df_coefficients.itertuples(index=False):
        if feature == 'Intercept':
            intercept = coefficient
        else:
            # exclude NA coefficients
            if coefficient == np.nan:
                logger.warning("No coefficient was estimated for "
                               "{}. This is likely due to exact "
                               "collinearity in the model. This "
                               "feature will not be used for model "
                               "building".format(feature))
            else:
                coefdict[feature] = coefficient

    learner = Learner('LinearRegression')
    num_features = len(coefdict)  # excluding the intercept
    fake_feature_values = randgen.rand(num_features)
    fake_features = [dict(zip(coefdict, fake_feature_values))]
    fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features)
    learner.train(fake_fs, grid_search=False)

    # now create its parameters from the coefficients from the built-in model
    learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0]
    learner.model.intercept_ = intercept
    return learner
Example #5
0
def main():
    '''
    Handles command line arguments and gets things started.
    '''
    parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                  given model.",
                                     conflict_handler='resolve',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    parser.add_argument('--k',
                        help='number of top features to print (0 for all)',
                        type=int, default=50)
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args()

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    weights = learner.model_params

    print("Number of nonzero features:", len(weights), file=sys.stderr)

    for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))
Example #6
0
def main():
    '''
    Handles command line arguments and gets things started.
    '''
    parser = argparse.ArgumentParser(
        description="Prints out the weights of a \
                                                  given model.",
        conflict_handler='resolve',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    parser.add_argument('--k',
                        help='number of top features to print (0 for all)',
                        type=int,
                        default=50)
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args()

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    weights = learner.model_params

    print("Number of nonzero features:", len(weights), file=sys.stderr)

    for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))
Example #7
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    """
    parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                  given model.",
                                     conflict_handler='resolve',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    parser.add_argument('--k',
                        help='number of top features to print (0 for all)',
                        type=int, default=50)
    parser.add_argument('--sign',
                        choices=['positive', 'negative', 'all'],
                        default='all',
                        help='show only positive, only negative, ' +
                             'or all weights')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    weight_items = iteritems(weights)
    if args.sign == 'positive':
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == 'negative':
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if '_intercept_' in intercept:
            # Some learners (e.g. LinearSVR) may return a list of intercepts
            if isinstance(intercept['_intercept_'], np.ndarray):
                intercept_list = ["%.12f" % i for i in intercept['_intercept_']]
                print("intercept = {}".format(intercept_list))
            else:
                print("intercept = {:.12f}".format(intercept['_intercept_']))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{:.12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    """
    parser = argparse.ArgumentParser(
        description="Prints out the weights of a \
                                                  given model.",
        conflict_handler="resolve",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("model_file", help="model file to load")
    parser.add_argument("--k", help="number of top features to print (0 for all)", type=int, default=50)
    parser.add_argument(
        "--sign",
        choices=["positive", "negative", "all"],
        default="all",
        help="show only positive, only negative, " + "or all weights",
    )
    parser.add_argument("--version", action="version", version="%(prog)s {0}".format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=("%(asctime)s - %(name)s - %(levelname)s - " + "%(message)s"))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    weight_items = iteritems(weights)
    if args.sign == "positive":
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == "negative":
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if "_intercept_" in intercept:
            # Some learners (e.g. LinearSVR) may return a list of intercepts
            if isinstance(intercept["_intercept_"], np.ndarray):
                intercept_list = ["%.12f" % i for i in intercept["_intercept_"]]
                print("intercept = {}".format(intercept_list))
            else:
                print("intercept = {:.12f}".format(intercept["_intercept_"]))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{:.12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))
Example #9
0
    def setUpClass(cls):

        # create a dummy train and test feature set
        X, y = make_classification(n_samples=525,
                                   n_features=10,
                                   n_classes=5,
                                   n_informative=8,
                                   random_state=123)
        X_train, y_train = X[:500], y[:500]
        X_test = X[500:]

        train_ids = list(range(1, len(X_train) + 1))
        train_features = [
            dict(
                zip([
                    'FEATURE_{}'.format(i + 1) for i in range(X_train.shape[1])
                ], x)) for x in X_train
        ]
        train_labels = list(y_train)

        test_ids = list(range(1, len(X_test) + 1))
        test_features = [
            dict(
                zip([
                    'FEATURE_{}'.format(i + 1) for i in range(X_test.shape[1])
                ], x)) for x in X_test
        ]

        cls.train_fs = FeatureSet('train',
                                  ids=train_ids,
                                  features=train_features,
                                  labels=train_labels)
        cls.test_fs = FeatureSet('test', ids=test_ids, features=test_features)

        # train some test SKLL learners that we will use in our tests
        cls.linearsvc = Learner('LinearSVC')
        _ = cls.linearsvc.train(cls.train_fs, grid_search=False)

        cls.svc = Learner('SVC')
        _ = cls.svc.train(cls.train_fs, grid_search=False)

        cls.svc_with_probs = Learner('SVC', probability=True)
        _ = cls.svc_with_probs.train(cls.train_fs, grid_search=False)
Example #10
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    """
    parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                  given model.",
                                     conflict_handler='resolve',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    parser.add_argument('--k',
                        help='number of top features to print (0 for all)',
                        type=int, default=50)
    parser.add_argument('--sign',
                        choices=['positive', 'negative', 'all'],
                        default='all',
                        help='show only positive, only negative, ' +
                             'or all weights')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    weight_items = iteritems(weights)
    if args.sign == 'positive':
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == 'negative':
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if '_intercept_' in intercept:
            print("intercept = {:.12f}".format(intercept['_intercept_']))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{:.12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))
Example #11
0
def train_skll_model(model_name, df_train, experiment_id, csvdir, figdir):

    # instantiate the given SKLL learner
    learner = Learner(model_name)

    # get the features, IDs, and labels from the given data frame
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]
    features = df_train[feature_columns].to_dict(orient='records')
    ids = df_train['spkitemid'].tolist()
    labels = df_train['sc1'].tolist()

    # create a FeatureSet and train the model
    fs = FeatureSet('train', ids=ids, labels=labels, features=features)

    # if it's a regression model, then our grid objective should be
    # pearson and otherwise it should be accuracy
    if model_name in ["AdaBoostRegressor", "DecisionTreeRegressor", "ElasticNet",
                      "GradientBoostingRegressor", "KNeighborsRegressor", "Lasso",
                      "LinearRegression", "RandomForestRegressor", "Ridge",
                      "SGDRegressor", "LinearSVR", "SVR"]:
        objective = 'pearson'
    else:
        objective = 'f1_score_micro'

    learner.train(fs, grid_search=True, grid_objective=objective, grid_jobs=1)

    # TODO: compute betas for linear SKLL models?

    # save the SKLL model to disk with the given model name prefix
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    # return the SKLL learner object
    return learner
Example #12
0
def create_fake_skll_learner(df_coefficients):
    """
    Create fake SKLL linear regression learner object
    using the coefficients in the given data frame.
    """

    # get the logger
    logger = logging.getLogger(__name__)

    # initialize a random number generator
    randgen = RandomState(1234567890)

    # iterate over the coefficients
    coefdict = {}
    for feature, coefficient in df_coefficients.itertuples(index=False):
        if feature == 'Intercept':
            intercept = coefficient
        else:
            # exclude NA coefficients
            if coefficient == np.nan:
                logger.warning("No coefficient was estimated for "
                               "{}. This is likely due to exact "
                               "collinearity in the model. This "
                               "feature will not be used for model "
                               "building".format(feature))
            else:
                coefdict[feature] = coefficient

    learner = Learner('LinearRegression')
    num_features = len(coefdict)  # excluding the intercept
    fake_feature_values = randgen.rand(num_features)
    fake_features = [dict(zip(coefdict, fake_feature_values))]
    fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features)
    learner.train(fake_fs, grid_search=False)

    # now create its parameters from the coefficients from the built-in model
    learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0]
    learner.model.intercept_ = intercept
    return learner
Example #13
0
def update_model(model_file):
    """Read in the model file and save it again."""
    model_dir = dirname(model_file)

    # get the list of current files so that we can
    # remove them later to ensure there are no stranded
    # .npy files
    npy_files = glob.glob(join(model_dir, '*.npy'))

    # now load the SKLL model
    model = Learner.from_file(model_file)

    # delete the existing npy files. The model file will get overwritten,
    # but we do not know the exact number of current .npy files.
    for npy_file in npy_files:
        remove(npy_file)

    model.save(model_file)
def update_model(model_file):
    ''' Read in the model file and save it again'''

    model_dir = dirname(model_file)

    # get the list of current files so that we can
    # remove them later to ensure there are no stranded
    # .npy files
    npy_files = glob.glob(join(model_dir, '*.npy'))

    # now load the SKLL model
    model = Learner.from_file(model_file)

    # delete the existing npy files. The model file will get overwritten,
    # but we do not know the exact number of current .npy files.
    for npy_file in npy_files:
        remove(npy_file)

    model.save(model_file)
Example #15
0
    def __init__(self, model_path, threshold=None, positive_class=1):
        '''
        Initialize the predictor.

        :param model_path: Path to use when loading trained model.
        :type model_path: str
        :param threshold: If the model we're using is generating probabilities
                          of the positive class, return 1 if it meets/exceeds
                          the given threshold and 0 otherwise.
        :type threshold: float
        :param positive_class: If the model is only being used to predict the
                               probability of a particular class, this
                               specifies the index of the class we're
                               predicting. 1 = second class, which is default
                               for binary classification.
        :type positive_class: int
        '''
        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_class
        self.threshold = threshold
Example #16
0
    def __init__(self, model_path, threshold=None, positive_class=1):
        '''
        Initialize the predictor.

        :param model_path: Path to use when loading trained model.
        :type model_path: str
        :param threshold: If the model we're using is generating probabilities
                          of the positive class, return 1 if it meets/exceeds
                          the given threshold and 0 otherwise.
        :type threshold: float
        :param positive_class: If the model is only being used to predict the
                               probability of a particular class, this
                               specifies the index of the class we're
                               predicting. 1 = second class, which is default
                               for binary classification.
        :type positive_class: int
        '''
        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_class
        self.threshold = threshold
Example #17
0
def test_api_with_inverted_custom_metric():
    """Test API with a lower-is-better custom metric"""

    # register a lower-is-better custom metrics from our file
    # which is simply 1 minus the precision score
    input_dir = join(_my_dir, "other")
    custom_metrics_file1 = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file1, "one_minus_precision")

    # create some classification data
    train_fs, _ = make_classification_data(num_examples=1000,
                                           num_features=10,
                                           num_labels=2)

    # set up a learner to tune using the lower-is-better custom metric
    learner1 = Learner("LogisticRegression")
    (grid_score1,
     grid_results_dict1) = learner1.train(train_fs,
                                          grid_objective="one_minus_precision")

    # now setup another learner that uses the complementary version
    # of our custom metric (regular precision) for grid search
    learner2 = Learner("LogisticRegression")
    (grid_score2,
     grid_results_dict2) = learner2.train(train_fs, grid_objective="precision")

    # for both learners the ranking of the C hyperparameter should be
    # should be the identical since when we defined one_minus_precision
    # we set the `greater_is_better` keyword argument to `False`
    assert_array_equal(grid_results_dict1['rank_test_score'],
                       grid_results_dict2['rank_test_score'])

    # furthermore, the final grid score and the mean scores for each
    # C hyperparameter value should follow the same 1-X relationship
    # except that our custom metric should be negated due to the
    # keyword argument that we set when we defined it
    assert_almost_equal(1 - grid_score2, -1 * grid_score1, places=6)
    assert_array_almost_equal(1 - grid_results_dict2['mean_test_score'],
                              -1 * grid_results_dict1['mean_test_score'],
                              decimal=6)
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                  given model.",
                                     conflict_handler='resolve',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--k',
                       help='number of top features to print (0 for all)',
                       type=int, default=50)
    group.add_argument("--sort_by_labels", '-s', action='store_true',
                       default=False, help="order the features by classes")
    parser.add_argument('--sign',
                        choices=['positive', 'negative', 'all'],
                        default='all',
                        help='show only positive, only negative, ' +
                             'or all weights')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    multiclass = False
    model = learner._model
    if (isinstance(model, LinearSVC) or
        (isinstance(model, LogisticRegression) and
            len(learner.label_list) > 2) or
        (isinstance(model, SVC) and
            model.kernel == 'linear')):
        multiclass = True
    weight_items = iteritems(weights)
    if args.sign == 'positive':
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == 'negative':
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if '_intercept_' in intercept:
            # Some learners (e.g. LinearSVR) may return an array of intercepts but
            # sometimes that array is of length 1 so we don't need to print that
            # as an array/list. First, let's normalize these cases.
            model_intercepts = intercept['_intercept_']
            intercept_is_array = isinstance(model_intercepts, np.ndarray)
            num_intercepts = len(model_intercepts) if intercept_is_array else 1
            if intercept_is_array and num_intercepts == 1:
                model_intercepts = model_intercepts[0]
                intercept_is_array = False

            # now print out the intercepts
            print("intercept = {:.12f}".format(model_intercepts))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{: .12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    weight_by_class = defaultdict(dict)
    if multiclass and args.sort_by_labels:
        for label_feature, weight in weight_items:
            label, feature = label_feature.split()
            weight_by_class[label][feature] = weight
        for label in sorted(weight_by_class):
            for feat, val in sorted(weight_by_class[label].items(), key=lambda x: -abs(x[1])):
                print("{: .12f}\t{}\t{}".format(val, label, feat))
    else:
        for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
            print("{: .12f}\t{}".format(val, feat))
Example #19
0
def compute_and_save_predictions(config_file, output_file, feats_file):
    """
    Generate predictions using the information in the config file
    and save them into the given output file.
    """

    logger = logging.getLogger(__name__)

    # read in the main config file
    config_obj = read_json_file(config_file)
    config_obj = check_main_config(config_obj, context='rsmpredict')

    # get the directory where the config file lives
    # if this is the 'expm' directory, then go
    # up one level.
    configpath = dirname(config_file)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = locate_file(config_obj['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'.format(config_obj['input_features_file']))

    # get the experiment ID
    experiment_id = config_obj['experiment_id']

    # get the column name that will hold the ID
    id_column = config_obj['id_column']

    # get the column name for human score (if any)
    human_score_column = config_obj['human_score_column']

    # get the column name for second human score (if any)
    second_human_score_column = config_obj['second_human_score_column']

    # get the column name for subgroups (if any)
    subgroups = config_obj['subgroups']

    # get the column names for flag columns (if any)
    flag_column_dict = check_flag_column(config_obj)

    # get the name for the candidate_column (if any)
    candidate_column = config_obj['candidate_column']

    # get the directory of the experiment
    experiment_dir = locate_file(config_obj['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'.format(config_obj['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError('The directory {} does not contain '
                                    'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError('The directory {} does not contain any rsmtool models.'.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError('{} does not contain a model for the experiment "{}". '                                 'The following experiments are contained in this '
                                'directory: {}'.format(experiment_output_dir,
                                                       experiment_id,
                                                       experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(experiment_output_dir,
                                                                     expected_file_name))

    # read in the given features but make sure that the
    # `id_column`, `candidate_column` and subgroups are read in as a string
    logger.info('Reading features from {}'.format(input_features_file))
    string_columns = [id_column, candidate_column] + subgroups
    converter_dict = dict([(column, str) for column in string_columns if column])

    df_input = pd.read_csv(input_features_file, converters=converter_dict)

    # make sure that the columns specified in the config file actually exist
    columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys())

    # add subgroups and the flag columns to the list of columns
    # that will be added to the final file
    columns_to_copy = subgroups + list(flag_column_dict.keys())

    # human_score_column will be set to sc1 by default
    # we only raise an error if it's set to something else.
    # However, since we cannot distinguish whether the column was set
    # to sc1 by default or specified as such in the config file
    # we append it to output anyway as long as
    # it is in the input file

    if human_score_column != 'sc1' or 'sc1' in df_input.columns:
        columns_to_check.append(human_score_column)
        columns_to_copy.append('sc1')

    if candidate_column:
        columns_to_check.append(candidate_column)
        columns_to_copy.append('candidate')

    if second_human_score_column:
        columns_to_check.append(second_human_score_column)
        columns_to_copy.append('sc2')

    missing_columns = set(columns_to_check).difference(df_input.columns)
    if missing_columns:
        raise KeyError("Columns {} from the config file "
                       "do not exist in the data.".format(missing_columns))

    # rename all columns
    df_input = rename_default_columns(df_input,
                                      [],
                                      id_column,
                                      human_score_column,
                                      second_human_score_column,
                                      None,
                                      None,
                                      candidate_column=candidate_column)

    # check that the id_column contains unique values
    if df_input['spkitemid'].size != df_input['spkitemid'].unique().size:
        raise ValueError("The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool.".format(id_column))



    # now we need to pre-process these features using
    # the parameters that are already stored in the
    # _features.csv file.
    df_feature_info = pd.read_csv(join(experiment_output_dir,
                                       '{}_feature.csv'.format(experiment_id)),
                                  index_col=0)
    required_features = df_feature_info.index.tolist()

    # ensure that all the features that are needed by the model
    # are present in the input file
    input_feature_columns = [c for c in df_input if c != id_column]
    missing_features = set(required_features).difference(input_feature_columns)
    if missing_features:
        raise KeyError('{} is missing the following features: {}'.format(feats_file, missing_features))
    extra_features = set(input_feature_columns).difference(required_features + [id_column])
    if extra_features:
        logging.warning('The following extraenous features will be ignored: {}'.format(extra_features))

    # keep the required features plus the id
    features_to_keep = ['spkitemid'] + required_features

    # check if actually have the human scores for this data and add
    # sc1 to preprocessed features for consistency with other tools
    has_human_scores = 'sc1' in df_input
    if has_human_scores:
        features_to_keep.append('sc1')

    df_features = df_input[features_to_keep]

    # preprocess the feature values
    logger.info('Pre-processing input features')

    # first we need to filter out NaNs and any other
    # weird features, the same way we did for rsmtool.
    df_filtered = df_features.copy()
    df_excluded = pd.DataFrame(columns=df_filtered.columns)

    for feature_name in required_features:
        newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid',
                                                 exclude_zeros=False,
                                                 exclude_zero_sd=False)
        del df_filtered
        df_filtered = newdf
        df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')

    # make sure that the remaining data frame is not empty
    if len(df_filtered) == 0:
        raise ValueError("There are no responses left after "
                         "filtering out non-numeric feature values. No analysis "
                         "will be run")

    df_features = df_filtered.copy()
    df_features_preprocessed = df_features.copy()
    for feature_name in required_features:

        feature_values = df_features[feature_name].values

        feature_transformation = df_feature_info.loc[feature_name]['transform']
        feature_weight = df_feature_info.loc[feature_name]['sign']

        train_feature_mean = df_feature_info.loc[feature_name]['train_mean']
        train_feature_sd = df_feature_info.loc[feature_name]['train_sd']

        train_transformed_mean = df_feature_info.loc[feature_name]['train_transformed_mean']
        train_transformed_sd = df_feature_info.loc[feature_name]['train_transformed_sd']

        # transform the feature values and remove outliers
        df_features_preprocessed[feature_name] = preprocess_feature(feature_values,
                                                                    feature_name,
                                                                    feature_transformation,
                                                                    train_feature_mean,
                                                                    train_feature_sd,
                                                                    exclude_zero_sd=False)

        # now standardize the feature values
        df_features_preprocessed[feature_name] = (df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd

        # Multiply features by weight. Within the
        # current SR timeline, the mean of the transformed train
        # feature used to standardize test features has to be
        # computed before multiplying the train feature by the weight.
        df_features_preprocessed[feature_name] = df_features_preprocessed[feature_name] * feature_weight

    # save the pre-processed features to disk if we were asked to
    if feats_file:
        logger.info('Saving pre-processed feature values to {}'.format(feats_file))

        # create any directories needed for the output file
        os.makedirs(dirname(feats_file), exist_ok=True)
        df_features_preprocessed.to_csv(feats_file, index=False)

    # now load the SKLL model to generate the predictions
    model = Learner.from_file(join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # now generate the predictions for the features using this model
    logger.info('Generating predictions')
    df_predictions = predict_with_model(model, df_features_preprocessed)

    # read in the post-processing parameters from disk
    df_postproc_params = pd.read_csv(join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id)))
    trim_min = df_postproc_params['trim_min'].values[0]
    trim_max = df_postproc_params['trim_max'].values[0]
    h1_mean = df_postproc_params['h1_mean'].values[0]
    h1_sd = df_postproc_params['h1_sd'].values[0]
    train_predictions_mean = df_postproc_params['train_predictions_mean'].values[0]
    train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0]

    # now scale the predictions
    logger.info('Rescaling predictions')
    scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd
    scaled_predictions = scaled_predictions * h1_sd + h1_mean
    df_predictions['scale'] = scaled_predictions

    # trim and round the predictions
    logger.info('Trimming and rounding predictions')
    df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max)
    df_predictions['raw_trim_round'] = np.rint(df_predictions['raw_trim']).astype('int64')
    df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max)
    df_predictions['scale_trim_round'] = np.rint(df_predictions['scale_trim']).astype('int64')

    # add back the columns that we were requested to copy if any
    if columns_to_copy:
        df_predictions_with_metadata = pd.merge(df_predictions,
                                                df_input[['spkitemid'] + columns_to_copy])
        assert(len(df_predictions) == len(df_predictions_with_metadata))
    else:
        df_predictions_with_metadata = df_predictions.copy()

    # create any directories needed for the output file
    os.makedirs(dirname(output_file), exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions to {}'.format(output_file))
    df_predictions_with_metadata.to_csv(output_file, index=False)

    # save excluded responses to disk
    if not df_excluded.empty:
        excluded_output_file = '{}_excluded_responses{}'.format(*splitext(output_file))
        logger.info('Saving excluded responses to {}'.format(excluded_output_file))
        df_excluded.to_csv(excluded_output_file, index=False)
Example #20
0
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir):
    """
    Train one of the built-in linear regression models.

    Parameters
    ----------
    model_name : str
        Name of the built-in model to train.
    df_train : pandas DataFrame
        Data frame containing the features on which
        to train the model.
    experiment_id : str
        The experiment ID.
    csvdir : str
        Path to the `output` experiment output directory.
    figdir : str
        Path to the `figure` experiment output directory.

    Returns
    -------
    learner : skll Learner object
        SKLL LinearRegression Learner object containing
        the coefficients learned by training the built-in
        model.
    """
    # get the columns that actually contain the feature values
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]

    # LinearRegression (formerly empWt) : simple linear regression
    if model_name == 'LinearRegression':

        # get the feature columns
        X = df_train[feature_columns]

        # add the intercept
        X = sm.add_constant(X)

        # fit the model
        fit = sm.OLS(df_train['sc1'], X).fit()
        df_coef = ols_coefficients_to_dataframe(fit.params)
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # EqualWeightsLR (formerly eqWt) : all features get equal weight
    elif model_name == 'EqualWeightsLR':
        # we first compute a single feature that is simply the sum of all features
        df_train_eqwt = df_train.copy()
        df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1)

        # train a plain Linear Regression model
        X = df_train_eqwt['sumfeature']
        X = sm.add_constant(X)
        fit = sm.OLS(df_train_eqwt['sc1'], X).fit()

        # get the coefficient for the summed feature and the intercept
        coef = fit.params['sumfeature']
        const = fit.params['const']

        # now we need to assign this coefficient to all of the original
        # features and create a fake SKLL learner with these weights
        original_features = [c for c in df_train_eqwt.columns if c not in ['sc1',
                                                                           'sumfeature',
                                                                           'spkitemid']]
        coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)]))
        df_coef = ols_coefficients_to_dataframe(coefs)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # RebalancedLR (formerly empWtBalanced) : balanced empirical weights
    # by changing betas [adapted from http://bit.ly/UTP7gS]
    elif model_name == 'RebalancedLR':

        # train a plain Linear Regression model
        X = df_train[feature_columns]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_params = ols_coefficients_to_dataframe(fit.params)
        df_params = df_params.set_index('feature')

        # compute the betas for the non-intercept coefficients
        df_weights = df_params.loc[feature_columns]
        df_betas = df_weights.copy()
        df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std()

        # replace each negative beta with delta and adjust
        # all the positive betas to account for this
        RT = 0.05
        df_positive_betas = df_betas[df_betas['coefficient'] > 0]
        df_negative_betas = df_betas[df_betas['coefficient'] < 0]
        delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas)
        df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1)

        # rescale the adjusted betas to get the new coefficients
        df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index')

        # add the intercept back to the new coefficients
        df_coef['Intercept'] = df_params.loc['Intercept'].coefficient
        df_coef = df_coef.sort_index().reset_index()
        df_coef.columns = ['feature', 'coefficient']

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature
    # selection using lasso regression with a fixed lambda and then
    # use only those features to train a second linear regression
    elif model_name == 'LassoFixedLambdaThenLR':

        # train a Lasso Regression model with this featureset with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train a new vanilla linear regression with just the non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # get the coefficients data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature
    # selection using lasso regression optimized for log likelihood using
    # cross validation and then use only those features to train a
    # second linear regression
    elif model_name == 'PositiveLassoCVThenLR':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # NNLR (formerly empWtNNLS) : First do feature selection using
    # non-negative least squares (NNLS) and then use only its non-zero
    # features to train a regular linear regression. We do the regular
    # LR at the end since we want an LR object so that we have access
    # to R^2 and other useful statistics. There should be no difference
    # between the non-zero coefficients from NNLS and the coefficients
    # that end up coming out of the subsequent LR.
    elif model_name == 'NNLR':

        # add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters to a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # LassoFixedLambdaThenNNLR (formerly empWtDropNegLasso): First do
    # feature selection using lasso regression and positive only weights.
    # Then fit an NNLR (see above) on those features.
    elif model_name == 'LassoFixedLambdaThenNNLR':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train an NNLS regression using these non-zero features
        # first add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the positive features
        used_features = non_zero_features

    # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with
    # a fixed lambda
    elif model_name == 'LassoFixedLambda':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        alpha = p_lambda / len(df_train)
        learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
        learner.train(fs_train, grid_search=False)

        # convert this model's parameters to a data frame
        df_coef = skll_learner_params_to_dataframe(learner)

        # there's no OLS fit object in this case
        fit = None

        # we used all the features
        used_features = feature_columns

    # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection
    # using lasso regression optimized for log likelihood using cross
    # validation.
    elif model_name == 'PositiveLassoCV':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # save the non-zero model coefficients and intercept to a data frame
        non_zero_features, non_zero_feature_values = [], []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)
                non_zero_feature_values.append(coefficient)

        # initialize the coefficient data frame with just the intercept
        df_coef = pd.DataFrame([('Intercept', model.intercept_)])
        df_coef = df_coef.append(list(zip(non_zero_features,
                                          non_zero_feature_values)), ignore_index=True)
        df_coef.columns = ['feature', 'coefficient']

        # create a fake SKLL learner with these non-zero weights
        learner = create_fake_skll_learner(df_coef)

        # there's no OLS fit object in this case
        fit = None

        # we used only the non-zero features
        used_features = non_zero_features

    # save the raw coefficients to a file
    df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False)

    # compute the standardized and relative coefficients (betas) for the
    # non-intercept features and save to a file
    df_betas = df_coef.set_index('feature').loc[used_features]
    df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False)

    # save the OLS fit object and its summary to files
    if fit:
        ols_file = join(csvdir, '{}.ols'.format(experiment_id))
        summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

        # create a data frame with main model fit metrics and save to the file
        df_model_fit = model_fit_to_dataframe(fit)
        model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id))
        df_model_fit.to_csv(model_fit_file, index=False)

    # save the SKLL model to a file
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    return learner
Example #21
0
    def validate_config(cls, config, context='rsmtool'):
        """
        Validate configuration file.

        Ensure that all required fields are specified, add default values
        values for all unspecified fields, and ensure that all specified
        fields are valid.

        Parameters
        ----------
        context : str, optional
            Context of the tool in which we are validating.
            Possible values are ::

                {'rsmtool', 'rsmeval',
                 'rsmpredict', 'rsmcompare', 'rsmsummarize'}

            Defaults to 'rsmtool'.
        inplace : bool
            Maintain the state of the config object produced by
            this method.
            Defaults to True.

        Returns
        -------
        config_obj : Configuration
            A configuration object

        Raises
        ------
        ValueError
            If config does not exist, and no config passed.
        """
        # make a copy of the given parameter dictionary
        new_config = deepcopy(config)

        # 1. Check to make sure all required fields are specified
        required_fields = CHECK_FIELDS[context]['required']

        for field in required_fields:
            if field not in new_config:
                raise ValueError("The config file must "
                                 "specify '{}'".format(field))

        # 2. Add default values for unspecified optional fields
        # for given RSMTool context
        defaults = DEFAULTS

        for field in defaults:
            if field not in new_config:
                new_config[field] = defaults[field]

        # 3. Check to make sure no unrecognized fields are specified
        for field in new_config:
            if field not in defaults and field not in required_fields:
                raise ValueError("Unrecognized field '{}'"
                                 " in json file".format(field))

        # 4. Check to make sure that the ID fields that will be
        # used as part of filenames are formatted correctly
        # i.e., they do not contain any spaces and are < 200 characters
        id_field = ID_FIELDS[context]
        id_field_values = {id_field: new_config[id_field]}

        for id_field, id_field_value in id_field_values.items():
            if len(id_field_value) > 200:
                raise ValueError("{} is too long (must be "
                                 "<=200 characters)".format(id_field))

            if re.search(r'\s', id_field_value):
                raise ValueError("{} cannot contain any "
                                 "spaces".format(id_field))

        # 5. Check that the feature file and feature subset/subset file are not
        # specified together
        msg = ("You cannot specify BOTH \"features\" and \"{}\". "
               "Please refer to the \"Selecting Feature Columns\" "
               "section in the documentation for more details.")
        if new_config['features'] and new_config['feature_subset_file']:
            msg = msg.format("feature_subset_file")
            raise ValueError(msg)
        if new_config['features'] and new_config['feature_subset']:
            msg = msg.format("feature_subset")
            raise ValueError(msg)

        # 6. Check for fields that require feature_subset_file and try
        # to use the default feature file
        if (new_config['feature_subset']
                and not new_config['feature_subset_file']):

            # Check if we have the default subset file from rsmextra
            if HAS_RSMEXTRA:
                default_basename = Path(default_feature_subset_file).name
                new_config['feature_subset_file'] = default_feature_subset_file
                logging.warning("You requested feature subsets but did not "
                                "specify any feature file. "
                                "The tool will use the default "
                                "feature file {} available via "
                                "rsmextra".format(default_basename))
            else:
                raise ValueError("If you want to use feature subsets, you "
                                 "must specify a feature subset file")

        if new_config['sign'] and not new_config['feature_subset_file']:

            # Check if we have the default subset file from rsmextra
            if HAS_RSMEXTRA:
                default_basename = Path(default_feature_subset_file).name
                new_config['feature_subset_file'] = default_feature_subset_file
                logging.warning("You specified the expected sign of "
                                "correlation but did not specify a feature "
                                "subset file. The tool will use "
                                "the default feature subset file {} "
                                "available via "
                                "rsmextra".format(default_basename))
            else:
                raise ValueError("If you want to specify the expected sign of "
                                 " correlation for each feature, you must "
                                 "specify a feature subset file")

        # Use the default sign if we are using the default feature file
        # and sign has not been specified in the config file
        if HAS_RSMEXTRA:
            default_feature = default_feature_subset_file
            if (new_config['feature_subset_file'] == default_feature
                    and not new_config['sign']):
                new_config['sign'] = default_feature_sign

        # 7. Check for fields that must be specified together
        if (new_config['min_items_per_candidate']
                and not new_config['candidate_column']):
            raise ValueError("If you want to filter out candidates with "
                             "responses to less than X items, you need "
                             "to specify the name of the column which "
                             "contains candidate IDs.")

        # 8. Check that if "skll_objective" is specified, it's
        # one of the metrics that SKLL allows for AND that it is
        # specified for a SKLL model and _not_ a built-in
        # linear regression model
        if new_config['skll_objective']:
            if not is_skll_model(new_config['model']):
                warnings.warn(
                    "You specified a custom SKLL objective but also chose a "
                    "non-SKLL model. The objective will be ignored.")
            else:
                if new_config['skll_objective'] not in SCORERS:
                    raise ValueError(
                        "Invalid SKLL objective. Please refer to the SKLL "
                        "documentation and choose a valid tuning objective.")

        # 9. Check that if "skll_fixed_parameters" is specified,
        # it's specified for SKLL model and _not_ a built-in linear
        # regression model; we cannot check whether the parameters
        # are valid at parse time but SKLL will raise an error
        # at run time for any invalid parameters
        if new_config['skll_fixed_parameters']:
            if not is_skll_model(new_config['model']):
                warnings.warn(
                    "You specified custom SKLL fixed parameters but "
                    "also chose a non-SKLL model. The parameters will "
                    "be ignored.")

        # 10. Check that if we are running rsmtool to ask for
        # expected scores then the SKLL model type must actually
        # support probabilistic classification. If it's not a SKLL
        # model at all, we just treat it as a LinearRegression model
        # which is basically what they all are in the end.
        if context == 'rsmtool' and new_config['predict_expected_scores']:
            model_name = new_config['model']
            dummy_learner = Learner(model_name) if is_skll_model(
                model_name) else Learner('LinearRegression')
            if not hasattr(dummy_learner.model_type, 'predict_proba'):
                raise ValueError(
                    "{} does not support expected scores "
                    "since it is not a probablistic classifier.".format(
                        model_name))
            del dummy_learner

        # 11. Check the fields that requires rsmextra
        if not HAS_RSMEXTRA:
            if new_config['special_sections']:
                raise ValueError("Special sections are only available to ETS"
                                 " users by installing the rsmextra package.")

        # 12. Raise a warning if we are specifiying a feature file but also
        # telling the system to automatically select transformations
        if new_config['features'] and new_config['select_transformations']:
            # Show a warning unless a user passed a list of features.
            if not isinstance(new_config['features'], list):
                warnings.warn("You specified a feature file but also set "
                              "`select_transformations` to True. Any "
                              "transformations or signs specified in "
                              "the feature file will be overwritten by "
                              "the automatically selected transformations "
                              "and signs.")

        # 13. If we have `experiment_names`, check that the length of the list
        # matches the list of experiment_dirs.
        if context == 'rsmsummarize' and new_config['experiment_names']:
            if len(new_config['experiment_names']) != len(
                    new_config['experiment_dirs']):
                raise ValueError(
                    "The number of specified experiment names should be the same"
                    " as the number of specified experiment directories.")

        # 14. Check that if the user specified min_n_per_group, they also
        # specified subgroups. If they supplied a dictionary, make
        # sure the keys match
        if new_config['min_n_per_group']:
            # make sure we have subgroups
            if 'subgroups' not in new_config:
                raise ValueError("You must specify a list of subgroups in "
                                 "in the `subgroups` field if "
                                 "you want to use the `min_n_per_group` field")
            # if we got dictionary, make sure the keys match
            elif isinstance(new_config['min_n_per_group'], dict):
                if sorted(new_config['min_n_per_group'].keys()) != sorted(
                        new_config['subgroups']):
                    raise ValueError(
                        "The keys in `min_n_per_group` must "
                        "match the subgroups in `subgroups` field")
            # else convert to dictionary
            else:
                new_config['min_n_per_group'] = {
                    group: new_config['min_n_per_group']
                    for group in new_config['subgroups']
                }

        # 15. Clean up config dict to keep only context-specific fields
        context_relevant_fields = (CHECK_FIELDS[context]['optional'] +
                                   CHECK_FIELDS[context]['required'])

        new_config = {
            k: v
            for k, v in new_config.items() if k in context_relevant_fields
        }

        return new_config
Example #22
0
    def validate_config(self, context='rsmtool', inplace=True):
        """
        Ensure that all required fields are specified, add default values
        values for all unspecified fields, and ensure that all specified
        fields are valid.

        Parameters
        ----------
        context : str, optional
            Context of the tool in which we are validating.
            Possible values are ::

                {'rsmtool', 'rsmeval',
                 'rsmpredict', 'rsmcompare', 'rsmsummarize'}

            Defaults to 'rsmtool'.
        inplace : bool
            Maintain the state of the config object produced by
            this method.
            Defaults to True.

        Returns
        -------
        config_obj : Configuration
            A configuration object

        Raises
        ------
        ValueError
            If config does not exist, and no config passed.
        """

        # Check to make sure a configuration file
        # or dictionary has been loaded.
        self._check_config_is_loaded()

        # Get the parameter dictionary
        new_config = self._config

        # 1. Check to make sure all required fields are specified
        required_fields = CHECK_FIELDS[context]['required']

        for field in required_fields:
            if field not in new_config:
                raise ValueError("The config file must "
                                 "specify '{}'".format(field))

        # 2. Add default values for unspecified optional fields
        # for given RSMTool context
        defaults = DEFAULTS

        for field in defaults:
            if field not in new_config:
                new_config[field] = defaults[field]

        # 3. Check to make sure no unrecognized fields are specified
        for field in new_config:
            if field not in defaults and field not in required_fields:
                raise ValueError("Unrecognized field '{}'"
                                 " in json file".format(field))

        # 4. Check to make sure that the ID fields that will be
        # used as part of filenames formatted correctly
        id_fields = ['comparison_id', 'experiment_id', 'summary_id']
        id_field_values = {
            field: new_config[field]
            for field in new_config if field in id_fields
        }

        # we do not need to validate any IDs for `rsmpredict`
        self.check_id_fields(id_field_values)

        # 5. Check that the feature file and feature subset/subset file are not
        # specified together
        msg = ("You cannot specify BOTH \"features\" and \"{}\". "
               "Please refer to the \"Selecting Feature Columns\" "
               "section in the documentation for more details.")
        if new_config['features'] and new_config['feature_subset_file']:
            msg = msg.format("feature_subset_file")
            raise ValueError(msg)
        if new_config['features'] and new_config['feature_subset']:
            msg = msg.format("feature_subset")
            raise ValueError(msg)

        # 6. Check for fields that require feature_subset_file and try
        # to use the default feature file
        if (new_config['feature_subset']
                and not new_config['feature_subset_file']):

            # Check if we have the default subset file from rsmextra
            if HAS_RSMEXTRA:
                default_basename = basename(default_feature_subset_file)
                new_config['feature_subset_file'] = default_feature_subset_file
                logging.warning("You requested feature subsets but did not "
                                "specify any feature file. "
                                "The tool will use the default "
                                "feature file {} available via "
                                "rsmextra".format(default_basename))
            else:
                raise ValueError("If you want to use feature subsets, you "
                                 "must specify a feature subset file")

        if new_config['sign'] and not new_config['feature_subset_file']:

            # Check if we have the default subset file from rsmextra
            if HAS_RSMEXTRA:
                default_basename = basename(default_feature_subset_file)
                new_config['feature_subset_file'] = default_feature_subset_file
                logging.warning("You specified the expected sign of "
                                "correlation but did not specify a feature "
                                "subset file. The tool will use "
                                "the default feature subset file {} "
                                "available via "
                                "rsmextra".format(default_basename))
            else:
                raise ValueError("If you want to specify the expected sign of "
                                 " correlation for each feature, you must "
                                 "specify a feature subset file")

        # Use the default sign if we are using the default feature file
        # and sign has not been specified in the config file
        if HAS_RSMEXTRA:
            default_feature = default_feature_subset_file
            if (new_config['feature_subset_file'] == default_feature
                    and not new_config['sign']):
                new_config['sign'] = default_feature_sign

        # 7. Check for fields that must be specified together
        if (new_config['min_items_per_candidate']
                and not new_config['candidate_column']):
            raise ValueError("If you want to filter out candidates with "
                             "responses to less than X items, you need "
                             "to specify the name of the column which "
                             "contains candidate IDs.")

        # 8. Check that if "skll_objective" is specified, it's
        # one of the metrics that SKLL allows for AND that it is
        # specified for a SKLL model and _not_ a built-in
        # linear regression model
        if new_config['skll_objective']:
            if not is_skll_model(new_config['model']):
                logging.warning(
                    "You specified a custom SKLL objective but also chose a "
                    "non-SKLL model. The objective will be ignored.")
            else:
                if new_config['skll_objective'] not in SCORERS:
                    raise ValueError(
                        "Invalid SKLL objective. Please refer to the SKLL "
                        "documentation and choose a valid tuning objective.")

        # 9. Check that if we are running rsmtool to ask for
        # expected scores then the SKLL model type must actually
        # support probabilistic classification. If it's not a SKLL
        # model at all, we just treat it as a LinearRegression model
        # which is basically what they all are in the end.
        if context == 'rsmtool' and new_config['predict_expected_scores']:
            model_name = new_config['model']
            dummy_learner = Learner(model_name) if is_skll_model(
                model_name) else Learner('LinearRegression')
            if not hasattr(dummy_learner.model_type, 'predict_proba'):
                raise ValueError(
                    "{} does not support expected scores "
                    "since it is not a probablistic classifier.".format(
                        model_name))
            del dummy_learner

        # 10. Check the fields that requires rsmextra
        if not HAS_RSMEXTRA:
            if new_config['special_sections']:
                raise ValueError("Special sections are only available to ETS"
                                 " users by installing the rsmextra package.")

        # 11. Raise a warning if we are specifiying a feature file but also
        # telling the system to automatically select transformations
        if new_config['features'] and new_config['select_transformations']:
            logging.warning("You specified a feature file but also set "
                            "`select_transformations` to True. Any "
                            "transformations or signs specified in "
                            "the feature file will be overwritten by "
                            "the automatically selected transformations "
                            "and signs.")

        # 12. Clean up config dict to keep only context-specific fields
        context_relevant_fields = (CHECK_FIELDS[context]['optional'] +
                                   CHECK_FIELDS[context]['required'])

        new_config = {
            k: v
            for k, v in new_config.items() if k in context_relevant_fields
        }

        if inplace:
            self._config = new_config
        return Configuration(self._config, self._filepath)
Example #23
0
def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir):

    # get the columns that actually contain the feature values
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]

    # LinearRegression (formerly empWt) : simple linear regression
    if model_name == 'LinearRegression':

        # get the feature columns
        X = df_train[feature_columns]

        # add the intercept
        X = sm.add_constant(X)

        # fit the model
        fit = sm.OLS(df_train['sc1'], X).fit()
        df_coef = ols_coefficients_to_dataframe(fit.params)
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # EqualWeightsLR (formerly eqWt) : all features get equal weight
    elif model_name == 'EqualWeightsLR':
        # we first compute a single feature that is simply the sum of all features
        df_train_eqwt = df_train.copy()
        df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1)

        # train a plain Linear Regression model
        X = df_train_eqwt['sumfeature']
        X = sm.add_constant(X)
        fit = sm.OLS(df_train_eqwt['sc1'], X).fit()

        # get the coefficient for the summed feature and the intercept
        coef = fit.params['sumfeature']
        const = fit.params['const']

        # now we need to assign this coefficient to all of the original
        # features and create a fake SKLL learner with these weights
        original_features = [c for c in df_train_eqwt.columns if c not in ['sc1',
                                                                           'sumfeature',
                                                                           'spkitemid']]
        coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)]))
        df_coef = ols_coefficients_to_dataframe(coefs)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # RebalancedLR (formerly empWtBalanced) : balanced empirical weights
    # by changing betas [adapted from http://bit.ly/UTP7gS]
    elif model_name == 'RebalancedLR':

        # train a plain Linear Regression model
        X = df_train[feature_columns]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_params = ols_coefficients_to_dataframe(fit.params)
        df_params = df_params.set_index('feature')

        # compute the betas for the non-intercept coefficients
        df_weights = df_params.loc[feature_columns]
        df_betas = df_weights.copy()
        df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std()

        # replace each negative beta with delta and adjust
        # all the positive betas to account for this
        RT = 0.05
        df_positive_betas = df_betas[df_betas['coefficient'] > 0]
        df_negative_betas = df_betas[df_betas['coefficient'] < 0]
        delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas)
        df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1)

        # rescale the adjusted betas to get the new coefficients
        df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index')

        # add the intercept back to the new coefficients
        df_coef['Intercept'] = df_params.loc['Intercept'].coefficient
        df_coef = df_coef.sort_index().reset_index()
        df_coef.columns = ['feature', 'coefficient']

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature
    # selection using lasso regression with a fixed lambda and then
    # use only those features to train a second linear regression
    elif model_name == 'LassoFixedLambdaThenLR':

        # train a Lasso Regression model with this featureset with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train a new vanilla linear regression with just the non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # get the coefficients data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature
    # selection using lasso regression optimized for log likelihood using
    # cross validation and then use only those features to train a
    # second linear regression
    elif model_name == 'PositiveLassoCVThenLR':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # NNLR (formerly empWtNNLS) : First do feature selection using
    # non-negative least squares (NNLS) and then use only its non-zero
    # features to train a regular linear regression. We do the regular
    # LR at the end since we want an LR object so that we have access
    # to R^2 and other useful statistics. There should be no difference
    # between the non-zero coefficients from NNLS and the coefficients
    # that end up coming out of the subsequent LR.
    elif model_name == 'NNLR':

        # add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters to a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # LassoFixedLambdaThenNNLS (formerly empWtDropNegLasso): First do
    # feature selection using lasso regression and positive only weights.
    # Then fit an NNLR (see above) on those features.
    elif model_name == 'LassoFixedLambdaThenNNLR':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train an NNLS regression using these non-zero features
        # first add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the positive features
        used_features = non_zero_features

    # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with
    # a fixed lambda
    elif model_name == 'LassoFixedLambda':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        alpha = p_lambda / len(df_train)
        learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
        learner.train(fs_train, grid_search=False)

        # convert this model's parameters to a data frame
        df_coef = skll_learner_params_to_dataframe(learner)

        # there's no OLS fit object in this case
        fit = None

        # we used all the features
        used_features = feature_columns

    # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection
    # using lasso regression optimized for log likelihood using cross
    # validation.
    elif model_name == 'PositiveLassoCV':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # save the non-zero model coefficients and intercept to a data frame
        non_zero_features, non_zero_feature_values = [], []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)
                non_zero_feature_values.append(coefficient)

        # initialize the coefficient data frame with just the intercept
        df_coef = pd.DataFrame([('Intercept', model.intercept_)])
        df_coef = df_coef.append(list(zip(non_zero_features,
                                          non_zero_feature_values)), ignore_index=True)
        df_coef.columns = ['feature', 'coefficient']

        # create a fake SKLL learner with these non-zero weights
        learner = create_fake_skll_learner(df_coef)

        # there's no OLS fit object in this case
        fit = None

        # we used only the non-zero features
        used_features = non_zero_features

    # save the raw coefficients to a file
    df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False)

    # compute the standardized and relative coefficients (betas) for the
    # non-intercept features and save to a file
    df_betas = df_coef.set_index('feature').loc[used_features]
    df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False)

    # save the OLS fit object and its summary to files
    if fit:
        ols_file = join(csvdir, '{}.ols'.format(experiment_id))
        summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

    # save the SKLL model to a file
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    return learner
Example #24
0
def compute_and_save_predictions(config_file, output_file, feats_file):
    """
    Generate predictions using the information in the config file
    and save them into the given output file.
    """

    logger = logging.getLogger(__name__)

    # read in the main config file
    config_obj = read_json_file(config_file)
    config_obj = check_main_config(config_obj, context='rsmpredict')

    # get the directory where the config file lives
    # if this is the 'expm' directory, then go
    # up one level.
    configpath = dirname(config_file)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = locate_file(config_obj['input_features_file'],
                                      configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'.format(
            config_obj['input_features_file']))

    # get the experiment ID
    experiment_id = config_obj['experiment_id']

    # get the column name that will hold the ID
    id_column = config_obj['id_column']

    # get the column name for human score (if any)
    human_score_column = config_obj['human_score_column']

    # get the column name for second human score (if any)
    second_human_score_column = config_obj['second_human_score_column']

    # get the column name for subgroups (if any)
    subgroups = config_obj['subgroups']

    # get the column names for flag columns (if any)
    flag_column_dict = check_flag_column(config_obj)

    # get the name for the candidate_column (if any)
    candidate_column = config_obj['candidate_column']

    # get the directory of the experiment
    experiment_dir = locate_file(config_obj['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'.format(
            config_obj['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError(
                'The directory {} does not contain '
                'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError(
            'The directory {} does not contain any rsmtool models.'.format(
                experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError(
            '{} does not contain a model for the experiment "{}". '
            'The following experiments are contained in this '
            'directory: {}'.format(experiment_output_dir, experiment_id,
                                   experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(
                                        experiment_output_dir,
                                        expected_file_name))

    # read in the given features but make sure that the
    # `id_column`, `candidate_column` and subgroups are read in as a string
    logger.info('Reading features from {}'.format(input_features_file))
    string_columns = [id_column, candidate_column] + subgroups
    converter_dict = dict([(column, str) for column in string_columns
                           if column])

    df_input = pd.read_csv(input_features_file, converters=converter_dict)

    # make sure that the columns specified in the config file actually exist
    columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys())

    # add subgroups and the flag columns to the list of columns
    # that will be added to the final file
    columns_to_copy = subgroups + list(flag_column_dict.keys())

    # human_score_column will be set to sc1 by default
    # we only raise an error if it's set to something else.
    # However, since we cannot distinguish whether the column was set
    # to sc1 by default or specified as such in the config file
    # we append it to output anyway as long as
    # it is in the input file

    if human_score_column != 'sc1' or 'sc1' in df_input.columns:
        columns_to_check.append(human_score_column)
        columns_to_copy.append('sc1')

    if candidate_column:
        columns_to_check.append(candidate_column)
        columns_to_copy.append('candidate')

    if second_human_score_column:
        columns_to_check.append(second_human_score_column)
        columns_to_copy.append('sc2')

    missing_columns = set(columns_to_check).difference(df_input.columns)
    if missing_columns:
        raise KeyError("Columns {} from the config file "
                       "do not exist in the data.".format(missing_columns))

    # rename all columns
    df_input = rename_default_columns(df_input, [],
                                      id_column,
                                      human_score_column,
                                      second_human_score_column,
                                      None,
                                      None,
                                      candidate_column=candidate_column)

    # check that the id_column contains unique values
    if df_input['spkitemid'].size != df_input['spkitemid'].unique().size:
        raise ValueError(
            "The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool."
            .format(id_column))

    # now we need to pre-process these features using
    # the parameters that are already stored in the
    # _features.csv file.
    df_feature_info = pd.read_csv(join(experiment_output_dir,
                                       '{}_feature.csv'.format(experiment_id)),
                                  index_col=0)
    required_features = df_feature_info.index.tolist()

    # ensure that all the features that are needed by the model
    # are present in the input file
    input_feature_columns = [c for c in df_input if c != id_column]
    missing_features = set(required_features).difference(input_feature_columns)
    if missing_features:
        raise KeyError('{} is missing the following features: {}'.format(
            feats_file, missing_features))
    extra_features = set(input_feature_columns).difference(required_features +
                                                           [id_column])
    if extra_features:
        logging.warning(
            'The following extraenous features will be ignored: {}'.format(
                extra_features))

    # keep the required features plus the id
    features_to_keep = ['spkitemid'] + required_features

    # check if actually have the human scores for this data and add
    # sc1 to preprocessed features for consistency with other tools
    has_human_scores = 'sc1' in df_input
    if has_human_scores:
        features_to_keep.append('sc1')

    df_features = df_input[features_to_keep]

    # preprocess the feature values
    logger.info('Pre-processing input features')

    # first we need to filter out NaNs and any other
    # weird features, the same way we did for rsmtool.
    df_filtered = df_features.copy()
    df_excluded = pd.DataFrame(columns=df_filtered.columns)

    for feature_name in required_features:
        newdf, newdf_excluded = filter_on_column(df_filtered,
                                                 feature_name,
                                                 'spkitemid',
                                                 exclude_zeros=False,
                                                 exclude_zero_sd=False)
        del df_filtered
        df_filtered = newdf
        df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')

    # make sure that the remaining data frame is not empty
    if len(df_filtered) == 0:
        raise ValueError(
            "There are no responses left after "
            "filtering out non-numeric feature values. No analysis "
            "will be run")

    df_features = df_filtered.copy()
    df_features_preprocessed = df_features.copy()
    for feature_name in required_features:

        feature_values = df_features[feature_name].values

        feature_transformation = df_feature_info.loc[feature_name]['transform']
        feature_weight = df_feature_info.loc[feature_name]['sign']

        train_feature_mean = df_feature_info.loc[feature_name]['train_mean']
        train_feature_sd = df_feature_info.loc[feature_name]['train_sd']

        train_transformed_mean = df_feature_info.loc[feature_name][
            'train_transformed_mean']
        train_transformed_sd = df_feature_info.loc[feature_name][
            'train_transformed_sd']

        # transform the feature values and remove outliers
        df_features_preprocessed[feature_name] = preprocess_feature(
            feature_values,
            feature_name,
            feature_transformation,
            train_feature_mean,
            train_feature_sd,
            exclude_zero_sd=False)

        # now standardize the feature values
        df_features_preprocessed[feature_name] = (
            df_features_preprocessed[feature_name] -
            train_transformed_mean) / train_transformed_sd

        # Multiply features by weight. Within the
        # current SR timeline, the mean of the transformed train
        # feature used to standardize test features has to be
        # computed before multiplying the train feature by the weight.
        df_features_preprocessed[feature_name] = df_features_preprocessed[
            feature_name] * feature_weight

    # save the pre-processed features to disk if we were asked to
    if feats_file:
        logger.info(
            'Saving pre-processed feature values to {}'.format(feats_file))

        # create any directories needed for the output file
        os.makedirs(dirname(feats_file), exist_ok=True)
        df_features_preprocessed.to_csv(feats_file, index=False)

    # now load the SKLL model to generate the predictions
    model = Learner.from_file(
        join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # now generate the predictions for the features using this model
    logger.info('Generating predictions')
    df_predictions = predict_with_model(model, df_features_preprocessed)

    # read in the post-processing parameters from disk
    df_postproc_params = pd.read_csv(
        join(experiment_output_dir,
             '{}_postprocessing_params.csv'.format(experiment_id)))
    trim_min = df_postproc_params['trim_min'].values[0]
    trim_max = df_postproc_params['trim_max'].values[0]
    h1_mean = df_postproc_params['h1_mean'].values[0]
    h1_sd = df_postproc_params['h1_sd'].values[0]
    train_predictions_mean = df_postproc_params[
        'train_predictions_mean'].values[0]
    train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0]

    # now scale the predictions
    logger.info('Rescaling predictions')
    scaled_predictions = (df_predictions['raw'] -
                          train_predictions_mean) / train_predictions_sd
    scaled_predictions = scaled_predictions * h1_sd + h1_mean
    df_predictions['scale'] = scaled_predictions

    # trim and round the predictions
    logger.info('Trimming and rounding predictions')
    df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min,
                                      trim_max)
    df_predictions['raw_trim_round'] = np.rint(
        df_predictions['raw_trim']).astype('int64')
    df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min,
                                        trim_max)
    df_predictions['scale_trim_round'] = np.rint(
        df_predictions['scale_trim']).astype('int64')

    # add back the columns that we were requested to copy if any
    if columns_to_copy:
        df_predictions_with_metadata = pd.merge(
            df_predictions, df_input[['spkitemid'] + columns_to_copy])
        assert (len(df_predictions) == len(df_predictions_with_metadata))
    else:
        df_predictions_with_metadata = df_predictions.copy()

    # create any directories needed for the output file
    os.makedirs(dirname(output_file), exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions to {}'.format(output_file))
    df_predictions_with_metadata.to_csv(output_file, index=False)

    # save excluded responses to disk
    if not df_excluded.empty:
        excluded_output_file = '{}_excluded_responses{}'.format(
            *splitext(output_file))
        logger.info(
            'Saving excluded responses to {}'.format(excluded_output_file))
        df_excluded.to_csv(excluded_output_file, index=False)
Example #25
0
def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    parser = argparse.ArgumentParser(
        description="Prints out the weights of a \
                                                  given model.",
        conflict_handler='resolve',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--k',
                       help='number of top features to print (0 for all)',
                       type=int,
                       default=50)
    group.add_argument("--sort_by_labels",
                       '-s',
                       action='store_true',
                       default=False,
                       help="order the features by classes")
    parser.add_argument(
        '--sign',
        choices=['positive', 'negative', 'all'],
        default='all',
        help='show only positive, only negative or all weights')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    multiclass = False
    model = learner._model
    if (isinstance(model, LinearSVC) or
        (isinstance(model, LogisticRegression) and len(learner.label_list) > 2)
            or (isinstance(model, SVC) and model.kernel == 'linear')):
        multiclass = True
    weight_items = weights.items()
    if args.sign == 'positive':
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == 'negative':
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if '_intercept_' in intercept:
            # Some learners (e.g. LinearSVR) may return an array of intercepts but
            # sometimes that array is of length 1 so we don't need to print that
            # as an array/list. First, let's normalize these cases.
            model_intercepts = intercept['_intercept_']
            intercept_is_array = isinstance(model_intercepts, np.ndarray)
            num_intercepts = len(model_intercepts) if intercept_is_array else 1
            if intercept_is_array and num_intercepts == 1:
                model_intercepts = model_intercepts[0]
                intercept_is_array = False

            # now print out the intercepts
            print("intercept = {:.12f}".format(model_intercepts))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{: .12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    weight_by_class = defaultdict(dict)
    if multiclass and args.sort_by_labels:
        for label_feature, weight in weight_items:
            label, feature = label_feature.split()
            weight_by_class[label][feature] = weight
        for label in sorted(weight_by_class):
            for feat, val in sorted(weight_by_class[label].items(),
                                    key=lambda x: -abs(x[1])):
                print("{: .12f}\t{}\t{}".format(val, label, feat))
    else:
        for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
            print("{: .12f}\t{}".format(val, feat))