Python Learner Examples

Programming Language: Python

Namespace/Package Name: skll

Class/Type: Learner

Examples at hotexamples.com: 25

Python Learner - 25 examples found. These are the top rated real world Python examples of skll.Learner extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Learner(10)

train(7)

from_file(6)

evaluate(2)

save(2)

Example #1

Show file

def test_custom_metric_api_experiment():
    """Test API with custom metrics"""

    # register two different metrics from two files
    input_dir = join(_my_dir, "other")
    custom_metrics_file1 = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file1, "f075_macro")
    custom_metrics_file2 = join(input_dir, "custom_metrics2.py")
    register_custom_metric(custom_metrics_file2, "f06_micro")

    # read in some train/test data
    train_file = join(input_dir, "examples_train.jsonlines")
    test_file = join(input_dir, "examples_test.jsonlines")

    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file).read()

    # set up a learner to tune using one of the custom metrics
    # and evaluate it using the other one
    learner = Learner("LogisticRegression")
    _ = learner.train(train_fs, grid_objective="f075_macro")
    results = learner.evaluate(
        test_fs,
        grid_objective="f075_macro",
        output_metrics=["balanced_accuracy", "f06_micro"])
    test_objective_value = results[-2]
    test_output_metrics_dict = results[-1]
    test_accuracy_value = test_output_metrics_dict["balanced_accuracy"]
    test_f06_micro_value = test_output_metrics_dict["f06_micro"]

    # check that the values are as expected
    assert_almost_equal(test_objective_value, 0.9785, places=4)
    assert_almost_equal(test_accuracy_value, 0.9792, places=4)
    assert_almost_equal(test_f06_micro_value, 0.98, places=4)

Example #2

Show file

def test_api_with_custom_prob_metric():
    """Test API with custom probabilistic metric"""

    # register a custom metric from our file that requires probabilities
    input_dir = join(_my_dir, "other")
    custom_metrics_file = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file, "fake_prob_metric")

    # create some classification data
    train_fs, _ = make_classification_data(num_examples=1000,
                                           num_features=10,
                                           num_labels=2)

    # set up a learner to tune using this probabilistic metric
    # this should fail since LinearSVC doesn't support probabilities
    learner1 = Learner("LinearSVC")
    assert_raises_regex(AttributeError,
                        r"has no attribute 'predict_proba'",
                        learner1.train,
                        train_fs,
                        grid_objective="fake_prob_metric")

    # set up another learner with explicit probability support
    # this should work just fine with our custom metric
    learner2 = Learner("SVC", probability=True)
    grid_score, _ = learner2.train(train_fs, grid_objective="fake_prob_metric")
    ok_(grid_score > 0.95)

Example #3

Show file

def test_custom_metric_api_experiment_with_kappa_filename():
    """Test API with metric defined in a file named kappa"""

    # register a dummy metric that just returns 1 from
    # a file called 'kappa.py'
    input_dir = join(_my_dir, "other")
    custom_metrics_file = join(input_dir, "kappa.py")
    register_custom_metric(custom_metrics_file, "dummy_metric")

    # read in some train/test data
    train_file = join(input_dir, "examples_train.jsonlines")
    test_file = join(input_dir, "examples_test.jsonlines")

    train_fs = NDJReader.for_path(train_file).read()
    test_fs = NDJReader.for_path(test_file).read()

    # set up a learner to tune using our usual kappa metric
    # and evaluate it using the dummy metric we loaded
    # this should work as there should be no confict between
    # the two "kappa" names
    learner = Learner("LogisticRegression")
    _ = learner.train(train_fs, grid_objective="unweighted_kappa")
    results = learner.evaluate(
        test_fs,
        grid_objective="unweighted_kappa",
        output_metrics=["balanced_accuracy", "dummy_metric"])
    test_objective_value = results[-2]
    test_output_metrics_dict = results[-1]
    test_accuracy_value = test_output_metrics_dict["balanced_accuracy"]
    test_dummy_metric_value = test_output_metrics_dict["dummy_metric"]

    # check that the values are as expected
    assert_almost_equal(test_objective_value, 0.9699, places=4)
    assert_almost_equal(test_accuracy_value, 0.9792, places=4)
    eq_(test_dummy_metric_value, 1.0)

Example #4

Show file

File: model.py Project: jkahn/rsmtool

def create_fake_skll_learner(df_coefficients):

    """
    Create fake SKLL linear regression learner object
    using the coefficients in the given data frame.

    Parameters
    ----------
    df_coefficients : pandas DataFrame
        Data frame containing the linear coefficients
        we want to create the fake SKLL model with.

    Returns
    -------
    learner: skll Learner object
        SKLL LinearRegression Learner object containing
        with the specified coefficients.
    """

    # get the logger
    logger = logging.getLogger(__name__)

    # initialize a random number generator
    randgen = RandomState(1234567890)

    # iterate over the coefficients
    coefdict = {}
    for feature, coefficient in df_coefficients.itertuples(index=False):
        if feature == 'Intercept':
            intercept = coefficient
        else:
            # exclude NA coefficients
            if coefficient == np.nan:
                logger.warning("No coefficient was estimated for "
                               "{}. This is likely due to exact "
                               "collinearity in the model. This "
                               "feature will not be used for model "
                               "building".format(feature))
            else:
                coefdict[feature] = coefficient

    learner = Learner('LinearRegression')
    num_features = len(coefdict)  # excluding the intercept
    fake_feature_values = randgen.rand(num_features)
    fake_features = [dict(zip(coefdict, fake_feature_values))]
    fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features)
    learner.train(fake_fs, grid_search=False)

    # now create its parameters from the coefficients from the built-in model
    learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0]
    learner.model.intercept_ = intercept
    return learner

Example #5

Show file

File: print_model_weights.py Project: wavelets/skll

def main():
    '''
    Handles command line arguments and gets things started.
    '''
    parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                  given model.",
                                     conflict_handler='resolve',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    parser.add_argument('--k',
                        help='number of top features to print (0 for all)',
                        type=int, default=50)
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args()

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    weights = learner.model_params

    print("Number of nonzero features:", len(weights), file=sys.stderr)

    for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))

Example #6

Show file

def main():
    '''
    Handles command line arguments and gets things started.
    '''
    parser = argparse.ArgumentParser(
        description="Prints out the weights of a \
                                                  given model.",
        conflict_handler='resolve',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    parser.add_argument('--k',
                        help='number of top features to print (0 for all)',
                        type=int,
                        default=50)
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args()

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    weights = learner.model_params

    print("Number of nonzero features:", len(weights), file=sys.stderr)

    for feat, val in sorted(iteritems(weights), key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))

Example #7

Show file

def main(argv=None):
    """
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    """
    parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                  given model.",
                                     conflict_handler='resolve',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    parser.add_argument('--k',
                        help='number of top features to print (0 for all)',
                        type=int, default=50)
    parser.add_argument('--sign',
                        choices=['positive', 'negative', 'all'],
                        default='all',
                        help='show only positive, only negative, ' +
                             'or all weights')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    weight_items = iteritems(weights)
    if args.sign == 'positive':
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == 'negative':
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if '_intercept_' in intercept:
            # Some learners (e.g. LinearSVR) may return a list of intercepts
            if isinstance(intercept['_intercept_'], np.ndarray):
                intercept_list = ["%.12f" % i for i in intercept['_intercept_']]
                print("intercept = {}".format(intercept_list))
            else:
                print("intercept = {:.12f}".format(intercept['_intercept_']))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{:.12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))

Example #8

Show file

File: print_model_weights.py Project: ChristianGeng/skll

def main(argv=None):
    """
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    """
    parser = argparse.ArgumentParser(
        description="Prints out the weights of a \
                                                  given model.",
        conflict_handler="resolve",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument("model_file", help="model file to load")
    parser.add_argument("--k", help="number of top features to print (0 for all)", type=int, default=50)
    parser.add_argument(
        "--sign",
        choices=["positive", "negative", "all"],
        default="all",
        help="show only positive, only negative, " + "or all weights",
    )
    parser.add_argument("--version", action="version", version="%(prog)s {0}".format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=("%(asctime)s - %(name)s - %(levelname)s - " + "%(message)s"))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    weight_items = iteritems(weights)
    if args.sign == "positive":
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == "negative":
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if "_intercept_" in intercept:
            # Some learners (e.g. LinearSVR) may return a list of intercepts
            if isinstance(intercept["_intercept_"], np.ndarray):
                intercept_list = ["%.12f" % i for i in intercept["_intercept_"]]
                print("intercept = {}".format(intercept_list))
            else:
                print("intercept = {:.12f}".format(intercept["_intercept_"]))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{:.12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))

Example #9

Show file

    def setUpClass(cls):

        # create a dummy train and test feature set
        X, y = make_classification(n_samples=525,
                                   n_features=10,
                                   n_classes=5,
                                   n_informative=8,
                                   random_state=123)
        X_train, y_train = X[:500], y[:500]
        X_test = X[500:]

        train_ids = list(range(1, len(X_train) + 1))
        train_features = [
            dict(
                zip([
                    'FEATURE_{}'.format(i + 1) for i in range(X_train.shape[1])
                ], x)) for x in X_train
        ]
        train_labels = list(y_train)

        test_ids = list(range(1, len(X_test) + 1))
        test_features = [
            dict(
                zip([
                    'FEATURE_{}'.format(i + 1) for i in range(X_test.shape[1])
                ], x)) for x in X_test
        ]

        cls.train_fs = FeatureSet('train',
                                  ids=train_ids,
                                  features=train_features,
                                  labels=train_labels)
        cls.test_fs = FeatureSet('test', ids=test_ids, features=test_features)

        # train some test SKLL learners that we will use in our tests
        cls.linearsvc = Learner('LinearSVC')
        _ = cls.linearsvc.train(cls.train_fs, grid_search=False)

        cls.svc = Learner('SVC')
        _ = cls.svc.train(cls.train_fs, grid_search=False)

        cls.svc_with_probs = Learner('SVC', probability=True)
        _ = cls.svc_with_probs.train(cls.train_fs, grid_search=False)

Example #10

Show file

File: print_model_weights.py Project: MechCoder/skll

def main(argv=None):
    """
    Handles command line arguments and gets things started.

    :param argv: List of arguments, as if specified on the command-line.
                 If None, ``sys.argv[1:]`` is used instead.
    :type argv: list of str
    """
    parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                  given model.",
                                     conflict_handler='resolve',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    parser.add_argument('--k',
                        help='number of top features to print (0 for all)',
                        type=int, default=50)
    parser.add_argument('--sign',
                        choices=['positive', 'negative', 'all'],
                        default='all',
                        help='show only positive, only negative, ' +
                             'or all weights')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    weight_items = iteritems(weights)
    if args.sign == 'positive':
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == 'negative':
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if '_intercept_' in intercept:
            print("intercept = {:.12f}".format(intercept['_intercept_']))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{:.12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
        print("{:.12f}\t{}".format(val, feat))

Example #11

Show file

def train_skll_model(model_name, df_train, experiment_id, csvdir, figdir):

    # instantiate the given SKLL learner
    learner = Learner(model_name)

    # get the features, IDs, and labels from the given data frame
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]
    features = df_train[feature_columns].to_dict(orient='records')
    ids = df_train['spkitemid'].tolist()
    labels = df_train['sc1'].tolist()

    # create a FeatureSet and train the model
    fs = FeatureSet('train', ids=ids, labels=labels, features=features)

    # if it's a regression model, then our grid objective should be
    # pearson and otherwise it should be accuracy
    if model_name in ["AdaBoostRegressor", "DecisionTreeRegressor", "ElasticNet",
                      "GradientBoostingRegressor", "KNeighborsRegressor", "Lasso",
                      "LinearRegression", "RandomForestRegressor", "Ridge",
                      "SGDRegressor", "LinearSVR", "SVR"]:
        objective = 'pearson'
    else:
        objective = 'f1_score_micro'

    learner.train(fs, grid_search=True, grid_objective=objective, grid_jobs=1)

    # TODO: compute betas for linear SKLL models?

    # save the SKLL model to disk with the given model name prefix
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    # return the SKLL learner object
    return learner

Example #12

Show file

def create_fake_skll_learner(df_coefficients):
    """
    Create fake SKLL linear regression learner object
    using the coefficients in the given data frame.
    """

    # get the logger
    logger = logging.getLogger(__name__)

    # initialize a random number generator
    randgen = RandomState(1234567890)

    # iterate over the coefficients
    coefdict = {}
    for feature, coefficient in df_coefficients.itertuples(index=False):
        if feature == 'Intercept':
            intercept = coefficient
        else:
            # exclude NA coefficients
            if coefficient == np.nan:
                logger.warning("No coefficient was estimated for "
                               "{}. This is likely due to exact "
                               "collinearity in the model. This "
                               "feature will not be used for model "
                               "building".format(feature))
            else:
                coefdict[feature] = coefficient

    learner = Learner('LinearRegression')
    num_features = len(coefdict)  # excluding the intercept
    fake_feature_values = randgen.rand(num_features)
    fake_features = [dict(zip(coefdict, fake_feature_values))]
    fake_fs = FeatureSet('fake', ids=['1'], labels=[1.0], features=fake_features)
    learner.train(fake_fs, grid_search=False)

    # now create its parameters from the coefficients from the built-in model
    learner.model.coef_ = learner.feat_vectorizer.transform(coefdict).toarray()[0]
    learner.model.intercept_ = intercept
    return learner

Example #13

Show file

File: update_skll_models.py Project: copperdong/rsmtool

def update_model(model_file):
    """Read in the model file and save it again."""
    model_dir = dirname(model_file)

    # get the list of current files so that we can
    # remove them later to ensure there are no stranded
    # .npy files
    npy_files = glob.glob(join(model_dir, '*.npy'))

    # now load the SKLL model
    model = Learner.from_file(model_file)

    # delete the existing npy files. The model file will get overwritten,
    # but we do not know the exact number of current .npy files.
    for npy_file in npy_files:
        remove(npy_file)

    model.save(model_file)

Example #14

Show file

File: update_skll_models.py Project: EducationalTestingService/rsmtool

def update_model(model_file):
    ''' Read in the model file and save it again'''

    model_dir = dirname(model_file)

    # get the list of current files so that we can
    # remove them later to ensure there are no stranded
    # .npy files
    npy_files = glob.glob(join(model_dir, '*.npy'))

    # now load the SKLL model
    model = Learner.from_file(model_file)

    # delete the existing npy files. The model file will get overwritten,
    # but we do not know the exact number of current .npy files.
    for npy_file in npy_files:
        remove(npy_file)

    model.save(model_file)

Example #15

Show file

File: generate_predictions.py Project: manugarri/skll

    def __init__(self, model_path, threshold=None, positive_class=1):
        '''
        Initialize the predictor.

        :param model_path: Path to use when loading trained model.
        :type model_path: str
        :param threshold: If the model we're using is generating probabilities
                          of the positive class, return 1 if it meets/exceeds
                          the given threshold and 0 otherwise.
        :type threshold: float
        :param positive_class: If the model is only being used to predict the
                               probability of a particular class, this
                               specifies the index of the class we're
                               predicting. 1 = second class, which is default
                               for binary classification.
        :type positive_class: int
        '''
        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_class
        self.threshold = threshold

Example #16

Show file

File: generate_predictions.py Project: wavelets/skll

    def __init__(self, model_path, threshold=None, positive_class=1):
        '''
        Initialize the predictor.

        :param model_path: Path to use when loading trained model.
        :type model_path: str
        :param threshold: If the model we're using is generating probabilities
                          of the positive class, return 1 if it meets/exceeds
                          the given threshold and 0 otherwise.
        :type threshold: float
        :param positive_class: If the model is only being used to predict the
                               probability of a particular class, this
                               specifies the index of the class we're
                               predicting. 1 = second class, which is default
                               for binary classification.
        :type positive_class: int
        '''
        self._learner = Learner.from_file(model_path)
        self._pos_index = positive_class
        self.threshold = threshold

Example #17

Show file

def test_api_with_inverted_custom_metric():
    """Test API with a lower-is-better custom metric"""

    # register a lower-is-better custom metrics from our file
    # which is simply 1 minus the precision score
    input_dir = join(_my_dir, "other")
    custom_metrics_file1 = join(input_dir, "custom_metrics.py")
    register_custom_metric(custom_metrics_file1, "one_minus_precision")

    # create some classification data
    train_fs, _ = make_classification_data(num_examples=1000,
                                           num_features=10,
                                           num_labels=2)

    # set up a learner to tune using the lower-is-better custom metric
    learner1 = Learner("LogisticRegression")
    (grid_score1,
     grid_results_dict1) = learner1.train(train_fs,
                                          grid_objective="one_minus_precision")

    # now setup another learner that uses the complementary version
    # of our custom metric (regular precision) for grid search
    learner2 = Learner("LogisticRegression")
    (grid_score2,
     grid_results_dict2) = learner2.train(train_fs, grid_objective="precision")

    # for both learners the ranking of the C hyperparameter should be
    # should be the identical since when we defined one_minus_precision
    # we set the `greater_is_better` keyword argument to `False`
    assert_array_equal(grid_results_dict1['rank_test_score'],
                       grid_results_dict2['rank_test_score'])

    # furthermore, the final grid score and the mean scores for each
    # C hyperparameter value should follow the same 1-X relationship
    # except that our custom metric should be negated due to the
    # keyword argument that we set when we defined it
    assert_almost_equal(1 - grid_score2, -1 * grid_score1, places=6)
    assert_array_almost_equal(1 - grid_results_dict2['mean_test_score'],
                              -1 * grid_results_dict1['mean_test_score'],
                              decimal=6)

Example #18

Show file

File: print_model_weights.py Project: EducationalTestingService/skll

def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    parser = argparse.ArgumentParser(description="Prints out the weights of a \
                                                  given model.",
                                     conflict_handler='resolve',
                                     formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--k',
                       help='number of top features to print (0 for all)',
                       type=int, default=50)
    group.add_argument("--sort_by_labels", '-s', action='store_true',
                       default=False, help="order the features by classes")
    parser.add_argument('--sign',
                        choices=['positive', 'negative', 'all'],
                        default='all',
                        help='show only positive, only negative, ' +
                             'or all weights')
    parser.add_argument('--version', action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' +
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    multiclass = False
    model = learner._model
    if (isinstance(model, LinearSVC) or
        (isinstance(model, LogisticRegression) and
            len(learner.label_list) > 2) or
        (isinstance(model, SVC) and
            model.kernel == 'linear')):
        multiclass = True
    weight_items = iteritems(weights)
    if args.sign == 'positive':
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == 'negative':
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if '_intercept_' in intercept:
            # Some learners (e.g. LinearSVR) may return an array of intercepts but
            # sometimes that array is of length 1 so we don't need to print that
            # as an array/list. First, let's normalize these cases.
            model_intercepts = intercept['_intercept_']
            intercept_is_array = isinstance(model_intercepts, np.ndarray)
            num_intercepts = len(model_intercepts) if intercept_is_array else 1
            if intercept_is_array and num_intercepts == 1:
                model_intercepts = model_intercepts[0]
                intercept_is_array = False

            # now print out the intercepts
            print("intercept = {:.12f}".format(model_intercepts))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{: .12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    weight_by_class = defaultdict(dict)
    if multiclass and args.sort_by_labels:
        for label_feature, weight in weight_items:
            label, feature = label_feature.split()
            weight_by_class[label][feature] = weight
        for label in sorted(weight_by_class):
            for feat, val in sorted(weight_by_class[label].items(), key=lambda x: -abs(x[1])):
                print("{: .12f}\t{}\t{}".format(val, label, feat))
    else:
        for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
            print("{: .12f}\t{}".format(val, feat))

Example #19

Show file

File: rsmpredict.py Project: jkahn/rsmtool

def compute_and_save_predictions(config_file, output_file, feats_file):
    """
    Generate predictions using the information in the config file
    and save them into the given output file.
    """

    logger = logging.getLogger(__name__)

    # read in the main config file
    config_obj = read_json_file(config_file)
    config_obj = check_main_config(config_obj, context='rsmpredict')

    # get the directory where the config file lives
    # if this is the 'expm' directory, then go
    # up one level.
    configpath = dirname(config_file)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = locate_file(config_obj['input_features_file'], configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'.format(config_obj['input_features_file']))

    # get the experiment ID
    experiment_id = config_obj['experiment_id']

    # get the column name that will hold the ID
    id_column = config_obj['id_column']

    # get the column name for human score (if any)
    human_score_column = config_obj['human_score_column']

    # get the column name for second human score (if any)
    second_human_score_column = config_obj['second_human_score_column']

    # get the column name for subgroups (if any)
    subgroups = config_obj['subgroups']

    # get the column names for flag columns (if any)
    flag_column_dict = check_flag_column(config_obj)

    # get the name for the candidate_column (if any)
    candidate_column = config_obj['candidate_column']

    # get the directory of the experiment
    experiment_dir = locate_file(config_obj['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'.format(config_obj['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError('The directory {} does not contain '
                                    'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError('The directory {} does not contain any rsmtool models.'.format(experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError('{} does not contain a model for the experiment "{}". '                                 'The following experiments are contained in this '
                                'directory: {}'.format(experiment_output_dir,
                                                       experiment_id,
                                                       experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(experiment_output_dir,
                                                                     expected_file_name))

    # read in the given features but make sure that the
    # `id_column`, `candidate_column` and subgroups are read in as a string
    logger.info('Reading features from {}'.format(input_features_file))
    string_columns = [id_column, candidate_column] + subgroups
    converter_dict = dict([(column, str) for column in string_columns if column])

    df_input = pd.read_csv(input_features_file, converters=converter_dict)

    # make sure that the columns specified in the config file actually exist
    columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys())

    # add subgroups and the flag columns to the list of columns
    # that will be added to the final file
    columns_to_copy = subgroups + list(flag_column_dict.keys())

    # human_score_column will be set to sc1 by default
    # we only raise an error if it's set to something else.
    # However, since we cannot distinguish whether the column was set
    # to sc1 by default or specified as such in the config file
    # we append it to output anyway as long as
    # it is in the input file

    if human_score_column != 'sc1' or 'sc1' in df_input.columns:
        columns_to_check.append(human_score_column)
        columns_to_copy.append('sc1')

    if candidate_column:
        columns_to_check.append(candidate_column)
        columns_to_copy.append('candidate')

    if second_human_score_column:
        columns_to_check.append(second_human_score_column)
        columns_to_copy.append('sc2')

    missing_columns = set(columns_to_check).difference(df_input.columns)
    if missing_columns:
        raise KeyError("Columns {} from the config file "
                       "do not exist in the data.".format(missing_columns))

    # rename all columns
    df_input = rename_default_columns(df_input,
                                      [],
                                      id_column,
                                      human_score_column,
                                      second_human_score_column,
                                      None,
                                      None,
                                      candidate_column=candidate_column)

    # check that the id_column contains unique values
    if df_input['spkitemid'].size != df_input['spkitemid'].unique().size:
        raise ValueError("The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool.".format(id_column))



    # now we need to pre-process these features using
    # the parameters that are already stored in the
    # _features.csv file.
    df_feature_info = pd.read_csv(join(experiment_output_dir,
                                       '{}_feature.csv'.format(experiment_id)),
                                  index_col=0)
    required_features = df_feature_info.index.tolist()

    # ensure that all the features that are needed by the model
    # are present in the input file
    input_feature_columns = [c for c in df_input if c != id_column]
    missing_features = set(required_features).difference(input_feature_columns)
    if missing_features:
        raise KeyError('{} is missing the following features: {}'.format(feats_file, missing_features))
    extra_features = set(input_feature_columns).difference(required_features + [id_column])
    if extra_features:
        logging.warning('The following extraenous features will be ignored: {}'.format(extra_features))

    # keep the required features plus the id
    features_to_keep = ['spkitemid'] + required_features

    # check if actually have the human scores for this data and add
    # sc1 to preprocessed features for consistency with other tools
    has_human_scores = 'sc1' in df_input
    if has_human_scores:
        features_to_keep.append('sc1')

    df_features = df_input[features_to_keep]

    # preprocess the feature values
    logger.info('Pre-processing input features')

    # first we need to filter out NaNs and any other
    # weird features, the same way we did for rsmtool.
    df_filtered = df_features.copy()
    df_excluded = pd.DataFrame(columns=df_filtered.columns)

    for feature_name in required_features:
        newdf, newdf_excluded = filter_on_column(df_filtered, feature_name, 'spkitemid',
                                                 exclude_zeros=False,
                                                 exclude_zero_sd=False)
        del df_filtered
        df_filtered = newdf
        df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')

    # make sure that the remaining data frame is not empty
    if len(df_filtered) == 0:
        raise ValueError("There are no responses left after "
                         "filtering out non-numeric feature values. No analysis "
                         "will be run")

    df_features = df_filtered.copy()
    df_features_preprocessed = df_features.copy()
    for feature_name in required_features:

        feature_values = df_features[feature_name].values

        feature_transformation = df_feature_info.loc[feature_name]['transform']
        feature_weight = df_feature_info.loc[feature_name]['sign']

        train_feature_mean = df_feature_info.loc[feature_name]['train_mean']
        train_feature_sd = df_feature_info.loc[feature_name]['train_sd']

        train_transformed_mean = df_feature_info.loc[feature_name]['train_transformed_mean']
        train_transformed_sd = df_feature_info.loc[feature_name]['train_transformed_sd']

        # transform the feature values and remove outliers
        df_features_preprocessed[feature_name] = preprocess_feature(feature_values,
                                                                    feature_name,
                                                                    feature_transformation,
                                                                    train_feature_mean,
                                                                    train_feature_sd,
                                                                    exclude_zero_sd=False)

        # now standardize the feature values
        df_features_preprocessed[feature_name] = (df_features_preprocessed[feature_name] - train_transformed_mean) / train_transformed_sd

        # Multiply features by weight. Within the
        # current SR timeline, the mean of the transformed train
        # feature used to standardize test features has to be
        # computed before multiplying the train feature by the weight.
        df_features_preprocessed[feature_name] = df_features_preprocessed[feature_name] * feature_weight

    # save the pre-processed features to disk if we were asked to
    if feats_file:
        logger.info('Saving pre-processed feature values to {}'.format(feats_file))

        # create any directories needed for the output file
        os.makedirs(dirname(feats_file), exist_ok=True)
        df_features_preprocessed.to_csv(feats_file, index=False)

    # now load the SKLL model to generate the predictions
    model = Learner.from_file(join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # now generate the predictions for the features using this model
    logger.info('Generating predictions')
    df_predictions = predict_with_model(model, df_features_preprocessed)

    # read in the post-processing parameters from disk
    df_postproc_params = pd.read_csv(join(experiment_output_dir, '{}_postprocessing_params.csv'.format(experiment_id)))
    trim_min = df_postproc_params['trim_min'].values[0]
    trim_max = df_postproc_params['trim_max'].values[0]
    h1_mean = df_postproc_params['h1_mean'].values[0]
    h1_sd = df_postproc_params['h1_sd'].values[0]
    train_predictions_mean = df_postproc_params['train_predictions_mean'].values[0]
    train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0]

    # now scale the predictions
    logger.info('Rescaling predictions')
    scaled_predictions = (df_predictions['raw'] - train_predictions_mean) / train_predictions_sd
    scaled_predictions = scaled_predictions * h1_sd + h1_mean
    df_predictions['scale'] = scaled_predictions

    # trim and round the predictions
    logger.info('Trimming and rounding predictions')
    df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min, trim_max)
    df_predictions['raw_trim_round'] = np.rint(df_predictions['raw_trim']).astype('int64')
    df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min, trim_max)
    df_predictions['scale_trim_round'] = np.rint(df_predictions['scale_trim']).astype('int64')

    # add back the columns that we were requested to copy if any
    if columns_to_copy:
        df_predictions_with_metadata = pd.merge(df_predictions,
                                                df_input[['spkitemid'] + columns_to_copy])
        assert(len(df_predictions) == len(df_predictions_with_metadata))
    else:
        df_predictions_with_metadata = df_predictions.copy()

    # create any directories needed for the output file
    os.makedirs(dirname(output_file), exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions to {}'.format(output_file))
    df_predictions_with_metadata.to_csv(output_file, index=False)

    # save excluded responses to disk
    if not df_excluded.empty:
        excluded_output_file = '{}_excluded_responses{}'.format(*splitext(output_file))
        logger.info('Saving excluded responses to {}'.format(excluded_output_file))
        df_excluded.to_csv(excluded_output_file, index=False)

Example #20

Show file

File: model.py Project: jkahn/rsmtool

def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir):
    """
    Train one of the built-in linear regression models.

    Parameters
    ----------
    model_name : str
        Name of the built-in model to train.
    df_train : pandas DataFrame
        Data frame containing the features on which
        to train the model.
    experiment_id : str
        The experiment ID.
    csvdir : str
        Path to the `output` experiment output directory.
    figdir : str
        Path to the `figure` experiment output directory.

    Returns
    -------
    learner : skll Learner object
        SKLL LinearRegression Learner object containing
        the coefficients learned by training the built-in
        model.
    """
    # get the columns that actually contain the feature values
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]

    # LinearRegression (formerly empWt) : simple linear regression
    if model_name == 'LinearRegression':

        # get the feature columns
        X = df_train[feature_columns]

        # add the intercept
        X = sm.add_constant(X)

        # fit the model
        fit = sm.OLS(df_train['sc1'], X).fit()
        df_coef = ols_coefficients_to_dataframe(fit.params)
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # EqualWeightsLR (formerly eqWt) : all features get equal weight
    elif model_name == 'EqualWeightsLR':
        # we first compute a single feature that is simply the sum of all features
        df_train_eqwt = df_train.copy()
        df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1)

        # train a plain Linear Regression model
        X = df_train_eqwt['sumfeature']
        X = sm.add_constant(X)
        fit = sm.OLS(df_train_eqwt['sc1'], X).fit()

        # get the coefficient for the summed feature and the intercept
        coef = fit.params['sumfeature']
        const = fit.params['const']

        # now we need to assign this coefficient to all of the original
        # features and create a fake SKLL learner with these weights
        original_features = [c for c in df_train_eqwt.columns if c not in ['sc1',
                                                                           'sumfeature',
                                                                           'spkitemid']]
        coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)]))
        df_coef = ols_coefficients_to_dataframe(coefs)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # RebalancedLR (formerly empWtBalanced) : balanced empirical weights
    # by changing betas [adapted from http://bit.ly/UTP7gS]
    elif model_name == 'RebalancedLR':

        # train a plain Linear Regression model
        X = df_train[feature_columns]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_params = ols_coefficients_to_dataframe(fit.params)
        df_params = df_params.set_index('feature')

        # compute the betas for the non-intercept coefficients
        df_weights = df_params.loc[feature_columns]
        df_betas = df_weights.copy()
        df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std()

        # replace each negative beta with delta and adjust
        # all the positive betas to account for this
        RT = 0.05
        df_positive_betas = df_betas[df_betas['coefficient'] > 0]
        df_negative_betas = df_betas[df_betas['coefficient'] < 0]
        delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas)
        df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1)

        # rescale the adjusted betas to get the new coefficients
        df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index')

        # add the intercept back to the new coefficients
        df_coef['Intercept'] = df_params.loc['Intercept'].coefficient
        df_coef = df_coef.sort_index().reset_index()
        df_coef.columns = ['feature', 'coefficient']

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature
    # selection using lasso regression with a fixed lambda and then
    # use only those features to train a second linear regression
    elif model_name == 'LassoFixedLambdaThenLR':

        # train a Lasso Regression model with this featureset with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train a new vanilla linear regression with just the non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # get the coefficients data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature
    # selection using lasso regression optimized for log likelihood using
    # cross validation and then use only those features to train a
    # second linear regression
    elif model_name == 'PositiveLassoCVThenLR':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # NNLR (formerly empWtNNLS) : First do feature selection using
    # non-negative least squares (NNLS) and then use only its non-zero
    # features to train a regular linear regression. We do the regular
    # LR at the end since we want an LR object so that we have access
    # to R^2 and other useful statistics. There should be no difference
    # between the non-zero coefficients from NNLS and the coefficients
    # that end up coming out of the subsequent LR.
    elif model_name == 'NNLR':

        # add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters to a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # LassoFixedLambdaThenNNLR (formerly empWtDropNegLasso): First do
    # feature selection using lasso regression and positive only weights.
    # Then fit an NNLR (see above) on those features.
    elif model_name == 'LassoFixedLambdaThenNNLR':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train an NNLS regression using these non-zero features
        # first add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the positive features
        used_features = non_zero_features

    # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with
    # a fixed lambda
    elif model_name == 'LassoFixedLambda':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        alpha = p_lambda / len(df_train)
        learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
        learner.train(fs_train, grid_search=False)

        # convert this model's parameters to a data frame
        df_coef = skll_learner_params_to_dataframe(learner)

        # there's no OLS fit object in this case
        fit = None

        # we used all the features
        used_features = feature_columns

    # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection
    # using lasso regression optimized for log likelihood using cross
    # validation.
    elif model_name == 'PositiveLassoCV':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # save the non-zero model coefficients and intercept to a data frame
        non_zero_features, non_zero_feature_values = [], []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)
                non_zero_feature_values.append(coefficient)

        # initialize the coefficient data frame with just the intercept
        df_coef = pd.DataFrame([('Intercept', model.intercept_)])
        df_coef = df_coef.append(list(zip(non_zero_features,
                                          non_zero_feature_values)), ignore_index=True)
        df_coef.columns = ['feature', 'coefficient']

        # create a fake SKLL learner with these non-zero weights
        learner = create_fake_skll_learner(df_coef)

        # there's no OLS fit object in this case
        fit = None

        # we used only the non-zero features
        used_features = non_zero_features

    # save the raw coefficients to a file
    df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False)

    # compute the standardized and relative coefficients (betas) for the
    # non-intercept features and save to a file
    df_betas = df_coef.set_index('feature').loc[used_features]
    df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False)

    # save the OLS fit object and its summary to files
    if fit:
        ols_file = join(csvdir, '{}.ols'.format(experiment_id))
        summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

        # create a data frame with main model fit metrics and save to the file
        df_model_fit = model_fit_to_dataframe(fit)
        model_fit_file = join(csvdir, '{}_model_fit.csv'.format(experiment_id))
        df_model_fit.to_csv(model_fit_file, index=False)

    # save the SKLL model to a file
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    return learner

Example #21

Show file

    def validate_config(cls, config, context='rsmtool'):
        """
        Validate configuration file.

        Ensure that all required fields are specified, add default values
        values for all unspecified fields, and ensure that all specified
        fields are valid.

        Parameters
        ----------
        context : str, optional
            Context of the tool in which we are validating.
            Possible values are ::

                {'rsmtool', 'rsmeval',
                 'rsmpredict', 'rsmcompare', 'rsmsummarize'}

            Defaults to 'rsmtool'.
        inplace : bool
            Maintain the state of the config object produced by
            this method.
            Defaults to True.

        Returns
        -------
        config_obj : Configuration
            A configuration object

        Raises
        ------
        ValueError
            If config does not exist, and no config passed.
        """
        # make a copy of the given parameter dictionary
        new_config = deepcopy(config)

        # 1. Check to make sure all required fields are specified
        required_fields = CHECK_FIELDS[context]['required']

        for field in required_fields:
            if field not in new_config:
                raise ValueError("The config file must "
                                 "specify '{}'".format(field))

        # 2. Add default values for unspecified optional fields
        # for given RSMTool context
        defaults = DEFAULTS

        for field in defaults:
            if field not in new_config:
                new_config[field] = defaults[field]

        # 3. Check to make sure no unrecognized fields are specified
        for field in new_config:
            if field not in defaults and field not in required_fields:
                raise ValueError("Unrecognized field '{}'"
                                 " in json file".format(field))

        # 4. Check to make sure that the ID fields that will be
        # used as part of filenames are formatted correctly
        # i.e., they do not contain any spaces and are < 200 characters
        id_field = ID_FIELDS[context]
        id_field_values = {id_field: new_config[id_field]}

        for id_field, id_field_value in id_field_values.items():
            if len(id_field_value) > 200:
                raise ValueError("{} is too long (must be "
                                 "<=200 characters)".format(id_field))

            if re.search(r'\s', id_field_value):
                raise ValueError("{} cannot contain any "
                                 "spaces".format(id_field))

        # 5. Check that the feature file and feature subset/subset file are not
        # specified together
        msg = ("You cannot specify BOTH \"features\" and \"{}\". "
               "Please refer to the \"Selecting Feature Columns\" "
               "section in the documentation for more details.")
        if new_config['features'] and new_config['feature_subset_file']:
            msg = msg.format("feature_subset_file")
            raise ValueError(msg)
        if new_config['features'] and new_config['feature_subset']:
            msg = msg.format("feature_subset")
            raise ValueError(msg)

        # 6. Check for fields that require feature_subset_file and try
        # to use the default feature file
        if (new_config['feature_subset']
                and not new_config['feature_subset_file']):

            # Check if we have the default subset file from rsmextra
            if HAS_RSMEXTRA:
                default_basename = Path(default_feature_subset_file).name
                new_config['feature_subset_file'] = default_feature_subset_file
                logging.warning("You requested feature subsets but did not "
                                "specify any feature file. "
                                "The tool will use the default "
                                "feature file {} available via "
                                "rsmextra".format(default_basename))
            else:
                raise ValueError("If you want to use feature subsets, you "
                                 "must specify a feature subset file")

        if new_config['sign'] and not new_config['feature_subset_file']:

            # Check if we have the default subset file from rsmextra
            if HAS_RSMEXTRA:
                default_basename = Path(default_feature_subset_file).name
                new_config['feature_subset_file'] = default_feature_subset_file
                logging.warning("You specified the expected sign of "
                                "correlation but did not specify a feature "
                                "subset file. The tool will use "
                                "the default feature subset file {} "
                                "available via "
                                "rsmextra".format(default_basename))
            else:
                raise ValueError("If you want to specify the expected sign of "
                                 " correlation for each feature, you must "
                                 "specify a feature subset file")

        # Use the default sign if we are using the default feature file
        # and sign has not been specified in the config file
        if HAS_RSMEXTRA:
            default_feature = default_feature_subset_file
            if (new_config['feature_subset_file'] == default_feature
                    and not new_config['sign']):
                new_config['sign'] = default_feature_sign

        # 7. Check for fields that must be specified together
        if (new_config['min_items_per_candidate']
                and not new_config['candidate_column']):
            raise ValueError("If you want to filter out candidates with "
                             "responses to less than X items, you need "
                             "to specify the name of the column which "
                             "contains candidate IDs.")

        # 8. Check that if "skll_objective" is specified, it's
        # one of the metrics that SKLL allows for AND that it is
        # specified for a SKLL model and _not_ a built-in
        # linear regression model
        if new_config['skll_objective']:
            if not is_skll_model(new_config['model']):
                warnings.warn(
                    "You specified a custom SKLL objective but also chose a "
                    "non-SKLL model. The objective will be ignored.")
            else:
                if new_config['skll_objective'] not in SCORERS:
                    raise ValueError(
                        "Invalid SKLL objective. Please refer to the SKLL "
                        "documentation and choose a valid tuning objective.")

        # 9. Check that if "skll_fixed_parameters" is specified,
        # it's specified for SKLL model and _not_ a built-in linear
        # regression model; we cannot check whether the parameters
        # are valid at parse time but SKLL will raise an error
        # at run time for any invalid parameters
        if new_config['skll_fixed_parameters']:
            if not is_skll_model(new_config['model']):
                warnings.warn(
                    "You specified custom SKLL fixed parameters but "
                    "also chose a non-SKLL model. The parameters will "
                    "be ignored.")

        # 10. Check that if we are running rsmtool to ask for
        # expected scores then the SKLL model type must actually
        # support probabilistic classification. If it's not a SKLL
        # model at all, we just treat it as a LinearRegression model
        # which is basically what they all are in the end.
        if context == 'rsmtool' and new_config['predict_expected_scores']:
            model_name = new_config['model']
            dummy_learner = Learner(model_name) if is_skll_model(
                model_name) else Learner('LinearRegression')
            if not hasattr(dummy_learner.model_type, 'predict_proba'):
                raise ValueError(
                    "{} does not support expected scores "
                    "since it is not a probablistic classifier.".format(
                        model_name))
            del dummy_learner

        # 11. Check the fields that requires rsmextra
        if not HAS_RSMEXTRA:
            if new_config['special_sections']:
                raise ValueError("Special sections are only available to ETS"
                                 " users by installing the rsmextra package.")

        # 12. Raise a warning if we are specifiying a feature file but also
        # telling the system to automatically select transformations
        if new_config['features'] and new_config['select_transformations']:
            # Show a warning unless a user passed a list of features.
            if not isinstance(new_config['features'], list):
                warnings.warn("You specified a feature file but also set "
                              "`select_transformations` to True. Any "
                              "transformations or signs specified in "
                              "the feature file will be overwritten by "
                              "the automatically selected transformations "
                              "and signs.")

        # 13. If we have `experiment_names`, check that the length of the list
        # matches the list of experiment_dirs.
        if context == 'rsmsummarize' and new_config['experiment_names']:
            if len(new_config['experiment_names']) != len(
                    new_config['experiment_dirs']):
                raise ValueError(
                    "The number of specified experiment names should be the same"
                    " as the number of specified experiment directories.")

        # 14. Check that if the user specified min_n_per_group, they also
        # specified subgroups. If they supplied a dictionary, make
        # sure the keys match
        if new_config['min_n_per_group']:
            # make sure we have subgroups
            if 'subgroups' not in new_config:
                raise ValueError("You must specify a list of subgroups in "
                                 "in the `subgroups` field if "
                                 "you want to use the `min_n_per_group` field")
            # if we got dictionary, make sure the keys match
            elif isinstance(new_config['min_n_per_group'], dict):
                if sorted(new_config['min_n_per_group'].keys()) != sorted(
                        new_config['subgroups']):
                    raise ValueError(
                        "The keys in `min_n_per_group` must "
                        "match the subgroups in `subgroups` field")
            # else convert to dictionary
            else:
                new_config['min_n_per_group'] = {
                    group: new_config['min_n_per_group']
                    for group in new_config['subgroups']
                }

        # 15. Clean up config dict to keep only context-specific fields
        context_relevant_fields = (CHECK_FIELDS[context]['optional'] +
                                   CHECK_FIELDS[context]['required'])

        new_config = {
            k: v
            for k, v in new_config.items() if k in context_relevant_fields
        }

        return new_config

Example #22

Show file

File: configuration_parser.py Project: jrosen48/rsmtool

    def validate_config(self, context='rsmtool', inplace=True):
        """
        Ensure that all required fields are specified, add default values
        values for all unspecified fields, and ensure that all specified
        fields are valid.

        Parameters
        ----------
        context : str, optional
            Context of the tool in which we are validating.
            Possible values are ::

                {'rsmtool', 'rsmeval',
                 'rsmpredict', 'rsmcompare', 'rsmsummarize'}

            Defaults to 'rsmtool'.
        inplace : bool
            Maintain the state of the config object produced by
            this method.
            Defaults to True.

        Returns
        -------
        config_obj : Configuration
            A configuration object

        Raises
        ------
        ValueError
            If config does not exist, and no config passed.
        """

        # Check to make sure a configuration file
        # or dictionary has been loaded.
        self._check_config_is_loaded()

        # Get the parameter dictionary
        new_config = self._config

        # 1. Check to make sure all required fields are specified
        required_fields = CHECK_FIELDS[context]['required']

        for field in required_fields:
            if field not in new_config:
                raise ValueError("The config file must "
                                 "specify '{}'".format(field))

        # 2. Add default values for unspecified optional fields
        # for given RSMTool context
        defaults = DEFAULTS

        for field in defaults:
            if field not in new_config:
                new_config[field] = defaults[field]

        # 3. Check to make sure no unrecognized fields are specified
        for field in new_config:
            if field not in defaults and field not in required_fields:
                raise ValueError("Unrecognized field '{}'"
                                 " in json file".format(field))

        # 4. Check to make sure that the ID fields that will be
        # used as part of filenames formatted correctly
        id_fields = ['comparison_id', 'experiment_id', 'summary_id']
        id_field_values = {
            field: new_config[field]
            for field in new_config if field in id_fields
        }

        # we do not need to validate any IDs for `rsmpredict`
        self.check_id_fields(id_field_values)

        # 5. Check that the feature file and feature subset/subset file are not
        # specified together
        msg = ("You cannot specify BOTH \"features\" and \"{}\". "
               "Please refer to the \"Selecting Feature Columns\" "
               "section in the documentation for more details.")
        if new_config['features'] and new_config['feature_subset_file']:
            msg = msg.format("feature_subset_file")
            raise ValueError(msg)
        if new_config['features'] and new_config['feature_subset']:
            msg = msg.format("feature_subset")
            raise ValueError(msg)

        # 6. Check for fields that require feature_subset_file and try
        # to use the default feature file
        if (new_config['feature_subset']
                and not new_config['feature_subset_file']):

            # Check if we have the default subset file from rsmextra
            if HAS_RSMEXTRA:
                default_basename = basename(default_feature_subset_file)
                new_config['feature_subset_file'] = default_feature_subset_file
                logging.warning("You requested feature subsets but did not "
                                "specify any feature file. "
                                "The tool will use the default "
                                "feature file {} available via "
                                "rsmextra".format(default_basename))
            else:
                raise ValueError("If you want to use feature subsets, you "
                                 "must specify a feature subset file")

        if new_config['sign'] and not new_config['feature_subset_file']:

            # Check if we have the default subset file from rsmextra
            if HAS_RSMEXTRA:
                default_basename = basename(default_feature_subset_file)
                new_config['feature_subset_file'] = default_feature_subset_file
                logging.warning("You specified the expected sign of "
                                "correlation but did not specify a feature "
                                "subset file. The tool will use "
                                "the default feature subset file {} "
                                "available via "
                                "rsmextra".format(default_basename))
            else:
                raise ValueError("If you want to specify the expected sign of "
                                 " correlation for each feature, you must "
                                 "specify a feature subset file")

        # Use the default sign if we are using the default feature file
        # and sign has not been specified in the config file
        if HAS_RSMEXTRA:
            default_feature = default_feature_subset_file
            if (new_config['feature_subset_file'] == default_feature
                    and not new_config['sign']):
                new_config['sign'] = default_feature_sign

        # 7. Check for fields that must be specified together
        if (new_config['min_items_per_candidate']
                and not new_config['candidate_column']):
            raise ValueError("If you want to filter out candidates with "
                             "responses to less than X items, you need "
                             "to specify the name of the column which "
                             "contains candidate IDs.")

        # 8. Check that if "skll_objective" is specified, it's
        # one of the metrics that SKLL allows for AND that it is
        # specified for a SKLL model and _not_ a built-in
        # linear regression model
        if new_config['skll_objective']:
            if not is_skll_model(new_config['model']):
                logging.warning(
                    "You specified a custom SKLL objective but also chose a "
                    "non-SKLL model. The objective will be ignored.")
            else:
                if new_config['skll_objective'] not in SCORERS:
                    raise ValueError(
                        "Invalid SKLL objective. Please refer to the SKLL "
                        "documentation and choose a valid tuning objective.")

        # 9. Check that if we are running rsmtool to ask for
        # expected scores then the SKLL model type must actually
        # support probabilistic classification. If it's not a SKLL
        # model at all, we just treat it as a LinearRegression model
        # which is basically what they all are in the end.
        if context == 'rsmtool' and new_config['predict_expected_scores']:
            model_name = new_config['model']
            dummy_learner = Learner(model_name) if is_skll_model(
                model_name) else Learner('LinearRegression')
            if not hasattr(dummy_learner.model_type, 'predict_proba'):
                raise ValueError(
                    "{} does not support expected scores "
                    "since it is not a probablistic classifier.".format(
                        model_name))
            del dummy_learner

        # 10. Check the fields that requires rsmextra
        if not HAS_RSMEXTRA:
            if new_config['special_sections']:
                raise ValueError("Special sections are only available to ETS"
                                 " users by installing the rsmextra package.")

        # 11. Raise a warning if we are specifiying a feature file but also
        # telling the system to automatically select transformations
        if new_config['features'] and new_config['select_transformations']:
            logging.warning("You specified a feature file but also set "
                            "`select_transformations` to True. Any "
                            "transformations or signs specified in "
                            "the feature file will be overwritten by "
                            "the automatically selected transformations "
                            "and signs.")

        # 12. Clean up config dict to keep only context-specific fields
        context_relevant_fields = (CHECK_FIELDS[context]['optional'] +
                                   CHECK_FIELDS[context]['required'])

        new_config = {
            k: v
            for k, v in new_config.items() if k in context_relevant_fields
        }

        if inplace:
            self._config = new_config
        return Configuration(self._config, self._filepath)

Example #23

Show file

def train_builtin_model(model_name, df_train, experiment_id, csvdir, figdir):

    # get the columns that actually contain the feature values
    feature_columns = [c for c in df_train.columns if c not in ['spkitemid', 'sc1']]

    # LinearRegression (formerly empWt) : simple linear regression
    if model_name == 'LinearRegression':

        # get the feature columns
        X = df_train[feature_columns]

        # add the intercept
        X = sm.add_constant(X)

        # fit the model
        fit = sm.OLS(df_train['sc1'], X).fit()
        df_coef = ols_coefficients_to_dataframe(fit.params)
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # EqualWeightsLR (formerly eqWt) : all features get equal weight
    elif model_name == 'EqualWeightsLR':
        # we first compute a single feature that is simply the sum of all features
        df_train_eqwt = df_train.copy()
        df_train_eqwt['sumfeature'] = df_train_eqwt[feature_columns].apply(lambda row: np.sum(row), axis=1)

        # train a plain Linear Regression model
        X = df_train_eqwt['sumfeature']
        X = sm.add_constant(X)
        fit = sm.OLS(df_train_eqwt['sc1'], X).fit()

        # get the coefficient for the summed feature and the intercept
        coef = fit.params['sumfeature']
        const = fit.params['const']

        # now we need to assign this coefficient to all of the original
        # features and create a fake SKLL learner with these weights
        original_features = [c for c in df_train_eqwt.columns if c not in ['sc1',
                                                                           'sumfeature',
                                                                           'spkitemid']]
        coefs = pd.Series(dict([(origf, coef) for origf in original_features] + [('const', const)]))
        df_coef = ols_coefficients_to_dataframe(coefs)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # RebalancedLR (formerly empWtBalanced) : balanced empirical weights
    # by changing betas [adapted from http://bit.ly/UTP7gS]
    elif model_name == 'RebalancedLR':

        # train a plain Linear Regression model
        X = df_train[feature_columns]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_params = ols_coefficients_to_dataframe(fit.params)
        df_params = df_params.set_index('feature')

        # compute the betas for the non-intercept coefficients
        df_weights = df_params.loc[feature_columns]
        df_betas = df_weights.copy()
        df_betas['coefficient'] = df_weights['coefficient'].multiply(df_train[feature_columns].std(), axis='index') / df_train['sc1'].std()

        # replace each negative beta with delta and adjust
        # all the positive betas to account for this
        RT = 0.05
        df_positive_betas = df_betas[df_betas['coefficient'] > 0]
        df_negative_betas = df_betas[df_betas['coefficient'] < 0]
        delta = np.sum(df_positive_betas['coefficient']) * RT / len(df_negative_betas)
        df_betas['coefficient'] = df_betas.apply(lambda row: row['coefficient'] * (1-RT) if row['coefficient'] > 0 else delta, axis=1)

        # rescale the adjusted betas to get the new coefficients
        df_coef = (df_betas['coefficient'] * df_train['sc1'].std()).divide(df_train[feature_columns].std(), axis='index')

        # add the intercept back to the new coefficients
        df_coef['Intercept'] = df_params.loc['Intercept'].coefficient
        df_coef = df_coef.sort_index().reset_index()
        df_coef.columns = ['feature', 'coefficient']

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used all the features
        used_features = feature_columns

    # LassoFixedLambdaThenLR (formerly empWtLasso) : First do feature
    # selection using lasso regression with a fixed lambda and then
    # use only those features to train a second linear regression
    elif model_name == 'LassoFixedLambdaThenLR':

        # train a Lasso Regression model with this featureset with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train a new vanilla linear regression with just the non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # get the coefficients data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # PositiveLassoCVThenLR (formerly empWtLassoBest) : First do feature
    # selection using lasso regression optimized for log likelihood using
    # cross validation and then use only those features to train a
    # second linear regression
    elif model_name == 'PositiveLassoCVThenLR':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert the model parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # NNLR (formerly empWtNNLS) : First do feature selection using
    # non-negative least squares (NNLS) and then use only its non-zero
    # features to train a regular linear regression. We do the regular
    # LR at the end since we want an LR object so that we have access
    # to R^2 and other useful statistics. There should be no difference
    # between the non-zero coefficients from NNLS and the coefficients
    # that end up coming out of the subsequent LR.
    elif model_name == 'NNLR':

        # add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters to a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the non-zero features
        used_features = non_zero_features

    # LassoFixedLambdaThenNNLS (formerly empWtDropNegLasso): First do
    # feature selection using lasso regression and positive only weights.
    # Then fit an NNLR (see above) on those features.
    elif model_name == 'LassoFixedLambdaThenNNLR':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        p_alpha = p_lambda / len(df_train)
        l_lasso = Learner('Lasso', model_kwargs={'alpha': p_alpha, 'positive': True})
        l_lasso.train(fs_train, grid_search=False)

        # get the feature names that have the non-zero coefficients
        non_zero_features = list(l_lasso.model_params[0].keys())

        # now train an NNLS regression using these non-zero features
        # first add an intercept to the features manually
        X = df_train[feature_columns].values
        intercepts = np.ones((len(df_train), 1))
        X_plus_intercept = np.concatenate([intercepts, X], axis=1)
        y = df_train['sc1'].values

        # fit an NNLS model on this data
        coefs, rnorm = nnls(X_plus_intercept, y)

        # check whether the intercept is set to 0 and if so then we need
        # to flip the sign and refit the model to ensure that it is always
        # kept in the model
        if coefs[0] == 0:
            intercepts = -1 * np.ones((len(df_train), 1))
            X_plus_intercept = np.concatenate([intercepts, X], axis=1)
            coefs, rnorm = nnls(X_plus_intercept, y)

        # separate the intercept and feature coefficients
        intercept = coefs[0]
        coefficients = coefs[1:].tolist()

        # get the non-zero features from this model
        non_zero_features = []
        for feature, coefficient in zip(feature_columns, coefficients):
            if coefficient != 0:
                non_zero_features.append(feature)

        # now train a new linear regression with just these non-zero features
        X = df_train[non_zero_features]
        X = sm.add_constant(X)
        fit = sm.OLS(df_train['sc1'], X).fit()

        # convert this model's parameters into a data frame
        df_coef = ols_coefficients_to_dataframe(fit.params)

        # create fake SKLL learner with these coefficients
        learner = create_fake_skll_learner(df_coef)

        # we used only the positive features
        used_features = non_zero_features

    # LassoFixedLambda (formerly lassoWtLasso) : Lasso model with
    # a fixed lambda
    elif model_name == 'LassoFixedLambda':

        # train a Lasso Regression model with a preset lambda
        p_lambda = sqrt(len(df_train) * log10(len(feature_columns)))

        # create a SKLL FeatureSet instance from the given data frame
        fs_train  = create_featureset_from_dataframe(df_train)

        # note that 'alpha' in sklearn is different from this lambda
        # so we need to normalize looking at the sklearn objective equation
        alpha = p_lambda / len(df_train)
        learner = Learner('Lasso', model_kwargs={'alpha': alpha, 'positive': True})
        learner.train(fs_train, grid_search=False)

        # convert this model's parameters to a data frame
        df_coef = skll_learner_params_to_dataframe(learner)

        # there's no OLS fit object in this case
        fit = None

        # we used all the features
        used_features = feature_columns

    # PositiveLassoCV (formerly lassoWtLassoBest) : feature selection
    # using lasso regression optimized for log likelihood using cross
    # validation.
    elif model_name == 'PositiveLassoCV':

        # train a LassoCV outside of SKLL since it's not exposed there
        X = df_train[feature_columns].values
        y = df_train['sc1'].values
        clf = LassoCV(cv=10, positive=True, random_state=1234567890)
        model = clf.fit(X, y)

        # save the non-zero model coefficients and intercept to a data frame
        non_zero_features, non_zero_feature_values = [], []
        for feature, coefficient in zip(feature_columns, model.coef_):
            if coefficient != 0:
                non_zero_features.append(feature)
                non_zero_feature_values.append(coefficient)

        # initialize the coefficient data frame with just the intercept
        df_coef = pd.DataFrame([('Intercept', model.intercept_)])
        df_coef = df_coef.append(list(zip(non_zero_features,
                                          non_zero_feature_values)), ignore_index=True)
        df_coef.columns = ['feature', 'coefficient']

        # create a fake SKLL learner with these non-zero weights
        learner = create_fake_skll_learner(df_coef)

        # there's no OLS fit object in this case
        fit = None

        # we used only the non-zero features
        used_features = non_zero_features

    # save the raw coefficients to a file
    df_coef.to_csv(join(csvdir, '{}_coefficients.csv'.format(experiment_id)), index=False)

    # compute the standardized and relative coefficients (betas) for the
    # non-intercept features and save to a file
    df_betas = df_coef.set_index('feature').loc[used_features]
    df_betas = df_betas.multiply(df_train[used_features].std(), axis='index') / df_train['sc1'].std()
    df_betas.columns = ['standardized']
    df_betas['relative'] = df_betas / sum(abs(df_betas['standardized']))
    df_betas.reset_index(inplace=True)
    df_betas.to_csv(join(csvdir, '{}_betas.csv'.format(experiment_id)), index=False)

    # save the OLS fit object and its summary to files
    if fit:
        ols_file = join(csvdir, '{}.ols'.format(experiment_id))
        summary_file = join(csvdir, '{}_ols_summary.txt'.format(experiment_id))
        with open(ols_file, 'wb') as olsf, open(summary_file, 'w') as summf:
            pickle.dump(fit, olsf)
            summf.write(str(fit.summary()))

    # save the SKLL model to a file
    model_file = join(csvdir, '{}.model'.format(experiment_id))
    learner.save(model_file)

    return learner

Example #24

Show file

def compute_and_save_predictions(config_file, output_file, feats_file):
    """
    Generate predictions using the information in the config file
    and save them into the given output file.
    """

    logger = logging.getLogger(__name__)

    # read in the main config file
    config_obj = read_json_file(config_file)
    config_obj = check_main_config(config_obj, context='rsmpredict')

    # get the directory where the config file lives
    # if this is the 'expm' directory, then go
    # up one level.
    configpath = dirname(config_file)

    # get the input file containing the feature values
    # for which we want to generate the predictions
    input_features_file = locate_file(config_obj['input_features_file'],
                                      configpath)
    if not input_features_file:
        raise FileNotFoundError('Input file {} does not exist'.format(
            config_obj['input_features_file']))

    # get the experiment ID
    experiment_id = config_obj['experiment_id']

    # get the column name that will hold the ID
    id_column = config_obj['id_column']

    # get the column name for human score (if any)
    human_score_column = config_obj['human_score_column']

    # get the column name for second human score (if any)
    second_human_score_column = config_obj['second_human_score_column']

    # get the column name for subgroups (if any)
    subgroups = config_obj['subgroups']

    # get the column names for flag columns (if any)
    flag_column_dict = check_flag_column(config_obj)

    # get the name for the candidate_column (if any)
    candidate_column = config_obj['candidate_column']

    # get the directory of the experiment
    experiment_dir = locate_file(config_obj['experiment_dir'], configpath)
    if not experiment_dir:
        raise FileNotFoundError('The directory {} does not exist.'.format(
            config_obj['experiment_dir']))
    else:
        experiment_output_dir = normpath(join(experiment_dir, 'output'))
        if not exists(experiment_output_dir):
            raise FileNotFoundError(
                'The directory {} does not contain '
                'the output of an rsmtool experiment.'.format(experiment_dir))

    # find all the .model files in the experiment output directory
    model_files = glob.glob(join(experiment_output_dir, '*.model'))
    if not model_files:
        raise FileNotFoundError(
            'The directory {} does not contain any rsmtool models.'.format(
                experiment_output_dir))

    experiment_ids = [splitext(basename(mf))[0] for mf in model_files]
    if experiment_id not in experiment_ids:
        raise FileNotFoundError(
            '{} does not contain a model for the experiment "{}". '
            'The following experiments are contained in this '
            'directory: {}'.format(experiment_output_dir, experiment_id,
                                   experiment_ids))

    # check that the directory contains outher required files
    required_file_types = ['feature', 'postprocessing_params']
    for file_type in required_file_types:
        expected_file_name = "{}_{}.csv".format(experiment_id, file_type)
        if not exists(join(experiment_output_dir, expected_file_name)):
            raise FileNotFoundError('{} does not contain the required file '
                                    '{} that was generated during the '
                                    'original model training'.format(
                                        experiment_output_dir,
                                        expected_file_name))

    # read in the given features but make sure that the
    # `id_column`, `candidate_column` and subgroups are read in as a string
    logger.info('Reading features from {}'.format(input_features_file))
    string_columns = [id_column, candidate_column] + subgroups
    converter_dict = dict([(column, str) for column in string_columns
                           if column])

    df_input = pd.read_csv(input_features_file, converters=converter_dict)

    # make sure that the columns specified in the config file actually exist
    columns_to_check = [id_column] + subgroups + list(flag_column_dict.keys())

    # add subgroups and the flag columns to the list of columns
    # that will be added to the final file
    columns_to_copy = subgroups + list(flag_column_dict.keys())

    # human_score_column will be set to sc1 by default
    # we only raise an error if it's set to something else.
    # However, since we cannot distinguish whether the column was set
    # to sc1 by default or specified as such in the config file
    # we append it to output anyway as long as
    # it is in the input file

    if human_score_column != 'sc1' or 'sc1' in df_input.columns:
        columns_to_check.append(human_score_column)
        columns_to_copy.append('sc1')

    if candidate_column:
        columns_to_check.append(candidate_column)
        columns_to_copy.append('candidate')

    if second_human_score_column:
        columns_to_check.append(second_human_score_column)
        columns_to_copy.append('sc2')

    missing_columns = set(columns_to_check).difference(df_input.columns)
    if missing_columns:
        raise KeyError("Columns {} from the config file "
                       "do not exist in the data.".format(missing_columns))

    # rename all columns
    df_input = rename_default_columns(df_input, [],
                                      id_column,
                                      human_score_column,
                                      second_human_score_column,
                                      None,
                                      None,
                                      candidate_column=candidate_column)

    # check that the id_column contains unique values
    if df_input['spkitemid'].size != df_input['spkitemid'].unique().size:
        raise ValueError(
            "The data contains repeated response IDs in {}. Please make sure all response IDs are unique and re-run the tool."
            .format(id_column))

    # now we need to pre-process these features using
    # the parameters that are already stored in the
    # _features.csv file.
    df_feature_info = pd.read_csv(join(experiment_output_dir,
                                       '{}_feature.csv'.format(experiment_id)),
                                  index_col=0)
    required_features = df_feature_info.index.tolist()

    # ensure that all the features that are needed by the model
    # are present in the input file
    input_feature_columns = [c for c in df_input if c != id_column]
    missing_features = set(required_features).difference(input_feature_columns)
    if missing_features:
        raise KeyError('{} is missing the following features: {}'.format(
            feats_file, missing_features))
    extra_features = set(input_feature_columns).difference(required_features +
                                                           [id_column])
    if extra_features:
        logging.warning(
            'The following extraenous features will be ignored: {}'.format(
                extra_features))

    # keep the required features plus the id
    features_to_keep = ['spkitemid'] + required_features

    # check if actually have the human scores for this data and add
    # sc1 to preprocessed features for consistency with other tools
    has_human_scores = 'sc1' in df_input
    if has_human_scores:
        features_to_keep.append('sc1')

    df_features = df_input[features_to_keep]

    # preprocess the feature values
    logger.info('Pre-processing input features')

    # first we need to filter out NaNs and any other
    # weird features, the same way we did for rsmtool.
    df_filtered = df_features.copy()
    df_excluded = pd.DataFrame(columns=df_filtered.columns)

    for feature_name in required_features:
        newdf, newdf_excluded = filter_on_column(df_filtered,
                                                 feature_name,
                                                 'spkitemid',
                                                 exclude_zeros=False,
                                                 exclude_zero_sd=False)
        del df_filtered
        df_filtered = newdf
        df_excluded = pd.merge(df_excluded, newdf_excluded, how='outer')

    # make sure that the remaining data frame is not empty
    if len(df_filtered) == 0:
        raise ValueError(
            "There are no responses left after "
            "filtering out non-numeric feature values. No analysis "
            "will be run")

    df_features = df_filtered.copy()
    df_features_preprocessed = df_features.copy()
    for feature_name in required_features:

        feature_values = df_features[feature_name].values

        feature_transformation = df_feature_info.loc[feature_name]['transform']
        feature_weight = df_feature_info.loc[feature_name]['sign']

        train_feature_mean = df_feature_info.loc[feature_name]['train_mean']
        train_feature_sd = df_feature_info.loc[feature_name]['train_sd']

        train_transformed_mean = df_feature_info.loc[feature_name][
            'train_transformed_mean']
        train_transformed_sd = df_feature_info.loc[feature_name][
            'train_transformed_sd']

        # transform the feature values and remove outliers
        df_features_preprocessed[feature_name] = preprocess_feature(
            feature_values,
            feature_name,
            feature_transformation,
            train_feature_mean,
            train_feature_sd,
            exclude_zero_sd=False)

        # now standardize the feature values
        df_features_preprocessed[feature_name] = (
            df_features_preprocessed[feature_name] -
            train_transformed_mean) / train_transformed_sd

        # Multiply features by weight. Within the
        # current SR timeline, the mean of the transformed train
        # feature used to standardize test features has to be
        # computed before multiplying the train feature by the weight.
        df_features_preprocessed[feature_name] = df_features_preprocessed[
            feature_name] * feature_weight

    # save the pre-processed features to disk if we were asked to
    if feats_file:
        logger.info(
            'Saving pre-processed feature values to {}'.format(feats_file))

        # create any directories needed for the output file
        os.makedirs(dirname(feats_file), exist_ok=True)
        df_features_preprocessed.to_csv(feats_file, index=False)

    # now load the SKLL model to generate the predictions
    model = Learner.from_file(
        join(experiment_output_dir, '{}.model'.format(experiment_id)))

    # now generate the predictions for the features using this model
    logger.info('Generating predictions')
    df_predictions = predict_with_model(model, df_features_preprocessed)

    # read in the post-processing parameters from disk
    df_postproc_params = pd.read_csv(
        join(experiment_output_dir,
             '{}_postprocessing_params.csv'.format(experiment_id)))
    trim_min = df_postproc_params['trim_min'].values[0]
    trim_max = df_postproc_params['trim_max'].values[0]
    h1_mean = df_postproc_params['h1_mean'].values[0]
    h1_sd = df_postproc_params['h1_sd'].values[0]
    train_predictions_mean = df_postproc_params[
        'train_predictions_mean'].values[0]
    train_predictions_sd = df_postproc_params['train_predictions_sd'].values[0]

    # now scale the predictions
    logger.info('Rescaling predictions')
    scaled_predictions = (df_predictions['raw'] -
                          train_predictions_mean) / train_predictions_sd
    scaled_predictions = scaled_predictions * h1_sd + h1_mean
    df_predictions['scale'] = scaled_predictions

    # trim and round the predictions
    logger.info('Trimming and rounding predictions')
    df_predictions['raw_trim'] = trim(df_predictions['raw'], trim_min,
                                      trim_max)
    df_predictions['raw_trim_round'] = np.rint(
        df_predictions['raw_trim']).astype('int64')
    df_predictions['scale_trim'] = trim(df_predictions['scale'], trim_min,
                                        trim_max)
    df_predictions['scale_trim_round'] = np.rint(
        df_predictions['scale_trim']).astype('int64')

    # add back the columns that we were requested to copy if any
    if columns_to_copy:
        df_predictions_with_metadata = pd.merge(
            df_predictions, df_input[['spkitemid'] + columns_to_copy])
        assert (len(df_predictions) == len(df_predictions_with_metadata))
    else:
        df_predictions_with_metadata = df_predictions.copy()

    # create any directories needed for the output file
    os.makedirs(dirname(output_file), exist_ok=True)

    # save the predictions to disk
    logger.info('Saving predictions to {}'.format(output_file))
    df_predictions_with_metadata.to_csv(output_file, index=False)

    # save excluded responses to disk
    if not df_excluded.empty:
        excluded_output_file = '{}_excluded_responses{}'.format(
            *splitext(output_file))
        logger.info(
            'Saving excluded responses to {}'.format(excluded_output_file))
        df_excluded.to_csv(excluded_output_file, index=False)

Example #25

Show file

File: print_model_weights.py Project: srhrshr/skll

def main(argv=None):
    """
    Handles command line arguments and gets things started.

    Parameters
    ----------
    argv : list of str
        List of arguments, as if specified on the command-line.
        If None, ``sys.argv[1:]`` is used instead.
    """

    parser = argparse.ArgumentParser(
        description="Prints out the weights of a \
                                                  given model.",
        conflict_handler='resolve',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('model_file', help='model file to load')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--k',
                       help='number of top features to print (0 for all)',
                       type=int,
                       default=50)
    group.add_argument("--sort_by_labels",
                       '-s',
                       action='store_true',
                       default=False,
                       help="order the features by classes")
    parser.add_argument(
        '--sign',
        choices=['positive', 'negative', 'all'],
        default='all',
        help='show only positive, only negative or all weights')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s {0}'.format(__version__))
    args = parser.parse_args(argv)

    # Make warnings from built-in warnings module get formatted more nicely
    logging.captureWarnings(True)
    logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - '
                                '%(message)s'))

    k = args.k if args.k > 0 else None

    learner = Learner.from_file(args.model_file)
    (weights, intercept) = learner.model_params

    multiclass = False
    model = learner._model
    if (isinstance(model, LinearSVC) or
        (isinstance(model, LogisticRegression) and len(learner.label_list) > 2)
            or (isinstance(model, SVC) and model.kernel == 'linear')):
        multiclass = True
    weight_items = weights.items()
    if args.sign == 'positive':
        weight_items = (x for x in weight_items if x[1] > 0)
    elif args.sign == 'negative':
        weight_items = (x for x in weight_items if x[1] < 0)

    if intercept is not None:
        # subclass of LinearModel
        if '_intercept_' in intercept:
            # Some learners (e.g. LinearSVR) may return an array of intercepts but
            # sometimes that array is of length 1 so we don't need to print that
            # as an array/list. First, let's normalize these cases.
            model_intercepts = intercept['_intercept_']
            intercept_is_array = isinstance(model_intercepts, np.ndarray)
            num_intercepts = len(model_intercepts) if intercept_is_array else 1
            if intercept_is_array and num_intercepts == 1:
                model_intercepts = model_intercepts[0]
                intercept_is_array = False

            # now print out the intercepts
            print("intercept = {:.12f}".format(model_intercepts))
        else:
            print("== intercept values ==")
            for (label, val) in intercept.items():
                print("{: .12f}\t{}".format(val, label))
        print()

    print("Number of nonzero features:", len(weights), file=sys.stderr)
    weight_by_class = defaultdict(dict)
    if multiclass and args.sort_by_labels:
        for label_feature, weight in weight_items:
            label, feature = label_feature.split()
            weight_by_class[label][feature] = weight
        for label in sorted(weight_by_class):
            for feat, val in sorted(weight_by_class[label].items(),
                                    key=lambda x: -abs(x[1])):
                print("{: .12f}\t{}\t{}".format(val, label, feat))
    else:
        for feat, val in sorted(weight_items, key=lambda x: -abs(x[1]))[:k]:
            print("{: .12f}\t{}".format(val, feat))