Ejemplo n.º 1
0
def check_tree_models(name,
                      use_feature_hashing=False,
                      use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([0.37483895,
                                         0.08816508,
                                         0.25379838,
                                         0.18337128,
                                         0.09982631] if use_feature_hashing else
                                        [0.08926899,
                                         0.15585068,
                                         0.75488033])
        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
    else:
        expected_feature_importances = ([0.40195798,
                                         0.06702903,
                                         0.25816559,
                                         0.18185518,
                                         0.09099222] if use_feature_hashing else
                                        [0.07974267,
                                         0.16121895,
                                         0.75903838])
        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
Ejemplo n.º 2
0
def check_tree_models(name,
                      use_feature_hashing=False,
                      use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([0.37331461,
                                         0.08572699,
                                         0.2543484,
                                         0.1841172,
                                         0.1024928] if use_feature_hashing else
                                        [0.08931994,
                                         0.15545093,
                                         0.75522913])
        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
    else:
        if use_feature_hashing:
            expected_feature_importances = [0.40195655,
                                            0.06702161,
                                            0.25814858,
                                            0.18183947,
                                            0.09103379]
        else:
            expected_feature_importances = [0.07975691, 0.16122862, 0.75901447]
        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    rtol=1e-2)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
Ejemplo n.º 3
0
def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [0.33718443,
                                            0.07810721,
                                            0.25621769,
                                            0.19489766,
                                            0.13359301]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([0.204,
                                         0.172,
                                         0.178,
                                         0.212,
                                         0.234] if use_feature_hashing else
                                        [0.262,
                                         0.288,
                                         0.45])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
def check_linear_models(name,
                        use_feature_hashing=False,
                        use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        (train_fs,
         test_fs,
         weightdict) = make_regression_data(num_examples=5000,
                                            num_features=10,
                                            use_feature_hashing=True,
                                            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. However, sometimes with
    # feature hashing, the ceiling is not exactly identical
    # so when that fails we want to check that the rounded
    # feature values are the same. One of those two equalities
    # _must_ be satisified.

    # get the weights for this trained model
    learned_weights = learner.model_params[0]

    for feature_name in learned_weights:
        learned_w_ceil = math.ceil(learned_weights[feature_name])
        given_w_ceil = math.ceil(weightdict[feature_name])
        learned_w_round = round(learned_weights[feature_name], 0)
        given_w_round = round(weightdict[feature_name], 0)
        ceil_equal = learned_w_ceil == given_w_ceil
        round_equal = learned_w_round == given_w_round
        either_equal = ceil_equal or round_equal
        assert either_equal

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
def check_tree_models(name,
                      use_feature_hashing=False,
                      use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([0.730811,
                                         0.001834,
                                         0.247603,
                                         0.015241,
                                         0.004511] if use_feature_hashing else
                                        [0.08926899,
                                         0.15585068,
                                         0.75488033])
    else:
        expected_feature_importances = ([0.733654,
                                         0.002528,
                                         0.245527,
                                         0.013664,
                                         0.004627] if use_feature_hashing else
                                        [0.07974267,
                                         0.16121895,
                                         0.75903838])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [0.749811,
                                            0.001373,
                                            0.23357,
                                            0.011691,
                                            0.003554]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([0.735756,
                                         0.001034,
                                         0.242734,
                                         0.015836,
                                         0.00464] if use_feature_hashing else
                                        [0.082621,
                                         0.166652,
                                         0.750726])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances, expected_feature_importances,
                    atol=1e-2, rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
Ejemplo n.º 7
0
def test_feature_merging_order_invariance():
    """
    Test whether featuresets with different orders of IDs can be merged
    """

    # First, randomly generate two feature sets and then make sure they have
    # the same labels.
    train_fs1, _, _ = make_regression_data()
    train_fs2, _, _ = make_regression_data(start_feature_num=3,
                                           random_state=87654321)
    train_fs2.labels = train_fs1.labels.copy()

    # make a reversed copy of feature set 2
    shuffled_indices = list(range(len(train_fs2.ids)))
    np.random.seed(123456789)
    np.random.shuffle(shuffled_indices)
    train_fs2_ids_shuf = train_fs2.ids[shuffled_indices]
    train_fs2_labels_shuf = train_fs2.labels[shuffled_indices]
    train_fs2_features_shuf = train_fs2.features[shuffled_indices]
    train_fs2_shuf = FeatureSet("f2_shuf",
                                train_fs2_ids_shuf,
                                labels=train_fs2_labels_shuf,
                                features=train_fs2_features_shuf,
                                vectorizer=train_fs2.vectorizer)

    # merge feature set 1 with feature set 2 and its reversed version
    merged_fs = train_fs1 + train_fs2
    merged_fs_shuf = train_fs1 + train_fs2_shuf

    # check that the two merged versions are the same
    feature_names = (train_fs1.vectorizer.get_feature_names() +
                     train_fs2.vectorizer.get_feature_names())
    assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names)
    assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(),
                       feature_names)

    assert_array_equal(merged_fs.labels, train_fs1.labels)
    assert_array_equal(merged_fs.labels, train_fs2.labels)
    assert_array_equal(merged_fs.labels, merged_fs_shuf.labels)

    assert_array_equal(merged_fs.ids, train_fs1.ids)
    assert_array_equal(merged_fs.ids, train_fs2.ids)
    assert_array_equal(merged_fs.ids, merged_fs_shuf.ids)

    assert_array_equal(merged_fs.features[:, 0:2].todense(),
                       train_fs1.features.todense())
    assert_array_equal(merged_fs.features[:, 2:4].todense(),
                       train_fs2.features.todense())
    assert_array_equal(merged_fs.features.todense(),
                       merged_fs_shuf.features.todense())

    assert not np.all(
        merged_fs.features[:,
                           0:2].todense() == merged_fs.features[:,
                                                                2:4].todense())
Ejemplo n.º 8
0
def check_linear_models(name, use_feature_hashing=False, use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        (train_fs, test_fs,
         weightdict) = make_regression_data(num_examples=5000,
                                            num_features=10,
                                            use_feature_hashing=True,
                                            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. However, sometimes with
    # feature hashing, the ceiling is not exactly identical
    # so when that fails we want to check that the rounded
    # feature values are the same. One of those two equalities
    # _must_ be satisified.

    # get the weights for this trained model
    learned_weights = learner.model_params[0]

    for feature_name in learned_weights:
        learned_w_ceil = math.ceil(learned_weights[feature_name])
        given_w_ceil = math.ceil(weightdict[feature_name])
        learned_w_round = round(learned_weights[feature_name], 0)
        given_w_round = round(weightdict[feature_name], 0)
        ceil_equal = learned_w_ceil == given_w_ceil
        round_equal = learned_w_round == given_w_round
        either_equal = ceil_equal or round_equal
        assert either_equal

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
Ejemplo n.º 9
0
def test_feature_merging_order_invariance():
    """
    Test whether featuresets with different orders of IDs can be merged
    """

    # First, randomly generate two feature sets and then make sure they have
    # the same labels.
    train_fs1, _, _ = make_regression_data()
    train_fs2, _, _ = make_regression_data(start_feature_num=3,
                                           random_state=87654321)
    train_fs2.labels = train_fs1.labels.copy()

    # make a reversed copy of feature set 2
    shuffled_indices = list(range(len(train_fs2.ids)))
    np.random.seed(123456789)
    np.random.shuffle(shuffled_indices)
    train_fs2_ids_shuf = train_fs2.ids[shuffled_indices]
    train_fs2_labels_shuf = train_fs2.labels[shuffled_indices]
    train_fs2_features_shuf = train_fs2.features[shuffled_indices]
    train_fs2_shuf = FeatureSet("f2_shuf",
                                train_fs2_ids_shuf,
                                labels=train_fs2_labels_shuf,
                                features=train_fs2_features_shuf,
                                vectorizer=train_fs2.vectorizer)

    # merge feature set 1 with feature set 2 and its reversed version
    merged_fs = train_fs1 + train_fs2
    merged_fs_shuf = train_fs1 + train_fs2_shuf

    # check that the two merged versions are the same
    feature_names = (train_fs1.vectorizer.get_feature_names()
                     + train_fs2.vectorizer.get_feature_names())
    assert_array_equal(merged_fs.vectorizer.get_feature_names(), feature_names)
    assert_array_equal(merged_fs_shuf.vectorizer.get_feature_names(),
                       feature_names)

    assert_array_equal(merged_fs.labels, train_fs1.labels)
    assert_array_equal(merged_fs.labels, train_fs2.labels)
    assert_array_equal(merged_fs.labels, merged_fs_shuf.labels)

    assert_array_equal(merged_fs.ids, train_fs1.ids)
    assert_array_equal(merged_fs.ids, train_fs2.ids)
    assert_array_equal(merged_fs.ids, merged_fs_shuf.ids)

    assert_array_equal(merged_fs.features[:, 0:2].todense(),
                       train_fs1.features.todense())
    assert_array_equal(merged_fs.features[:, 2:4].todense(),
                       train_fs2.features.todense())
    assert_array_equal(merged_fs.features.todense(),
                       merged_fs_shuf.features.todense())

    assert not np.all(merged_fs.features[:, 0:2].todense()
                      == merged_fs.features[:, 2:4].todense())
Ejemplo n.º 10
0
def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [
                0.33718443, 0.07810721, 0.25621769, 0.19489766, 0.13359301
            ]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([
            0.204, 0.172, 0.178, 0.212, 0.234
        ] if use_feature_hashing else [0.262, 0.288, 0.45])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances,
                    expected_feature_importances,
                    atol=1e-2,
                    rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
Ejemplo n.º 11
0
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([
            0.37331461, 0.08572699, 0.2543484, 0.1841172, 0.1024928
        ] if use_feature_hashing else [0.08931994, 0.15545093, 0.75522913])
        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
    else:
        if use_feature_hashing:
            expected_feature_importances = [
                0.40195655, 0.06702161, 0.25814858, 0.18183947, 0.09103379
            ]
        else:
            expected_feature_importances = [0.07975691, 0.16122862, 0.75901447]
        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances,
                    expected_feature_importances,
                    rtol=1e-2)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
Ejemplo n.º 12
0
def check_ensemble_models(name,
                          use_feature_hashing=False,
                          use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('AdaBoostRegressor'):
        if use_feature_hashing:
            expected_feature_importances = [
                0.749811, 0.001373, 0.23357, 0.011691, 0.003554
            ]
        else:
            expected_feature_importances = [0.10266744, 0.18681777, 0.71051479]
    else:
        expected_feature_importances = ([
            0.735756, 0.001034, 0.242734, 0.015836, 0.00464
        ] if use_feature_hashing else [0.082621, 0.166652, 0.750726])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances,
                    expected_feature_importances,
                    atol=1e-2,
                    rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
Ejemplo n.º 13
0
def check_linear_models(name, use_feature_hashing=False, use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, weightdict = make_regression_data(
            num_examples=5000,
            num_features=10,
            use_feature_hashing=True,
            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before  comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. Note though that we cannot
    # test feature weights if we are using feature hashing
    # since model_params is not defined with a featurehasher.
    if not use_feature_hashing:

        # get the weights for this trained model
        learned_weights = learner.model_params[0]

        for feature_name in learned_weights:
            learned_w = math.ceil(learned_weights[feature_name])
            given_w = math.ceil(weightdict[feature_name])
            eq_(learned_w, given_w)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
Ejemplo n.º 14
0
def check_linear_models(name,
                        use_feature_hashing=False,
                        use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, weightdict = make_regression_data(
            num_examples=5000, num_features=10, use_feature_hashing=True,
            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the weights are close to the weights
    # that we got from make_regression_data. Take the
    # ceiling before  comparing since just comparing
    # the ceilings should be enough to make sure nothing
    # catastrophic happened. Note though that we cannot
    # test feature weights if we are using feature hashing
    # since model_params is not defined with a featurehasher.
    if not use_feature_hashing:

        # get the weights for this trained model
        learned_weights = learner.model_params[0]

        for feature_name in learned_weights:
            learned_w = math.ceil(learned_weights[feature_name])
            given_w = math.ceil(weightdict[feature_name])
            eq_(learned_w, given_w)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
Ejemplo n.º 15
0
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([
            0.37483895, 0.08816508, 0.25379838, 0.18337128, 0.09982631
        ] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033])
        expected_cor_range = [0.5, 0.6] if use_feature_hashing else [0.9, 1.0]
    else:
        expected_feature_importances = ([
            0.40195798, 0.06702903, 0.25816559, 0.18185518, 0.09099222
        ] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838])
        expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances,
                    expected_feature_importances,
                    atol=1e-2,
                    rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
Ejemplo n.º 16
0
def check_print_model_weights(task='classification'):

    # create some simple classification or regression data
    if task == 'classification':
        train_fs, _ = make_classification_data(train_test_ratio=0.8)
    else:
        train_fs, _, _ = make_regression_data(num_features=4,
                                              train_test_ratio=0.8)

    # now train the appropriate model
    if task == 'classification':
        learner = Learner('LogisticRegression')
        learner.train(train_fs)
    else:
        learner = Learner('LinearRegression')
        learner.train(train_fs, grid_objective='pearson')

    # now save the model to disk
    model_file = join(_my_dir, 'output',
                      'test_print_model_weights.model')
    learner.save(model_file)

    # now call print_model_weights main() and capture the output
    print_model_weights_cmd = [model_file]
    err = ''
    try:
        old_stderr = sys.stderr
        old_stdout = sys.stdout
        sys.stderr = mystderr = StringIO()
        sys.stdout = mystdout = StringIO()
        pmw.main(print_model_weights_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        sys.stdout = old_stdout
        print(err)

    # now parse the output of the print_model_weight command
    # and get the intercept and the feature values
    if task == 'classification':
        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = safe_float(lines_to_parse[0].split('\t')[0])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[2], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_[0])
        assert_allclose(learner.model.coef_[0], feature_values)
    else:
        lines_to_parse = [l for l in out.split('\n') if l]
        intercept = safe_float(lines_to_parse[0].split('=')[1])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)
Ejemplo n.º 17
0
def check_invalid_regr_grid_obj_func(learner_name, grid_objective_function):
    """
    Checks whether the grid objective function is valid for this regression
    learner
    """
    (train_fs, _, _) = make_regression_data()
    clf = Learner(learner_name)
    clf.train(train_fs, grid_objective=grid_objective_function)
Ejemplo n.º 18
0
def check_tree_models(name, use_feature_hashing=False, use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, _ = make_regression_data(num_examples=5000,
                                                    num_features=10,
                                                    use_feature_hashing=True,
                                                    feature_bins=5)
    else:
        train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                    num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # make sure that the feature importances are as expected.
    if name.endswith('DecisionTreeRegressor'):
        expected_feature_importances = ([
            0.730811, 0.001834, 0.247603, 0.015241, 0.004511
        ] if use_feature_hashing else [0.08926899, 0.15585068, 0.75488033])
    else:
        expected_feature_importances = ([
            0.733654, 0.002528, 0.245527, 0.013664, 0.004627
        ] if use_feature_hashing else [0.07974267, 0.16121895, 0.75903838])

    feature_importances = learner.model.feature_importances_
    assert_allclose(feature_importances,
                    expected_feature_importances,
                    atol=1e-2,
                    rtol=0)

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
Ejemplo n.º 19
0
def check_invalid_regr_grid_obj_func(learner_name, grid_objective_function):
    """
    Checks whether the grid objective function is valid for this regression
    learner
    """
    (train_fs, _, _) = make_regression_data()
    clf = Learner(learner_name)
    clf.train(train_fs, grid_objective=grid_objective_function)
Ejemplo n.º 20
0
def check_rescaling(name, grid_search=False):

    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # instantiate the given learner and its rescaled counterpart
    learner = Learner(name)
    rescaled_learner = Learner('Rescaled' + name)

    # train both the regular regressor and the rescaled regressor
    # with and without using grid search
    if grid_search:
        learner.train(train_fs, grid_search=True, grid_objective='pearson')
        rescaled_learner.train(train_fs,
                               grid_search=True,
                               grid_objective='pearson')
    else:
        learner.train(train_fs, grid_search=False)
        rescaled_learner.train(train_fs, grid_search=False)

    # now generate both sets of predictions on the test feature set
    predictions = learner.predict(test_fs)
    rescaled_predictions = rescaled_learner.predict(test_fs)

    # ... and on the training feature set
    train_predictions = learner.predict(train_fs)
    rescaled_train_predictions = rescaled_learner.predict(train_fs)

    # make sure that both sets of correlations are close to perfectly
    # correlated, since the only thing different is that one set has been
    # rescaled
    assert_almost_equal(pearsonr(predictions, rescaled_predictions)[0],
                        1.0,
                        places=3)

    # make sure that the standard deviation of the rescaled test set
    # predictions is higher than the standard deviation of the regular test set
    # predictions
    p_std = np.std(predictions)
    rescaled_p_std = np.std(rescaled_predictions)
    assert_greater(rescaled_p_std, p_std)

    # make sure that the standard deviation of the rescaled predictions
    # on the TRAINING set (not the TEST) is closer to the standard
    # deviation of the training set labels than the standard deviation
    # of the regular predictions.
    train_y_std = np.std(train_fs.labels)
    train_p_std = np.std(train_predictions)
    rescaled_train_p_std = np.std(rescaled_train_predictions)
    assert_less(abs(rescaled_train_p_std - train_y_std),
                abs(train_p_std - train_y_std))
Ejemplo n.º 21
0
def check_non_linear_models(name,
                            use_feature_hashing=False,
                            use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, weightdict = make_regression_data(
            num_examples=5000,
            num_features=10,
            use_feature_hashing=True,
            feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # Note that we cannot check the feature weights here
    # since `model_params()` is not defined for non-linear
    # kernels.

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
Ejemplo n.º 22
0
def check_non_linear_models(name,
                            use_feature_hashing=False,
                            use_rescaling=False):

    # create a FeatureSet object with the data we want to use
    if use_feature_hashing:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=5000,
                                                             num_features=10,
                                                             use_feature_hashing=True,
                                                             feature_bins=5)
    else:
        train_fs, test_fs, weightdict = make_regression_data(num_examples=2000,
                                                             num_features=3)

    # create the learner
    if use_rescaling:
        name = 'Rescaled' + name
    learner = Learner(name)

    # train it with the training feature set we created
    # make sure to set the grid objective to pearson
    learner.train(train_fs, grid_objective='pearson')

    # Note that we cannot check the feature weights here
    # since `model_params()` is not defined for non-linear
    # kernels.

    # now generate the predictions on the test FeatureSet
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated with pearson > 0.95
    cor, _ = pearsonr(predictions, test_fs.labels)
    expected_cor_range = [0.7, 0.8] if use_feature_hashing else [0.9, 1.0]
    assert_greater(cor, expected_cor_range[0])
    assert_less(cor, expected_cor_range[1])
def check_rescaling(name, grid_search=False):

    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # instantiate the given learner and its rescaled counterpart
    learner = Learner(name)
    rescaled_learner = Learner('Rescaled' + name)

    # train both the regular regressor and the rescaled regressor
    # with and without using grid search
    if grid_search:
        learner.train(train_fs, grid_search=True, grid_objective='pearson')
        rescaled_learner.train(train_fs, grid_search=True, grid_objective='pearson')
    else:
        learner.train(train_fs, grid_search=False)
        rescaled_learner.train(train_fs, grid_search=False)

    # now generate both sets of predictions on the test feature set
    predictions = learner.predict(test_fs)
    rescaled_predictions = rescaled_learner.predict(test_fs)

    # ... and on the training feature set
    train_predictions = learner.predict(train_fs)
    rescaled_train_predictions = rescaled_learner.predict(train_fs)

    # make sure that both sets of correlations are close to perfectly
    # correlated, since the only thing different is that one set has been
    # rescaled
    assert_almost_equal(pearsonr(predictions, rescaled_predictions)[0],
                        1.0,
                        places=3)

    # make sure that the standard deviation of the rescaled test set
    # predictions is higher than the standard deviation of the regular test set
    # predictions
    p_std = np.std(predictions)
    rescaled_p_std = np.std(rescaled_predictions)
    assert_greater(rescaled_p_std, p_std)

    # make sure that the standard deviation of the rescaled predictions
    # on the TRAINING set (not the TEST) is closer to the standard
    # deviation of the training set labels than the standard deviation
    # of the regular predictions.
    train_y_std = np.std(train_fs.labels)
    train_p_std = np.std(train_predictions)
    rescaled_train_p_std = np.std(rescaled_train_predictions)
    assert_less(abs(rescaled_train_p_std - train_y_std),
                abs(train_p_std - train_y_std))
def test_additional_metrics():
    """
    Test additional metrics in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    results = learner.evaluate(test_fs, output_metrics=['spearman',
                                                        'kendall_tau'])

    # check that the values for the additional metrics are as expected
    additional_scores_dict = results[-1]
    assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4)
    assert_almost_equal(additional_scores_dict['kendall_tau'], 0.9846, places=4)
Ejemplo n.º 25
0
def check_adaboost_regression(base_estimator):
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # train an AdaBoostClassifier on the training data and evalute on the
    # testing data
    learner = Learner('AdaBoostRegressor', model_kwargs={'base_estimator':
                                                         base_estimator})
    learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
Ejemplo n.º 26
0
def check_adaboost_regression(base_estimator):
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # train an AdaBoostClassifier on the training data and evalute on the
    # testing data
    learner = Learner('AdaBoostRegressor',
                      model_kwargs={'base_estimator': base_estimator})
    learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.95)
Ejemplo n.º 27
0
def check_ransac_regression(base_estimator, pearson_value):
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # train a RANSACRegressor on the training data and evalute on the
    # testing data
    model_kwargs = {'base_estimator': base_estimator} if base_estimator else {}
    learner = Learner('RANSACRegressor', model_kwargs=model_kwargs)
    learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated and the value
    # of the correlation is as expected
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, pearson_value)
Ejemplo n.º 28
0
def test_additional_metrics():
    """
    Test additional metrics in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    results = learner.evaluate(test_fs,
                               output_metrics=['spearman', 'kendall_tau'])

    # check that the values for the additional metrics are as expected
    additional_scores_dict = results[-1]
    assert_almost_equal(additional_scores_dict['spearman'], 0.9996, places=4)
    assert_almost_equal(additional_scores_dict['kendall_tau'],
                        0.9846,
                        places=4)
def check_ransac_regression(base_estimator, pearson_value):
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                sd_noise=4,
                                                num_features=3)

    # train a RANSACRegressor on the training data and evalute on the
    # testing data
    model_kwargs = {'base_estimator': base_estimator} if base_estimator else {}
    learner = Learner('RANSACRegressor', model_kwargs=model_kwargs)
    learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated and the value
    # of the correlation is as expected
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, pearson_value)
Ejemplo n.º 30
0
def check_mlp_regression(use_rescaling=False):
    train_fs, test_fs, _ = make_regression_data(num_examples=500,
                                                sd_noise=4,
                                                num_features=5)

    # train an MLPRegressor on the training data and evalute on the
    # testing data
    name = 'MLPRegressor' if use_rescaling else 'RescaledMLPRegressor'
    learner = Learner(name)
    # we don't want to see any convergence warnings during the grid search
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
        learner.train(train_fs, grid_search=True, grid_objective='pearson')

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.98)
def check_mlp_regression(use_rescaling=False):
    train_fs, test_fs, _ = make_regression_data(num_examples=500,
                                                sd_noise=4,
                                                num_features=5)

    # train an MLPRegressor on the training data and evalute on the
    # testing data
    name = 'MLPRegressor' if use_rescaling else 'RescaledMLPRegressor'
    learner = Learner(name)
    # we don't want to see any convergence warnings during the grid search
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=ConvergenceWarning)
        learner.train(train_fs, grid_search=False)

    # now generate the predictions on the test set
    predictions = learner.predict(test_fs)

    # now make sure that the predictions are close to
    # the actual test FeatureSet labels that we generated
    # using make_regression_data. To do this, we just
    # make sure that they are correlated
    cor, _ = pearsonr(predictions, test_fs.labels)
    assert_greater(cor, 0.98)
def check_predict(model, use_feature_hashing=False):
    """
    This tests whether predict task runs and generates the same
    number of predictions as samples in the test set. The specified
    model indicates whether to generate random regression
    or classification data.
    """

    # create the random data for the given model
    if model._estimator_type == 'regressor':
        train_fs, test_fs, _ = \
            make_regression_data(use_feature_hashing=use_feature_hashing,
                                 feature_bins=5)
    # feature hashing will not work for Naive Bayes since it requires
    # non-negative feature values
    elif model.__name__ == 'MultinomialNB':
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=False,
                                     non_negative=True)
    else:
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=use_feature_hashing,
                                     feature_bins=25)

    # create the learner with the specified model
    learner = Learner(model.__name__)

    # now train the learner on the training data and use feature hashing when
    # specified and when we are not using a Naive Bayes model
    learner.train(train_fs, grid_search=False)

    # now make predictions on the test set
    predictions = learner.predict(test_fs)

    # make sure we have the same number of outputs as the
    # number of test set samples
    eq_(len(predictions), test_fs.features.shape[0])
Ejemplo n.º 33
0
def check_predict(model, use_feature_hashing=False):
    """
    This tests whether predict task runs and generates the same
    number of predictions as samples in the test set. The specified
    model indicates whether to generate random regression
    or classification data.
    """

    # create the random data for the given model
    if model._estimator_type == 'regressor':
        train_fs, test_fs, _ = \
            make_regression_data(use_feature_hashing=use_feature_hashing,
                                 feature_bins=5)
    # feature hashing will not work for Naive Bayes since it requires
    # non-negative feature values
    elif model.__name__ == 'MultinomialNB':
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=False,
                                     non_negative=True)
    else:
        train_fs, test_fs = \
            make_classification_data(use_feature_hashing=use_feature_hashing,
                                     feature_bins=25)

    # create the learner with the specified model
    learner = Learner(model.__name__)

    # now train the learner on the training data and use feature hashing when
    # specified and when we are not using a Naive Bayes model
    learner.train(train_fs, grid_search=False)

    # now make predictions on the test set
    predictions = learner.predict(test_fs)

    # make sure we have the same number of outputs as the
    # number of test set samples
    eq_(len(predictions), test_fs.features.shape[0])
Ejemplo n.º 34
0
def test_fancy_output():
    """
    Test the descriptive statistics output in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    resultdict = learner.evaluate(test_fs)
    actual_stats_from_api = dict(resultdict[2]['descriptive']['actual'])
    pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted'])

    # write out the training and test feature set
    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'),
                             train_fs)
    train_writer.write()
    test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs)
    test_writer.write()

    # now get the config file template, fill it in and run it
    # so that we can get a results file
    config_template_path = join(_my_dir, 'configs',
                                'test_regression_fancy_output.template.cfg')
    config_path = fill_in_config_paths_for_fancy_output(config_template_path)

    run_configuration(config_path, quiet=True)

    # read in the results file and get the descriptive statistics
    actual_stats_from_file = {}
    pred_stats_from_file = {}
    with open(join(output_dir, ('regression_fancy_output_train_fancy_train.'
                                'jsonlines_test_fancy_test.jsonlines'
                                '_LinearRegression.results')),
              'r') as resultf:

        result_output = resultf.read().strip().split('\n')
        for desc_stat_line in result_output[27:31]:
            desc_stat_line = desc_stat_line.strip()
            if not desc_stat_line:
                continue
            else:
                m = re.search(r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+'
                              r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+'
                              r'\((predicted)\)', desc_stat_line)
                stat_type, actual_value, _, pred_value, _ = m.groups()
                actual_stats_from_file[stat_type.lower()] = float(actual_value)
                pred_stats_from_file[stat_type.lower()] = float(pred_value)

    for stat_type in actual_stats_from_api:

        assert_almost_equal(actual_stats_from_file[stat_type],
                            actual_stats_from_api[stat_type],
                            places=4)

        assert_almost_equal(pred_stats_from_file[stat_type],
                            pred_stats_from_api[stat_type],
                            places=4)
Ejemplo n.º 35
0
def check_print_model_weights(task='classification'):

    # create some simple classification or regression data
    if task == 'classification':
        train_fs, _ = make_classification_data(train_test_ratio=0.8)
    elif task == 'multiclass_classification':
        train_fs, _ = make_classification_data(train_test_ratio=0.8, num_labels=3)
    else:
        train_fs, _, _ = make_regression_data(num_features=4,
                                              train_test_ratio=0.8)

    # now train the appropriate model
    if task == 'classification' or task == 'multiclass_classification':
        learner = Learner('LogisticRegression')
        learner.train(train_fs, grid_objective='f1_score_micro')
    elif task == 'regression':
        learner = Learner('LinearRegression')
        learner.train(train_fs, grid_objective='pearson')
    else:
        learner = Learner('LinearSVR')
        learner.train(train_fs, grid_objective='pearson')

    # now save the model to disk
    model_file = join(_my_dir, 'output',
                      'test_print_model_weights.model')
    learner.save(model_file)

    # now call print_model_weights main() and capture the output
    print_model_weights_cmd = [model_file]
    err = ''
    try:
        old_stderr = sys.stderr
        old_stdout = sys.stdout
        sys.stderr = mystderr = StringIO()
        sys.stdout = mystdout = StringIO()
        pmw.main(print_model_weights_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        sys.stdout = old_stdout
        print(err)

    # now parse the output of the print_model_weight command
    # and get the intercept and the feature values
    if task == 'classification':
        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = safe_float(lines_to_parse[0].split('\t')[0])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[2], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_[0])
        assert_allclose(learner.model.coef_[0], feature_values)
    elif task == 'multiclass_classification':
        # for multiple classes we get an intercept for each class
        # as well as a list of weights for each class

        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = []
        for intercept_string in lines_to_parse[0:3]:
            intercept.append(safe_float(intercept_string.split('\t')[0]))

        feature_values = [[], [], []]
        for ltp in lines_to_parse[3:]:
            fields = ltp.split('\t')
            feature_values[int(fields[1])].append((fields[2], safe_float(fields[0])))

        for index, weights in enumerate(feature_values):
            feature_values[index] = [t[1] for t in sorted(weights)]

        for index, weights in enumerate(learner.model.coef_):
            assert_array_almost_equal(weights, feature_values[index])

        assert_array_almost_equal(intercept, learner.model.intercept_)

    elif task == 'regression':
        lines_to_parse = [l for l in out.split('\n') if l]
        intercept = safe_float(lines_to_parse[0].split('=')[1])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)
    else:
        lines_to_parse = [l for l in out.split('\n') if l]

        intercept_list = ast.literal_eval(lines_to_parse[0].split('=')[1].strip())
        intercept = []
        for intercept_string in intercept_list:
            intercept.append(safe_float(intercept_string))

        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]

        assert_array_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)
Ejemplo n.º 36
0
def check_print_model_weights(task='classification'):

    # create some simple classification or regression data
    if task == 'classification' or task == 'classification_no_intercept':
        train_fs, _ = make_classification_data(train_test_ratio=0.8)
    elif task == 'multiclass_classification':
        train_fs, _ = make_classification_data(train_test_ratio=0.8, num_labels=3)
    else:
        train_fs, _, _ = make_regression_data(num_features=4,
                                              train_test_ratio=0.8)

    # now train the appropriate model
    if task == 'classification' or task == 'multiclass_classification':
        learner = Learner('LogisticRegression')
        learner.train(train_fs, grid_objective='f1_score_micro')
    elif task == 'classification_no_intercept':
        learner = Learner('LogisticRegression')
        learner.train(train_fs, grid_objective='f1_score_micro', param_grid=[{'fit_intercept':[False]}])
    elif task == 'regression':
        learner = Learner('LinearRegression')
        learner.train(train_fs, grid_objective='pearson')
    else:
        learner = Learner('LinearSVR')
        learner.train(train_fs, grid_objective='pearson')

    # now save the model to disk
    model_file = join(_my_dir, 'output',
                      'test_print_model_weights.model')
    learner.save(model_file)

    # now call print_model_weights main() and capture the output
    print_model_weights_cmd = [model_file]
    err = ''
    try:
        old_stderr = sys.stderr
        old_stdout = sys.stdout
        sys.stderr = mystderr = StringIO()
        sys.stdout = mystdout = StringIO()
        pmw.main(print_model_weights_cmd)
        out = mystdout.getvalue()
        err = mystderr.getvalue()
    finally:
        sys.stderr = old_stderr
        sys.stdout = old_stdout
        print(err)

    # now parse the output of the print_model_weight command
    # and get the intercept and the feature values
    if task == 'classification':
        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = safe_float(lines_to_parse[0].split('\t')[0])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[2], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_[0])
        assert_allclose(learner.model.coef_[0], feature_values)
    elif task == 'multiclass_classification':
        # for multiple classes we get an intercept for each class
        # as well as a list of weights for each class

        lines_to_parse = [l for l in out.split('\n')[1:] if l]
        intercept = []
        for intercept_string in lines_to_parse[0:3]:
            intercept.append(safe_float(intercept_string.split('\t')[0]))

        feature_values = [[], [], []]
        for ltp in lines_to_parse[3:]:
            fields = ltp.split('\t')
            feature_values[int(fields[1])].append((fields[2], safe_float(fields[0])))

        for index, weights in enumerate(feature_values):
            feature_values[index] = [t[1] for t in sorted(weights)]

        for index, weights in enumerate(learner.model.coef_):
            assert_array_almost_equal(weights, feature_values[index])

        assert_array_almost_equal(intercept, learner.model.intercept_)
    elif task == 'classification_no_intercept':
        lines_to_parse = [l for l in out.split('\n')[0:] if l]
        intercept = safe_float(lines_to_parse[0].split('=')[1])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[2], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_[0], feature_values)
    elif task == 'regression':
        lines_to_parse = [l for l in out.split('\n') if l]
        intercept = safe_float(lines_to_parse[0].split('=')[1])
        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]
        assert_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)
    else:
        lines_to_parse = [l for l in out.split('\n') if l]

        intercept_list = ast.literal_eval(lines_to_parse[0].split('=')[1].strip())
        intercept = []
        for intercept_string in intercept_list:
            intercept.append(safe_float(intercept_string))

        feature_values = []
        for ltp in lines_to_parse[1:]:
            fields = ltp.split('\t')
            feature_values.append((fields[1], safe_float(fields[0])))
        feature_values = [t[1] for t in sorted(feature_values)]

        assert_array_almost_equal(intercept, learner.model.intercept_)
        assert_allclose(learner.model.coef_, feature_values)
Ejemplo n.º 37
0
def test_fancy_output():
    """
    Test the descriptive statistics output in the results file for a regressor
    """
    train_fs, test_fs, _ = make_regression_data(num_examples=2000,
                                                num_features=3)

    # train a regression model using the train feature set
    learner = Learner('LinearRegression')
    learner.train(train_fs, grid_objective='pearson')

    # evaluate the trained model using the test feature set
    resultdict = learner.evaluate(test_fs)
    actual_stats_from_api = dict(resultdict[2]['descriptive']['actual'])
    pred_stats_from_api = dict(resultdict[2]['descriptive']['predicted'])

    # write out the training and test feature set
    train_dir = join(_my_dir, 'train')
    test_dir = join(_my_dir, 'test')
    output_dir = join(_my_dir, 'output')

    train_writer = NDJWriter(join(train_dir, 'fancy_train.jsonlines'),
                             train_fs)
    train_writer.write()
    test_writer = NDJWriter(join(test_dir, 'fancy_test.jsonlines'), test_fs)
    test_writer.write()

    # now get the config file template, fill it in and run it
    # so that we can get a results file
    config_template_path = join(_my_dir, 'configs',
                                'test_regression_fancy_output.template.cfg')
    config_path = fill_in_config_paths_for_fancy_output(config_template_path)

    run_configuration(config_path, quiet=True)

    # read in the results file and get the descriptive statistics
    actual_stats_from_file = {}
    pred_stats_from_file = {}
    with open(
            join(output_dir, ('regression_fancy_output_train_fancy_train.'
                              'jsonlines_test_fancy_test.jsonlines'
                              '_LinearRegression.results')), 'r') as resultf:

        result_output = resultf.read().strip().split('\n')
        for desc_stat_line in result_output[27:31]:
            desc_stat_line = desc_stat_line.strip()
            if not desc_stat_line:
                continue
            else:
                m = re.search(
                    r'([A-Za-z]+)\s+=\s+(-?[0-9]+.?[0-9]*)\s+'
                    r'\((actual)\),\s+(-?[0-9]+.?[0-9]*)\s+'
                    r'\((predicted)\)', desc_stat_line)
                stat_type, actual_value, _, pred_value, _ = m.groups()
                actual_stats_from_file[stat_type.lower()] = float(actual_value)
                pred_stats_from_file[stat_type.lower()] = float(pred_value)

    for stat_type in actual_stats_from_api:

        assert_almost_equal(actual_stats_from_file[stat_type],
                            actual_stats_from_api[stat_type],
                            places=4)

        assert_almost_equal(pred_stats_from_file[stat_type],
                            pred_stats_from_api[stat_type],
                            places=4)
Ejemplo n.º 38
0
def test_pipeline_attribute():

    # define the classifier and regressor feature dictionaries and labels that we will test on
    # and also the classes and targets respectively
    cfeature_dicts = [{"f01": -2.87, "f02": 0.713, "f03": 2.86, "f04": 0.385, "f05": -0.989,
                       "f06": 0.380, "f07": -0.365, "f08": -0.224, "f09": 3.45, "f10": 0.622},
                      {"f01": 0.058, "f02": -1.14, "f03": 2.85, "f04": 1.41, "f05": 1.60,
                       "f06": 1.04, "f07": -0.669, "f08": -0.727, "f09": 1.82, "f10": 1.336},
                      {"f01": -1.80, "f02": 3.21, "f03": 0.79, "f04": -0.55, "f05": 0.059,
                       "f06": -5.66, "f07": -3.08, "f08": -0.95, "f09": 0.188, "f10": -1.24},
                      {"f01": 2.270, "f02": 2.271, "f03": 2.285, "f04": 2.951, "f05": 1.018,
                       "f06": -0.59, "f07": 0.432, "f08": 1.614, "f09": -0.69, "f10": -1.27},
                      {"f01": 2.98, "f02": 3.74, "f03": 1.96, "f04": 0.80, "f05": 0.425,
                       "f06": -0.76, "f07": 4.013, "f08": 3.119, "f09": 2.104, "f10": 0.195},
                      {"f01": 2.560, "f02": -2.05, "f03": 1.793, "f04": 0.955, "f05": 2.914,
                       "f06": 2.239, "f07": -1.41, "f08": -1.24, "f09": -4.44, "f10": 0.273},
                      {"f01": 1.86, "f02": -0.017, "f03": 1.337, "f04": -2.14, "f05": 2.255,
                       "f06": -1.21, "f07": -0.24, "f08": -0.66, "f09": -2.51, "f10": -1.06},
                      {"f01": -1.95, "f02": -1.81, "f03": 2.105, "f04": 0.976, "f05": -1.480,
                       "f06": 1.120, "f07": -1.22, "f08": 0.704, "f09": -3.66, "f10": -1.72},
                      {"f01": -1.54, "f02": -2.17, "f03": -4.18, "f04": 1.708, "f05": 0.514,
                       "f06": 0.354, "f07": -3.55, "f08": 2.285, "f09": -3.47, "f10": -0.79},
                      {"f01": 2.162, "f02": -0.71, "f03": -0.448, "f04": 0.326, "f05": 3.384,
                      "f06": -0.455, "f07": 1.253, "f08": 0.998, "f09": 3.193, "f10": 1.342}]
    classes = [1, 1, 0, 2, 1, 2, 0, 1, 2, 1]

    rfeature_dicts = [{'f1': 1.351, 'f2': -0.117, 'f3': 0.570, 'f4': 0.0619,
                       'f5': 1.569, 'f6': 0.805},
                      {'f1': -0.557, 'f2': -1.704, 'f3': 0.0913, 'f4': 0.767,
                       'f5': 1.281, 'f6': -0.803},
                      {'f1': 0.720, 'f2': -0.268, 'f3': 0.760, 'f4': 0.861,
                      'f5': -0.403, 'f6': 0.814},
                      {'f1': 1.737, 'f2': -0.228, 'f3': 1.340, 'f4': 2.031,
                      'f5': 2.170, 'f6': 1.498},
                      {'f1': 0.344, 'f2': 0.340, 'f3': 0.572, 'f4': -1.06,
                       'f5': 1.044, 'f6': 2.065},
                      {'f1': -0.489, 'f2': -0.420, 'f3': 0.428, 'f4': 0.707,
                       'f5': -1.306, 'f6': 0.0081},
                      {'f1': 0.805, 'f2': 0.570, 'f3': 1.351, 'f4': -0.117,
                       'f5': 0.0619, 'f6': 1.569},
                      {'f1': -1.083, 'f2': 0.0369, 'f3': -0.413, 'f4': 1.391,
                       'f5': 1.417, 'f6': -1.118},
                      {'f1': -1.945, 'f2': -0.332, 'f3': -1.393, 'f4': 0.952,
                       'f5': -0.816, 'f6': 1.417},
                      {'f1': 1.976, 'f2': -0.220, 'f3': -1.636, 'f4': 0.795,
                       'f5': -2.34, 'f6': -0.148}]
    targets = [96.057, -176.017, -182.32, -56.46, -50.14, -84.53, 241.71, -17.84,
               -47.09, 77.65]

    # create training featuresets that we will use to train our estimator
    function_args_dict = defaultdict(dict)
    for estimator_type in ['classifier', 'regressor']:
        for do_feature_hashing in [True, False]:
            if estimator_type == 'classifier':
                (train_fs, test_fs) = make_classification_data(num_examples=500,
                                                               num_features=10,
                                                               num_labels=3,
                                                               feature_bins=4,
                                                               non_negative=True,
                                                               use_feature_hashing=do_feature_hashing)
                labels = classes
                feature_dicts = cfeature_dicts
            else:
                (train_fs, test_fs, _) = make_regression_data(num_examples=500,
                                                              num_features=6,
                                                              feature_bins=4,
                                                              use_feature_hashing=do_feature_hashing)
                labels = targets
                feature_dicts = rfeature_dicts

            # if we are doing feature hashing, we need to transform our test
            # cases to the same space. If we are not, then we don't need to worry
            # beacuse we have manually ensured that the number of features are the
            # same for the non-hashing case (10 for classification, and 6 for
            # regression)
            test_fs = FeatureSet('test',
                                 ids=list(range(1, 11)),
                                 features=feature_dicts,
                                 labels=labels,
                                 vectorizer=train_fs.vectorizer if do_feature_hashing else None)
            function_args_dict[estimator_type][do_feature_hashing] = [train_fs,
                                                                      test_fs,
                                                                      feature_dicts,
                                                                      labels]
    function_args_dict = dict(function_args_dict)

    # now set up the test cases
    learners = ['LinearSVC', 'LogisticRegression',
                'MultinomialNB', 'SVC',
                'GradientBoostingClassifier', 'Lars',
                'LinearSVR', 'Ridge', 'SVR',
                'GradientBoostingRegressor']
    use_hashing = [True, False]
    min_feature_counts = [1, 2]
    samplers = [None, 'RBFSampler', 'SkewedChi2Sampler']
    scalers = ['none', 'with_mean', 'with_std', 'both']

    for (learner_name,
         do_feature_hashing,
         min_count,
         scaling_type,
         sampler_name) in product(learners,
                                  use_hashing,
                                  min_feature_counts,
                                  scalers,
                                  samplers):

        # skip the case for MultinomialNB with feature hashing
        # or feature sampling since it does not support those
        if learner_name == 'MultinomialNB':
            if do_feature_hashing or sampler_name is not None:
                continue

        # if we are using a SkewedChi2Sampler, we need to set the
        # some parameters to make sure it works as expected
        if sampler_name == 'SkewedChi2Sampler':
            sampler_kwargs = {'skewedness': 15, 'n_components': 10}
        else:
            sampler_kwargs = {}

        # create a learner instance with the given parameters
        # and with pipeline attribute set to True
        learner = Learner(learner_name,
                          min_feature_count=min_count,
                          sampler=sampler_name,
                          sampler_kwargs=sampler_kwargs,
                          feature_scaling=scaling_type,
                          pipeline=True)

        yield (check_pipeline_attribute,
               learner_name,
               do_feature_hashing,
               min_count,
               scaling_type,
               sampler_name,
               learner,
               function_args_dict)