Exemple #1
0
    def test_combine_with_classifier_trained_with_filedatastream(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(data)

        result_1 = result_1.astype(np.int32)
        result_2 = result_2['PredictedLabel'].astype(np.int32)
        self.assertTrue(result_1.equals(result_2))
Exemple #2
0
    def test_combine_with_classifier_trained_with_y_arg(self):
        """
        Tests a sequence where the initial transform is computed
        using both X and y input args. Note, any steps after the
        initial transform will be operating on data where the X
        and y have been combined in to one dataset.
        """
        np.random.seed(0)

        df = get_dataset("infert").as_df()

        X = df.loc[:, df.columns != 'case']
        y = df['case']

        transform = OneHotVectorizer() << 'education_str'

        # Passing in both X and y
        df = transform.fit_transform(X, y, as_binary_data_stream=True)

        # NOTE: need to specify the label column here because the
        # feature and label data was joined in the last step.
        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=list(X.columns))
        predictor.fit(df)

        df = transform.transform(X, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(X)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
Exemple #3
0
    def test_combined_models_support_decision_function(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.decision_function(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.decision_function(data)

        self.assertTrue(np.array_equal(result_1, result_2))
    def test_with_or_without_pipeline(self):
        # Bug 227810
        # data input (as a FileDataStream)
        path = get_dataset('infert').as_filepath()

        file_schema = 'sep=, col=education:TX:1 col=Features:R4:2-4,6-8 ' \
                      'col=case:R4:5 header=+'
        data = FileDataStream(path, schema=file_schema)

        # without pipeline -- fails
        m = LogisticRegressionBinaryClassifier(feature=['Features'],
                                               label='case')
        m.fit(data)
        scores1 = m.predict(data)

        # with pipeline -- works
        m = Pipeline([
            LogisticRegressionBinaryClassifier(feature=['Features'],
                                               label='case')
        ])
        m.fit(data)
        scores2 = m.predict(data)
        diff = np.abs(scores1.values.ravel() -
                      scores2[['PredictedLabel']].values.ravel())
        assert diff.sum() <= 2
    def test_pipeline_subclass_correctly_supports_decision_function(self):
        X, y = generate_dataset_1()

        pipeline = Pipeline([LogisticRegressionBinaryClassifier()])
        pipeline.fit(X, y)
        orig_result = pipeline.decision_function(X)

        pipeline = CustomPipeline([LogisticRegressionBinaryClassifier()])
        pipeline.fit(X, y)
        new_result = pipeline.decision_function(X)

        self.assertTrue(np.array_equal(orig_result, new_result))
    def test_pipeline_subclass_can_override_predict(self):
        X, y = generate_dataset_1()

        pipeline = Pipeline([LogisticRegressionBinaryClassifier()])
        pipeline.fit(X, y)
        result = pipeline.predict(X)['PredictedLabel']

        self.assertTrue(np.array_equal(result.values, y['y'].values))

        pipeline = CustomPipeline([LogisticRegressionBinaryClassifier()])
        pipeline.fit(X, y)

        self.assertEqual(pipeline.predict(X, test_return_value=3), 3)
Exemple #7
0
    def test_combined_models_support_predict_proba_with_more_than_2_classes(
            self):
        path = get_dataset('infert').as_filepath()
        data = FileDataStream.read_csv(path)

        featurization_pipeline = Pipeline(
            [OneHotVectorizer(columns={'education': 'education'})])
        featurization_pipeline.fit(data)
        featurized_data = featurization_pipeline.transform(data)

        feature_cols = ['education', 'age']
        training_pipeline = Pipeline([
            DatasetTransformer(featurization_pipeline.model),
            OneVsRestClassifier(LogisticRegressionBinaryClassifier(),
                                feature=feature_cols,
                                label='induced')
        ])
        training_pipeline.fit(data, output_predictor_model=True)

        concat_pipeline = Pipeline(
            [PrefixColumnConcatenator({'education': 'education.'})])
        concat_pipeline.fit(featurized_data)

        predictor_pipeline = Pipeline()
        predictor_pipeline.load_model(training_pipeline.predictor_model)

        concat_and_predictor_pipeline = Pipeline.combine_models(
            concat_pipeline, predictor_pipeline)

        result = concat_and_predictor_pipeline.predict_proba(featurized_data)
        self.assertEqual(result.shape[1], 3)
Exemple #8
0
 def test_pass_predict_proba_binary_with_pipeline(self):
     assert_almost_equal(proba_sum(
         Pipeline([LogisticRegressionBinaryClassifier(number_of_threads=1)
                   ])),
                         38.0,
                         decimal=3,
                         err_msg=invalid_predict_proba_output)
    def test_ovr_accuracy(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(number_of_threads=1),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1,
                                       number_of_threads=1),
            GamBinaryClassifier(number_of_threads=1),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1,
                                      number_of_threads=1),
            FastLinearBinaryClassifier(number_of_threads=1),
            SgdBinaryClassifier(number_of_threads=1),
            # SymSgdBinaryClassifier(number_of_threads=1),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            metrics = accuracy(ovr)
            accu = metrics['Accuracy(micro-avg)'][0]
            # algos will have wide range of accuracy, so use low bar. Also
            # checks Pipeline + Ova + clf
            assert_greater(
                accu, 0.65,
                "{} accuracy is too low {}".format(clf.__class__, accu))
Exemple #10
0
    def test_metrics_evaluate_binary(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionBinaryClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train, verbose=0)
        metrics, _ = e.test(X_test, y_test)
        # TODO: debug flucations, and increase decimal precision on checks
        assert_almost_equal(metrics['AUC'][0],
                            0.980,
                            decimal=1,
                            err_msg="AUC should be %s" % 0.980)
        assert_almost_equal(metrics['Accuracy'][0],
                            0.632,
                            decimal=1,
                            err_msg="Accuracy should be %s" % 0.632)
        assert_almost_equal(metrics['Positive precision'][0],
                            1,
                            decimal=1,
                            err_msg="Positive precision should be %s" % 1)
        assert_almost_equal(metrics['Positive recall'][0],
                            0.125,
                            decimal=1,
                            err_msg="Positive recall should be %s" % 0.125)
        assert_almost_equal(metrics['Negative precision'][0],
                            0.611,
                            decimal=1,
                            err_msg="Negative precision should be %s" % 0.611)
        assert_almost_equal(metrics['Negative recall'][0],
                            1,
                            decimal=1,
                            err_msg="Negative recall should be %s" % 1)
        assert_almost_equal(metrics['Log-loss'][0],
                            0.686,
                            decimal=1,
                            err_msg="Log-loss should be %s" % 0.686)
        assert_almost_equal(metrics['Log-loss reduction'][0],
                            0.3005,
                            decimal=3,
                            err_msg="Log-loss reduction should be %s" % 0.3005)
        assert_almost_equal(
            metrics['Test-set entropy (prior Log-Loss/instance)'][0],
            0.981,
            decimal=1,
            err_msg="Test-set entropy (prior Log-Loss/instance) should be %s" %
            0.981)
        assert_almost_equal(metrics['F1 Score'][0],
                            0.222,
                            decimal=1,
                            err_msg="F1 Score should be %s" % 0.222)
        assert_almost_equal(metrics['AUPRC'][0],
                            0.966,
                            decimal=1,
                            err_msg="AUPRC should be %s" % 0.966)
    def test_ngramfeaturizer_syntax_dict(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << {
            'outg': ['review']
        }
        X = textt.fit_transform(X)

        assert X.shape == (25, 117)
        # columns ordering changed between 0.22 and 0.23
        assert 'review' in (X.columns[0], X.columns[-1])
        X = X.drop('review', axis=1)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        X_test = X_test.drop('review', axis=1)
        scores = mymodel.predict(X_test)

        # View the scores
        assert scores.shape == (10, )
    def test_ngramfeaturizer(self):

        train_reviews = pandas.DataFrame(data=dict(
            review=[
                "This is great", "I hate it", "Love it", "Do not like it",
                "Really like it", "I hate it", "I like it a lot",
                "I kind of hate it", "I do like it", "I really hate it",
                "It is very good", "I hate it a bunch", "I love it a bunch",
                "I hate it", "I like it very much", "I hate it very much.",
                "I really do love it", "I really do hate it", "Love it!",
                "Hate it!", "I love it", "I hate it", "I love it", "I hate it",
                "I love it"
            ],
            like=[
                True, False, True, False, True, False, True, False, True,
                False, True, False, True, False, True, False, True, False,
                True, False, True, False, True, False, True
            ]))

        test_reviews = pandas.DataFrame(data=dict(review=[
            "This is great", "I hate it", "Love it", "Really like it",
            "I hate it", "I like it a lot", "I love it", "I do like it",
            "I really hate it", "I love it"
        ]))

        y = train_reviews['like']
        X = train_reviews.loc[:, train_reviews.columns != 'like']

        textt = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
        X = textt.fit_transform(X)

        assert X.shape == (25, 116)

        mymodel = LogisticRegressionBinaryClassifier().fit(X, y, verbose=0)
        X_test = textt.transform(test_reviews)
        scores = mymodel.predict(textt.transform(test_reviews))

        # View the scores
        assert scores.shape == (10, )
        assert X_test.shape[0] == 10
Exemple #13
0
    def test_combine_with_classifier_trained_with_joined_X_and_y(self):
        np.random.seed(0)

        infert_df = get_dataset("infert").as_df()
        feature_cols = [c for c in infert_df.columns if c != 'case']

        transform = OneHotVectorizer() << 'education_str'
        df = transform.fit_transform(infert_df, as_binary_data_stream=True)

        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=feature_cols)
        predictor.fit(df)

        df = transform.transform(infert_df, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(infert_df)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
    def setUpClass(self):
        adult_path = get_dataset('uciadult_train').as_filepath()
        self.classification_data = FileDataStream.read_csv(adult_path)
        binary_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            LogisticRegressionBinaryClassifier(feature=['age', 'education'],
                                               label='label',
                                               number_of_threads=1)
        ])
        self.binary_model = binary_pipeline.fit(self.classification_data)
        self.binary_pfi = self.binary_model.permutation_feature_importance(
            self.classification_data)
        classifier_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearClassifier(feature=['age', 'education'],
                                 label='label',
                                 number_of_threads=1,
                                 shuffle=False)
        ])
        self.classifier_model = classifier_pipeline.fit(
            self.classification_data)
        self.classifier_pfi = self.classifier_model.permutation_feature_importance(
            self.classification_data)

        infert_path = get_dataset('infert').as_filepath()
        self.regression_data = FileDataStream.read_csv(infert_path)
        regressor_pipeline = Pipeline([
            OneHotVectorizer(columns=['education']),
            FastLinearRegressor(feature=['induced', 'education'],
                                label='age',
                                number_of_threads=1,
                                shuffle=False)
        ])
        self.regressor_model = regressor_pipeline.fit(self.regression_data)
        self.regressor_pfi = self.regressor_model.permutation_feature_importance(
            self.regression_data)

        ticket_path = get_dataset('gen_tickettrain').as_filepath()
        self.ranking_data = FileDataStream.read_csv(ticket_path)
        ranker_pipeline = Pipeline([
            ToKey(columns=['group']),
            LightGbmRanker(feature=['Class', 'dep_day', 'duration'],
                           label='rank',
                           group_id='group',
                           random_state=0,
                           number_of_threads=1)
        ])
        self.ranker_model = ranker_pipeline.fit(self.ranking_data)
        self.ranker_pfi = self.ranker_model.permutation_feature_importance(
            self.ranking_data)
Exemple #15
0
def train_data_type_single(fit_X_type="dataframe",
                           fit_Y_type=None,
                           predict_X_type=None):
    data = [[1, 2, 3], [2, 3, 4], [1, 2, 3], [2, 2, 2]]
    label = [1, 0, 1, 1]
    if fit_X_type == "sparse":
        model = LightGbmClassifier(minimum_example_count_per_leaf=1)
    else:
        model = LogisticRegressionBinaryClassifier()
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type)
Exemple #16
0
def train_data_type_ppl(fit_X_type=None, fit_Y_type=None, predict_X_type=None):
    data = [[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [1.0, 2.0, 3.0], [2.0, 2.0, 2.0]]
    label = [1, 0, 1, 1]
    if fit_X_type == "sparse":
        model = Pipeline([Binner(), LightGbmClassifier(minimum_example_count_per_leaf=1)])
    else:
        model = Pipeline([Binner(), LogisticRegressionBinaryClassifier()])
    data_with_new_type = transform_data(data, fit_X_type)
    label_with_new_type = transform_data(label, fit_Y_type)
    model.fit(data_with_new_type, label_with_new_type)
    metrics, scores = model.test(
        data_with_new_type, label_with_new_type, output_scores=True)
    test_data_with_new_type = transform_data(data, predict_X_type)
    return model.predict(test_data_with_new_type), scores, metrics
Exemple #17
0
    def test_metrics_check_output_scores(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionBinaryClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train, verbose=0)
        metrics, scores = e.test(X_test, y_test, output_scores=False)
        assert len(scores) == 0
        metrics, scores = e.test(X_test, y_test, output_scores=True)
        assert len(scores) > 0
Exemple #18
0
    def test_pass_predict_proba_from_load_model(selfs):
        pipeline = Pipeline([LogisticRegressionBinaryClassifier()])
        pipeline.fit(X_train, y_train)
        probs1 = pipeline.predict_proba(X_test)
        sum1 = probs1.sum().sum()
        (fd, modelfilename) = tempfile.mkstemp(suffix='.model.bin')
        fl = os.fdopen(fd, 'w')
        fl.close()
        pipeline.save_model(modelfilename)

        pipeline2 = Pipeline()
        pipeline2.load_model(modelfilename)
        probs2 = pipeline2.predict_proba(X_test)
        sum2 = probs2.sum().sum()
        assert_equal(sum1, sum2,
                     "model probabilities don't match after loading model")
    def test_score_binary(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionBinaryClassifier(train_threads=1)
        e = Pipeline([lr])
        e.fit(X_train, y_train)
        metrics = e.score(X_test, y_test)
        print(metrics)
        assert_almost_equal(metrics,
                            0.9801136363636364,
                            decimal=5,
                            err_msg="AUC should be %s" % 0.9801136363636364)
    def test_failing_predict_proba_called_with_use_probabilites_false(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            check_predict_proba_when_trained_with_use_probabilites_false(
                self, ovr, clf)
Exemple #21
0
    def test_failing_decision_function_called_with_use_probabilites_true(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(min_split=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(min_split=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=True)
            check_decision_function_when_trained_with_use_probabilites_true(
                self, ovr, clf)
Exemple #22
0
    def test_metrics_evaluate_binary_sklearn(self):
        np.random.seed(0)
        df = get_dataset("iris").as_df()
        df.drop(['Species'], inplace=True, axis=1)
        df.Label = [1 if x == 1 else 0 for x in df.Label]
        X_train, X_test, y_train, y_test = \
            train_test_split(df.loc[:, df.columns != 'Label'], df['Label'])

        lr = LogisticRegressionBinaryClassifier()
        e = Pipeline([lr])
        e.fit(X_train, y_train, verbose=0)

        metrics, scores = e.test(X_test, y_test, output_scores=True)
        aucnimbusml = metrics['AUC']
        precision, recall, _ = precision_recall_curve(y_test,
                                                      scores['Probability'])
        aucskpr = auc(recall, precision)
        precision, recall, _ = precision_recall_curve(y_test, scores['Score'])
        aucsksc = auc(recall, precision)
        print(aucnimbusml, aucskpr, aucsksc)
        assert aucskpr == aucsksc
    def test_decision_function_produces_distribution_not_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf, use_probabilities=False)
            scoremean = decfun_average(ovr)
            assert_not_equal(
                scoremean, 1.0,
                '{} raw scores should not sum to 1.0 over 3 classes'.format(
                    clf.__class__))
    def test_predict_proba_produces_distribution_sum_to_1(self):
        clfs = [
            # TODO: BUG 231482 , why doesnt FM work
            # FactorizationMachineBinaryClassifier(),
            LogisticRegressionBinaryClassifier(),
            FastForestBinaryClassifier(minimum_example_count_per_leaf=1),
            GamBinaryClassifier(),
            AveragedPerceptronBinaryClassifier(),
            FastTreesBinaryClassifier(minimum_example_count_per_leaf=1),
            LightGbmBinaryClassifier(),
            FastLinearBinaryClassifier(),
            SgdBinaryClassifier(),
            # TODO: why symsgd does not sum to 1.0
            # SymSgdBinaryClassifier(),
        ]

        for clf in clfs:
            ovr = OneVsRestClassifier(classifier=clf)
            probmean = proba_average(ovr)
            assert_equal(
                probmean, 1.0,
                '{} probabilites {} do not sum to 1.0 over 3 classes'.format(
                    clf.__class__, probmean))
    data=dict(
        review=[
            "This is great",
            "I hate it",
            "Love it",
            "Really like it",
            "I hate it",
            "I like it a lot",
            "I love it",
            "I do like it",
            "I really hate it",
            "I love it"]))

y = train_reviews['like']
X = train_reviews.loc[:, train_reviews.columns != 'like']

ngram = NGramFeaturizer(word_feature_extractor=n_gram()) << 'review'
X = ngram.fit_transform(X)

# view the transformed numerical values and column names
# print(X.head())

mymodel = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = ngram.transform(test_reviews)

scores = mymodel.predict(ngram.transform(test_reviews))

# view the scores
# print(scores.head())
# LogisticRegressionBinaryClassifier
import numpy as np
from nimbusml.datasets import get_dataset
from nimbusml.feature_extraction.categorical import OneHotVectorizer
from nimbusml.linear_model import LogisticRegressionBinaryClassifier
from sklearn.model_selection import train_test_split

# use the built-in data set 'infert' to create test and train data
#   Unnamed: 0  education   age  parity  induced  case  spontaneous  stratum  \
# 0           1        0.0  26.0     6.0      1.0   1.0          2.0      1.0
# 1           2        0.0  42.0     1.0      1.0   1.0          0.0      2.0
#   pooled.stratum education_str
# 0             3.0        0-5yrs
# 1             1.0        0-5yrs
np.random.seed(0)

df = get_dataset("infert").as_df()

# remove : and ' ' from column names, and encode categorical column
df.columns = [i.replace(': ', '') for i in df.columns]
df = (OneHotVectorizer() << 'education_str').fit_transform(df)

X_train, X_test, y_train, y_test = \
    train_test_split(df.loc[:, df.columns != 'case'], df['case'])

lr = LogisticRegressionBinaryClassifier().fit(X_train, y_train)
scores = lr.predict(X_test)

# evaluate the model
print('Accuracy:', np.mean(y_test == [i for i in scores]))
Exemple #27
0
from nimbusml.linear_model import LogisticRegressionBinaryClassifier

# data input (as a FileDataStream)
path = get_dataset('uciadult_train').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    label  workclass     education  ... capital-loss hours-per-week
# 0      0    Private          11th  ...            0             40
# 1      0    Private       HS-grad  ...            0             50
# 2      1  Local-gov    Assoc-acdm  ...            0             40
# 3      1    Private  Some-college  ...            0             40
# 4      0          ?  Some-college  ...            0             30

# define the training pipeline with a linear model
lr_pipeline = Pipeline([LogisticRegressionBinaryClassifier(
    feature=['age', 'education-num', 'hours-per-week'], label='label')])

# train the model
lr_model = lr_pipeline.fit(data)

# For linear models, the contribution of a given feature is equal to the
# product of feature value times the corresponding weight. Similarly, for
# Generalized Additive Models (GAM), the contribution of a feature is equal to
# the shape function for the given feature evaluated at the feature value.
lr_feature_contributions = lr_model.get_feature_contributions(data)

# Print predictions with feature contributions, which give a relative measure
# of how much each feature impacted the Score.
print("========== Feature Contributions for Linear Model ==========")
print(lr_feature_contributions.head())
#   label  ... PredictedLabel     Score ... FeatureContributions.hours-per-week
              'col=occupation:TX:4 col=relationship:TX:5 col=ethnicity:TX:6 ' \
              'col=sex:TX:7 col=native-country-region:TX:8 header+'
label_column = 'label'
learners = [
    FastForestBinaryClassifier(),
    FastForestRegressor(),
    FastTreesBinaryClassifier(),
    FastTreesRegressor(),
    FastTreesTweedieRegressor(),
    LightGbmRegressor(),
    LightGbmBinaryClassifier(),
    AveragedPerceptronBinaryClassifier(),
    FastLinearBinaryClassifier(),
    FastLinearClassifier(),
    FastLinearRegressor(),
    LogisticRegressionBinaryClassifier(),
    LogisticRegressionClassifier(),
    OnlineGradientDescentRegressor(),
    SgdBinaryClassifier(),
    # SymSgdBinaryClassifier(),
    OrdinaryLeastSquaresRegressor(),
    PoissonRegressionRegressor()
]

learners_not_supported = [
    NaiveBayesClassifier(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    KMeansPlusPlus(),
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    # fix in nimbusml, needs to implement ICanGetSummaryAsIDataView
    FactorizationMachineBinaryClassifier(),
Exemple #29
0
pipeline = Pipeline([
    CharTokenizer(columns={'review_transform': 'review'}),
    NGramExtractor(ngram_length=3, all_lengths=False, columns={'ngrams': 'review_transform'}),
    ColumnDropper(columns=['review_transform', 'review'])
])
X = pipeline.fit_transform(X)

print(X.head())
#    ngrams.<␂>|T|h  ngrams.T|h|i  ngrams.h|i|s  ngrams.i|s|<␠>  ...  ngrams.i|t|!  ngrams.t|!|<␃>  ngrams.<␂>|H|a  ngrams.H|a|t
# 0             1.0           1.0           1.0             2.0  ...           0.0             0.0             0.0           0.0
# 1             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 2             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 3             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0
# 4             0.0           0.0           0.0             0.0  ...           0.0             0.0             0.0           0.0

model = LogisticRegressionBinaryClassifier().fit(X, y)

X_test = pipeline.transform(test_reviews)
result = model.predict(X_test)

print(result)
# 0     True
# 1    False
# 2     True
# 3     True
# 4    False
# 5     True
# 6     True
# 7     True
# 8    False
# 9     True
# data input (as a FileDataStream)
path = get_dataset('infert').as_filepath()

data = FileDataStream.read_csv(path)
print(data.head())
#    age  case education  induced  parity ... row_num  spontaneous  ...
# 0   26     1    0-5yrs        1       6 ...       1            2  ...
# 1   42     1    0-5yrs        1       1 ...       2            0  ...
# 2   39     1    0-5yrs        2       6 ...       3            0  ...
# 3   34     1    0-5yrs        2       4 ...       4            0  ...
# 4   35     1   6-11yrs        1       3 ...       5            1  ...

# define the training pipeline
pipeline = Pipeline([
    OneHotVectorizer(columns={'edu': 'education'}),
    LogisticRegressionBinaryClassifier(feature=['parity', 'edu'], label='case')
])

# train, predict, and evaluate
metrics, predictions = pipeline.fit(data).test(data, output_scores=True)

# print predictions
print(predictions.head())
#   PredictedLabel  Probability     Score
# 0               0     0.334679 -0.687098
# 1               0     0.334679 -0.687098
# 2               0     0.334679 -0.687098
# 3               0     0.334679 -0.687098
# 4               0     0.334679 -0.687098
# print evaluation metrics
print(metrics)