Beispiel #1
0
    def test_combined_models_support_decision_function(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.decision_function(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.decision_function(data)

        self.assertTrue(np.array_equal(result_1, result_2))
    def test_sparse_vector_column_combined_with_single_value_columns(self):
        train_data = {'c0': [0, 1, 0, 3], 'c1': ['a', 'b', 'a', 'b']}
        train_df = pd.DataFrame(train_data).astype({'c0': np.float32})

        xf = OneHotVectorizer(columns={'c1': 'c1'})
        xf.fit(train_df)
        expected_result = xf.transform(train_df)
        self.assertTrue(type(expected_result) == pd.DataFrame)

        result = xf.transform(train_df, as_csr=True)
        self.assertEqual(result.nnz, 6)
        self.assertTrue(type(result) == csr_matrix)

        result = pd.DataFrame(result.todense(), columns=['c0', 'c1.a', 'c1.b'])

        self.assertTrue(result.equals(expected_result))
Beispiel #3
0
    def test_combine_with_classifier_trained_with_filedatastream(self):
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        transform = OneHotVectorizer(columns={'edu': 'education'})
        df = transform.fit_transform(data, as_binary_data_stream=True)

        feature_cols = [
            'parity', 'edu', 'age', 'induced', 'spontaneous', 'stratum',
            'pooled.stratum'
        ]
        predictor = LogisticRegressionBinaryClassifier(feature=feature_cols,
                                                       label='case')
        predictor.fit(df)

        data = FileDataStream.read_csv(path)
        df = transform.transform(data, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        data = FileDataStream.read_csv(path)
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(data)

        result_1 = result_1.astype(np.int32)
        result_2 = result_2['PredictedLabel'].astype(np.int32)
        self.assertTrue(result_1.equals(result_2))
Beispiel #4
0
    def test_combine_with_classifier_trained_with_y_arg(self):
        """
        Tests a sequence where the initial transform is computed
        using both X and y input args. Note, any steps after the
        initial transform will be operating on data where the X
        and y have been combined in to one dataset.
        """
        np.random.seed(0)

        df = get_dataset("infert").as_df()

        X = df.loc[:, df.columns != 'case']
        y = df['case']

        transform = OneHotVectorizer() << 'education_str'

        # Passing in both X and y
        df = transform.fit_transform(X, y, as_binary_data_stream=True)

        # NOTE: need to specify the label column here because the
        # feature and label data was joined in the last step.
        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=list(X.columns))
        predictor.fit(df)

        df = transform.transform(X, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(X)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
    def test_sparse_vector_column(self):
        train_data = {'c0': ['a', 'b', 'a', 'b'], 'c1': ['c', 'd', 'd', 'c']}
        train_df = pd.DataFrame(train_data)

        xf = OneHotVectorizer(columns={'c0': 'c0', 'c1': 'c1'})
        xf.fit(train_df)
        expected_result = xf.transform(train_df)
        self.assertTrue(type(expected_result) == pd.DataFrame)

        result = xf.transform(train_df, as_csr=True)
        self.assertEqual(result.nnz, 8)
        self.assertTrue(type(result) == csr_matrix)

        result = pd.DataFrame(result.todense(),
                              columns=['c0.a', 'c0.b', 'c1.c', 'c1.d'])

        self.assertTrue(result.equals(expected_result))
Beispiel #6
0
    def test_combine_transform_and_predictor(self):
        transform = OneHotVectorizer() << 'c0'
        df = transform.fit_transform(train_df, as_binary_data_stream=True)

        predictor = OnlineGradientDescentRegressor(label='c2',
                                                   feature=['c0', 'c1'])
        predictor.fit(df)

        df = transform.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(test_df)

        self.assertEqual(result_1[0], result_2.loc[0, 'Score'])
        self.assertEqual(result_1[1], result_2.loc[1, 'Score'])
Beispiel #7
0
    def test_combine_transform_and_transform(self):
        transform_1 = RangeFilter(min=0.0, max=4.5) << 'c2'
        df = transform_1.fit_transform(train_df)

        transform_2 = OneHotVectorizer() << 'c0'
        transform_2.fit(df)

        df = transform_1.transform(test_df)
        result_1 = transform_2.transform(df)

        combined_pipeline = Pipeline.combine_models(transform_1,
                                                    transform_2,
                                                    contains_predictor=False)
        result_2 = combined_pipeline.transform(test_df)

        self.assertTrue(result_1.equals(result_2))
Beispiel #8
0
    def test_fit_predictor_with_idv(self):
        train_data = {
            'c0': ['a', 'b', 'a', 'b'],
            'c1': [1, 2, 3, 4],
            'c2': [2, 3, 4, 5]
        }
        train_df = pd.DataFrame(train_data).astype({
            'c1': np.float64,
            'c2': np.float64
        })

        test_data = {
            'c0': ['a', 'b', 'b'],
            'c1': [1.5, 2.3, 3.7],
            'c2': [2.2, 4.9, 2.7]
        }
        test_df = pd.DataFrame(test_data).astype({
            'c1': np.float64,
            'c2': np.float64
        })

        # Fit a transform pipeline to the training data
        transform_pipeline = Pipeline([OneHotVectorizer() << 'c0'])
        transform_pipeline.fit(train_df)
        df = transform_pipeline.transform(train_df, as_binary_data_stream=True)

        # Fit a predictor pipeline given a transformed BinaryDataStream
        predictor = OnlineGradientDescentRegressor(label='c2',
                                                   feature=['c0', 'c1'])
        predictor_pipeline = Pipeline([predictor])
        predictor_pipeline.fit(df)

        # Perform a prediction given the test data using
        # the transform and predictor defined previously.
        df = transform_pipeline.transform(test_df, as_binary_data_stream=True)
        result_1 = predictor_pipeline.predict(df)

        # Create expected result
        xf = OneHotVectorizer() << 'c0'
        df = xf.fit_transform(train_df)
        predictor = OnlineGradientDescentRegressor(
            label='c2', feature=['c0.a', 'c0.b', 'c1'])
        predictor.fit(df)
        df = xf.transform(test_df)
        expected_result = predictor.predict(df)

        self.assertTrue(result_1.loc[:, 'Score'].equals(expected_result))
Beispiel #9
0
    def test_combine_with_classifier_trained_with_joined_X_and_y(self):
        np.random.seed(0)

        infert_df = get_dataset("infert").as_df()
        feature_cols = [c for c in infert_df.columns if c != 'case']

        transform = OneHotVectorizer() << 'education_str'
        df = transform.fit_transform(infert_df, as_binary_data_stream=True)

        predictor = LogisticRegressionBinaryClassifier(label='case',
                                                       feature=feature_cols)
        predictor.fit(df)

        df = transform.transform(infert_df, as_binary_data_stream=True)
        result_1 = predictor.predict(df)

        # Combine the models and perform a prediction
        combined_pipeline = Pipeline.combine_models(transform, predictor)
        result_2 = combined_pipeline.predict(infert_df)

        result_2 = result_2['PredictedLabel'].astype(np.float64)
        self.assertTrue(result_1.equals(result_2))
Beispiel #10
0
 def test_syntax4_passing(self):
     df, X, y = self.get_simple_df()
     vec = OneHotVectorizer() << {'edu1': ['education']}
     vec.fit(X)
     res = vec.transform(X)
     assert res.shape == (5, 5)