Ejemplo n.º 1
0
    def test_passing_in_a_single_transform_returns_new_pipeline(self):
        transform = OneHotVectorizer() << 'c0'
        transform.fit(train_df)

        combined_pipeline = Pipeline.combine_models(transform,
                                                    contains_predictor=False)
        result = combined_pipeline.transform(test_df)

        self.assertEqual(len(result), 3)
        self.assertEqual(len(result.columns), 4)
        self.assertTrue(result.columns[0].startswith('c0.'))
        self.assertTrue(result.columns[1].startswith('c0.'))
        self.assertTrue(isinstance(combined_pipeline, Pipeline))
Ejemplo n.º 2
0
    def test_combine_transform_and_transform(self):
        transform_1 = RangeFilter(min=0.0, max=4.5) << 'c2'
        df = transform_1.fit_transform(train_df)

        transform_2 = OneHotVectorizer() << 'c0'
        transform_2.fit(df)

        df = transform_1.transform(test_df)
        result_1 = transform_2.transform(df)

        combined_pipeline = Pipeline.combine_models(transform_1,
                                                    transform_2,
                                                    contains_predictor=False)
        result_2 = combined_pipeline.transform(test_df)

        self.assertTrue(result_1.equals(result_2))
Ejemplo n.º 3
0
    def test_sparse_vector_column_combined_with_single_value_columns(self):
        train_data = {'c0': [0, 1, 0, 3], 'c1': ['a', 'b', 'a', 'b']}
        train_df = pd.DataFrame(train_data).astype({'c0': np.float32})

        xf = OneHotVectorizer(columns={'c1': 'c1'})
        xf.fit(train_df)
        expected_result = xf.transform(train_df)
        self.assertTrue(type(expected_result) == pd.DataFrame)

        result = xf.transform(train_df, as_csr=True)
        self.assertEqual(result.nnz, 6)
        self.assertTrue(type(result) == csr_matrix)

        result = pd.DataFrame(result.todense(), columns=['c0', 'c1.a', 'c1.b'])

        self.assertTrue(result.equals(expected_result))
Ejemplo n.º 4
0
    def test_sparse_vector_column(self):
        train_data = {'c0': ['a', 'b', 'a', 'b'], 'c1': ['c', 'd', 'd', 'c']}
        train_df = pd.DataFrame(train_data)

        xf = OneHotVectorizer(columns={'c0': 'c0', 'c1': 'c1'})
        xf.fit(train_df)
        expected_result = xf.transform(train_df)
        self.assertTrue(type(expected_result) == pd.DataFrame)

        result = xf.transform(train_df, as_csr=True)
        self.assertEqual(result.nnz, 8)
        self.assertTrue(type(result) == csr_matrix)

        result = pd.DataFrame(result.todense(),
                              columns=['c0.a', 'c0.b', 'c1.c', 'c1.d'])

        self.assertTrue(result.equals(expected_result))
Ejemplo n.º 5
0
    def test_fit_transform(self):
        # data input (as a FileDataStream)
        path = get_dataset('infert').as_filepath()

        data = FileDataStream.read_csv(path)

        # transform usage
        xf = OneHotVectorizer(
            columns={
                'edu': 'education',
                'in': 'induced',
                'sp': 'spontaneous'})

        # fit and transform
        res1 = xf.fit_transform(data)
        res2 = xf.fit(data).transform(data)
        assert_frame_equal(res1, res2)
Ejemplo n.º 6
0
 def test_syntax4_passing(self):
     df, X, y = self.get_simple_df()
     vec = OneHotVectorizer() << {'edu1': ['education']}
     vec.fit(X)
     res = vec.transform(X)
     assert res.shape == (5, 5)