def test_fit(self): """Test 'fit' on a np.ndarray with one continuous and one discrete columns. The 'fit' method should: - Set 'self.dataframe' to 'False' - Set 'self._column_raw_dtypes' to the appropirate dtypes - Use the appropriate '_fit' type for each column' - Update 'self.output_info_list', 'self.output_dimensions' and 'self._column_transform_info_list' appropriately Setup: - Create DataTransformer - Mock _fit_discrete - Mock _fit_continuous Input: - raw_data = a table with one continuous and one discrete columns. - discrete_columns = list with the name of the discrete column Output: - None Side Effects: - _fit_discrete and _fit_continuous should each be called once - Assigns 'self._column_raw_dtypes' the appropriate dtypes - Assigns 'self.output_info_list' the appropriate 'output_info'. - Assigns 'self.output_dimensions' the appropriate 'output_dimensions'. - Assigns 'self._column_transform_info_list' the appropriate 'column_transform_info'. """ data = pd.DataFrame({ "x": np.random.random(size=100), "y": np.random.choice(["yes", "no"], size=100) }) transformer = DataTransformer() transformer._fit_continuous = Mock() transformer._fit_continuous.return_value = ColumnTransformInfo( column_name="x", column_type="continuous", transform=None, transform_aux=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3) transformer._fit_discrete = Mock() transformer._fit_discrete.return_value = ColumnTransformInfo( column_name="y", column_type="discrete", transform=None, transform_aux=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) transformer.fit(data, discrete_columns=["y"]) transformer._fit_discrete.assert_called_once() transformer._fit_continuous.assert_called_once() assert transformer.output_dimensions == 6
def test_fit(self): """Test ``fit`` on a np.ndarray with one continuous and one discrete columns. The ``fit`` method should: - Set ``self.dataframe`` to ``False``. - Set ``self._column_raw_dtypes`` to the appropirate dtypes. - Use the appropriate ``_fit`` type for each column. - Update ``self.output_info_list``, ``self.output_dimensions`` and ``self._column_transform_info_list`` appropriately. Setup: - Create ``DataTransformer``. - Mock ``_fit_discrete``. - Mock ``_fit_continuous``. Input: - A table with one continuous and one discrete columns. - A list with the name of the discrete column. Side Effects: - ``_fit_discrete`` and ``_fit_continuous`` should each be called once. - Assigns ``self._column_raw_dtypes`` the appropriate dtypes. - Assigns ``self.output_info_list`` the appropriate ``output_info``. - Assigns ``self.output_dimensions`` the appropriate ``output_dimensions``. - Assigns ``self._column_transform_info_list`` the appropriate ``column_transform_info``. """ # Setup transformer = DataTransformer() transformer._fit_continuous = Mock() transformer._fit_continuous.return_value = ColumnTransformInfo( column_name='x', column_type='continuous', transform=None, output_info=[SpanInfo(1, 'tanh'), SpanInfo(3, 'softmax')], output_dimensions=1 + 3) transformer._fit_discrete = Mock() transformer._fit_discrete.return_value = ColumnTransformInfo( column_name='y', column_type='discrete', transform=None, output_info=[SpanInfo(2, 'softmax')], output_dimensions=2) data = pd.DataFrame({ 'x': np.random.random(size=100), 'y': np.random.choice(['yes', 'no'], size=100) }) # Run transformer.fit(data, discrete_columns=['y']) # Assert transformer._fit_discrete.assert_called_once() transformer._fit_continuous.assert_called_once() assert transformer.output_dimensions == 6
def test___fit_continuous_(self, MockBGM): """Test '_fit_continuous_' on a simple continuous column. A 'BayesianGaussianMixture' will be created and fit with the 'raw_column_data'. Setup: - Create DataTransformer with weight_threshold - Mock the BayesianGaussianMixture - Provide fit method (no-op) - Provide weights_ attribute, some above threshold, some below Input: - column_name = string - raw_column_data = numpy array of continuous values Output: - ColumnTransformInfo - Check column name - Check that output_dims matches expected (1 + # weights above threshold) Side Effects: - fit should be called with the data """ bgm_instance = MockBGM.return_value bgm_instance.weights_ = np.array([10.0, 5.0, 0.0]) # 2 non-zero components max_clusters = 10 transformer = DataTransformer(max_clusters, weight_threshold=0.005) info = transformer._fit_continuous("column", np.random.normal( (100, 1))) assert info.column_name == "column" assert info.transform == bgm_instance assert info.output_dimensions == 3 assert info.output_info[0].dim == 1 assert info.output_info[0].activation_fn == "tanh" assert info.output_info[1].dim == 2 assert info.output_info[1].activation_fn == "softmax"
def test___fit_continuous(self, MockBGM): """Test ``_fit_continuous`` on a simple continuous column. A ``BayesGMMTransformer`` will be created and fit with some ``data``. Setup: - Mock the ``BayesGMMTransformer`` with ``valid_component_indicator`` as ``[True, False, True]``. - Initialize a ``DataTransformer``. Input: - A dataframe with only one column containing random float values. Output: - A ``ColumnTransformInfo`` object where: - ``column_name`` matches the column of the data. - ``transform`` is the ``BayesGMMTransformer`` instance. - ``output_dimensions`` is 3 (matches size of ``valid_component_indicator``). - ``output_info`` assigns the correct activation functions. Side Effects: - ``fit`` should be called with the data. """ # Setup bgm_instance = MockBGM.return_value bgm_instance.valid_component_indicator = [True, False, True] transformer = DataTransformer() data = pd.DataFrame(np.random.normal((100, 1)), columns=['column']) # Run info = transformer._fit_continuous(data) # Assert assert info.column_name == 'column' assert info.transform == bgm_instance assert info.output_dimensions == 3 assert info.output_info[0].dim == 1 assert info.output_info[0].activation_fn == 'tanh' assert info.output_info[1].dim == 2 assert info.output_info[1].activation_fn == 'softmax'