def _invalid_check(data): with self.assertRaisesRegex( TypeError, "Data must be imported using the " "data_readers, pd.DataFrames, " "np.ndarrays, or lists.", ): BaseDataLabeler._check_and_return_valid_data_format(data)
def _valid_check(data): try: print("\tChecking data format: {}".format(str(type(data)))) data = BaseDataLabeler._check_and_return_valid_data_format( data, fit_or_predict='predict') except Exception as e: self.fail("Exception raised on input of accepted types.") return data
def test_load_with_components(self, *mocks): mock_preprocessor = mock.Mock(spec=data_processing.BaseDataPreprocessor) mock_preprocessor._parameters = {"test": 1} mock_postprocessor = mock.Mock( spec=data_processing.BaseDataPostprocessor) mock_postprocessor._parameters = {"test": 2} mock_model = mock.Mock(spec=BaseTrainableModel) mock_model._parameters = {"test": 3} data_labeler = BaseDataLabeler.load_with_components( preprocessor=mock_preprocessor, model=mock_model, postprocessor=mock_postprocessor) self.assertIsInstance(data_labeler, BaseDataLabeler) self.assertEqual('CustomDataLabeler', data_labeler.__class__.__name__) self.assertEqual(mock_preprocessor, data_labeler.preprocessor) self.assertEqual({"test": 1}, data_labeler.preprocessor._parameters) self.assertEqual(mock_model, data_labeler.model) self.assertEqual({"test": 3}, data_labeler.model._parameters) self.assertEqual(mock_postprocessor, data_labeler.postprocessor) self.assertEqual({"test": 2}, data_labeler.postprocessor._parameters)
def test_save_labeler(self, mock_load_data_labeler, mock_open, mock_load_model): # setup mocks mock_file = setup_save_mock_open(mock_open) base_data_labeler = BaseDataLabeler('fake_path') base_data_labeler._model = mock.Mock() base_data_labeler._preprocessor = mock.Mock() base_data_labeler._postprocessor = mock.Mock() base_data_labeler.save_to_disk('test') self.assertEqual( '{"model": {"class": "Mock"}, "preprocessor": {"class": "Mock"}, ' '"postprocessor": {"class": "Mock"}}', mock_file.getvalue()) mock_open.assert_called_with('test/data_labeler_parameters.json', 'w') base_data_labeler._model.save_to_disk.assert_called_with('test') base_data_labeler._preprocessor.save_to_disk.assert_called_with('test') base_data_labeler._postprocessor.save_to_disk.assert_called_with( 'test') # close mock StringIO.close(mock_file)
def _invalid_check(data): with self.assertRaisesRegex( TypeError, "Data must either be imported using " "the data_readers or pd.DataFrame."): BaseDataLabeler._check_and_return_valid_data_format(data)
def test_check_and_return_valid_data_format(self): # test incorrect fit_or_predict value with self.assertRaisesRegex(ValueError, '`fit_or_predict` must equal ' '`fit` or `predict`'): BaseDataLabeler._check_and_return_valid_data_format([], 'oops') # test incorrect data type with self.assertRaisesRegex(TypeError, "Data must be imported using the" " data_readers, pd.DataFrames, " "np.ndarrays, or lists."): BaseDataLabeler._check_and_return_valid_data_format('oops') # test proper conversion of 2 dimensional structured data two_dim = [["this", "is"], ["two", "dimensions"]] two_dim_pred = np.array(["this", "is", "two", "dimensions"]) # for fit self.assertTrue( np.array_equal(np.array(two_dim), BaseDataLabeler._check_and_return_valid_data_format( two_dim, fit_or_predict='fit'))) self.assertTrue( np.array_equal(np.array(two_dim), BaseDataLabeler._check_and_return_valid_data_format( pd.DataFrame(two_dim), fit_or_predict='fit'))) self.assertTrue( np.array_equal(np.array(two_dim), BaseDataLabeler._check_and_return_valid_data_format( np.array(two_dim), fit_or_predict='fit'))) # for predict self.assertTrue( np.array_equal(two_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( two_dim, fit_or_predict='predict'))) self.assertTrue( np.array_equal(two_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( pd.DataFrame(two_dim), fit_or_predict='predict'))) self.assertTrue( np.array_equal(two_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( np.array(two_dim), fit_or_predict='predict'))) # test proper conversion of 1 dimensional data one_dim = ["this", "is", "one", "dimension"] one_dim_pred = np.array(one_dim) # for fit self.assertTrue( np.array_equal(np.array(one_dim), BaseDataLabeler._check_and_return_valid_data_format( one_dim, fit_or_predict='fit'))) self.assertTrue( np.array_equal(np.array(one_dim), BaseDataLabeler._check_and_return_valid_data_format( pd.Series(one_dim), fit_or_predict='fit'))) self.assertTrue( np.array_equal(np.array(one_dim), BaseDataLabeler._check_and_return_valid_data_format( np.array(one_dim), fit_or_predict='fit'))) # for predict self.assertTrue( np.array_equal(one_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( one_dim, fit_or_predict='predict'))) self.assertTrue( np.array_equal(one_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( pd.DataFrame(one_dim), fit_or_predict='predict'))) self.assertTrue( np.array_equal(one_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( np.array(one_dim), fit_or_predict='predict'))) # test proper conversion of unstructured labels labels = [[(0, 4, "UNKNOWN"), (4, 10, "ADDRESS")], [(0, 5, "SSN"), (5, 8, "UNKNOWN")]] validated_labels = \ BaseDataLabeler._check_and_return_valid_data_format(labels) self.assertIsInstance(validated_labels, np.ndarray) self.assertEqual(len(validated_labels), 2) self.assertEqual(len(validated_labels[0]), 2) self.assertEqual(len(validated_labels[0][0]), 3) self.assertEqual(validated_labels[0][0][0], 0) self.assertEqual(validated_labels[0][1][1], 10) self.assertEqual(validated_labels[1][0][2], "SSN") # test proper conversion of data reader objects for dt in ["csv", "json", "parquet"]: data_obj = dp.Data(data=pd.DataFrame(two_dim), data_type=dt) val = BaseDataLabeler._check_and_return_valid_data_format(data_obj) self.assertTrue(np.array_equal(np.array(two_dim), val))
def setUpClass(cls) -> None: cls.data = np.array([ '123 Fake St.', '1/2/2020', 'nice.', '4/3/22', 'abc', '333-44-2341' ]).reshape((-1, )) cls.data_labeler = BaseDataLabeler.load_from_disk( os.path.join(default_labeler_dir, 'regex_model'))
def setUpClass(cls) -> None: cls.data = np.array([ "123 Fake St.", "1/2/2020", "nice.", "4/3/22", "abc", "333-44-2341" ]).reshape((-1, )) cls.data_labeler = BaseDataLabeler.load_from_disk( os.path.join(default_labeler_dir, "regex_model"))