def test_data_labeler_change_labels(self): data = [[ 'this is my test sentence.', { 'entities': [(5, 7, 'ADDRESS'), (11, 20, 'INTEGER_BIG'), (20, 22, 'ADDRESS'), (22, 24, 'INTEGER_BIG')] } ], [ 'How nice.', { 'entities': [(0, 2, 'ADDRESS'), (4, 5, 'INTEGER_BIG'), (6, 8, 'INTEGER_BIG')] } ]] data = pd.DataFrame(data * 50) # constructing default UnstructuredDataLabeler() default = dp.DataLabeler(labeler_type='unstructured', trainable=True) # get char-level predictions on default model model_predictions = default.fit( x=data[0], y=data[1], labels=['BACKGROUND', 'INTEGER_BIG', 'ADDRESS']) self.assertEqual(1, len(model_predictions)) self.assertEqual(3, len(model_predictions[0])) self.assertIsInstance(model_predictions[0][0], dict) self.assertIsInstance(model_predictions[0][1], float) self.assertIsInstance(model_predictions[0][2], dict) # no bg, pad, but includes micro,macro, weighted self.assertEqual(5, len(model_predictions[0][2].keys()))
def test_help(self, mock_stdout, *mock): data_labeler = dp.DataLabeler(labeler_type='structured') data_labeler.help() self.assertIn("Process Input Format", mock_stdout.getvalue()) self.assertIn("Process Output Format:", mock_stdout.getvalue()) self.assertIn("Parameters", mock_stdout.getvalue())
def test_equality(self, mock_open, mock_load_model): self._setup_mock_load_model(mock_load_model) struct_data_labeler1 = dp.DataLabeler(labeler_type='structured') struct_data_labeler2 = dp.DataLabeler(labeler_type='structured') unstruct_data_labeler1 = dp.DataLabeler(labeler_type='unstructured') unstruct_data_labeler2 = dp.DataLabeler(labeler_type='unstructured') # Assert they are equal self.assertEqual(struct_data_labeler1, struct_data_labeler2) self.assertEqual(unstruct_data_labeler1, unstruct_data_labeler2) # Assert they are unequal because of type self.assertNotEqual(struct_data_labeler1, unstruct_data_labeler1) # Assert they are unequal because of _label_encoding struct_data_labeler1.set_labels(['BACKGROUND', 'b', 'c']) self.assertNotEqual(struct_data_labeler1, struct_data_labeler2)
def test_multi_labelers(self, *mocks): """ Test Multiple labelers called consecutively. :return: """ data = dp.Data(data=pd.DataFrame([12, 2, 3, 4, 5]).astype(str), data_type='parquet') data2 = dp.Data(data=pd.DataFrame(['atest', 'b', 'c']), data_type='csv') structured_labeler_1 = dp.DataLabeler(labeler_type='structured') structured_labeler_1.predict(data) unstructured_labeler = dp.DataLabeler(labeler_type='unstructured') unstructured_labeler._label_encoding = { 'PAD': 0, 'CITY': 1, # SAME AS BACKGROUND 'BACKGROUND': 1, 'ADDRESS': 2, 'BAN': 3, 'CREDIT_CARD': 4, 'EMAIL_ADDRESS': 5, 'UUID': 6, 'HASH_OR_KEY': 7, 'IPV4': 8, 'IPV6': 9, 'MAC_ADDRESS': 10, 'NAME': 11, # SAME AS PERSON 'PERSON': 11, 'PHONE_NUMBER': 12, 'SSN': 13, 'URL': 14, 'DATETIME': 15, 'INTEGER_BIG': 16, # SAME AS INTEGER 'INTEGER': 16, 'FLOAT': 17, 'QUANTITY': 18, 'ORDINAL': 19 } unstructured_labeler.predict(data) structured_labeler_2 = dp.DataLabeler(labeler_type='structured') structured_labeler_2.predict(data2)
def test_load_data_labeler(self, *mocks): # error if no labeler specified with self.assertRaisesRegex(TypeError, r'__new__\(\) missing 1 required positional' r' argument: \'labeler_type\''): data_labeler = dp.DataLabeler() # error if no labeler specified with self.assertRaisesRegex(ValueError, r'No DataLabeler class types matched the ' r'input, `fake_labeler`. Allowed types ' r'\[\'structured\', \'unstructured\'\].'): data_labeler = dp.DataLabeler(labeler_type='fake_labeler') # test loads a structured data labeler data_labeler = dp.DataLabeler(labeler_type='structured') self.assertIsInstance(data_labeler, BaseDataLabeler) # test loads an unstructured data labeler data_labeler = dp.DataLabeler(labeler_type='unstructured') self.assertIsInstance(data_labeler, BaseDataLabeler)
def test_load_labeler(self, mock_open, mock_load_model): # TODO: Mock exist functions self._setup_mock_load_model(mock_load_model) # load default data_labeler = dp.DataLabeler(labeler_type='structured') self.assertDictEqual(label_encoding['encoder'], struct_data_labeler_parameters['label_mapping']) self.assertIsInstance(data_labeler.preprocessor, data_processing.BaseDataPreprocessor) self.assertIsInstance(data_labeler.postprocessor, data_processing.BaseDataPostprocessor)
def test_set_labels(self, mock_open, mock_load_model): # setup self._setup_mock_load_model(mock_load_model) data_labeler = dp.DataLabeler(labeler_type='structured') data_labeler._model = mock.Mock() data_labeler._model.set_label_mapping.return_value = None # test require pad true data_labeler._model.requires_zero_mapping = True data_labeler.set_labels(['a', 'b']) data_labeler._model.set_label_mapping.assert_called_with( label_mapping={'a': 1, 'b': 2}) # test require pad false data_labeler._model.requires_zero_mapping = False data_labeler.set_labels(['a', 'b']) data_labeler._model.set_label_mapping.assert_called_with( label_mapping={'a': 0, 'b': 1})
def test_fit_with_default_model(self): data = [[ 'this is my test sentence.', { 'entities': [(5, 7, 'ADDRESS'), (11, 20, 'INTEGER_BIG'), (20, 22, 'ADDRESS'), (22, 24, 'INTEGER_BIG')] } ], [ 'How nice.', { 'entities': [(0, 2, 'ADDRESS'), (4, 5, 'INTEGER_BIG'), (6, 8, 'INTEGER_BIG')] } ]] data = pd.DataFrame(data * 50) # constructing default UnstructuredDataLabeler() default = dp.DataLabeler(labeler_type='unstructured', trainable=True) # get char-level predictions on default model model_predictions = default.fit(x=data[0], y=data[1]) self.assertEqual(1, len(model_predictions)) self.assertEqual(3, len(model_predictions[0])) self.assertIsInstance(model_predictions[0][0], dict) self.assertIsInstance(model_predictions[0][1], float) self.assertIsInstance(model_predictions[0][2], dict) # no bg, pad, but includes micro,macro, weighted self.assertEqual(21, len(model_predictions[0][2].keys())) # test default no validation model_predictions = default.fit(x=data[0], y=data[1], validation_split=0) self.assertEqual(1, len(model_predictions)) self.assertEqual(3, len(model_predictions[0])) self.assertIsInstance(model_predictions[0][0], dict) self.assertIsNone(model_predictions[0][1]) # no f1 since no validation self.assertListEqual(model_predictions[0][2], []) # validate epoch id self.assertEqual(2, default.model._epoch_id)