def test_unstructured_data_labeler_fit_predict_take_data_obj(self): # Determine string index in joined data at cell i def data_ind(i, data): # Take off 1 in base case so we don't include trailing comma if i == -1: return -1 # Add 1 with every pass to account for commas return len(data[i]) + 1 + data_ind(i - 1, data) # Generate entities list for a set of structured data and labels def entities(data, labels): return [(0, len(data[0]), labels[0])] + \ [(data_ind(i - 1, data) + 1, data_ind(i, data), labels[i]) for i in range(1, len(data))] data_cells = [ "123 Fake st", "1/1/2021", "blah", "555-55-5555", "*****@*****.**", "John Doe", "123-4567" ] label_cells = [ "ADDRESS", "DATETIME", "UNKNOWN", "SSN", "EMAIL_ADDRESS", "PERSON", "PHONE_NUMBER" ] # Test with one large string of data data_str = ",".join(data_cells) label_str = entities(data_cells, label_cells) for dt in ["csv", "json", "parquet"]: data_obj = dp.Data(data=pd.DataFrame([data_str]), data_type=dt) labeler = dp.DataLabeler(labeler_type="unstructured", trainable=True) self.assertIsNotNone(labeler.fit(x=data_obj, y=[label_str])) self.assertIsNotNone(labeler.predict(data=data_obj)) # Test with the string broken up into different df entries data_1 = data_cells[:3] data_2 = data_cells[3:5] data_3 = data_cells[5:] data_df = pd.DataFrame( [",".join(data_1), ",".join(data_2), ",".join(data_3)]) zipped = [(data_1, label_cells[:3]), (data_2, label_cells[3:5]), (data_3, label_cells[5:])] three_labels = [entities(d, l) for (d, l) in zipped] for dt in ["csv", "json", "parquet"]: data_obj = dp.Data(data=data_df, data_type=dt) labeler = dp.DataLabeler(labeler_type="unstructured", trainable=True) self.assertIsNotNone(labeler.fit(x=data_obj, y=three_labels)) self.assertIsNotNone(labeler.predict(data=data_obj)) # Test with text data object text_obj = dp.Data(data=data_str, data_type="text") labeler = dp.DataLabeler(labeler_type="unstructured", trainable=True) self.assertIsNotNone(labeler.fit(x=text_obj, y=[label_str])) self.assertIsNotNone(labeler.predict(data=text_obj))
def test_data_labeler_change_labels(self): data = [[ 'this is my test sentence.', [(5, 7, 'ADDRESS'), (11, 20, 'INTEGER'), (20, 22, 'ADDRESS'), (22, 24, 'INTEGER')] ], [ 'How nice.', [(0, 2, 'ADDRESS'), (4, 5, 'INTEGER'), (6, 8, 'INTEGER')] ]] data = pd.DataFrame(data * 50) # constructing default UnstructuredDataLabeler() default = dp.DataLabeler(labeler_type='unstructured', trainable=True) # get char-level predictions on default model model_predictions = default.fit( x=data[0], y=data[1], labels=['UNKNOWN', 'INTEGER', 'ADDRESS']) self.assertEqual(1, len(model_predictions)) self.assertEqual(3, len(model_predictions[0])) self.assertIsInstance(model_predictions[0][0], dict) self.assertIsInstance(model_predictions[0][1], float) self.assertIsInstance(model_predictions[0][2], dict) # no bg, pad, but includes micro,macro, weighted self.assertEqual(5, len(model_predictions[0][2].keys()))
def test_help(self, mock_stdout, *mock): data_labeler = dp.DataLabeler(labeler_type='structured') data_labeler.help() self.assertIn("Process Input Format", mock_stdout.getvalue()) self.assertIn("Process Output Format:", mock_stdout.getvalue()) self.assertIn("Parameters", mock_stdout.getvalue())
def test_structured_data_labeler_fit_predict_take_data_obj(self): data = pd.DataFrame( [ "123 Fake st", "1/1/2021", "blah", "333-44-2341", "*****@*****.**", "John Doe", "123-4567", ] ) labels = pd.DataFrame( [ "ADDRESS", "DATETIME", "UNKNOWN", "SSN", "EMAIL_ADDRESS", "PERSON", "PHONE_NUMBER", ] ) for dt in ["csv", "json", "parquet"]: data_obj = dp.Data(data=data, data_type=dt) label_obj = dp.Data(data=labels, data_type=dt) labeler = dp.DataLabeler(labeler_type="structured", trainable=True) self.assertIsNotNone(labeler.fit(x=data_obj, y=label_obj)) self.assertIsNotNone(labeler.predict(data=data_obj))
def _pandas(cls: Any, column: str, threshold: float, data_label: str, **kwargs: Any) -> np.ndarray: """ Implement the yes/no question for the expectation """ labeler = dp.DataLabeler(labeler_type="structured") labeler.postprocessor.set_params(is_pred_labels=False) results = labeler.predict( column, predict_options={"show_confidences": True}, ) if data_label.upper() in labeler.label_mapping.keys(): data_label_ind = labeler.label_mapping[data_label.upper()] else: raise ValueError(""" The only values acceptable for the data label parameter are as follows: ['PAD', 'UNKNOWN', 'ADDRESS', 'BAN', 'CREDIT_CARD', 'DATE', 'TIME', 'DATETIME',\ 'DRIVERS_LICENSE', 'EMAIL_ADDRESS', 'UUID', 'HASH_OR_KEY', 'IPV4', 'IPV6',\ 'MAC_ADDRESS', 'PERSON', 'PHONE_NUMBER', 'SSN', 'URL', 'US_STATE', 'INTEGER',\ 'FLOAT', 'QUANTITY', 'ORDINAL'] """) data_label_conf = results["conf"][:, data_label_ind] return data_label_conf >= threshold
def test_equality(self, mock_open, mock_load_model): self._setup_mock_load_model(mock_load_model) struct_data_labeler1 = dp.DataLabeler(labeler_type='structured') struct_data_labeler2 = dp.DataLabeler(labeler_type='structured') unstruct_data_labeler1 = dp.DataLabeler(labeler_type='unstructured') unstruct_data_labeler2 = dp.DataLabeler(labeler_type='unstructured') # Assert they are equal self.assertEqual(struct_data_labeler1, struct_data_labeler2) self.assertEqual(unstruct_data_labeler1, unstruct_data_labeler2) # Assert they are unequal because of type self.assertNotEqual(struct_data_labeler1, unstruct_data_labeler1) # Assert they are unequal because of _label_encoding struct_data_labeler1.set_labels(['BACKGROUND', 'b', 'c']) self.assertNotEqual(struct_data_labeler1, struct_data_labeler2)
def test_equality(self, mock_open, mock_load_model): self._setup_mock_load_model(mock_load_model) struct_data_labeler1 = dp.DataLabeler(labeler_type="structured") struct_data_labeler2 = dp.DataLabeler(labeler_type="structured") unstruct_data_labeler1 = dp.DataLabeler(labeler_type="unstructured") unstruct_data_labeler2 = dp.DataLabeler(labeler_type="unstructured") # Assert they are equal self.assertEqual(struct_data_labeler1, struct_data_labeler2) self.assertEqual(unstruct_data_labeler1, unstruct_data_labeler2) # Assert they are unequal because of type self.assertNotEqual(struct_data_labeler1, unstruct_data_labeler1) # Assert they are unequal because of _label_encoding struct_data_labeler1.set_labels(["UNKNOWN", "b", "c"]) self.assertNotEqual(struct_data_labeler1, struct_data_labeler2)
def test_multi_labelers(self, *mocks): """ Test Multiple labelers called consecutively. :return: """ data = dp.Data(data=pd.DataFrame([12, 2, 3, 4, 5]).astype(str), data_type='parquet') data2 = dp.Data(data=pd.DataFrame(['atest', 'b', 'c']), data_type='csv') structured_labeler_1 = dp.DataLabeler(labeler_type='structured') structured_labeler_1.predict(data) unstructured_labeler = dp.DataLabeler(labeler_type='unstructured') unstructured_labeler._label_encoding = { 'PAD': 0, 'CITY': 1, # SAME AS BACKGROUND 'BACKGROUND': 1, 'ADDRESS': 2, 'BAN': 3, 'CREDIT_CARD': 4, 'EMAIL_ADDRESS': 5, 'UUID': 6, 'HASH_OR_KEY': 7, 'IPV4': 8, 'IPV6': 9, 'MAC_ADDRESS': 10, 'NAME': 11, # SAME AS PERSON 'PERSON': 11, 'PHONE_NUMBER': 12, 'SSN': 13, 'URL': 14, 'DATETIME': 15, 'INTEGER_BIG': 16, # SAME AS INTEGER 'INTEGER': 16, 'FLOAT': 17, 'QUANTITY': 18, 'ORDINAL': 19 } unstructured_labeler.predict(data) structured_labeler_2 = dp.DataLabeler(labeler_type='structured') structured_labeler_2.predict(data2)
def test_multi_labelers(self, *mocks): """ Test Multiple labelers called consecutively. :return: """ data = dp.Data( data=pd.DataFrame([12, 2, 3, 4, 5]).astype(str), data_type="parquet" ) data2 = dp.Data(data=pd.DataFrame(["atest", "b", "c"]), data_type="csv") structured_labeler_1 = dp.DataLabeler(labeler_type="structured") structured_labeler_1.predict(data) unstructured_labeler = dp.DataLabeler(labeler_type="unstructured") unstructured_labeler._label_encoding = { "PAD": 0, "CITY": 1, # SAME AS UNKNOWN "UNKNOWN": 1, "ADDRESS": 2, "BAN": 3, "CREDIT_CARD": 4, "EMAIL_ADDRESS": 5, "UUID": 6, "HASH_OR_KEY": 7, "IPV4": 8, "IPV6": 9, "MAC_ADDRESS": 10, "NAME": 11, # SAME AS PERSON "PERSON": 11, "PHONE_NUMBER": 12, "SSN": 13, "URL": 14, "DATETIME": 15, "INTEGER_BIG": 16, # SAME AS INTEGER "INTEGER": 16, "FLOAT": 17, "QUANTITY": 18, "ORDINAL": 19, } unstructured_labeler.predict(data) structured_labeler_2 = dp.DataLabeler(labeler_type="structured") structured_labeler_2.predict(data2)
def _pandas(cls: Any, column: str, threshold: float, **kwargs: Any) -> np.ndarray: """ Implement the yes/no question for the expectation """ labeler = dp.DataLabeler(labeler_type="structured") labeler.postprocessor.set_params(is_pred_labels=False) results = labeler.predict(column, predict_options={"show_confidences": True}) return (np.choose(results["pred"].astype(int, copy=False), results["conf"].T) >= threshold)
def test_load_data_labeler(self, *mocks): # error if no labeler specified with self.assertRaisesRegex(TypeError, r'__new__\(\) missing 1 required positional' r' argument: \'labeler_type\''): data_labeler = dp.DataLabeler() # error if no labeler specified with self.assertRaisesRegex(ValueError, r'No DataLabeler class types matched the ' r'input, `fake_labeler`. Allowed types ' r'\[\'structured\', \'unstructured\'\].'): data_labeler = dp.DataLabeler(labeler_type='fake_labeler') # test loads a structured data labeler data_labeler = dp.DataLabeler(labeler_type='structured') self.assertIsInstance(data_labeler, BaseDataLabeler) # test loads an unstructured data labeler data_labeler = dp.DataLabeler(labeler_type='unstructured') self.assertIsInstance(data_labeler, BaseDataLabeler)
def test_load_labeler(self, mock_open, mock_load_model): # TODO: Mock exist functions self._setup_mock_load_model(mock_load_model) # load default data_labeler = dp.DataLabeler(labeler_type='structured') self.assertDictEqual(label_encoding['encoder'], struct_data_labeler_parameters['label_mapping']) self.assertIsInstance(data_labeler.preprocessor, data_processing.BaseDataPreprocessor) self.assertIsInstance(data_labeler.postprocessor, data_processing.BaseDataPostprocessor)
def test_load_data_labeler(self, *mocks): # error if no labeler specified with self.assertRaisesRegex( TypeError, r"__new__\(\) missing 1 required positional" r" argument: \'labeler_type\'", ): data_labeler = dp.DataLabeler() # error if no labeler specified with self.assertRaisesRegex( ValueError, r"No DataLabeler class types matched the " r"input, `fake_labeler`. Allowed types " r"\[\'structured\', \'unstructured\'\].", ): data_labeler = dp.DataLabeler(labeler_type="fake_labeler") # test loads a structured data labeler data_labeler = dp.DataLabeler(labeler_type="structured") self.assertIsInstance(data_labeler, BaseDataLabeler) # test loads an unstructured data labeler data_labeler = dp.DataLabeler(labeler_type="unstructured") self.assertIsInstance(data_labeler, BaseDataLabeler)
def test_fit_with_default_model(self): data = [ [ "this is my test sentence.", [ (5, 7, "ADDRESS"), (11, 20, "INTEGER"), (20, 22, "ADDRESS"), (22, 24, "INTEGER"), ], ], [ "How nice.", [(0, 2, "ADDRESS"), (4, 5, "INTEGER"), (6, 8, "INTEGER")] ], ] new_labels = ["UNKNOWN", "ADDRESS", "INTEGER"] data = pd.DataFrame(data * 50) # constructing default UnstructuredDataLabeler() default = dp.DataLabeler(labeler_type="unstructured", trainable=True) # get char-level predictions on default model model_predictions = default.fit(x=data[0], y=data[1], labels=new_labels) self.assertEqual(1, len(model_predictions)) self.assertEqual(3, len(model_predictions[0])) self.assertIsInstance(model_predictions[0][0], dict) self.assertIsInstance(model_predictions[0][1], float) self.assertIsInstance(model_predictions[0][2], dict) # no bg, pad, but includes micro, macro, weighted self.assertEqual( len(default.labels) + 1, len(model_predictions[0][2].keys())) # test default no validation model_predictions = default.fit(x=data[0], y=data[1], validation_split=0) self.assertEqual(1, len(model_predictions)) self.assertEqual(3, len(model_predictions[0])) self.assertIsInstance(model_predictions[0][0], dict) self.assertIsNone(model_predictions[0][1]) # no f1 since no validation self.assertListEqual(model_predictions[0][2], []) # validate epoch id self.assertEqual(2, default.model._epoch_id)
def test_set_labels(self, mock_open, mock_load_model): # setup self._setup_mock_load_model(mock_load_model) data_labeler = dp.DataLabeler(labeler_type='structured') data_labeler._model = mock.Mock() data_labeler._model.set_label_mapping.return_value = None # test require pad true data_labeler._model.requires_zero_mapping = True data_labeler.set_labels(['a', 'b']) data_labeler._model.set_label_mapping.assert_called_with( label_mapping=['a', 'b']) # test require pad false data_labeler._model.requires_zero_mapping = False data_labeler.set_labels(['a', 'b']) data_labeler._model.set_label_mapping.assert_called_with( label_mapping=['a', 'b'])