def test_structured_data_labeler_fit_predict_take_data_obj(self): data = pd.DataFrame( [ "123 Fake st", "1/1/2021", "blah", "333-44-2341", "*****@*****.**", "John Doe", "123-4567", ] ) labels = pd.DataFrame( [ "ADDRESS", "DATETIME", "UNKNOWN", "SSN", "EMAIL_ADDRESS", "PERSON", "PHONE_NUMBER", ] ) for dt in ["csv", "json", "parquet"]: data_obj = dp.Data(data=data, data_type=dt) label_obj = dp.Data(data=labels, data_type=dt) labeler = dp.DataLabeler(labeler_type="structured", trainable=True) self.assertIsNotNone(labeler.fit(x=data_obj, y=label_obj)) self.assertIsNotNone(labeler.predict(data=data_obj))
def test_warning_tf_multiple_dp_with_update(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) print('running dp1') profile1 = dp.Profiler(data, profiler_options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) print('running dp2') profile2 = dp.Profiler(data, profiler_options=profile_options) profile1.update_profile(data)
def test_unstructured_data_labeler_fit_predict_take_data_obj(self): # Determine string index in joined data at cell i def data_ind(i, data): # Take off 1 in base case so we don't include trailing comma if i == -1: return -1 # Add 1 with every pass to account for commas return len(data[i]) + 1 + data_ind(i - 1, data) # Generate entities list for a set of structured data and labels def entities(data, labels): return [(0, len(data[0]), labels[0])] + \ [(data_ind(i - 1, data) + 1, data_ind(i, data), labels[i]) for i in range(1, len(data))] data_cells = [ "123 Fake st", "1/1/2021", "blah", "555-55-5555", "*****@*****.**", "John Doe", "123-4567" ] label_cells = [ "ADDRESS", "DATETIME", "UNKNOWN", "SSN", "EMAIL_ADDRESS", "PERSON", "PHONE_NUMBER" ] # Test with one large string of data data_str = ",".join(data_cells) label_str = entities(data_cells, label_cells) for dt in ["csv", "json", "parquet"]: data_obj = dp.Data(data=pd.DataFrame([data_str]), data_type=dt) labeler = dp.DataLabeler(labeler_type="unstructured", trainable=True) self.assertIsNotNone(labeler.fit(x=data_obj, y=[label_str])) self.assertIsNotNone(labeler.predict(data=data_obj)) # Test with the string broken up into different df entries data_1 = data_cells[:3] data_2 = data_cells[3:5] data_3 = data_cells[5:] data_df = pd.DataFrame( [",".join(data_1), ",".join(data_2), ",".join(data_3)]) zipped = [(data_1, label_cells[:3]), (data_2, label_cells[3:5]), (data_3, label_cells[5:])] three_labels = [entities(d, l) for (d, l) in zipped] for dt in ["csv", "json", "parquet"]: data_obj = dp.Data(data=data_df, data_type=dt) labeler = dp.DataLabeler(labeler_type="unstructured", trainable=True) self.assertIsNotNone(labeler.fit(x=data_obj, y=three_labels)) self.assertIsNotNone(labeler.predict(data=data_obj)) # Test with text data object text_obj = dp.Data(data=data_str, data_type="text") labeler = dp.DataLabeler(labeler_type="unstructured", trainable=True) self.assertIsNotNone(labeler.fit(x=text_obj, y=[label_str])) self.assertIsNotNone(labeler.predict(data=text_obj))
def test_warning_tf_run_dp_multiple_times(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') for i in range(3): print('running dp =============================', i) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, }) profile = dp.Profiler(data, profiler_options=profile_options) results = profile.report() columns = [] predictions = [] for col in results['data_stats']: columns.append(col) predictions.append(results['data_stats'][col]['data_label'])
def test_warning_tf(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "chi2_homogeneity.is_enabled": False, "datetime.is_enabled": False }) profile = dp.StructuredProfiler(data, options=profile_options) results = profile.report() columns = [] predictions = [] for i in range(len(results['data_stats'])): columns.append(i) predictions.append(results['data_stats'][i]['data_label'])
def test_text_data_raises_error(self): text_file_path = os.path.join(test_root_path, 'data', 'txt/sentence-10x.txt') with self.assertRaisesRegex( TypeError, 'Cannot provide TextData object' ' to Profiler'): profile = dp.Profiler(dp.Data(text_file_path))
def test_save_and_load(self): datapth = "dataprofiler/tests/data/" test_files = ["csv/guns.csv", "csv/iris.csv"] def _clean_report(report): data_stats = report["data_stats"] for key in data_stats: stats = data_stats[key]["statistics"] if "histogram" in stats: if "bin_counts" in stats["histogram"]: stats["histogram"]["bin_counts"] = \ stats["histogram"]["bin_counts"].tolist() if "bin_edges" in stats["histogram"]: stats["histogram"]["bin_edges"] = \ stats["histogram"]["bin_edges"].tolist() return report for test_file in test_files: # Create Data and Profiler objects data = dp.Data(os.path.join(datapth, test_file)) save_profile = dp.Profiler(data) # Save and Load profile with Mock IO with mock.patch('builtins.open') as m: mock_file = setup_save_mock_open(m) save_profile.save() mock_file.seek(0) load_profile = dp.Profiler.load("mock.pkl") # Check that reports are equivalent save_report = _clean_report(save_profile.report()) load_report = _clean_report(load_profile.report()) self.assertDictEqual(save_report, load_report)
def test_warning_tf_run_dp_multiple_times(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") path = os.path.join(test_dir, "csv/diamonds.csv") for i in range(3): print("running dp =============================", i) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "chi2_homogeneity.is_enabled": False, "datetime.is_enabled": False, } ) profile = dp.StructuredProfiler(data, options=profile_options) results = profile.report() columns = [] predictions = [] for j in range(len(results["data_stats"])): columns.append(j) predictions.append(results["data_stats"][j]["data_label"])
def test_accepted_inputs(self): with self.assertRaisesRegex( TypeError, "Input data must be either a " "`pd.DataFrame` or a `data_profiler.Data` " "and not of type `TextData`."): dp.train_structured_labeler(None, None) with self.assertRaisesRegex(TypeError, "The output dirpath must be a string."): dp.train_structured_labeler(pd.DataFrame([]), save_dirpath=0) # doesn't accept text data text_data = dp.Data(data='test', data_type='text') with self.assertRaisesRegex( TypeError, "Input data must be either a " "`pd.DataFrame` or a `data_profiler.Data` " "and not of type `TextData`."): dp.train_structured_labeler(text_data, None) with self.assertRaisesRegex( ValueError, "The `save_dirpath` is not valid or not " "accessible."): dp.train_structured_labeler(pd.DataFrame([]), "/a/test") try: data = { 'BACKGROUND': ["Beep", "Boop"], 'PERSON': ["GRANT", "MENSHENG"] } df = pd.DataFrame(data=data) dp.train_structured_labeler(df, save_dirpath=None) fake_data = dp.Data(data=df, data_type='csv') dp.train_structured_labeler(fake_data, save_dirpath=None) fake_data = dp.Data(data=df, data_type='json') dp.train_structured_labeler(fake_data, save_dirpath=None) fake_data = dp.Data(data=df, data_type='parquet') dp.train_structured_labeler(fake_data, save_dirpath=None) except Exception as e: self.fail(str(e))
def test_multi_labelers(self, *mocks): """ Test Multiple labelers called consecutively. :return: """ data = dp.Data(data=pd.DataFrame([12, 2, 3, 4, 5]).astype(str), data_type='parquet') data2 = dp.Data(data=pd.DataFrame(['atest', 'b', 'c']), data_type='csv') structured_labeler_1 = dp.DataLabeler(labeler_type='structured') structured_labeler_1.predict(data) unstructured_labeler = dp.DataLabeler(labeler_type='unstructured') unstructured_labeler._label_encoding = { 'PAD': 0, 'CITY': 1, # SAME AS BACKGROUND 'BACKGROUND': 1, 'ADDRESS': 2, 'BAN': 3, 'CREDIT_CARD': 4, 'EMAIL_ADDRESS': 5, 'UUID': 6, 'HASH_OR_KEY': 7, 'IPV4': 8, 'IPV6': 9, 'MAC_ADDRESS': 10, 'NAME': 11, # SAME AS PERSON 'PERSON': 11, 'PHONE_NUMBER': 12, 'SSN': 13, 'URL': 14, 'DATETIME': 15, 'INTEGER_BIG': 16, # SAME AS INTEGER 'INTEGER': 16, 'FLOAT': 17, 'QUANTITY': 18, 'ORDINAL': 19 } unstructured_labeler.predict(data) structured_labeler_2 = dp.DataLabeler(labeler_type='structured') structured_labeler_2.predict(data2)
def test_multi_labelers(self, *mocks): """ Test Multiple labelers called consecutively. :return: """ data = dp.Data( data=pd.DataFrame([12, 2, 3, 4, 5]).astype(str), data_type="parquet" ) data2 = dp.Data(data=pd.DataFrame(["atest", "b", "c"]), data_type="csv") structured_labeler_1 = dp.DataLabeler(labeler_type="structured") structured_labeler_1.predict(data) unstructured_labeler = dp.DataLabeler(labeler_type="unstructured") unstructured_labeler._label_encoding = { "PAD": 0, "CITY": 1, # SAME AS UNKNOWN "UNKNOWN": 1, "ADDRESS": 2, "BAN": 3, "CREDIT_CARD": 4, "EMAIL_ADDRESS": 5, "UUID": 6, "HASH_OR_KEY": 7, "IPV4": 8, "IPV6": 9, "MAC_ADDRESS": 10, "NAME": 11, # SAME AS PERSON "PERSON": 11, "PHONE_NUMBER": 12, "SSN": 13, "URL": 14, "DATETIME": 15, "INTEGER_BIG": 16, # SAME AS INTEGER "INTEGER": 16, "FLOAT": 17, "QUANTITY": 18, "ORDINAL": 19, } unstructured_labeler.predict(data) structured_labeler_2 = dp.DataLabeler(labeler_type="structured") structured_labeler_2.predict(data2)
def test_warning_tf_multiple_dp_with_update(self): test_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, "data") path = os.path.join(test_dir, "csv/diamonds.csv") data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False, } ) print("running dp1") profile1 = dp.StructuredProfiler(data, options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set( { "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False, } ) print("running dp2") profile2 = dp.StructuredProfiler(data, options=profile_options) profile1.update_profile(data)
def test_warning_tf_run_dp_merge(self): test_root_path = os.path.dirname( os.path.dirname(os.path.realpath(__file__))) test_dir = os.path.join(test_root_path, 'data') path = os.path.join(test_dir, 'csv/diamonds.csv') data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False }) print('running dp1') profile1 = dp.StructuredProfiler(data, options=profile_options) data = dp.Data(path) profile_options = dp.ProfilerOptions() profile_options.structured_options.set({ "text.is_enabled": False, "int.is_enabled": False, "float.is_enabled": False, "order.is_enabled": False, "category.is_enabled": False, "datetime.is_enabled": False, "chi2_homogeneity.is_enabled": False, "correlation.is_enabled": False }) print('running dp2') profile2 = dp.StructuredProfiler(data, options=profile_options) profile = profile1 + profile2
def test_null_in_file(self): filename_null_in_file = os.path.join( test_root_path, 'data', 'csv/sparse-first-and-last-column.txt') profiler_options = ProfilerOptions() profiler_options.set({'data_labeler.is_enabled': False}) data = dp.Data(filename_null_in_file) profile = dp.Profiler(data, profiler_options=profiler_options) report = profile.report(report_options={"output_format":"pretty"}) self.assertEqual( report['data_stats']['COUNT']['statistics']['null_types_index'], {'': '[2, 3, 4, 5, 7, 8]'} ) self.assertEqual( report['data_stats'][' NUMBERS']['statistics']['null_types_index'], {'': '[5, 6, 8]', ' ': '[2, 4]'} )
def test_check_and_return_valid_data_format(self): # test incorrect fit_or_predict value with self.assertRaisesRegex(ValueError, '`fit_or_predict` must equal ' '`fit` or `predict`'): BaseDataLabeler._check_and_return_valid_data_format([], 'oops') # test incorrect data type with self.assertRaisesRegex(TypeError, "Data must be imported using the" " data_readers, pd.DataFrames, " "np.ndarrays, or lists."): BaseDataLabeler._check_and_return_valid_data_format('oops') # test proper conversion of 2 dimensional structured data two_dim = [["this", "is"], ["two", "dimensions"]] two_dim_pred = np.array(["this", "is", "two", "dimensions"]) # for fit self.assertTrue( np.array_equal(np.array(two_dim), BaseDataLabeler._check_and_return_valid_data_format( two_dim, fit_or_predict='fit'))) self.assertTrue( np.array_equal(np.array(two_dim), BaseDataLabeler._check_and_return_valid_data_format( pd.DataFrame(two_dim), fit_or_predict='fit'))) self.assertTrue( np.array_equal(np.array(two_dim), BaseDataLabeler._check_and_return_valid_data_format( np.array(two_dim), fit_or_predict='fit'))) # for predict self.assertTrue( np.array_equal(two_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( two_dim, fit_or_predict='predict'))) self.assertTrue( np.array_equal(two_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( pd.DataFrame(two_dim), fit_or_predict='predict'))) self.assertTrue( np.array_equal(two_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( np.array(two_dim), fit_or_predict='predict'))) # test proper conversion of 1 dimensional data one_dim = ["this", "is", "one", "dimension"] one_dim_pred = np.array(one_dim) # for fit self.assertTrue( np.array_equal(np.array(one_dim), BaseDataLabeler._check_and_return_valid_data_format( one_dim, fit_or_predict='fit'))) self.assertTrue( np.array_equal(np.array(one_dim), BaseDataLabeler._check_and_return_valid_data_format( pd.Series(one_dim), fit_or_predict='fit'))) self.assertTrue( np.array_equal(np.array(one_dim), BaseDataLabeler._check_and_return_valid_data_format( np.array(one_dim), fit_or_predict='fit'))) # for predict self.assertTrue( np.array_equal(one_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( one_dim, fit_or_predict='predict'))) self.assertTrue( np.array_equal(one_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( pd.DataFrame(one_dim), fit_or_predict='predict'))) self.assertTrue( np.array_equal(one_dim_pred, BaseDataLabeler._check_and_return_valid_data_format( np.array(one_dim), fit_or_predict='predict'))) # test proper conversion of unstructured labels labels = [[(0, 4, "UNKNOWN"), (4, 10, "ADDRESS")], [(0, 5, "SSN"), (5, 8, "UNKNOWN")]] validated_labels = \ BaseDataLabeler._check_and_return_valid_data_format(labels) self.assertIsInstance(validated_labels, np.ndarray) self.assertEqual(len(validated_labels), 2) self.assertEqual(len(validated_labels[0]), 2) self.assertEqual(len(validated_labels[0][0]), 3) self.assertEqual(validated_labels[0][0][0], 0) self.assertEqual(validated_labels[0][1][1], 10) self.assertEqual(validated_labels[1][0][2], "SSN") # test proper conversion of data reader objects for dt in ["csv", "json", "parquet"]: data_obj = dp.Data(data=pd.DataFrame(two_dim), data_type=dt) val = BaseDataLabeler._check_and_return_valid_data_format(data_obj) self.assertTrue(np.array_equal(np.array(two_dim), val))
def test_accepted_inputs(self): with self.assertRaisesRegex(TypeError, "Input data must be either a " "`pd.DataFrame` or a `data_profiler.Data` " "and not of type `TextData`."): dp.train_structured_labeler(None) with self.assertRaisesRegex(TypeError, "The output dirpath must be a string."): dp.train_structured_labeler(pd.DataFrame([]), save_dirpath=0) with self.assertRaisesRegex(ValueError, "`default_label` must be a string."): dp.train_structured_labeler(pd.DataFrame([]), default_label=1) # doesn't accept text data text_data = dp.Data(data='test', data_type='text') with self.assertRaisesRegex(TypeError, "Input data must be either a " "`pd.DataFrame` or a `data_profiler.Data` " "and not of type `TextData`."): dp.train_structured_labeler(text_data) with self.assertRaisesRegex(ValueError, "The `save_dirpath` is not valid or not " "accessible."): dp.train_structured_labeler( pd.DataFrame([]), save_dirpath="/a/test") # default label not in the label mapping data = {'LABEL1': ["word1", "word2"], 'LABEL2': ["word3", "word4"]} df = pd.DataFrame(data=data) with self.assertRaisesRegex(ValueError, "The `default_label` of UNKNOWN must " "exist in the label mapping."): dp.train_structured_labeler(df) try: data = {'UNKNOWN': ["Beep", "Boop"], 'PERSON': ["GRANT", "MENSHENG"]} df = pd.DataFrame(data=data) dp.train_structured_labeler(df) fake_data = dp.Data(data=df, data_type='csv') dp.train_structured_labeler(fake_data) fake_data = dp.Data(data=df, data_type='json') dp.train_structured_labeler(fake_data) fake_data = dp.Data(data=df, data_type='parquet') dp.train_structured_labeler(fake_data) except Exception as e: self.fail(str(e)) # set default label to be in label mapping data = {'LABEL1': ["word1", "word2"], 'LABEL2': ["word3", "word4"]} df = pd.DataFrame(data=data) try: default_label = 'LABEL1' data_labeler = dp.train_structured_labeler( df, default_label=default_label) self.assertTrue(default_label in data_labeler.label_mapping) self.assertEqual(default_label, data_labeler.model._parameters['default_label']) except Exception as e: self.fail(str(e))
# parameter alteration ALLOW_SUBSAMPLING = True # profiler to subsample the dataset if large PERCENT_TO_NAN = 0.0 # Value must be between 0 and 100 sample_sizes = [100, 1000, 5000, 7500, int(1e5)] ################################################################################ if __name__ == "__main__": # set seed random.seed(0) np.random.seed(0) dp.set_seed(0) # load data data = dp.Data('data/time_structured_profiler.csv') # [0] allows model to be initialzied and added to labeler sample_sizes = [0] + sample_sizes profile_times = [] for sample_size in sample_sizes: # setup time dict print(f"Evaluating sample size: {sample_size}") df = data.data.sample(sample_size, replace=True).reset_index(drop=True) if PERCENT_TO_NAN: samples_to_nan = int(len(df) * PERCENT_TO_NAN / 100) for col_name in df: ind_to_nan = random.sample(list(df.index), samples_to_nan) df[col_name][ind_to_nan] = 'None'
# parameter alteration ALLOW_SUBSAMPLING = True # profiler to subsample the dataset if large PERCENT_TO_NAN = 0.0 # Value must be between 0 and 100 sample_sizes = [100, 1000, 5000, 7500, int(1e5)] ################################################################################ if __name__ == "__main__": # set seed random.seed(0) np.random.seed(0) dp.set_seed(0) # load data data = dp.Data("data/time_structured_profiler.csv") # [0] allows model to be initialzied and added to labeler sample_sizes = [0] + sample_sizes profile_times = [] for sample_size in sample_sizes: # setup time dict print(f"Evaluating sample size: {sample_size}") df = data.data.sample(sample_size, replace=True).reset_index(drop=True) if PERCENT_TO_NAN: samples_to_nan = int(len(df) * PERCENT_TO_NAN / 100) for col_name in df: ind_to_nan = random.sample(list(df.index), samples_to_nan) df[col_name][ind_to_nan] = "None"