def test_compiler_unstructured_reports(self, *mocks): data = pd.Series(["Hello Hello", "This is a test grant"]) compiler = col_pro_compilers.UnstructuredCompiler(data) unstructured_options = UnstructuredOptions() unstructured_options.text.vocab.is_enabled = False compiler._create_profile(data, unstructured_options) report = compiler.report(remove_disabled_flag=True) self.assertNotIn("vocab", report["statistics"]) self.assertIn("words", report["statistics"]) report = compiler.report(remove_disabled_flag=False) self.assertIn("vocab", report["statistics"]) self.assertIn("words", report["statistics"]) unstructured_options.text.vocab.is_enabled = True compiler._create_profile(data, unstructured_options) report = compiler.report(remove_disabled_flag=True) self.assertIn("vocab", report["statistics"]) self.assertIn("words", report["statistics"]) unstructured_options.text.words.is_enabled = False compiler._create_profile(data, unstructured_options) report = compiler.report(remove_disabled_flag=True) self.assertIn("vocab", report["statistics"]) self.assertNotIn("words", report["statistics"])
def test_compiler_stats_diff(self, *mocks): data1 = pd.Series(["Hello Hello", "This is a test grant"]) data2 = pd.Series(["This is unknown", "my name grant", "9", "9"]) # Test normal diff compiler1 = col_pro_compilers.UnstructuredCompiler(data1) compiler2 = col_pro_compilers.UnstructuredCompiler(data2) labeler_1 = compiler1._profiles["data_labeler"] labeler_2 = compiler2._profiles["data_labeler"] labeler_1.char_sample_size = 20 labeler_1.word_sample_size = 15 entity_counts = { "word_level": { "UNKNOWN": 5, "TEST": 5, "UNIQUE1": 5 }, "true_char_level": { "UNKNOWN": 4, "TEST": 8, "UNIQUE1": 8 }, "postprocess_char_level": { "UNKNOWN": 5, "TEST": 10, "UNIQUE1": 5 }, } labeler_1.entity_counts = entity_counts labeler_1.update(pd.Series(["a"])) labeler_2.char_sample_size = 20 labeler_2.word_sample_size = 10 entity_counts = { "word_level": { "UNKNOWN": 2, "TEST": 4, "UNIQUE2": 4 }, "true_char_level": { "UNKNOWN": 8, "TEST": 8, "UNIQUE2": 4 }, "postprocess_char_level": { "UNKNOWN": 5, "TEST": 10, "UNIQUE2": 5 }, } labeler_2.entity_counts = entity_counts labeler_2.update(pd.Series(["a"])) expected_diff = { "statistics": { "vocab": [ ["H", "l"], [ "e", "o", " ", "T", "h", "i", "s", "a", "t", "g", "r", "n" ], ["u", "k", "w", "m", "y", "9"], ], "vocab_count": [ { "l": 4, "H": 2 }, { " ": 1, "e": 2, "s": 1, "t": 2, "o": 1, "i": "unchanged", "a": "unchanged", "T": "unchanged", "h": "unchanged", "g": "unchanged", "r": "unchanged", "n": -4, }, { "m": 2, "9": 2, "u": 1, "k": 1, "w": 1, "y": 1 }, ], "words": [["Hello", "test"], ["grant"], ["unknown", "name", "9"]], "word_count": [ { "Hello": 2, "test": 1 }, { "grant": "unchanged" }, { "9": 2, "unknown": 1, "name": 1 }, ], }, "data_label": { "entity_counts": { "word_level": { "UNKNOWN": 3, "TEST": 1, "UNIQUE1": [5, None], "UNIQUE2": [None, 4], }, "true_char_level": { "UNKNOWN": -4, "TEST": "unchanged", "UNIQUE1": [8, None], "UNIQUE2": [None, 4], }, "postprocess_char_level": { "UNKNOWN": "unchanged", "TEST": "unchanged", "UNIQUE1": [5, None], "UNIQUE2": [None, 5], }, }, "entity_percentages": { "word_level": { "UNKNOWN": 0.1333333333333333, "TEST": -0.06666666666666671, "UNIQUE1": [0.3333333333333333, None], "UNIQUE2": [None, 0.4], }, "true_char_level": { "UNKNOWN": -0.2, "TEST": "unchanged", "UNIQUE1": [0.4, None], "UNIQUE2": [None, 0.2], }, "postprocess_char_level": { "UNKNOWN": "unchanged", "TEST": "unchanged", "UNIQUE1": [0.25, None], "UNIQUE2": [None, 0.25], }, }, }, } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test while disabling a column options = UnstructuredOptions() options.data_labeler.is_enabled = False compiler2 = col_pro_compilers.UnstructuredCompiler(data2, options) expected_diff = { "statistics": { "vocab": [ ["H", "l"], [ "e", "o", " ", "T", "h", "i", "s", "a", "t", "g", "r", "n" ], ["u", "k", "w", "m", "y", "9"], ], "vocab_count": [ { "l": 4, "H": 2 }, { " ": 1, "e": 2, "s": 1, "t": 2, "o": 1, "i": "unchanged", "a": "unchanged", "T": "unchanged", "h": "unchanged", "g": "unchanged", "r": "unchanged", "n": -4, }, { "m": 2, "9": 2, "u": 1, "k": 1, "w": 1, "y": 1 }, ], "words": [["Hello", "test"], ["grant"], ["unknown", "name", "9"]], "word_count": [ { "Hello": 2, "test": 1 }, { "grant": "unchanged" }, { "9": 2, "unknown": 1, "name": 1 }, ], } } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test while disabling 2 columns options.text.is_enabled = False compiler2 = col_pro_compilers.UnstructuredCompiler(data2, options) expected_diff = {} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test while disabling all columns compiler1 = col_pro_compilers.UnstructuredCompiler(data1, options) self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
def test_compiler_stats_diff(self, *mocks): data1 = pd.Series(['Hello Hello', 'This is a test grant']) data2 = pd.Series(['This is unknown', 'my name grant', '9', '9']) # Test normal diff compiler1 = col_pro_compilers.UnstructuredCompiler(data1) compiler2 = col_pro_compilers.UnstructuredCompiler(data2) labeler_1 = compiler1._profiles["data_labeler"] labeler_2 = compiler2._profiles["data_labeler"] labeler_1.char_sample_size = 20 labeler_1.word_sample_size = 15 entity_counts = { 'word_level': { 'UNKNOWN': 5, 'TEST': 5, 'UNIQUE1': 5 }, 'true_char_level': { 'UNKNOWN': 4, 'TEST': 8, 'UNIQUE1': 8 }, 'postprocess_char_level': { 'UNKNOWN': 5, 'TEST': 10, 'UNIQUE1': 5 } } labeler_1.entity_counts = entity_counts labeler_1.update(pd.Series(["a"])) labeler_2.char_sample_size = 20 labeler_2.word_sample_size = 10 entity_counts = { 'word_level': { 'UNKNOWN': 2, 'TEST': 4, 'UNIQUE2': 4 }, 'true_char_level': { 'UNKNOWN': 8, 'TEST': 8, 'UNIQUE2': 4 }, 'postprocess_char_level': { 'UNKNOWN': 5, 'TEST': 10, 'UNIQUE2': 5 } } labeler_2.entity_counts = entity_counts labeler_2.update(pd.Series(["a"])) expected_diff = { 'statistics': { 'vocab': [['H', 'l'], ['e', 'o', ' ', 'T', 'h', 'i', 's', 'a', 't', 'g', 'r', 'n'], ['u', 'k', 'w', 'm', 'y', '9']], 'vocab_count': [{ 'l': 4, 'H': 2 }, { ' ': 1, 'e': 2, 's': 1, 't': 2, 'o': 1, 'i': 'unchanged', 'a': 'unchanged', 'T': 'unchanged', 'h': 'unchanged', 'g': 'unchanged', 'r': 'unchanged', 'n': -4 }, { 'm': 2, '9': 2, 'u': 1, 'k': 1, 'w': 1, 'y': 1 }], 'words': [['Hello', 'test'], ['grant'], ['unknown', 'name', '9']], 'word_count': [{ 'Hello': 2, 'test': 1 }, { 'grant': 'unchanged' }, { '9': 2, 'unknown': 1, 'name': 1 }] }, 'data_label': { 'entity_counts': { 'word_level': { 'UNKNOWN': 3, 'TEST': 1, 'UNIQUE1': [5, None], 'UNIQUE2': [None, 4] }, 'true_char_level': { 'UNKNOWN': -4, 'TEST': 'unchanged', 'UNIQUE1': [8, None], 'UNIQUE2': [None, 4] }, 'postprocess_char_level': { 'UNKNOWN': 'unchanged', 'TEST': 'unchanged', 'UNIQUE1': [5, None], 'UNIQUE2': [None, 5] } }, 'entity_percentages': { 'word_level': { 'UNKNOWN': 0.1333333333333333, 'TEST': -0.06666666666666671, 'UNIQUE1': [0.3333333333333333, None], 'UNIQUE2': [None, 0.4] }, 'true_char_level': { 'UNKNOWN': -0.2, 'TEST': 'unchanged', 'UNIQUE1': [0.4, None], 'UNIQUE2': [None, 0.2] }, 'postprocess_char_level': { 'UNKNOWN': 'unchanged', 'TEST': 'unchanged', 'UNIQUE1': [0.25, None], 'UNIQUE2': [None, 0.25] } } } } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test while disabling a column options = UnstructuredOptions() options.data_labeler.is_enabled = False compiler2 = col_pro_compilers.UnstructuredCompiler(data2, options) expected_diff = { 'statistics': { 'vocab': [['H', 'l'], ['e', 'o', ' ', 'T', 'h', 'i', 's', 'a', 't', 'g', 'r', 'n'], ['u', 'k', 'w', 'm', 'y', '9']], 'vocab_count': [{ 'l': 4, 'H': 2 }, { ' ': 1, 'e': 2, 's': 1, 't': 2, 'o': 1, 'i': 'unchanged', 'a': 'unchanged', 'T': 'unchanged', 'h': 'unchanged', 'g': 'unchanged', 'r': 'unchanged', 'n': -4 }, { 'm': 2, '9': 2, 'u': 1, 'k': 1, 'w': 1, 'y': 1 }], 'words': [['Hello', 'test'], ['grant'], ['unknown', 'name', '9']], 'word_count': [{ 'Hello': 2, 'test': 1 }, { 'grant': 'unchanged' }, { '9': 2, 'unknown': 1, 'name': 1 }] } } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test while disabling 2 columns options.text.is_enabled = False compiler2 = col_pro_compilers.UnstructuredCompiler(data2, options) expected_diff = {} self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) # Test while disabling all columns compiler1 = col_pro_compilers.UnstructuredCompiler(data1, options) self.assertDictEqual(expected_diff, compiler1.diff(compiler2))
def test_base(self, *mocks): from collections import defaultdict import pandas as pd df_series = pd.Series(["test", "hi my name is John Doe. 123-432-1234"]) time_array = [float(i) for i in range(100, 0, -1)] with mock.patch("time.time", side_effect=lambda: time_array.pop()): compiler = col_pro_compilers.UnstructuredCompiler(df_series) expected_dict = { "data_label": { "entity_counts": { "postprocess_char_level": defaultdict(int), "true_char_level": defaultdict(int), "word_level": defaultdict(int), }, "entity_percentages": { "postprocess_char_level": defaultdict(int), "true_char_level": defaultdict(int), "word_level": defaultdict(int), }, "times": { "data_labeler_predict": 1.0 }, }, "statistics": { "times": { "vocab": 1.0, "words": 1.0 }, "vocab_count": { " ": 6, "-": 2, ".": 1, "1": 2, "2": 3, "3": 3, "4": 2, "D": 1, "J": 1, "a": 1, "e": 3, "h": 2, "i": 2, "m": 2, "n": 2, "o": 2, "s": 2, "t": 2, "y": 1, }, "vocab": [ " ", "-", ".", "1", "2", "3", "4", "D", "J", "a", "e", "h", "i", "m", "n", "o", "s", "t", "y", ], "word_count": { "123-432-1234": 1, "Doe": 1, "John": 1, "hi": 1, "name": 1, "test": 1, }, "words": ["test", "hi", "name", "John", "Doe", "123-432-1234"], }, } output_profile = compiler.profile # because vocab uses a set, it will be random order every time, hence # we need to sort to check exact match between profiles if "statistics" in output_profile and "vocab" in output_profile[ "statistics"]: output_profile["statistics"]["vocab"] = sorted( output_profile["statistics"]["vocab"]) self.assertDictEqual(expected_dict, output_profile)
def test_base(self, *mocks): import pandas as pd from collections import defaultdict df_series = pd.Series(['test', 'hi my name is John Doe. 123-432-1234']) time_array = [float(i) for i in range(100, 0, -1)] with mock.patch('time.time', side_effect=lambda: time_array.pop()): compiler = col_pro_compilers.UnstructuredCompiler(df_series) expected_dict = { 'data_label': { 'entity_counts': { 'postprocess_char_level': defaultdict(int), 'true_char_level': defaultdict(int), 'word_level': defaultdict(int) }, 'entity_percentages': { 'postprocess_char_level': defaultdict(int), 'true_char_level': defaultdict(int), 'word_level': defaultdict(int) }, 'times': { 'data_labeler_predict': 1.0 } }, 'statistics': { 'times': { 'vocab': 1.0, 'words': 1.0 }, 'vocab_count': { ' ': 6, '-': 2, '.': 1, '1': 2, '2': 3, '3': 3, '4': 2, 'D': 1, 'J': 1, 'a': 1, 'e': 3, 'h': 2, 'i': 2, 'm': 2, 'n': 2, 'o': 2, 's': 2, 't': 2, 'y': 1 }, 'vocab': [ ' ', '-', '.', '1', '2', '3', '4', 'D', 'J', 'a', 'e', 'h', 'i', 'm', 'n', 'o', 's', 't', 'y' ], 'word_count': { '123-432-1234': 1, 'Doe': 1, 'John': 1, 'hi': 1, 'name': 1, 'test': 1 }, 'words': ['test', 'hi', 'name', 'John', 'Doe', '123-432-1234'] } } output_profile = compiler.profile # because vocab uses a set, it will be random order every time, hence # we need to sort to check exact match between profiles if ('statistics' in output_profile and 'vocab' in output_profile['statistics']): output_profile['statistics']['vocab'] = \ sorted(output_profile['statistics']['vocab']) self.assertDictEqual(expected_dict, output_profile)