def build_detectors( cls, detector: DocumentFieldMultilineRegexDetector, log_function: Callable) -> List[CsvRegexpsDetectionRow]: try: df = detector.get_as_pandas_df() except Exception as e: cls.log_error( log_function, f'CSV data is corrupted for field {detector.field_id}', e) return [] if df.shape[0] == 0: return [] if df.shape[1] != 2: cls.log_error( log_function, 'CSV data is has wrong number ' + f'of columns ({df.shape[1]}) for field {detector.field_id}') return [] detectors = [] # type: List[CsvRegexpsDetectionRow] for i, row in df.iterrows(): try: detected_value = row[0] detector_reg_raw = row[1] if detector.regexps_pre_process_lower: reg_pattern = re.compile(detector_reg_raw, re.IGNORECASE) else: reg_pattern = re.compile(detector_reg_raw) detectors.append( CsvRegexpsDetectionRow(reg_pattern, detected_value, detector)) except Exception as e: cls.log_error( log_function, 'CSV data is corrupted for field ' + f'{detector.field_id} at line #{i}', e) return detectors
def test_combine_dfs(self): # first row is the same, second row - same value, another pattern # third row - same pattern, another value, last row - brand new another_text = """ ,value,pattern 0,"Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby (Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); Lobo, Rebecca (Charlotte))","\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b" 1,"Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))","\bfamily\s{1,5}guy(173437)\b" 2,"Eye-Eyes Communications (018951)","\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b" 3,"John Smith Archives, LLC d/b/a Charlie (085292) (Flay, Bobby (New York))","\bcharlie\s{1,5}(085292)\b" """ with StringIO(another_text) as cs_stream: df = pd.read_csv(cs_stream, usecols=[1, 2]) detector = DocumentFieldMultilineRegexDetector() detector.csv_content = self.csv_text detector.update_checksum() detector.combine_with_dataframe(df) df_new = detector.get_as_pandas_df() row_val = [] # type: List[Tuple[str, str]] for i, row in df_new.iterrows(): row_val.append(( row[0], row[1], )) self.assertEqual(8, len(row_val)) self.assertTrue(( 'John Smith Archives, LLC d/b/a Charlie (085292) (Flay, Bobby (New York))', '\bcharlie\s{1,5}(085292)\b', ) in row_val) self.assertTrue(( 'Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby ' + '(Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); ' + 'Lobo, Rebecca (Charlotte))', '\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b', ) in row_val) self.assertTrue(( 'Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))', '\bfamily\s{1,5}guy(173437)\b', ) in row_val) self.assertTrue(( 'Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))', '\bfamily\s{1,5}name(173437)\b', ) in row_val) self.assertTrue(( 'Eye-Eyes Communications (018951)', '\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b', ) in row_val) # this one is replaced self.assertFalse(( 'All Eyes Communications (018951) (Moore, Michael (New York); Tarantino, Quentin ' + '(San Francisco); Lee, Spike (New York); Levinson, Barry (Charlotte))', '\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b', ) in row_val)
def test_get_as_pd(self): detector = DocumentFieldMultilineRegexDetector() detector.csv_content = self.csv_text df = detector.get_as_pandas_df() self.assertIsNotNone(df) self.assertEqual(( 6, 2, ), df.shape)