def test_checksum(self):
        detector = DocumentFieldMultilineRegexDetector()
        detector.csv_content = self.csv_text
        detector.update_checksum()
        self.assertGreater(len(detector.csv_checksum), 10)

        cs_old = detector.csv_checksum
        detector.csv_content = detector.csv_content[:-1] + ';'
        detector.update_checksum()
        self.assertGreater(len(detector.csv_checksum), 10)
        self.assertNotEqual(cs_old, detector.csv_checksum)
    def test_combine_dfs(self):
        # first row is the same, second row - same value, another pattern
        # third row - same pattern, another value, last row - brand new
        another_text = """
,value,pattern
0,"Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby (Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); Lobo, Rebecca (Charlotte))","\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b"
1,"Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))","\bfamily\s{1,5}guy(173437)\b"
2,"Eye-Eyes Communications (018951)","\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b"
3,"John Smith Archives, LLC d/b/a Charlie (085292) (Flay, Bobby (New York))","\bcharlie\s{1,5}(085292)\b"
            """
        with StringIO(another_text) as cs_stream:
            df = pd.read_csv(cs_stream, usecols=[1, 2])

        detector = DocumentFieldMultilineRegexDetector()
        detector.csv_content = self.csv_text
        detector.update_checksum()
        detector.combine_with_dataframe(df)

        df_new = detector.get_as_pandas_df()
        row_val = []  # type: List[Tuple[str, str]]
        for i, row in df_new.iterrows():
            row_val.append((
                row[0],
                row[1],
            ))

        self.assertEqual(8, len(row_val))
        self.assertTrue((
            'John Smith Archives, LLC d/b/a Charlie (085292) (Flay, Bobby (New York))',
            '\bcharlie\s{1,5}(085292)\b',
        ) in row_val)
        self.assertTrue((
            'Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby '
            +
            '(Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); '
            + 'Lobo, Rebecca (Charlotte))',
            '\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b',
        ) in row_val)
        self.assertTrue((
            'Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))',
            '\bfamily\s{1,5}guy(173437)\b',
        ) in row_val)
        self.assertTrue((
            'Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))',
            '\bfamily\s{1,5}name(173437)\b',
        ) in row_val)
        self.assertTrue((
            'Eye-Eyes Communications (018951)',
            '\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b',
        ) in row_val)
        # this one is replaced
        self.assertFalse((
            'All Eyes Communications (018951) (Moore, Michael (New York); Tarantino, Quentin '
            +
            '(San Francisco); Lee, Spike (New York); Levinson, Barry (Charlotte))',
            '\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b',
        ) in row_val)
 def test_get_as_pd(self):
     detector = DocumentFieldMultilineRegexDetector()
     detector.csv_content = self.csv_text
     df = detector.get_as_pandas_df()
     self.assertIsNotNone(df)
     self.assertEqual((
         6,
         2,
     ), df.shape)
Exemple #4
0
def setup_mock():
    doc_field.uid = 'ABCDEF'
    doc_field.code = 'client'

    csv_text = """
    ,value,pattern
    0,"Big Bank & Company (004578) (Knight, Bobby (Charlotte); Bryant, Koby (Charlotte); Williams, Gary (Charlotte); Johnson, Magic (Charlotte); Lobo, Rebecca (Charlotte))","\bbig\s{1,5}bank\s{1,5}.{1,5}\s{1,5}company\s{1,5}(004578)\b"
    1,"Family Name Limited (173437) (Tanner, Rebecca (Houston); Saget, Bob (Houston))","family\s{1,5}name\s{1,5}\(173437\)"
    2,"Financial Services & Co. (015607) (Spelling, Tori (Chicago); Priestley, Jason (Dallas); Perry, Luke (New York); Doherty, Shannon (Chicago); Garth, Jenny (Chicago))","\bfinancial\s{1,5}services\s{1,5}.{1,5}(015607)\b"
    3,"Food Wholsale, Inc. (056230) (Jenner, Bruce (Chicago))","\bfood\s{1,5}wholsale,(056230)\b"
    4,"All Eyes Communications (018951) (Moore, Michael (New York); Tarantino, Quentin (San Francisco); Lee, Spike (New York); Levinson, Barry (Charlotte))","\ball\s{1,5}eyes\s{1,5}communications\s{1,5}(018951)\b"
    5,"Joe Smith Archives, LLC d/b/a Foxtrot (085292) (Flay, Bobby (New York))","\bfoxtrot\s{1,5}(085292)\b
    \bjoe\s{1,5}smith\s{1,5}archives\b" """

    detector = DocumentFieldMultilineRegexDetector()
    detector.csv_content = csv_text
    detector.document_field = doc_field
    detector.update_checksum()
    CsvRegexpsDetectionCacheMock.detector_by_field[doc_field.uid] = detector
Exemple #5
0
    def save_detector_settings_csv(
            self, detectors_by_value: Dict[str, List[str]]) -> None:
        detector = DocumentFieldMultilineRegexDetector()
        detector.document_field = self.document_field
        df = pd.DataFrame(columns=['value', 'pattern'])
        df.set_index("pattern", inplace=True)
        for field_val in detectors_by_value:
            for include_reg_value in detectors_by_value[field_val]:
                df = df.append(
                    {
                        'value': field_val,
                        'pattern': include_reg_value
                    },
                    ignore_index=True)
        df.drop_duplicates(subset='pattern', inplace=True)

        try:
            existing = DocumentFieldMultilineRegexDetector.objects.get(
                document_field_id=self.document_field.uid
            )  # type: DocumentFieldMultilineRegexDetector
        except DocumentFieldMultilineRegexDetector.DoesNotExist:
            detector.csv_content = df.to_csv()
            detector.update_checksum()
            detector.save()
            return

        # just update CSV content and hashsum
        if self.drop_previous_field_detectors:
            existing.csv_content = df.to_csv()
            existing.update_checksum()
            existing.save()
            return

        # join these options with existing one
        # overwriting duplicates by detected_value or regexp pattern
        existing.combine_with_dataframe(df)
        existing.save()