Example #1
0
    def test_ocr_skip(self):
        obj = {
            "scan_tag": {
                "scanner": {
                    "name": "integration_test",
                    "pk": 0
                },
                "time": "2020-01-01T00:00:00+00:00"
            },
            "source":
            FilesystemSource(os.path.join(test_data_path, "ocr",
                                          "good")).to_json_object(),
            "rule":
            CPRRule(modulus_11=False,
                    ignore_irrelevant=False).to_json_object(),
            "configuration": {
                "skip_mime_types": ["image/*"]
            }
        }

        self.messages.append((
            obj,
            "os2ds_scan_specs",
        ))
        self.run_pipeline()

        for message, queue in self.unhandled:
            if queue == "os2ds_results":
                self.assertFalse(message["matched"],
                                 "OCR match found with OCR disabled")
            else:
                self.fail("unexpected message in queue {0}".format(queue))
Example #2
0
    def test_corrupted_container(self):
        obj = {
            "scan_tag":
            "integration_test",
            "source":
            FilesystemSource(os.path.join(test_data_path, "pdf",
                                          "corrupted")).to_json_object(),
            "rule":
            CPRRule(modulus_11=False,
                    ignore_irrelevant=False).to_json_object(),
            "configuration": {}
        }

        self.messages.append((
            obj,
            "os2ds_scan_specs",
        ))
        self.run_pipeline()

        print(self.unhandled)

        self.assertEqual(len(self.unhandled), 1)
        self.assertEqual(self.unhandled[0][0]["origin"], "os2ds_problems")
Example #3
0
if reload_content:
    h = FilesystemHandle.make_handle(fpath)
    content = get_content_from_handle(h)

# newrule = CPRRule(modulus_11=True, ignore_irrelevant=False,
#                   examine_context=True)
# newrule.extract_surrounding_words = MethodType(extract_surrounding_words_fixed, newrule)

rules = [
    (CPRSimple(modulus_11=True, ignore_irrelevant=False,
               examine_context=True), "simple w. context"),
    (CPRComplicated(modulus_11=True,
                    ignore_irrelevant=False,
                    examine_context=True), "'accepted' w. context"),
    (CPRRule(modulus_11=True, ignore_irrelevant=False,
             examine_context=True), "current w. context"),
    # (CPRRule(modulus_11=True, ignore_irrelevant=False, examine_context=True),
    #  "current w. context"),
    (CPROld(modulus_11=True, ignore_irrelevant=False,
            examine_context=False), "old wo. context"),
    (CPROld(modulus_11=True, ignore_irrelevant=False,
            examine_context=True), "old w. context"),
    # (newrule, "new w. context"),
]

for rule, description in rules:

    print(description)

    @timing
    def f(rule):
Example #4
0
 def setUp(self):
     self.rule = CPRRule(modulus_11=False, ignore_irrelevant=False)
Example #5
0
    """@timing decorator
    """
    @wraps(func)
    def wrap(*args, **kw):
        ts = time() * 1000
        result = func(*args, **kw)
        te = time() * 1000
        print('func:{!r}, took: {:.4f} ms'.format(func.__name__, te - ts))
        return result

    return cast(F, wrap)


rules = [
    (CPRRule(modulus_11=True,
             ignore_irrelevant=False,
             examine_context=False,
             blacklist=None), matches, "match all"),
    (CPRRule(modulus_11=True,
             ignore_irrelevant=False,
             examine_context=True,
             blacklist=None), [matches[i] for i in [0, 1, 2, 3, 5, 6, 19]],
     "match using context rules"),
    (CPRRule(modulus_11=True,
             ignore_irrelevant=False,
             examine_context=True,
             blacklist=None,
             whitelist=None), [matches[i] for i in [0, 2, 3, 5, 6, 19]],
     "match setting `whitelist=None`"),
    (CPRRule(modulus_11=True,
             ignore_irrelevant=False,
             examine_context=True,
    def test_simplerule_matches(self):
        candidates = [
            (CPRRule(modulus_11=False, ignore_irrelevant=False), """
2205995008: forbryder,
230500 0003: forbryder,
240501-0006: forbryder,
250501-1987: forbryder""",
             ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX", "2505XXXXXX"]),
            (CPRRule(modulus_11=True, ignore_irrelevant=True), """
2205995008: forbryder,
230500 0003: forbryder,
240501-0006: forbryder,
250501-1987: forbryder""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX"]),
            (CPRRule(modulus_11=True,
                     ignore_irrelevant=True,
                     examine_context=False), """
Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nr. 2205995008
Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nummer: 2305000003
240501-0006""", ["2205XXXXXX", "2305XXXXXX", "2405XXXXXX"]),
            (CPRRule(modulus_11=True,
                     ignore_irrelevant=True,
                     examine_context=True), """
Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nr. 2205995008
Vejstrand Kommune, Børn- og Ungeforvaltningen. P-nummer: 2305000003
240501-0006""", ["2405XXXXXX"]),
            (RegexRule("((four|six)( [aopt]+)?|(one|seven) [aopt]+)"), """
one
one potato
two potato
three potato
four
five potato
six potato
seven potato
more!""", ["one potato", "four", "six potato", "seven potato"]),
            (LastModifiedRule(
                datetime(2019, 12, 24, 23, 59, 59, tzinfo=timezone.utc)),
             datetime(2019, 12, 31, 23, 59, 59,
                      tzinfo=timezone.utc), ["2019-12-31T23:59:59+0000"]),
            (LastModifiedRule(
                datetime(2019, 12, 24, 23, 59, 59, tzinfo=timezone.utc)),
             datetime(2019, 5, 22, 0, 0, 1, tzinfo=timezone.utc), None),
            (DimensionsRule(width_range=range(0, 16385),
                            height_range=range(0, 16385),
                            min_dim=256), (128, 256), [[128, 256]]),
            (DimensionsRule(width_range=range(0, 16385),
                            height_range=range(0, 16385),
                            min_dim=256), (128, 255), []),
            (DimensionsRule(width_range=range(256, 1024),
                            height_range=range(256, 1024),
                            min_dim=0), (256, 256), [[256, 256]]),
            (DimensionsRule(width_range=range(256, 1024),
                            height_range=range(256, 1024),
                            min_dim=0), (32, 32), []),
        ]

        for rule, in_value, expected in candidates:
            with self.subTest(rule):
                json = rule.to_json_object()
                back_again = rule.from_json_object(json)
                self.assertEqual(rule, back_again)

            with self.subTest(rule):
                matches = rule.match(in_value)
                if expected:
                    self.assertEqual([match["match"] for match in matches],
                                     expected)
                else:
                    self.assertFalse(list(matches))