def test_default_regex_anomaly(business):
    compiler = DefaultRegexCompiler()
    tokenizer = DefaultTokenizer()
    collector = Group()

    tokenized = tokenizer.encode(business['Address '])
    groups = collector.collect(tokenized)

    patterns = compiler.compile(tokenized, groups)

    types = [[DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA],
             [DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA],
             [DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA]]
    for i, t in zip([9, 5, 7], types):
        for element, truth in zip(patterns[i].top(pattern=True).container, t):
            assert element.element_type == truth

    match_patterns = list()
    for pat in patterns.values():
        match_patterns.append(pat.top(pattern=True))

    mismatches = compiler.mismatches(tokenized, patterns=match_patterns)
    mismatched_rows = business.loc[mismatches, 'Address ']

    assert len(mismatched_rows) == 7  # except row#14, the other mismatches are e.g. those that had 14th (alphanum) instead of an alpha at position 2  # noqa: E501
    assert 14 in mismatched_rows.index  # index # 14 = 'ATTN HEATHER J HANSEN' which shouldnt match the pattern.
def test_patterns_object(business):
    compiler = DefaultRegexCompiler()
    tokenizer = DefaultTokenizer()
    collector = Group()

    tokenized = tokenizer.encode(business['Address '])
    alignments = collector.collect(tokenized)

    patterns = compiler.compile(tokenized, alignments)

    assert len(patterns[7]) == 1
    for k, pat in patterns[7].items():
        assert pat.idx == {1, 4, 6, 7, 10, 11, 13, 15}

    anomalies = compiler.mismatches(tokenized, patterns[7].top(pattern=True))
    assert list(business.loc[anomalies, 'Address '].index) == [
        0, 2, 3, 5, 8, 9, 12, 14, 16, 17, 18, 19
    ]