def test_default_regex_anomaly(business):
    compiler = DefaultRegexCompiler()
    tokenizer = DefaultTokenizer()
    collector = Group()

    tokenized = tokenizer.encode(business['Address '])
    groups = collector.collect(tokenized)

    patterns = compiler.compile(tokenized, groups)

    types = [[DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA],
             [DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA],
             [DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA]]
    for i, t in zip([9, 5, 7], types):
        for element, truth in zip(patterns[i].top(pattern=True).container, t):
            assert element.element_type == truth

    match_patterns = list()
    for pat in patterns.values():
        match_patterns.append(pat.top(pattern=True))

    mismatches = compiler.mismatches(tokenized, patterns=match_patterns)
    mismatched_rows = business.loc[mismatches, 'Address ']

    assert len(mismatched_rows) == 7  # except row#14, the other mismatches are e.g. those that had 14th (alphanum) instead of an alpha at position 2  # noqa: E501
    assert 14 in mismatched_rows.index  # index # 14 = 'ATTN HEATHER J HANSEN' which shouldnt match the pattern.
Esempio n. 2
0
def test_patternfinder_find(business):
    """test the patternfinder find method"""
    pf = OpencleanPatternFinder(tokenizer='default',
                                aligner='pad',
                                compiler=DefaultRegexCompiler())

    patterns = pf.find(series=business['Address '])
    assert len(patterns) == 4

    types = [
        DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP,
        DT.ALPHA, DT.SPACE_REP, DT.ALPHA
    ]
    assert len(patterns[9]) == 1
    for k, pat in patterns[9].items():
        for elements, type in zip(pat.container, types):
            assert elements.element_type == type

    # test column wise pattern creator
    pf = OpencleanPatternFinder(tokenizer='default',
                                aligner='pad',
                                compiler=DefaultRegexCompiler(method='col'))

    patterns = pf.find(series=business['Address '])
    assert len(patterns) == 4

    types = [
        DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP,
        DT.ALPHA, DT.SPACE_REP, DT.ALPHA
    ]
    assert len(patterns[9]) == 1
    for k, pat in patterns[9].items():
        for elements, type in zip(pat.container, types):
            assert elements.element_type == type
def test_default_regex_compiler_all(business):
    compiler = DefaultRegexCompiler(per_group='all')
    tokenizer = DefaultTokenizer()
    collector = Group()

    tokenized = tokenizer.encode(business['Address '])
    groups = collector.collect(tokenized)

    patterns = compiler.compile(tokenized, groups)

    assert len(patterns) == 4
    assert len(patterns[7]) == 4
    assert len(patterns[11]) == 2
    assert len(patterns[9]) == 1
    assert len(patterns[5]) == 2

    truth = [
        ' '.join([DT.DIGIT, DT.SPACE_REP, DT.ALPHANUM, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA]),
        ' '.join([DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA]),
        ' '.join([DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHANUM, DT.SPACE_REP, DT.ALPHA]),
        ' '.join([DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA])
    ]

    for t, key in zip(truth, patterns[7]):
        assert key == t
Esempio n. 4
0
def test_evaluator_evaluate(business):
    """Creates a pattern and evaluates it on the same column to see if mismatches are the same as mismatches
    """
    pf = OpencleanPatternFinder(tokenizer='default',
                                collector='group',
                                compiler=DefaultRegexCompiler())

    patterns = pf.find(series=business['Address '])
    eval_pattern = patterns[7].top(pattern=True)

    predicate = pf.compare(eval_pattern,
                           business['Address '].tolist(),
                           negate=True)
    mismatches = business.loc[predicate, 'Address ']
    mismatched_pattern = pf.find(mismatches)

    for mp in mismatched_pattern.values():
        assert not mp == eval_pattern

    predicate = pf.compare(eval_pattern,
                           business['Address '].tolist(),
                           negate=False)
    matches = business.loc[predicate, 'Address ']
    matched_pattern = pf.find(matches)

    assert len(matched_pattern) == 1
    assert matched_pattern[7].top(pattern=True) == eval_pattern
def test_padder_regex_col_compile(business):
    dt = DefaultTokenizer()
    rows = dt.encode(business['Address '])

    cr = Cluster(dist='TED', min_samples=3)
    groups = cr.collect(rows)

    ar = Padder()
    padded_tokens = ar.align(rows, groups)

    cp = DefaultRegexCompiler(method='col')
    patterns = cp.compile(padded_tokens, groups)

    for k, pat in patterns.items():
        if k != -1:  # ignore noise group for dbscan
            for value in business.loc[pat.top(pattern=True).idx, 'Address ']:
                assert pat.top(pattern=True).compare(value, dt)
def test_patterns_object(business):
    compiler = DefaultRegexCompiler()
    tokenizer = DefaultTokenizer()
    collector = Group()

    tokenized = tokenizer.encode(business['Address '])
    alignments = collector.collect(tokenized)

    patterns = compiler.compile(tokenized, alignments)

    assert len(patterns[7]) == 1
    for k, pat in patterns[7].items():
        assert pat.idx == {1, 4, 6, 7, 10, 11, 13, 15}

    anomalies = compiler.mismatches(tokenized, patterns[7].top(pattern=True))
    assert list(business.loc[anomalies, 'Address '].index) == [
        0, 2, 3, 5, 8, 9, 12, 14, 16, 17, 18, 19
    ]
def test_padder_regex_typeresolver_row_compile(business):
    tr = DefaultTypeResolver(interceptors=[AddressDesignatorResolver()])
    dt = RegexTokenizer(type_resolver=tr)
    rows = dt.encode(business['Address '])

    cr = Cluster(dist='TED', min_samples=3)
    groups = cr.collect(rows)

    ar = Padder()
    padded_tokens = ar.align(rows, groups)

    cp = DefaultRegexCompiler(method='row')
    patterns = cp.compile(padded_tokens, groups)

    for k, pat in patterns.items():
        if k != -1:  # ignore noise group for dbscan
            for value in business.loc[patterns[k].top(pattern=True).idx,
                                      'Address ']:
                assert pat.top(pattern=True).compare(value, dt)
def test_default_regex_compiler(business):
    compiler = DefaultRegexCompiler(per_group='top')
    tokenizer = DefaultTokenizer()
    collector = Group()

    tokenized = tokenizer.encode(business['Address '])
    groups = collector.collect(tokenized)

    patterns = compiler.compile(tokenized, groups)

    assert len(patterns) == 4
    types = [
        [DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA],
        [DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA],
        [DT.DIGIT, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA, DT.SPACE_REP, DT.ALPHA]
    ]
    for i, t in zip([9, 5, 7], types):
        for element, truth in zip(patterns[i].top(pattern=True).container, t):
            assert element.element_type == truth
Esempio n. 9
0
    def create_compiler(compiler):
        """Returns the compile object if the input string matches the compiler identifier

        Parameters
        ----------
        compiler: str
            name string of the compiler
        """
        if compiler == COMPILER_DEFAULT:
            return DefaultRegexCompiler()

        raise ValueError('compiler: {} not found'.format(compiler))
def test_anomalous_values_in_mismatches(checkintime):
    """Test if values not included in the pattern elements are identified as mismatches
    """
    collector = Group()
    tokenizer = DefaultTokenizer()
    compiler = DefaultRegexCompiler(method='col', size_coverage=.9)

    # Get a sample of terms from the column.
    terms = list(checkintime)

    # Tokenize and convert tokens into representation.
    tokenized_terms = tokenizer.encode(terms)

    # Group tokenized terms by number of tokens.
    clusters = collector.collect(tokenized_terms)

    for _, term_ids in clusters.items():
        if len(term_ids) / len(terms) < 0.9:
            # Ignore small clusters.
            continue

        # Return the pattern for the found cluster. This assumes that
        # maximally one cluster can satisfy the threshold.
        patterns = compiler.compile(tokenized_terms, {0: term_ids})[0]
        break

    pattern = patterns.top(n=1, pattern=True)

    mismatches = list()
    for term in terms:
        if not pattern.compare(term, tokenizer):
            mismatches.append(term)

    assert mismatches == [
        '04/22/43971 02:40:00 AM +0000', '01/11/43972 04:40:00 PM +0000',
        '10/03/43971 04:40:00 PM +0000', '04/20/43971 12:40:00 AM +0000',
        '05/08/43971 06:40:00 PM +0000', '08/29/43971 06:40:00 AM +0000',
        '08/27/43971 04:40:00 AM +0000', '10/03/43971 12:00:00 AM +0000',
        '10/16/43971 09:20:00 PM +0000'
    ]
Esempio n. 11
0
def test_regex_pattern_compile():
    """Tests a pattern using the Value Function
    """
    pf = OpencleanPatternFinder(tokenizer='default',
                                collector='group',
                                compiler=DefaultRegexCompiler())

    pattern = pf.find(series=[ROWS[0]])[7]

    match = ROWS[1]
    assert pattern[pattern.top()].compile(tokenizer=pf.tokenizer).eval(match)

    mismatch = '321-West Broadway 10007'
    assert not pattern[pattern.top()].compile(
        tokenizer=pf.tokenizer).eval(mismatch)
Esempio n. 12
0
def test_regex_pattern_compare():
    """creates a pattern from ROWS[0] and compares it with ROWS[1]
   """
    pf = OpencleanPatternFinder(tokenizer='default',
                                collector='group',
                                compiler=DefaultRegexCompiler())

    pattern = pf.find(series=[ROWS[0]])[7]

    match = ROWS[1]
    assert pattern.top(pattern=True).compare(value=match,
                                             tokenizer=pf.tokenizer)

    mismatch = '321-West Broadway 10007'
    assert not pattern.top(pattern=True).compare(value=mismatch,
                                                 tokenizer=pf.tokenizer)
Esempio n. 13
0
def test_func_match():
    """Test functionality of the match operator."""

    pf = OpencleanPatternFinder(tokenizer='default',
                                aligner='group',
                                compiler=DefaultRegexCompiler())

    pattern = pf.find(series=ROWS[0])[7]

    match = ROWS[1][0]
    mismatch = '321-West Broadway 10007'

    # -- IsMatch --------------------------------------------------------------
    f = IsMatch(func=pattern.compare, generator=pf)
    assert f(match)
    assert not f(mismatch)

    # -- IsNotMatch -----------------------------------------------------------
    f = IsNotMatch(func=pattern.compare, generator=pf)
    assert not f(match)
    assert f(mismatch)
def test_pattern_without_anomalous_elements(checkintime, specimen):
    """Test if anomalous values (<90% of the dataset) are excluded during pattern element generation

    Process for creating PatternElements:
     1. create sets of differently lengthed values (e.g. 2 sets total, 1 for ['ne','st'] and 1 for ['w']
     2. start combining sets together in descending order of their frequencies (so largest sets down to the smallest ones)
     3. stop when you've added 90% of the data
     4. sets smaller than this are excluded

     Example: (assuming @ 20% threshold for 5 values)
         For a Pattern Element ALPHA with values: 'ave','str','nes','jtd','st'

        1. Create sets:
            A: Size 3, Freq 4: [ave, str, nes, jtd]
            B: Size 2, Freq 1: [st]

        2-4. Update Pattern Element
            create new PatternElement()
            add set A to it
            check freq/total = 0.8 == threshold
            dont add set B

        The final Patten element ~ ALPHA (3-3) instead of ALPHA(2-3)
    """
    # The dataset contains 9 anomalous values for the 5th position (year) thus
    # the pattern element = DIGIT[4-5] instead of DIGIT[4-4] without anomaly
    # removal = ['DIGIT[2-2]', '/', 'DIGIT[2-2]', '/', 'DIGIT[4-5]', 'SPACE_REP[1-1]',
    #   'DIGIT[2-2]', ':', 'DIGIT[2-2]', ':', 'DIGIT[2-2]', 'SPACE_REP[1-1]',
    #   'ALPHA[2-2]', 'SPACE_REP[1-1]', '+', 'DIGIT[4-4]']

    checkin_truth = [
        '{}[2-2]'.format(DT.DIGIT), '/', '{}[2-2]'.format(DT.DIGIT), '/',
        '{}[4-4]'.format(DT.DIGIT), '{}[1-1]'.format(DT.SPACE_REP),
        '{}[2-2]'.format(DT.DIGIT), ':', '{}[2-2]'.format(DT.DIGIT), ':',
        '{}[2-2]'.format(DT.DIGIT), '{}[1-1]'.format(DT.SPACE_REP),
        'ALPHA[2-2]', '{}[1-1]'.format(DT.SPACE_REP), '+',
        '{}[4-4]'.format(DT.DIGIT)
    ]

    specimen_truth = [
        '{}[4-4]'.format(DT.DIGIT), '/', '{}[2-2]'.format(DT.DIGIT), '/',
        '{}[2-2]'.format(DT.DIGIT)
    ]

    compiler1 = DefaultRegexCompiler(method='col', size_coverage=.9)
    compiler2 = DefaultRegexCompiler(method='row', size_coverage=.9)

    def test(df, compiler, truth):
        collector = Group()
        tokenizer = DefaultTokenizer()

        # Get a sample of terms from the column.
        terms = list(df)

        # Tokenize and convert tokens into representation.
        tokenized_terms = tokenizer.encode(terms)

        # Group tokenized terms by number of tokens.
        clusters = collector.collect(tokenized_terms)

        for _, term_ids in clusters.items():
            if len(term_ids) / len(terms) < 0.9:
                # Ignore small clusters.
                continue

            # Return the pattern for the found cluster. This assumes that
            # maximally one cluster can satisfy the threshold.
            patterns = compiler.compile(tokenized_terms, {0: term_ids})[0]
            break

        if patterns:
            tokens = list()
            for el in patterns.top(n=1, pattern=True):
                if el.punc_list:
                    token = ''.join(el.punc_list)
                else:
                    token = '{}[{}-{}]'.format(el.element_type, el.len_min,
                                               el.len_max)
                tokens.append(token)

        for actual, expected in zip(tokens, truth):
            assert actual == expected

    for df, truth in [(checkintime, checkin_truth),
                      (specimen, specimen_truth)]:
        for compiler in [compiler1, compiler2]:
            test(df, compiler, truth)
Esempio n. 15
0
def test_multi_resolvers_full(urban):
    """
        test multiple resolvers in series: AD -> GEO -> AT
        """
    deft = DefaultTypeResolver(interceptors=[
        AddressDesignatorResolver(),
        GeoSpatialResolver(levels=[0, 1, 2])
    ])
    rt = RegexTokenizer(type_resolver=deft)

    encoded = rt.encode(urban)

    # group the column
    from openclean_pattern.collect.group import Group

    ga = Group()
    grouped = ga.collect(encoded)

    # compile the pattern
    from openclean_pattern.regex.compiler import DefaultRegexCompiler

    rws = DefaultRegexCompiler()
    compiled = rws.compile(encoded, grouped)

    assert len(compiled) == 14
    types = [[
        SupportedDataTypes.DIGIT, SupportedDataTypes.SPACE_REP,
        SupportedDataTypes.ALPHA, SupportedDataTypes.PUNCTUATION,
        SupportedDataTypes.SPACE_REP, SupportedDataTypes.SPACE_REP,
        SupportedDataTypes.ALPHA, SupportedDataTypes.PUNCTUATION,
        SupportedDataTypes.SPACE_REP, SupportedDataTypes.ALPHA,
        SupportedDataTypes.PUNCTUATION, SupportedDataTypes.SPACE_REP,
        SupportedDataTypes.DIGIT
    ],
             [
                 SupportedDataTypes.ALPHA, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.ALPHA, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.DIGIT, SupportedDataTypes.PUNCTUATION,
                 SupportedDataTypes.SPACE_REP, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.ALPHA, SupportedDataTypes.PUNCTUATION,
                 SupportedDataTypes.SPACE_REP, SupportedDataTypes.ALPHA,
                 SupportedDataTypes.PUNCTUATION, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.DIGIT
             ],
             [
                 SupportedDataTypes.DIGIT, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.ALPHA, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.ALPHA, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.STREET, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.SUD, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.DIGIT, SupportedDataTypes.PUNCTUATION,
                 SupportedDataTypes.SPACE_REP, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.ALPHA, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.STREET, SupportedDataTypes.PUNCTUATION,
                 SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.ADMIN_LEVEL_2,
                 SupportedDataTypes.PUNCTUATION, SupportedDataTypes.SPACE_REP,
                 SupportedDataTypes.DIGIT
             ]]
    for i, t in zip([13, 15, 23], types):
        for element, truth in zip(list(compiled[i].values())[0].container, t):
            assert element.element_type == truth