def test_needlemanwunsch():
    """test the needlenman wunsch pairwise aligner"""
    x = 'W. 125 ST' # alpha punc space digit space alpha
    y = 'W125 ST' # alphanum space alpha

    column = [x, y]

    rows = DefaultTokenizer().encode(column)
    nwa = NeedlemanWunschAligner()
    aligned = nwa.align(rows)

    assert len(aligned) == len(rows)
    assert len(aligned[1]) == len(aligned[0])

    # many alignments possible but last alpha and 1 space should always be aligned
    assert aligned[0][-1].regex_type == aligned[1][-1].regex_type == SupportedDataTypes.ALPHA
    assert aligned[0][2].regex_type == aligned[1][2].regex_type == SupportedDataTypes.SPACE_REP or \
           aligned[0][4].regex_type == aligned[1][4].regex_type == SupportedDataTypes.SPACE_REP

    z = '12BROADWAY.AVE'
    rows = DefaultTokenizer().encode([y, z])
    aligned = nwa.align(rows)

    assert len(aligned) == len(rows)
    assert len(aligned[1]) == len(aligned[0])

    assert aligned[0][0].regex_type == aligned[1][0].regex_type == SupportedDataTypes.ALPHANUM
    assert aligned[0][1].regex_type == SupportedDataTypes.SPACE_REP and aligned[1][1].regex_type == SupportedDataTypes.PUNCTUATION
    assert aligned[0][-1].regex_type == aligned[1][-1].regex_type == SupportedDataTypes.ALPHA
def test_progressive_align_wo_tree():
    dt = DefaultTokenizer()
    encoded = dt.encode(ADDRESSES)

    pa = ProgressiveAligner(use_guide_tree=False)
    aln = pa.align(encoded, {0: [0, 1, 2]})[0]

    assert len(aln) == 3
    assert aln[0][0].regex_type == SupportedDataTypes.DIGIT
    assert aln[0][1].regex_type == SupportedDataTypes.SPACE_REP
    assert aln[0][2].regex_type == SupportedDataTypes.ALPHA
    assert aln[0][3].regex_type == SupportedDataTypes.GAP
    assert aln[0][4].regex_type == SupportedDataTypes.GAP

    assert aln[1][0].regex_type == SupportedDataTypes.ALPHANUM
    assert aln[1][1].regex_type == SupportedDataTypes.PUNCTUATION
    assert aln[1][2].regex_type == SupportedDataTypes.ALPHA
    assert aln[1][3].regex_type == SupportedDataTypes.GAP
    assert aln[1][4].regex_type == SupportedDataTypes.GAP

    assert aln[2][0].regex_type == SupportedDataTypes.ALPHA
    assert aln[2][1].regex_type == SupportedDataTypes.SPACE_REP
    assert aln[2][2].regex_type == SupportedDataTypes.ALPHA
    assert aln[2][3].regex_type == SupportedDataTypes.SPACE_REP
    assert aln[2][4].regex_type == SupportedDataTypes.DIGIT
Ejemplo n.º 3
0
def test_neighborjoining_collect():
    dt = DefaultTokenizer()
    encoded = dt.encode(ADDRESSES)

    nj = NeighborJoin()
    order = nj.collect(encoded)

    actual = [1, 0, 2]
    for i, o in enumerate(order[0][0][0]):
        assert o == actual[i]
Ejemplo n.º 4
0
def test_neighborjoining_order():
    dt = DefaultTokenizer()
    encoded = dt.encode(ADDRESSES)

    nj = NeighborJoin()
    aln, order = nj.get_tree_and_order(encoded)

    actual = [SupportedDataTypes.ALPHANUM, SupportedDataTypes.PUNCTUATION, SupportedDataTypes.ALPHA]
    for i, o in enumerate(order[0][0][0]):
        assert o.regex_type == actual[i]
Ejemplo n.º 5
0
def test_neighborjoining_distance():
    addresses = ADDRESSES[:2]

    dt = DefaultTokenizer()
    encoded = dt.encode(addresses)

    nj = NeighborJoin()
    dists = nj._compute_pairwise_distance(encoded)

    assert len(dists) == 2
    assert (dists == [[0, 0], [0, 0]]).all()
Ejemplo n.º 6
0
def test_distance_absolute_compute(dates):
    test = '12TH JANUARY 2011'
    test_tokens = DefaultTokenizer().tokens(rowidx=0, value=test)
    train_tokens = DefaultTokenizer().encode(dates)

    dist = DistanceFactory.create('ABS')
    distances = list()
    for row in train_tokens:
        distances.append(dist.compute(test_tokens, row))

    for i, j in zip(distances, [4, 4, 0, 2]):
        assert i == j
def test_progressive_align_dates(dates):
    dt = DefaultTokenizer()

    encoded = dt.encode(dates)

    pa = ProgressiveAligner(use_guide_tree=False)
    aln = pa.align(encoded, {0: [0, 1, 2], 1: [3]})

    for al in aln:
        assert isinstance(al, Alignment)

    al = aln[0]
    assert len(al) == 3
    assert al[0][0].regex_type == SupportedDataTypes.ALPHA
    assert al[0][1].regex_type == SupportedDataTypes.PUNCTUATION
    assert al[0][2].regex_type == SupportedDataTypes.SPACE_REP
    assert al[0][3].regex_type == SupportedDataTypes.ALPHANUM
    assert al[0][4].regex_type == SupportedDataTypes.SPACE_REP
    assert al[0][5].regex_type == SupportedDataTypes.ALPHA
    assert al[0][6].regex_type == SupportedDataTypes.PUNCTUATION
    assert al[0][7].regex_type == SupportedDataTypes.SPACE_REP
    assert al[0][8].regex_type == SupportedDataTypes.DIGIT

    assert al[1][0].regex_type == SupportedDataTypes.ALPHA
    assert al[1][1].regex_type == SupportedDataTypes.PUNCTUATION
    assert al[1][2].regex_type == SupportedDataTypes.SPACE_REP
    assert al[1][3].regex_type == SupportedDataTypes.ALPHA
    assert al[1][4].regex_type == SupportedDataTypes.SPACE_REP
    assert al[1][5].regex_type == SupportedDataTypes.DIGIT
    assert al[1][6].regex_type == SupportedDataTypes.PUNCTUATION
    assert al[1][7].regex_type == SupportedDataTypes.SPACE_REP
    assert al[1][8].regex_type == SupportedDataTypes.DIGIT

    assert al[2][0].regex_type == SupportedDataTypes.ALPHA
    assert al[2][1].regex_type == SupportedDataTypes.GAP
    assert al[2][2].regex_type == SupportedDataTypes.SPACE_REP
    assert al[2][3].regex_type == SupportedDataTypes.ALPHA
    assert al[2][4].regex_type == SupportedDataTypes.GAP
    assert al[2][5].regex_type == SupportedDataTypes.GAP
    assert al[2][6].regex_type == SupportedDataTypes.GAP
    assert al[2][7].regex_type == SupportedDataTypes.SPACE_REP
    assert al[2][8].regex_type == SupportedDataTypes.DIGIT

    al = aln[1]
    assert len(al) == 1
    assert al[0][0].regex_type == SupportedDataTypes.ALPHA
    assert al[0][1].regex_type == SupportedDataTypes.SPACE_REP
    assert al[0][2].regex_type == SupportedDataTypes.DIGIT
Ejemplo n.º 8
0
def test_padder_regex_col_compile(business):
    dt = DefaultTokenizer()
    rows = dt.encode(business['Address '])

    cr = Cluster(dist='TED', min_samples=3)
    groups = cr.collect(rows)

    ar = Padder()
    padded_tokens = ar.align(rows, groups)

    cp = DefaultRegexCompiler(method='col')
    patterns = cp.compile(padded_tokens, groups)

    for k, pat in patterns.items():
        if k != -1:  # ignore noise group for dbscan
            for value in business.loc[pat.top(pattern=True).idx, 'Address ']:
                assert pat.top(pattern=True).compare(value, dt)
Ejemplo n.º 9
0
def test_padder_align(business):
    rows = DefaultTokenizer().encode(business['Address '])

    groups = Cluster(dist='TED', min_samples=3).collect(rows)
    padded_tokens = Padder().align(rows, groups)

    for group, idx in groups.items():
        leng = len(padded_tokens[idx[0]])
        for id in idx:
            assert len(padded_tokens[id]) == leng
Ejemplo n.º 10
0
def test_distance_absolute_compute(business):
    rows = DefaultTokenizer().encode(business['Address '])
    groups = Cluster(dist='ABS', min_samples=3).collect(rows)

    for group, idx in groups.items():
        # noise
        if group != -1:
            # all other rows should be grouped by no. of tokens
            leng = len(rows[idx[0]])
            for id in idx:
                assert len(rows[id]) == leng
Ejemplo n.º 11
0
def test_distance_ted_compute(business):
    rows = DefaultTokenizer().encode(business['Address '])

    aligned = Cluster(dist='TED', min_samples=3).collect(rows)

    assert len(business.loc[aligned[0],'Address ']) == 8

    results = ['4356 NE DAVIS ST', '2947 NE VILLAGE CT', '1754 ORCHARD HOME DR',
     '6740 NE PORTLAND HIGHWAY', '33555 NE KRAMIEN ROAD', '5290 SW CHESTNUT AVE',
     '3590 SE CHARTER PL', '18355 SHADY HOLLOW WAY']

    for i in business.loc[aligned[0], 'Address ']:
        assert i in results