Beispiel #1
0
def test1():
    shingles = ks.shingleseqs_k("", 1)
    assert shingles == [[]]

    shingles = ks.shingleseqs_k(" ", 1)
    assert shingles == [[" "]]

    shingles = ks.shingleseqs_k(" ", 2)
    assert shingles == [[" "], []]
Beispiel #2
0
def test1():
    # check length
    data = ['abc d abc de abc def', 'abc defg abc def gh abc def ghi']
    shingled = [ks.shingleseqs_k(s, k=9) for s in data]
    for n in range(1, 15):
        VOCAB = ks.identify_vocab(shingled, n_max_vocab=n)
        assert len(VOCAB) == n
Beispiel #3
0
def test7():
    k = 5
    docs = [
        "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam ",
        "nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam ",
        "erat, sed diam voluptua. At vero eos et accusam et justo duo ",
        "dolores et ea rebum. Stet clita kasd gubergren, no sea takimata "
    ]
    # generate all shingles
    shingled = [ks.shingleseqs_k(doc, k=k) for doc in docs]
    assert len(shingled) == len(docs)
    assert len(shingled[0]) == k
    # run CEWS algorithm
    db = functools.reduce(lambda x, y: x + Counter(itertools.chain(*y)),
                          shingled, Counter([]))
    memo = ks.cews(db, threshold=0.8, min_samples_split=10, max_wildcards=2)
    # encode shingles with patterns
    PATTERNS = ks.shingles_to_patterns(memo)
    encoded = ks.encode_with_patterns(shingled, PATTERNS, len(PATTERNS))
    assert sum([len(pats) for pats in PATTERNS.values()]) == len(memo)
    assert len(encoded) == len(shingled)
    for i in range(len(encoded)):
        assert len(encoded[i]) == len(shingled[i])
        for j in range(len(encoded[i])):
            assert len(encoded[i][j]) == len(shingled[i][j])
Beispiel #4
0
def test3():
    shingles = ks.shingleseqs_k("12345", 0)
    assert shingles == []

    shingles = ks.shingleseqs_k("12345", 1)
    assert shingles == [['1', '2', '3', '4', '5']]

    shingles = ks.shingleseqs_k("12345", 2)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45']]

    shingles = ks.shingleseqs_k("12345", 3)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'],
                        ['123', '234', '345']]

    shingles = ks.shingleseqs_k("12345", 4)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'],
                        ['123', '234', '345'], ['1234', '2345']]

    shingles = ks.shingleseqs_k("12345", 5)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'],
                        ['123', '234', '345'], ['1234', '2345'], ['12345']]

    shingles = ks.shingleseqs_k("12345", 6)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'],
                        ['123', '234', '345'], ['1234', '2345'], ['12345'], []]
Beispiel #5
0
def test42():
    seqs = ks.shingleseqs_k("12345",
                            k=6,
                            padding='center',
                            placeholder='x',
                            evenpad='post')
    target = [['1', '2', '3', '4', '5'], ['12', '23', '34', '45', 'x'],
              ['x', '123', '234', '345', 'x'], ['x', '1234', '2345', 'x', 'x'],
              ['x', 'x', '12345', 'x', 'x'], ['x', 'x', 'x', 'x', 'x']]
    assert seqs == target
Beispiel #6
0
def test10():
    k = 5
    corpus = [
        "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam ",
        "nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam ",
        "erat, sed diam voluptua. At vero eos et accusam et justo duo ",
        "dolores et ea rebum. Stet clita kasd gubergren, no sea takimata "
    ]
    # generate all shingles
    shingled = [ks.shingleseqs_k(doc, k=k) for doc in corpus]
    assert len(shingled) == len(corpus)
    assert len(shingled[0]) == k
    # run CEWS algorithm
    db = functools.reduce(lambda x, y: x + Counter(itertools.chain(*y)),
                          shingled, Counter([]))
    memo = ks.cews(db, threshold=0.8, min_samples_split=10, max_wildcards=2)
    PATTERNS = ks.shingles_to_patterns(memo)
    # encode
    encoded, shingled = ks.encode_multi_match_corpus(corpus,
                                                     k=k,
                                                     PATTERNS=PATTERNS,
                                                     num_matches=3,
                                                     stack=True)
    assert encoded.shape[1] == 3 * k - 3
Beispiel #7
0
def test5():
    data = ['a', 'ab']
    shingled = [ks.shingleseqs_k(s, k=2) for s in data]
    VOCAB = ks.identify_vocab(
        shingled, sortmode='log-x-length', n_min_count=1, n_max_vocab=None)
    assert VOCAB == ['ab', 'a', 'b']
Beispiel #8
0
def test2():
    data = ['a', 'ab']
    shingled = [ks.shingleseqs_k(s, k=2) for s in data]
    VOCAB = ks.identify_vocab(
        shingled, sortmode='prefer-shorter', n_min_count=1, n_max_vocab=None)
    assert VOCAB == ['a', 'b', 'ab']
Beispiel #9
0
def test2():
    shingles = ks.shingleseqs_k(" ", 0)
    assert shingles == []

    shingles = ks.shingleseqs_k(" ", -1)
    assert shingles == []
Beispiel #10
0
def test43():
    seqs = ks.shingleseqs_k("12345", k=6, padding='pre', placeholder='x')
    target = [['1', '2', '3', '4', '5'], ['x', '12', '23', '34', '45'],
              ['x', 'x', '123', '234', '345'], ['x', 'x', 'x', '1234', '2345'],
              ['x', 'x', 'x', 'x', '12345'], ['x', 'x', 'x', 'x', 'x']]
    assert seqs == target