コード例 #1
0
ファイル: test_shingleseqs.py プロジェクト: ulf1/kshingle
def test1():
    shingles = ks.shingleseqs_k("", 1)
    assert shingles == [[]]

    shingles = ks.shingleseqs_k(" ", 1)
    assert shingles == [[" "]]

    shingles = ks.shingleseqs_k(" ", 2)
    assert shingles == [[" "], []]
コード例 #2
0
ファイル: test_identify_vocab.py プロジェクト: ulf1/kshingle
def test1():
    # check length
    data = ['abc d abc de abc def', 'abc defg abc def gh abc def ghi']
    shingled = [ks.shingleseqs_k(s, k=9) for s in data]
    for n in range(1, 15):
        VOCAB = ks.identify_vocab(shingled, n_max_vocab=n)
        assert len(VOCAB) == n
コード例 #3
0
def test7():
    k = 5
    docs = [
        "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam ",
        "nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam ",
        "erat, sed diam voluptua. At vero eos et accusam et justo duo ",
        "dolores et ea rebum. Stet clita kasd gubergren, no sea takimata "
    ]
    # generate all shingles
    shingled = [ks.shingleseqs_k(doc, k=k) for doc in docs]
    assert len(shingled) == len(docs)
    assert len(shingled[0]) == k
    # run CEWS algorithm
    db = functools.reduce(lambda x, y: x + Counter(itertools.chain(*y)),
                          shingled, Counter([]))
    memo = ks.cews(db, threshold=0.8, min_samples_split=10, max_wildcards=2)
    # encode shingles with patterns
    PATTERNS = ks.shingles_to_patterns(memo)
    encoded = ks.encode_with_patterns(shingled, PATTERNS, len(PATTERNS))
    assert sum([len(pats) for pats in PATTERNS.values()]) == len(memo)
    assert len(encoded) == len(shingled)
    for i in range(len(encoded)):
        assert len(encoded[i]) == len(shingled[i])
        for j in range(len(encoded[i])):
            assert len(encoded[i][j]) == len(shingled[i][j])
コード例 #4
0
ファイル: test_shingleseqs.py プロジェクト: ulf1/kshingle
def test3():
    shingles = ks.shingleseqs_k("12345", 0)
    assert shingles == []

    shingles = ks.shingleseqs_k("12345", 1)
    assert shingles == [['1', '2', '3', '4', '5']]

    shingles = ks.shingleseqs_k("12345", 2)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45']]

    shingles = ks.shingleseqs_k("12345", 3)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'],
                        ['123', '234', '345']]

    shingles = ks.shingleseqs_k("12345", 4)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'],
                        ['123', '234', '345'], ['1234', '2345']]

    shingles = ks.shingleseqs_k("12345", 5)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'],
                        ['123', '234', '345'], ['1234', '2345'], ['12345']]

    shingles = ks.shingleseqs_k("12345", 6)
    assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'],
                        ['123', '234', '345'], ['1234', '2345'], ['12345'], []]
コード例 #5
0
ファイル: test_shingleseqs.py プロジェクト: ulf1/kshingle
def test42():
    seqs = ks.shingleseqs_k("12345",
                            k=6,
                            padding='center',
                            placeholder='x',
                            evenpad='post')
    target = [['1', '2', '3', '4', '5'], ['12', '23', '34', '45', 'x'],
              ['x', '123', '234', '345', 'x'], ['x', '1234', '2345', 'x', 'x'],
              ['x', 'x', '12345', 'x', 'x'], ['x', 'x', 'x', 'x', 'x']]
    assert seqs == target
コード例 #6
0
def test10():
    k = 5
    corpus = [
        "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam ",
        "nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam ",
        "erat, sed diam voluptua. At vero eos et accusam et justo duo ",
        "dolores et ea rebum. Stet clita kasd gubergren, no sea takimata "
    ]
    # generate all shingles
    shingled = [ks.shingleseqs_k(doc, k=k) for doc in corpus]
    assert len(shingled) == len(corpus)
    assert len(shingled[0]) == k
    # run CEWS algorithm
    db = functools.reduce(lambda x, y: x + Counter(itertools.chain(*y)),
                          shingled, Counter([]))
    memo = ks.cews(db, threshold=0.8, min_samples_split=10, max_wildcards=2)
    PATTERNS = ks.shingles_to_patterns(memo)
    # encode
    encoded, shingled = ks.encode_multi_match_corpus(corpus,
                                                     k=k,
                                                     PATTERNS=PATTERNS,
                                                     num_matches=3,
                                                     stack=True)
    assert encoded.shape[1] == 3 * k - 3
コード例 #7
0
ファイル: test_identify_vocab.py プロジェクト: ulf1/kshingle
def test5():
    data = ['a', 'ab']
    shingled = [ks.shingleseqs_k(s, k=2) for s in data]
    VOCAB = ks.identify_vocab(
        shingled, sortmode='log-x-length', n_min_count=1, n_max_vocab=None)
    assert VOCAB == ['ab', 'a', 'b']
コード例 #8
0
ファイル: test_identify_vocab.py プロジェクト: ulf1/kshingle
def test2():
    data = ['a', 'ab']
    shingled = [ks.shingleseqs_k(s, k=2) for s in data]
    VOCAB = ks.identify_vocab(
        shingled, sortmode='prefer-shorter', n_min_count=1, n_max_vocab=None)
    assert VOCAB == ['a', 'b', 'ab']
コード例 #9
0
ファイル: test_shingleseqs.py プロジェクト: ulf1/kshingle
def test2():
    shingles = ks.shingleseqs_k(" ", 0)
    assert shingles == []

    shingles = ks.shingleseqs_k(" ", -1)
    assert shingles == []
コード例 #10
0
ファイル: test_shingleseqs.py プロジェクト: ulf1/kshingle
def test43():
    seqs = ks.shingleseqs_k("12345", k=6, padding='pre', placeholder='x')
    target = [['1', '2', '3', '4', '5'], ['x', '12', '23', '34', '45'],
              ['x', 'x', '123', '234', '345'], ['x', 'x', 'x', '1234', '2345'],
              ['x', 'x', 'x', 'x', '12345'], ['x', 'x', 'x', 'x', 'x']]
    assert seqs == target