def test1(): shingles = ks.shingleseqs_k("", 1) assert shingles == [[]] shingles = ks.shingleseqs_k(" ", 1) assert shingles == [[" "]] shingles = ks.shingleseqs_k(" ", 2) assert shingles == [[" "], []]
def test1(): # check length data = ['abc d abc de abc def', 'abc defg abc def gh abc def ghi'] shingled = [ks.shingleseqs_k(s, k=9) for s in data] for n in range(1, 15): VOCAB = ks.identify_vocab(shingled, n_max_vocab=n) assert len(VOCAB) == n
def test7(): k = 5 docs = [ "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam ", "nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam ", "erat, sed diam voluptua. At vero eos et accusam et justo duo ", "dolores et ea rebum. Stet clita kasd gubergren, no sea takimata " ] # generate all shingles shingled = [ks.shingleseqs_k(doc, k=k) for doc in docs] assert len(shingled) == len(docs) assert len(shingled[0]) == k # run CEWS algorithm db = functools.reduce(lambda x, y: x + Counter(itertools.chain(*y)), shingled, Counter([])) memo = ks.cews(db, threshold=0.8, min_samples_split=10, max_wildcards=2) # encode shingles with patterns PATTERNS = ks.shingles_to_patterns(memo) encoded = ks.encode_with_patterns(shingled, PATTERNS, len(PATTERNS)) assert sum([len(pats) for pats in PATTERNS.values()]) == len(memo) assert len(encoded) == len(shingled) for i in range(len(encoded)): assert len(encoded[i]) == len(shingled[i]) for j in range(len(encoded[i])): assert len(encoded[i][j]) == len(shingled[i][j])
def test3(): shingles = ks.shingleseqs_k("12345", 0) assert shingles == [] shingles = ks.shingleseqs_k("12345", 1) assert shingles == [['1', '2', '3', '4', '5']] shingles = ks.shingleseqs_k("12345", 2) assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45']] shingles = ks.shingleseqs_k("12345", 3) assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'], ['123', '234', '345']] shingles = ks.shingleseqs_k("12345", 4) assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'], ['123', '234', '345'], ['1234', '2345']] shingles = ks.shingleseqs_k("12345", 5) assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'], ['123', '234', '345'], ['1234', '2345'], ['12345']] shingles = ks.shingleseqs_k("12345", 6) assert shingles == [['1', '2', '3', '4', '5'], ['12', '23', '34', '45'], ['123', '234', '345'], ['1234', '2345'], ['12345'], []]
def test42(): seqs = ks.shingleseqs_k("12345", k=6, padding='center', placeholder='x', evenpad='post') target = [['1', '2', '3', '4', '5'], ['12', '23', '34', '45', 'x'], ['x', '123', '234', '345', 'x'], ['x', '1234', '2345', 'x', 'x'], ['x', 'x', '12345', 'x', 'x'], ['x', 'x', 'x', 'x', 'x']] assert seqs == target
def test10(): k = 5 corpus = [ "Lorem ipsum dolor sit amet, consetetur sadipscing elitr, sed diam ", "nonumy eirmod tempor invidunt ut labore et dolore magna aliquyam ", "erat, sed diam voluptua. At vero eos et accusam et justo duo ", "dolores et ea rebum. Stet clita kasd gubergren, no sea takimata " ] # generate all shingles shingled = [ks.shingleseqs_k(doc, k=k) for doc in corpus] assert len(shingled) == len(corpus) assert len(shingled[0]) == k # run CEWS algorithm db = functools.reduce(lambda x, y: x + Counter(itertools.chain(*y)), shingled, Counter([])) memo = ks.cews(db, threshold=0.8, min_samples_split=10, max_wildcards=2) PATTERNS = ks.shingles_to_patterns(memo) # encode encoded, shingled = ks.encode_multi_match_corpus(corpus, k=k, PATTERNS=PATTERNS, num_matches=3, stack=True) assert encoded.shape[1] == 3 * k - 3
def test5(): data = ['a', 'ab'] shingled = [ks.shingleseqs_k(s, k=2) for s in data] VOCAB = ks.identify_vocab( shingled, sortmode='log-x-length', n_min_count=1, n_max_vocab=None) assert VOCAB == ['ab', 'a', 'b']
def test2(): data = ['a', 'ab'] shingled = [ks.shingleseqs_k(s, k=2) for s in data] VOCAB = ks.identify_vocab( shingled, sortmode='prefer-shorter', n_min_count=1, n_max_vocab=None) assert VOCAB == ['a', 'b', 'ab']
def test2(): shingles = ks.shingleseqs_k(" ", 0) assert shingles == [] shingles = ks.shingleseqs_k(" ", -1) assert shingles == []
def test43(): seqs = ks.shingleseqs_k("12345", k=6, padding='pre', placeholder='x') target = [['1', '2', '3', '4', '5'], ['x', '12', '23', '34', '45'], ['x', 'x', '123', '234', '345'], ['x', 'x', 'x', '1234', '2345'], ['x', 'x', 'x', 'x', '12345'], ['x', 'x', 'x', 'x', 'x']] assert seqs == target