def test_invalid_n(self): fhp = FieldHashingProperties(ngram=2, k=20, positional=True) fhp.ngram = -6 with self.assertRaises( ValueError, msg='Expected raise ValueError on invalid n.'): tok = get_tokenizer(fhp) tok('prawn')
def stream_bloom_filters( dataset, # type: Iterable[Sequence[Text]] keys, # type: Sequence[Sequence[bytes]] schema # type: Schema ): # type: (...) -> Iterable[Tuple[bitarray, Text, int]] """ Compute composite Bloom filters (CLKs) for every record in an iterable dataset. :param dataset: An iterable of indexable records. :param schema: An instantiated Schema instance :param keys: A tuple of two lists of secret keys used in the HMAC. :return: Generator yielding bloom filters as 3-tuples """ tokenizers = [ tokenizer.get_tokenizer(field.hashing_properties) for field in schema.fields ] return (crypto_bloom_filter(s, tokenizers, schema, keys) for s in dataset)
import unittest from clkhash.field_formats import FieldHashingProperties from clkhash.tokenizer import get_tokenizer __author__ = 'shardy' # some tokenizers p1_20 = get_tokenizer( FieldHashingProperties(ngram=1, k=20) ) p2_20 = get_tokenizer( FieldHashingProperties(ngram=2, k=20) ) p1_20_true = get_tokenizer( FieldHashingProperties(ngram=1, k=20, positional=True) ) dummy = get_tokenizer(None) class TestTokenizer(unittest.TestCase): def test_unigram_1(self): self.assertEqual(list(p1_20("1/2/93", ignore='/')), ['1', '2', '9', '3']) def test_unigram_2(self): self.assertEqual(list(p1_20("1*2*93", ignore='*')),
def test_bigram_duplicate(self): properties = FieldHashingProperties(ngram=2, positional=False) self.assertEqual(list(get_tokenizer(properties)("abab")), [' a', 'ab', 'ba', 'ab', 'b '])
def test_unigram_1(self): properties = FieldHashingProperties(ngram=1, positional=False) self.assertEqual(list(get_tokenizer(properties)("1/2/93", ignore='/')), ['1', '2', '9', '3'])
def test_bigram_2(self): properties = FieldHashingProperties(ngram=2, positional=False) self.assertEqual(list(get_tokenizer(properties)("steve", ignore='e')), [' s', 'st', 'tv', 'v '])
def test_bigram_1(self): properties = FieldHashingProperties(ngram=2, positional=False) self.assertEqual(list(get_tokenizer(properties)("steve")), [' s', 'st', 'te', 'ev', 've', 'e '])
def test_positional_unigram_duplicate(self): properties = FieldHashingProperties(ngram=1, positional=True) self.assertEqual(list(get_tokenizer(properties)("111")), ['1 1', '2 1', '3 1'])
def test_positional_unigram_2(self): properties = FieldHashingProperties(ngram=1, positional=True) self.assertEqual(list(get_tokenizer(properties)("1*2*")), ['1 1', '2 *', '3 2', '4 *'])
def test_unigram_duplicate(self): properties = FieldHashingProperties(ngram=1, positional=False) self.assertEqual(list(get_tokenizer(properties)("1212")), ['1', '2', '1', '2'])