def test_fp_rate(self): pattern = [(10, 123), (10, 456), (10, 789)] eg = ExamplesGenerator(seed=7357, pattern=pattern, fp_rate=0.5) examples = [next(eg()) for _ in range(1000)] positive_examples = [e[0] for e in examples if e[1]] has_pattern = [eg.has_pattern(e, pattern) for e in positive_examples] self.assertTrue(sum(has_pattern) < 0.6 * len(positive_examples))
def test_has_pattern(self): eg, _ = self._get_eg() self.assertFalse(eg.has_pattern(list(range(100)), eg.pattern)) """ test recursive find""" eg = ExamplesGenerator(seed=7357, pattern=[(3, 123), (3, 456), (3, 789)]) example = [1, 123, 123, 1, 456, 456, 1, 1, 789] self.assertTrue(eg.has_pattern(example, eg.pattern))
def test_multiple_patterns(self): eg = ExamplesGenerator(seed=7357, multiple_patterns=[ [ [(10, 123), (10, 456), (10, 789)], [(10, 789), (10, 456), (10, 123)] ], [0.8, 0.2] ]) examples = [next(eg()) for _ in range(10000)] has_pattern_0 = [eg.has_pattern(e[0], eg.all_patterns[0]) for e in examples] has_pattern_1 = [eg.has_pattern(e[0], eg.all_patterns[1]) for e in examples] self.assertTrue(sum(has_pattern_0) + sum(has_pattern_1) > 4000) self.assertTrue(sum(has_pattern_0) > sum(has_pattern_1) * 3)
from prefixspan import PrefixSpan from data_sources.data_generator import ExamplesGenerator, get_multiple_patterns VOCAB_SIZE = 1000 SEQ_LEN = 250 multiple_patterns = get_multiple_patterns(10) NUM_EXAMPLES = 200 MIN_FREQ = 25 MIN_LEN = 5 MIN_DIST = 3 data_generator = ExamplesGenerator(seq_len=SEQ_LEN, vocab_size=VOCAB_SIZE, seed=111, multiple_patterns=multiple_patterns) data_sequences = [next(data_generator()) for _ in range(NUM_EXAMPLES)] positive_sequences = [s[0] for s in data_sequences if s[1] == 1] negative_sequences = [s[0] for s in data_sequences if s[1] == 0] positive_seq = PrefixSpan(positive_sequences).frequent(MIN_FREQ) long_seq = [s for s in positive_seq if len(s[1]) >= MIN_LEN] seq_by_freq = sorted(long_seq, key=lambda x: x[0], reverse=True) def distance_from_seqs(s, s_list: list): """return distance (in terms of number of different tokens) between the sequence s and the list of sequence s_list""" if not s_list:
def _get_eg(self): eg = ExamplesGenerator(seed=7357, pattern=[(10, 123), (10, 456), (10, 789)]) example_pat = eg.insert_pattern(list(range(100)), eg.pattern) self.assertTrue(eg.has_pattern(example_pat, eg.pattern)) return eg, example_pat
def test_init(self): """can we build the damn thing?""" _ = ExamplesGenerator()
def test_proportions(self): eg = ExamplesGenerator(seed=7357, pattern=[(10, 123), (10, 456), (10, 789)]) examples = [next(eg()) for _ in range(10000)] has_pattern = [eg.has_pattern(e[0], eg.pattern) for e in examples] self.assertEqual(sum(has_pattern), 4953)
def test_pattern_indices(self): eg = ExamplesGenerator(seed=7357, pattern=[(10, 123), (10, 456), (10, 789)]) example_pat = eg.insert_pattern(list(range(100)), eg.pattern) self.assertEqual(eg.find_pattern_indices(example_pat, eg.pattern), [8, 14, 18])
def test_insert_pattern(self): eg = ExamplesGenerator(seq_len=10, seed=7357, pattern=[(5, 123), (2, 456)]) examples_pat = eg.insert_pattern(list(range(10)), eg.pattern) self.assertEqual(examples_pat[4:6], [123, 456])
def test_call_seed(self): """How does this seed set up at init works if we call the random generator later""" eg = ExamplesGenerator(seq_len=10, seed=456) examples = [next(eg())[0] for _ in range(10)] self.assertEqual(list(examples[0][0:3]), [27, 43, 89])