def complex_test_2(): from collections import Counter def foo1(counter, iterable): counter.update(iterable) return counter p = [(lambda x: x * 2, modifier.map), (foo1, modifier.reduce, Counter())] print(apply_pipeline([[1, 2, 3, 4], [1, 2, 3], [1, 2], [1]], p))
def simple_test_1_with_print(): def foo1(x): print('Map 1') return x * 2 def foo2(x): print('Filter 1') return x > 5 def foo3(x): print('Map 2') return x - 1 def foo4(x, y): print('Reduce 1') return x + y p = [(foo1, modifier.map), (foo2, modifier.filter), (foo3, modifier.map), (foo4, modifier.reduce)] print(apply_pipeline(iter(range(0, 10)), p))
counter = 0 with open('input/all_questions_corpus.txt', 'r', encoding="ISO-8859-1") as input_file: for line in input_file: counter += 1 if counter % 1000000 == 0: print(counter) yield line.strip() def get_lemma_set(question_document): return set([(word.lemma_, word.pos_) for word in question_document if not word.is_stop and not word.is_punct]) def update_counter(counter, iterable): counter.update(iterable) return counter pipeline = [(nlp, modifier.map), (get_lemma_set, modifier.map), (update_counter, modifier.reduce, Counter())] word_counter = apply_pipeline(all_questions(), pipeline) print(len(word_counter)) print(word_counter.most_common(20)) with open('input/document_frequencies.pickle', 'wb') as output_file: pickle.dump(word_counter, output_file)
def create_features(input_file_path): for entry in apply_pipeline(read_dataset(input_file_path), nlp_pipeline): yield { k: v for k, v in entry.items() if k.endswith('feature') or k == 'id' }
def complex_test_1(): p = [(lambda x: x * 2, modifier.map), (windowify(2), modifier.window), (lambda x: sum(x[0]) > 4, modifier.filter), (dewindowify, modifier.window), (lambda x, y: x + y, modifier.reduce)] print(apply_pipeline(iter(range(0, 10)), p))
def simple_test_1(): p = [(lambda x: x * 2, modifier.map), (lambda x: x > 5, modifier.filter), (lambda x: x - 1, modifier.map), (lambda x, y: x + y, modifier.reduce)] print(apply_pipeline(iter(range(0, 10)), p))