def test_simple_mapper(): input_node = Input(input=get_numbers()) mapper_node = Map(simple_mapper)(input_node) assert mapper_node.input == input_node graph = Graph(input_node=input_node, output_node=mapper_node) res = graph.run() assert res == get_numbers()
def test_square_mapper(): input_node = Input(input=get_numbers()) mapper_node = Map(square_mapper)(input_node) graph = Graph(input_node=input_node, output_node=mapper_node) res = graph.run() answer = [{'a': i**2} for i in range(1, 6)] assert res == answer
def test_person_mapper(): input_node = Input(input=get_persons()) mapper_node = Map(person_mapper)(input_node) graph = Graph(input_node=input_node, output_node=mapper_node) res = graph.run() assert res == [ { "name": "Andrey", "id": 1 }, { "name": "Leonid", "id": 2 }, { "name": "Sergey", "id": 1 }, ]
def test_empty_mapper(): input_node = Input(input=[]) mapper_node = Map(simple_mapper)(input_node) graph = Graph(input_node=input_node, output_node=mapper_node) res = graph.run() assert res == []
def split_text(record): """ Split rows with 'text' field into set of rows with 'token' field (one for every occurence of every word in text) """ new_text = re.sub('[^A-Za-z]+', ' ', record['text']) tokens = new_text.split() for token in tokens: yield { 'doc_id': record['doc_id'], 'word': token.lower(), } def word_counter(rows): """ Count words. """ yield {'word': rows[0]['word'], 'number': len(rows)} if __name__ == "__main__": input_node = Input() mapper = Map(split_text)(input_node) sort = Sort("word")(mapper) reduce = Reduce(word_counter, "word")(sort) graph = Graph(input_node=input_node, output_node=reduce) graph.run(input_file="data/text_corpus.txt", output_file=open("word_count.txt", "w"))
words = [] for row in rows: pmi = math.log(row['number_in_doc'] * row['words_in_total'] / row['sum_of_docs'] / row['words_in_doc']) words.append((row['word'], pmi)) words.sort(key=lambda x: x[1], reverse=True) yield {'doc_id': rows[0]['doc_id'], 'top_words': words[:10]} if __name__ == "__main__": input_node = Input() docs_count_reducer = Reduce(docs_count)(input_node) split_mapper = Map(split_text, "tokenizer")(docs_count_reducer) sort_by_word_doc_id = Sort(by=['word', 'doc_id'])(split_mapper) words_in_doc_reducer = Reduce(count_words_in_doc, key=('word', 'doc_id'))(sort_by_word_doc_id) sum_of_docs = Reduce(count_sum_of_docs, key=('word'))(words_in_doc_reducer) sort_by_doc_id = Sort(by="doc_id")(sum_of_docs) word_in_one_doc_reducer = Reduce(count_words_in_one_doc, key="doc_id")(sort_by_doc_id) words_reducer = Reduce(count_words)(word_in_one_doc_reducer) pmi_reducer = Reduce(count_pmi, "doc_id")(words_reducer)