def _test_concat(self): """ Tests concatenating K datasets into a new Dampr """ word1 = Dampr.memory("abcdefg") word1.concat(Dampr.memory("hijklmn")) results = sorted(list(word1.run())) self.assertEquals(results, list('abcdefghijklmn'))
def main(fname): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Share a root words = Dampr.text(fname, 1024**2) \ .flat_map(lambda line: line.split()) # Most frequent words top_words = words.count(lambda x: x) \ .sort_by(lambda word_count: -word_count[1]) # Total number of words seen total_count = top_words.fold_by(key=lambda word: 1, value=lambda x: x[1], binop=lambda x, y: x + y) # Character lengths word_lengths = top_words \ .fold_by(lambda tc: len(tc[0]), value=lambda tc: tc[1], binop=lambda x,y: x+y) \ .sort_by(lambda cl: cl[0]) # Average character length avg_word_lengths = word_lengths \ .map(lambda wl: wl[0] * wl[1]) \ .a_group_by(lambda x: 1) \ .sum() \ .join(total_count) \ .reduce(lambda awl, tc: next(awl)[1] / float(next(tc)[1])) tc, tw, wl, awl = Dampr.run(total_count, top_words, word_lengths, avg_word_lengths, name="word-stats") print() print("*" * 10) print("Word Stats") print("*" * 10) print("Total Words Found: ", tc.read(1)[0][1]) print("\nTop 10 words") print("\n************") for word, count in tw.read(10): print(word, count) print("\nCharacter histogram") print("\n*******************") for cl, length in wl.read(20): print(cl, length) print("\nAverage Word Length: ", awl.read(1)[0][1])
def test_file_link(self): """ Tests that we can read file globs """ import os dirnames = [] for i in range(10): dirname = os.path.join('/tmp', '_test_dampr_dir_{}'.format(i)) if os.path.isdir(dirname): shutil.rmtree(dirname) os.makedirs(dirname) dirnames.append(dirname) fname = os.path.join(dirname, 'foo') with open(fname, 'w') as out: out.write(str(i)) # Symlink into a new directory base = '/tmp/_dampr_test_link' if os.path.isdir(base): shutil.rmtree(base) dirnames.append(base) os.makedirs(base) for i in (1, 3, 5): os.symlink(dirnames[i], os.path.join(base, os.path.basename(dirnames[i]))) # Yields nothing! results = Dampr.text(base) \ .map(int) \ .fold_by(lambda x: 1, lambda x,y: x + y) \ .read() self.assertEqual(results, []) # Yields something! results = Dampr.text(base, followlinks=True) \ .map(int) \ .fold_by(lambda x: 1, lambda x,y: x + y) \ .read() self.assertEqual(results, [(1, 1 + 3 + 5)]) for d in dirnames: shutil.rmtree(d)
def test_stream_blocks(self): """ Tests stream blocks """ import heapq def map_topk(it): heap = [] for symbol, count in it: heapq.heappush(heap, (count, symbol)) if len(heap) > 2: heapq.heappop(heap) return ((1, x) for x in heap) def reduce_topk(it): counts = (v for k, vit in it for v in vit) for count, symbol in heapq.nlargest(2, counts): yield symbol, count word = Dampr.memory(["supercalifragilisticexpialidociousa"]) letter_counts = word.flat_map(lambda w: list(w)).count() topk = letter_counts \ .partition_map(map_topk) \ .partition_reduce(reduce_topk) results = sorted(list(topk.run())) self.assertEquals(results, [('a', 4), ('i', 7)])
def test_combine(self): even = self.items.filter(lambda x: x % 2 == 0) odd = self.items.filter(lambda x: x % 2 == 1) even_ve, odd_ve = Dampr.run(even, odd) self.assertEquals([10, 12, 14, 16, 18], list(even_ve)) self.assertEquals([11, 13, 15, 17, 19], list(odd_ve))
def test_len(self): """ Tests the number of items in a collection. """ self.assertEquals(self.items.len().read(), [10]) self.assertEquals(Dampr.memory([]).len().read(), [0])
def intersect(self, keys, min_match=None): if not isinstance(keys, (list, tuple)): keys = [keys] if min_match is None: min_match = len(keys) if isinstance(min_match, float): min_match = int(min_match * len(keys)) paths = read_paths(self.path, self.suffix) str_keys = u','.join(u'"{}"'.format(key) for key in keys) query = u""" select offset from (select offset, count(*) as c from key_index where key in ({}) group by offset) where c >= {} order by offset asc""".format(str_keys, min_match) def read_db(fname): db = self.open_db(fname) cur = db.cursor() cur.execute(query) with codecs.open(fname, encoding='utf-8') as f: for (offset,) in cur: f.seek(offset) yield f.readline() return Dampr.memory(paths).flat_map(read_db)
def test_disjoint(self): items2 = Dampr.memory(list(range(10))) \ .group_by(lambda x: -x) output = self.items.group_by(lambda x: x) \ .join(items2) \ .run() output = [v for k, v in output] self.assertEquals([], output)
def test_read_input(self): """ Tests that custom taps work as expected. """ results = Dampr.read_input(RangeDataset(5), RangeDataset(10)) \ .fold_by(lambda x: 1, lambda x, y: x + y) \ .read() self.assertEqual(results[0][1], sum(range(5)) + sum(range(10)))
def test_read_url(self): """ Tests that we can read urls. """ results = Dampr.read_input(UrlsInput(["http://www.example.com"])) \ .filter(lambda line: 'h1' in line) \ .map(lambda line: line.strip()) \ .read() self.assertEqual(results, ['<h1>Example Domain</h1>'])
def test_repartition(self): items2 = Dampr.memory(list(range(10))) \ .group_by(lambda x: -x) \ .reduce(lambda k, vs: sum(vs)) output = self.items.group_by(lambda x: x) \ .join(items2) \ .run() output = [v for k, v in output] self.assertEquals([], output)
def test_reduce_join(self): items2 = Dampr.memory(list(range(10))) res = self.items \ .group_by(lambda x: x % 2) \ .join(items2.group_by(lambda x: x % 2)) \ .reduce(lambda l, r: list(sorted(itertools.chain(l, r)))) \ .run() output = list(res) self.assertEquals((0, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]), output[0]) self.assertEquals((1, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]), output[1])
def test_top_k(self): """ Tests getting the top k items """ word = Dampr.memory(["supercalifragilisticexpialidociousa"]) topk = word.flat_map(lambda w: list(w)).count() \ .topk(5, lambda x: x[1]) results = sorted(list(topk.run())) self.assertEquals(results, [('a', 4), ('c', 3), ('i', 7), ('l', 3), ('s', 3)])
def test_left_join(self): to_remove = Dampr.memory(list(range(10, 13))) output = self.items.group_by(lambda x: x) \ .join(to_remove.group_by(lambda x: x)) \ .left_reduce(lambda l, r: (list(l), list(r))) \ .filter(lambda llrs: len(llrs[1][1]) == 0) \ .map(lambda llrs: llrs[1][0][0]) \ .sort_by(lambda x: x) \ .run() output = list(output) self.assertEquals(list(range(13, 20)), output)
def main(fname): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') wc = Dampr.text(fname) \ .flat_map(lambda x: x.split()) \ .fold_by(lambda x: x, value=lambda x: 1, binop=lambda x, y: x + y) \ .sort_by(lambda x: -x[1]) results = wc.run("word-count") for k, v in results: print("{}:".format(k), v) results.delete()
def test_read_input(self): """ Tests that custom taps work as expected. """ class RangeDataset(Dataset): def __init__(self, n): self.n = n def read(self): for i in range(self.n): yield i, i results = Dampr.read_input(RangeDataset(5), RangeDataset(10)) \ .fold_by(lambda x: 1, lambda x, y: x + y) \ .read() self.assertEqual(results[0][1], sum(range(5)) + sum(range(10)))
def test_blocks(self): """ Tests Custom Blocks """ from collections import defaultdict import heapq class TopKMapper(BlockMapper): def __init__(self, k): self.k = k def start(self): self.heap = [] def add(self, _k, lc): heapq.heappush(self.heap, (lc[1], lc[0])) if len(self.heap) > self.k: heapq.heappop(self.heap) return iter([]) def finish(self): for cl in self.heap: yield 1, cl class TopKReducer(BlockReducer): def __init__(self, k): self.k = k def start(self): pass def add(self, k, it): for count, letter in heapq.nlargest(self.k, it): yield letter, (letter, count) word = Dampr.memory(["supercalifragilisticexpialidociousa"]) letter_counts = word.flat_map(lambda w: list(w)).count() topk = letter_counts \ .custom_mapper(TopKMapper(2)) \ .custom_reducer(TopKReducer(2)) results = sorted(list(topk.run())) self.assertEquals(results, [('a', 4), ('i', 7)])
def union(self, keys): if not isinstance(keys, (list, tuple)): keys = [keys] paths = read_paths(self.path, self.suffix) query = """select distinct offset from key_index where key in ({}) order by offset asc""".format( ','.join('"{}"'.format(key) for key in keys)) def read_db(fname): db = self.open_db(fname) cur = db.cursor() cur.execute(query) with codecs.open(fname, encoding='utf-8') as f: for (offset,) in cur: f.seek(offset) yield f.readline() return Dampr.memory(paths).flat_map(read_db)
def test_file_glob(self): """ Tests that we can read file globs """ import os files = [] for i in range(10): path = os.path.join('/tmp', '_test_dampr_{}'.format(i)) with open(path, 'w') as out: out.write(str(i)) files.append(path) results = Dampr.text("/tmp/_test_dampr_[135]") \ .map(int) \ .fold_by(lambda x: 1, lambda x,y: x + y) \ .read() self.assertEqual(results, [(1, 1 + 3 + 5)]) for fname in files: os.unlink(fname)
def build(self, key_f, force=False): paths = list(read_paths(self.path, False)) paths.sort() def index_file(fname): logging.debug("Indexing %s", fname) db = self.create_db(fname) def it(): offset = 0 with codecs.open(fname, encoding='utf-8') as f: while True: line = f.readline() if len(line) == 0: break for key in key_f(line): yield key, offset offset += len(line.encode('utf-8')) c = db.cursor() c.executemany("INSERT INTO key_index values (?, ?)", it()) db.commit() c.execute("create index key_idx on key_index (key)") db.commit() c.execute("select count(*) from key_index") count = c.fetchone()[0] logging.debug("Keys indexed for %s: %s", fname, count) return count return Dampr.memory(paths) \ .filter(lambda fname: force or not self.exists(fname)) \ .map(index_file) \ .fold_by(key=lambda x: 1, binop=lambda x,y: x + y) \ .read(name="indexing")
import os import re import sys import math import multiprocessing from dampr import Dampr, setup_logging chunk_size = os.stat(sys.argv[1]).st_size / multiprocessing.cpu_count() docs = Dampr.text(sys.argv[1], chunk_size + 1) RX = re.compile(r'[^\w]+') doc_freq = docs \ .flat_map(lambda x: set(RX.split(x.lower()))) \ .count(reduce_buffer=float('inf')) idf = doc_freq.cross_right(docs.len(), lambda df, total: (df[0], df[1], math.log(1 + (float(total) / df[1]))), memory=True) idf.sink_tsv("/tmp/idfs").run()
def setUp(self): items = list(range(10, 20)) self.items = Dampr.memory(items, partitions=2)