Example #1
0
    def _test_concat(self):
        """
        Tests concatenating K datasets into a new Dampr
        """

        word1 = Dampr.memory("abcdefg")
        word1.concat(Dampr.memory("hijklmn"))

        results = sorted(list(word1.run()))
        self.assertEquals(results, list('abcdefghijklmn'))
Example #2
0
def main(fname):
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # Share a root
    words = Dampr.text(fname, 1024**2) \
            .flat_map(lambda line: line.split())

    # Most frequent words
    top_words = words.count(lambda x: x) \
              .sort_by(lambda word_count: -word_count[1])

    # Total number of words seen
    total_count = top_words.fold_by(key=lambda word: 1,
                                    value=lambda x: x[1],
                                    binop=lambda x, y: x + y)

    # Character lengths
    word_lengths = top_words \
            .fold_by(lambda tc: len(tc[0]),
                    value=lambda tc: tc[1],
                    binop=lambda x,y: x+y) \
            .sort_by(lambda cl: cl[0])

    # Average character length
    avg_word_lengths = word_lengths \
            .map(lambda wl: wl[0] * wl[1]) \
            .a_group_by(lambda x: 1) \
                .sum() \
            .join(total_count) \
                .reduce(lambda awl, tc: next(awl)[1] / float(next(tc)[1]))

    tc, tw, wl, awl = Dampr.run(total_count,
                                top_words,
                                word_lengths,
                                avg_word_lengths,
                                name="word-stats")

    print()
    print("*" * 10)
    print("Word Stats")
    print("*" * 10)
    print("Total Words Found: ", tc.read(1)[0][1])

    print("\nTop 10 words")
    print("\n************")
    for word, count in tw.read(10):
        print(word, count)

    print("\nCharacter histogram")
    print("\n*******************")
    for cl, length in wl.read(20):
        print(cl, length)

    print("\nAverage Word Length: ", awl.read(1)[0][1])
Example #3
0
    def test_file_link(self):
        """
        Tests that we can read file globs
        """
        import os
        dirnames = []
        for i in range(10):
            dirname = os.path.join('/tmp', '_test_dampr_dir_{}'.format(i))
            if os.path.isdir(dirname):
                shutil.rmtree(dirname)

            os.makedirs(dirname)
            dirnames.append(dirname)

            fname = os.path.join(dirname, 'foo')
            with open(fname, 'w') as out:
                out.write(str(i))

        # Symlink into a new directory
        base = '/tmp/_dampr_test_link'
        if os.path.isdir(base):
            shutil.rmtree(base)

        dirnames.append(base)
        os.makedirs(base)

        for i in (1, 3, 5):
            os.symlink(dirnames[i],
                       os.path.join(base, os.path.basename(dirnames[i])))

        # Yields nothing!
        results = Dampr.text(base) \
                .map(int) \
                .fold_by(lambda x: 1, lambda x,y: x + y) \
                .read()

        self.assertEqual(results, [])

        # Yields something!
        results = Dampr.text(base, followlinks=True) \
                .map(int) \
                .fold_by(lambda x: 1, lambda x,y: x + y) \
                .read()

        self.assertEqual(results, [(1, 1 + 3 + 5)])

        for d in dirnames:
            shutil.rmtree(d)
Example #4
0
    def test_stream_blocks(self):
        """
        Tests stream blocks
        """
        import heapq

        def map_topk(it):
            heap = []
            for symbol, count in it:
                heapq.heappush(heap, (count, symbol))
                if len(heap) > 2:
                    heapq.heappop(heap)

            return ((1, x) for x in heap)

        def reduce_topk(it):
            counts = (v for k, vit in it for v in vit)
            for count, symbol in heapq.nlargest(2, counts):
                yield symbol, count

        word = Dampr.memory(["supercalifragilisticexpialidociousa"])
        letter_counts = word.flat_map(lambda w: list(w)).count()

        topk = letter_counts \
                .partition_map(map_topk) \
                .partition_reduce(reduce_topk)

        results = sorted(list(topk.run()))
        self.assertEquals(results, [('a', 4), ('i', 7)])
Example #5
0
    def test_combine(self):
        even = self.items.filter(lambda x: x % 2 == 0)
        odd = self.items.filter(lambda x: x % 2 == 1)

        even_ve, odd_ve = Dampr.run(even, odd)
        self.assertEquals([10, 12, 14, 16, 18], list(even_ve))
        self.assertEquals([11, 13, 15, 17, 19], list(odd_ve))
Example #6
0
    def test_len(self):
        """
        Tests the number of items in a collection.
        """

        self.assertEquals(self.items.len().read(), [10])
        self.assertEquals(Dampr.memory([]).len().read(), [0])
Example #7
0
    def intersect(self, keys, min_match=None):
        if not isinstance(keys, (list, tuple)):
            keys = [keys]

        if min_match is None:
            min_match = len(keys)

        if isinstance(min_match, float):
            min_match = int(min_match * len(keys))

        paths = read_paths(self.path, self.suffix)

        str_keys = u','.join(u'"{}"'.format(key) for key in keys)
        query = u"""
            select offset from 
            (select offset, count(*) as c 
                from key_index 
                where key in ({}) 
                group by offset) where c >= {}
            order by offset asc""".format(str_keys, min_match)

        def read_db(fname):
            db = self.open_db(fname)

            cur = db.cursor()
            cur.execute(query)
            with codecs.open(fname, encoding='utf-8') as f:
                for (offset,) in cur:
                    f.seek(offset)
                    yield f.readline()

        return Dampr.memory(paths).flat_map(read_db)
Example #8
0
 def test_disjoint(self):
     items2 = Dampr.memory(list(range(10))) \
             .group_by(lambda x: -x)
     output = self.items.group_by(lambda x: x) \
             .join(items2) \
             .run()
     output = [v for k, v in output]
     self.assertEquals([], output)
Example #9
0
    def test_read_input(self):
        """
        Tests that custom taps work as expected.
        """

        results = Dampr.read_input(RangeDataset(5), RangeDataset(10)) \
                .fold_by(lambda x: 1, lambda x, y: x + y) \
                .read()

        self.assertEqual(results[0][1], sum(range(5)) + sum(range(10)))
Example #10
0
    def test_read_url(self):
        """
        Tests that we can read urls.
        """
        results = Dampr.read_input(UrlsInput(["http://www.example.com"])) \
                .filter(lambda line: 'h1' in line) \
                .map(lambda line: line.strip()) \
                .read()

        self.assertEqual(results, ['<h1>Example Domain</h1>'])
Example #11
0
    def test_repartition(self):
        items2 = Dampr.memory(list(range(10))) \
                .group_by(lambda x: -x) \
                    .reduce(lambda k, vs: sum(vs))

        output = self.items.group_by(lambda x: x) \
                .join(items2) \
                .run()

        output = [v for k, v in output]
        self.assertEquals([], output)
Example #12
0
    def test_reduce_join(self):
        items2 = Dampr.memory(list(range(10)))
        res = self.items \
                .group_by(lambda x: x % 2) \
                .join(items2.group_by(lambda x: x % 2)) \
                    .reduce(lambda l, r: list(sorted(itertools.chain(l, r)))) \
                .run()

        output = list(res)
        self.assertEquals((0, [0, 2, 4, 6, 8, 10, 12, 14, 16, 18]), output[0])
        self.assertEquals((1, [1, 3, 5, 7, 9, 11, 13, 15, 17, 19]), output[1])
Example #13
0
    def test_top_k(self):
        """
        Tests getting the top k items
        """

        word = Dampr.memory(["supercalifragilisticexpialidociousa"])
        topk = word.flat_map(lambda w: list(w)).count() \
                .topk(5, lambda x: x[1])

        results = sorted(list(topk.run()))
        self.assertEquals(results, [('a', 4), ('c', 3), ('i', 7), ('l', 3),
                                    ('s', 3)])
Example #14
0
    def test_left_join(self):
        to_remove = Dampr.memory(list(range(10, 13)))

        output = self.items.group_by(lambda x: x) \
                .join(to_remove.group_by(lambda x: x)) \
                    .left_reduce(lambda l, r: (list(l), list(r))) \
                .filter(lambda llrs: len(llrs[1][1]) == 0) \
                .map(lambda llrs: llrs[1][0][0]) \
                .sort_by(lambda x: x) \
                .run()

        output = list(output)
        self.assertEquals(list(range(13, 20)), output)
Example #15
0
def main(fname):
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    wc = Dampr.text(fname) \
            .flat_map(lambda x: x.split()) \
            .fold_by(lambda x: x, value=lambda x: 1, binop=lambda x, y: x + y) \
            .sort_by(lambda x: -x[1])

    results = wc.run("word-count")
    for k, v in results:
        print("{}:".format(k), v)

    results.delete()
Example #16
0
    def test_read_input(self):
        """
        Tests that custom taps work as expected.
        """
        class RangeDataset(Dataset):
            def __init__(self, n):
                self.n = n

            def read(self):
                for i in range(self.n):
                    yield i, i

        results = Dampr.read_input(RangeDataset(5), RangeDataset(10)) \
                .fold_by(lambda x: 1, lambda x, y: x + y) \
                .read()

        self.assertEqual(results[0][1], sum(range(5)) + sum(range(10)))
Example #17
0
    def test_blocks(self):
        """
        Tests Custom Blocks
        """
        from collections import defaultdict
        import heapq

        class TopKMapper(BlockMapper):
            def __init__(self, k):
                self.k = k

            def start(self):
                self.heap = []

            def add(self, _k, lc):
                heapq.heappush(self.heap, (lc[1], lc[0]))
                if len(self.heap) > self.k:
                    heapq.heappop(self.heap)

                return iter([])

            def finish(self):
                for cl in self.heap:
                    yield 1, cl

        class TopKReducer(BlockReducer):
            def __init__(self, k):
                self.k = k

            def start(self):
                pass

            def add(self, k, it):
                for count, letter in heapq.nlargest(self.k, it):
                    yield letter, (letter, count)

        word = Dampr.memory(["supercalifragilisticexpialidociousa"])
        letter_counts = word.flat_map(lambda w: list(w)).count()

        topk = letter_counts \
                .custom_mapper(TopKMapper(2)) \
                .custom_reducer(TopKReducer(2))

        results = sorted(list(topk.run()))
        self.assertEquals(results, [('a', 4), ('i', 7)])
Example #18
0
    def union(self, keys):
        if not isinstance(keys, (list, tuple)):
            keys = [keys]

        paths = read_paths(self.path, self.suffix)

        query = """select distinct offset from key_index 
            where key in ({}) order by offset asc""".format(
                ','.join('"{}"'.format(key) for key in keys))

        def read_db(fname):
            db = self.open_db(fname)

            cur = db.cursor()
            cur.execute(query)
            with codecs.open(fname, encoding='utf-8') as f:
                for (offset,) in cur:
                    f.seek(offset)
                    yield f.readline()

        return Dampr.memory(paths).flat_map(read_db)
Example #19
0
    def test_file_glob(self):
        """
        Tests that we can read file globs
        """
        import os
        files = []
        for i in range(10):
            path = os.path.join('/tmp', '_test_dampr_{}'.format(i))
            with open(path, 'w') as out:
                out.write(str(i))

            files.append(path)

        results = Dampr.text("/tmp/_test_dampr_[135]") \
                .map(int) \
                .fold_by(lambda x: 1, lambda x,y: x + y) \
                .read()

        self.assertEqual(results, [(1, 1 + 3 + 5)])

        for fname in files:
            os.unlink(fname)
Example #20
0
    def build(self, key_f, force=False):
        paths = list(read_paths(self.path, False))
        paths.sort()

        def index_file(fname):
            logging.debug("Indexing %s", fname)
            db = self.create_db(fname)
            def it():
                offset = 0
                with codecs.open(fname, encoding='utf-8') as f:
                    while True:
                        line = f.readline()
                        if len(line) == 0:
                            break

                        for key in key_f(line):
                            yield key, offset

                        offset += len(line.encode('utf-8'))

            c = db.cursor()
            c.executemany("INSERT INTO key_index values (?, ?)", it())
            db.commit()
            c.execute("create index key_idx on key_index (key)")
            db.commit()
            c.execute("select count(*) from key_index")
            count = c.fetchone()[0]
            logging.debug("Keys indexed for %s: %s", fname, count)
            
            return count

        return Dampr.memory(paths) \
                .filter(lambda fname: force or not self.exists(fname)) \
                .map(index_file) \
                .fold_by(key=lambda x: 1, binop=lambda x,y: x + y) \
                .read(name="indexing")
Example #21
0
import os
import re
import sys
import math
import multiprocessing

from dampr import Dampr, setup_logging

chunk_size = os.stat(sys.argv[1]).st_size / multiprocessing.cpu_count()
docs = Dampr.text(sys.argv[1], chunk_size + 1)

RX = re.compile(r'[^\w]+')
doc_freq = docs \
        .flat_map(lambda x: set(RX.split(x.lower()))) \
        .count(reduce_buffer=float('inf'))

idf = doc_freq.cross_right(docs.len(),
                           lambda df, total:
                           (df[0], df[1], math.log(1 +
                                                   (float(total) / df[1]))),
                           memory=True)

idf.sink_tsv("/tmp/idfs").run()
Example #22
0
 def setUp(self):
     items = list(range(10, 20))
     self.items = Dampr.memory(items, partitions=2)