Esempio n. 1
0
    def test_file_link(self):
        """
        Tests that we can read file globs
        """
        import os
        dirnames = []
        for i in range(10):
            dirname = os.path.join('/tmp', '_test_dampr_dir_{}'.format(i))
            if os.path.isdir(dirname):
                shutil.rmtree(dirname)

            os.makedirs(dirname)
            dirnames.append(dirname)

            fname = os.path.join(dirname, 'foo')
            with open(fname, 'w') as out:
                out.write(str(i))

        # Symlink into a new directory
        base = '/tmp/_dampr_test_link'
        if os.path.isdir(base):
            shutil.rmtree(base)

        dirnames.append(base)
        os.makedirs(base)

        for i in (1, 3, 5):
            os.symlink(dirnames[i],
                       os.path.join(base, os.path.basename(dirnames[i])))

        # Yields nothing!
        results = Dampr.text(base) \
                .map(int) \
                .fold_by(lambda x: 1, lambda x,y: x + y) \
                .read()

        self.assertEqual(results, [])

        # Yields something!
        results = Dampr.text(base, followlinks=True) \
                .map(int) \
                .fold_by(lambda x: 1, lambda x,y: x + y) \
                .read()

        self.assertEqual(results, [(1, 1 + 3 + 5)])

        for d in dirnames:
            shutil.rmtree(d)
Esempio n. 2
0
def main(fname):
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    # Share a root
    words = Dampr.text(fname, 1024**2) \
            .flat_map(lambda line: line.split())

    # Most frequent words
    top_words = words.count(lambda x: x) \
              .sort_by(lambda word_count: -word_count[1])

    # Total number of words seen
    total_count = top_words.fold_by(key=lambda word: 1,
                                    value=lambda x: x[1],
                                    binop=lambda x, y: x + y)

    # Character lengths
    word_lengths = top_words \
            .fold_by(lambda tc: len(tc[0]),
                    value=lambda tc: tc[1],
                    binop=lambda x,y: x+y) \
            .sort_by(lambda cl: cl[0])

    # Average character length
    avg_word_lengths = word_lengths \
            .map(lambda wl: wl[0] * wl[1]) \
            .a_group_by(lambda x: 1) \
                .sum() \
            .join(total_count) \
                .reduce(lambda awl, tc: next(awl)[1] / float(next(tc)[1]))

    tc, tw, wl, awl = Dampr.run(total_count,
                                top_words,
                                word_lengths,
                                avg_word_lengths,
                                name="word-stats")

    print()
    print("*" * 10)
    print("Word Stats")
    print("*" * 10)
    print("Total Words Found: ", tc.read(1)[0][1])

    print("\nTop 10 words")
    print("\n************")
    for word, count in tw.read(10):
        print(word, count)

    print("\nCharacter histogram")
    print("\n*******************")
    for cl, length in wl.read(20):
        print(cl, length)

    print("\nAverage Word Length: ", awl.read(1)[0][1])
Esempio n. 3
0
def main(fname):
    logging.basicConfig(level=logging.INFO,
                        format='%(asctime)s %(levelname)s %(message)s')

    wc = Dampr.text(fname) \
            .flat_map(lambda x: x.split()) \
            .fold_by(lambda x: x, value=lambda x: 1, binop=lambda x, y: x + y) \
            .sort_by(lambda x: -x[1])

    results = wc.run("word-count")
    for k, v in results:
        print("{}:".format(k), v)

    results.delete()
Esempio n. 4
0
    def test_file_glob(self):
        """
        Tests that we can read file globs
        """
        import os
        files = []
        for i in range(10):
            path = os.path.join('/tmp', '_test_dampr_{}'.format(i))
            with open(path, 'w') as out:
                out.write(str(i))

            files.append(path)

        results = Dampr.text("/tmp/_test_dampr_[135]") \
                .map(int) \
                .fold_by(lambda x: 1, lambda x,y: x + y) \
                .read()

        self.assertEqual(results, [(1, 1 + 3 + 5)])

        for fname in files:
            os.unlink(fname)
Esempio n. 5
0
import os
import re
import sys
import math
import multiprocessing

from dampr import Dampr, setup_logging

chunk_size = os.stat(sys.argv[1]).st_size / multiprocessing.cpu_count()
docs = Dampr.text(sys.argv[1], chunk_size + 1)

RX = re.compile(r'[^\w]+')
doc_freq = docs \
        .flat_map(lambda x: set(RX.split(x.lower()))) \
        .count(reduce_buffer=float('inf'))

idf = doc_freq.cross_right(docs.len(),
                           lambda df, total:
                           (df[0], df[1], math.log(1 +
                                                   (float(total) / df[1]))),
                           memory=True)

idf.sink_tsv("/tmp/idfs").run()