def test_file_link(self): """ Tests that we can read file globs """ import os dirnames = [] for i in range(10): dirname = os.path.join('/tmp', '_test_dampr_dir_{}'.format(i)) if os.path.isdir(dirname): shutil.rmtree(dirname) os.makedirs(dirname) dirnames.append(dirname) fname = os.path.join(dirname, 'foo') with open(fname, 'w') as out: out.write(str(i)) # Symlink into a new directory base = '/tmp/_dampr_test_link' if os.path.isdir(base): shutil.rmtree(base) dirnames.append(base) os.makedirs(base) for i in (1, 3, 5): os.symlink(dirnames[i], os.path.join(base, os.path.basename(dirnames[i]))) # Yields nothing! results = Dampr.text(base) \ .map(int) \ .fold_by(lambda x: 1, lambda x,y: x + y) \ .read() self.assertEqual(results, []) # Yields something! results = Dampr.text(base, followlinks=True) \ .map(int) \ .fold_by(lambda x: 1, lambda x,y: x + y) \ .read() self.assertEqual(results, [(1, 1 + 3 + 5)]) for d in dirnames: shutil.rmtree(d)
def main(fname): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') # Share a root words = Dampr.text(fname, 1024**2) \ .flat_map(lambda line: line.split()) # Most frequent words top_words = words.count(lambda x: x) \ .sort_by(lambda word_count: -word_count[1]) # Total number of words seen total_count = top_words.fold_by(key=lambda word: 1, value=lambda x: x[1], binop=lambda x, y: x + y) # Character lengths word_lengths = top_words \ .fold_by(lambda tc: len(tc[0]), value=lambda tc: tc[1], binop=lambda x,y: x+y) \ .sort_by(lambda cl: cl[0]) # Average character length avg_word_lengths = word_lengths \ .map(lambda wl: wl[0] * wl[1]) \ .a_group_by(lambda x: 1) \ .sum() \ .join(total_count) \ .reduce(lambda awl, tc: next(awl)[1] / float(next(tc)[1])) tc, tw, wl, awl = Dampr.run(total_count, top_words, word_lengths, avg_word_lengths, name="word-stats") print() print("*" * 10) print("Word Stats") print("*" * 10) print("Total Words Found: ", tc.read(1)[0][1]) print("\nTop 10 words") print("\n************") for word, count in tw.read(10): print(word, count) print("\nCharacter histogram") print("\n*******************") for cl, length in wl.read(20): print(cl, length) print("\nAverage Word Length: ", awl.read(1)[0][1])
def main(fname): logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s %(message)s') wc = Dampr.text(fname) \ .flat_map(lambda x: x.split()) \ .fold_by(lambda x: x, value=lambda x: 1, binop=lambda x, y: x + y) \ .sort_by(lambda x: -x[1]) results = wc.run("word-count") for k, v in results: print("{}:".format(k), v) results.delete()
def test_file_glob(self): """ Tests that we can read file globs """ import os files = [] for i in range(10): path = os.path.join('/tmp', '_test_dampr_{}'.format(i)) with open(path, 'w') as out: out.write(str(i)) files.append(path) results = Dampr.text("/tmp/_test_dampr_[135]") \ .map(int) \ .fold_by(lambda x: 1, lambda x,y: x + y) \ .read() self.assertEqual(results, [(1, 1 + 3 + 5)]) for fname in files: os.unlink(fname)
import os import re import sys import math import multiprocessing from dampr import Dampr, setup_logging chunk_size = os.stat(sys.argv[1]).st_size / multiprocessing.cpu_count() docs = Dampr.text(sys.argv[1], chunk_size + 1) RX = re.compile(r'[^\w]+') doc_freq = docs \ .flat_map(lambda x: set(RX.split(x.lower()))) \ .count(reduce_buffer=float('inf')) idf = doc_freq.cross_right(docs.len(), lambda df, total: (df[0], df[1], math.log(1 + (float(total) / df[1]))), memory=True) idf.sink_tsv("/tmp/idfs").run()