def _read_csv(fn, executor=None, hdfs=None, lazy=False, lineterminator='\n', header=True, names=None, **kwargs): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) kwargs['lineterminator'] = lineterminator filenames = hdfs.glob(fn) blockss = [read_binary(fn, executor, hdfs, lazy=True, delimiter=lineterminator) for fn in filenames] if names is None and header: with hdfs.open(filenames[0]) as f: head = pd.read_csv(f, nrows=5, **kwargs) names = head.columns dfs1 = [[do(buffer_to_csv)(blocks[0], names=names, skiprows=1, **kwargs)] + [do(buffer_to_csv)(b, names=names, **kwargs) for b in blocks[1:]] for blocks in blockss] dfs2 = sum(dfs1, []) if lazy: from dask.dataframe import from_imperative raise gen.Return(from_imperative(dfs2, columns=names)) else: futures = executor.compute(*dfs2) from distributed.collections import _futures_to_dask_dataframe df = yield _futures_to_dask_dataframe(futures) raise gen.Return(df)
def _read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = sorted(hdfs.glob(fn)) blocks = [block for fn in filenames for block in read_bytes(fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode())] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] if lazy: from dask.bag import from_imperative if collection: ensure_default_get(executor) raise gen.Return(from_imperative(lines)) else: raise gen.Return(lines) else: futures = executor.compute(lines) from distributed.collections import _futures_to_dask_bag if collection: ensure_default_get(executor) b = yield _futures_to_dask_bag(futures) raise gen.Return(b) else: raise gen.Return(futures)
def read_csv(block_lists, header, head, kwargs, lazy=True, collection=True, executor=None): """ Convert blocks of bytes to a dask.dataframe or other high-level object This accepts a list of lists of futures/values of bytes where each list corresponds to one file, and the futures/values of bytes concatenate to comprise the entire file, in order. Parameters ---------- block_lists: list of lists of futures of bytes The lists of bytestrings with each list corresponding to one logical file header: bytestring The header, found at the front of the first file, to be prepended to all blocks head: pd.DataFrame An example Pandas DataFrame to be used for metadata kwargs: dict Keyword arguments to pass down to ``pd.read_csv`` lazy: boolean, optional (defaults to True) collection: boolean, optional (defaults to True) Returns ------- A dask.dataframe, or list of futures or values, depending on the value of lazy and collection. """ from dask.dataframe import from_imperative executor = default_executor(executor) dfs1 = [[do(bytes_read_csv)(blocks[0], '', kwargs)] + [do(bytes_read_csv)(b, header, kwargs) for b in blocks[1:]] for blocks in block_lists] dfs2 = sum(dfs1, []) ensure_default_get(executor) if collection: result = from_imperative(dfs2, head) else: result = dfs2 if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): """ Read text lines from HDFS Parameters ---------- fn: string filename or globstring of files on HDFS collection: boolean, optional Whether or not to return a high level collection lazy: boolean, optional Whether or not to start reading immediately Returns ------- Dask bag (if collection=True) or Futures or dask values """ from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) ensure_default_get(executor) filenames = sorted(hdfs.glob(fn)) blocks = [ block for fn in filenames for block in read_bytes( fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode()) ] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] from dask.bag import from_imperative if collection: result = from_imperative(lines).filter(None) else: result = lines if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def read_csv(block_lists, header, head, kwargs, lazy=True, collection=True, executor=None): """ Convert blocks of bytes to a dask.dataframe or other high-level object This accepts a list of lists of futures/values of bytes where each list corresponds to one file, and the futures/values of bytes concatenate to comprise the entire file, in order. Parameters ---------- block_lists: list of lists of futures of bytes The lists of bytestrings with each list corresponding to one logical file header: bytestring The header, found at the front of the first file, to be prepended to all blocks head: pd.DataFrame An example Pandas DataFrame to be used for metadata kwargs: dict Keyword arguments to pass down to ``pd.read_csv`` lazy: boolean, optional (defaults to True) collection: boolean, optional (defaults to True) Returns ------- A dask.dataframe, or list of futures or values, depending on the value of lazy and collection. """ executor = default_executor(executor) dfs1 = [[do(bytes_read_csv)(blocks[0], '', kwargs)] + [do(bytes_read_csv)(b, header, kwargs) for b in blocks[1:]] for blocks in block_lists] dfs2 = sum(dfs1, []) ensure_default_get(executor) if collection: result = from_imperative(dfs2, head) else: result = dfs2 if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def test_progress_stream(e, s, a, b): futures = e.map(div, [1] * 10, range(10)) x = 1 for i in range(5): x = do(inc)(x) future = e.compute(x) yield _wait(futures + [future]) stream = yield progress_stream(s.address, interval=0.010) msg = yield read(stream) assert msg == {'all': {'div': 10, 'inc': 5, 'finalize': 1}, 'erred': {'div': 1}, 'in_memory': {'div': 9, 'finalize': 1}, 'released': {'div': 1, 'inc': 5}} d = progress_quads(msg) assert d == {'name': ['div', 'inc', 'finalize'], 'all': [10, 5, 1], 'in_memory': [9, 0, 1], 'in_memory_right': [1, 1, 1], 'fraction': ['10 / 10', '5 / 5', '1 / 1'], 'erred': [1, 0, 0], 'erred_left': [0.9, 1, 1], 'released': [1, 5, 0], 'released_right': [0.1, 1, 0], 'top': [0.7, 1.7, 2.7], 'center': [0.5, 1.5, 2.5], 'bottom': [0.3, 1.3, 2.3]} stream.close()
def _read_avro(fn, executor=None, hdfs=None, lazy=False, **kwargs): """ See distributed.hdfs.read_avro for docstring """ from hdfs3 import HDFileSystem from dask import do import fastavro hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = hdfs.glob(fn) blockss = [] for fn in filenames: with hdfs.open(fn, 'r') as f: av = fastavro.reader(f) header = av._header schema = json.loads(header['meta']['avro.schema']) blockss.extend([read_bytes(fn, executor, hdfs, lazy=True, delimiter=header['sync'], not_zero=True) for fn in filenames]) lazy_values = [do(avro_body)(b, header) for blocks in blockss for b in blocks] if lazy: raise gen.Return(lazy_values) else: futures = executor.compute(*lazy_values) raise gen.Return(futures)
def _read_avro(path, executor=None, hdfs=None, lazy=True, **kwargs): """ See distributed.hdfs.read_avro for docstring """ from hdfs3 import HDFileSystem from dask import do import fastavro hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = walk_glob(hdfs, path) blockss = [] for fn in filenames: with hdfs.open(fn, 'rb') as f: av = fastavro.reader(f) header = av._header schema = json.loads(header['meta']['avro.schema'].decode()) blockss.extend([ read_bytes(fn, executor, hdfs, lazy=True, delimiter=header['sync'], not_zero=True) for fn in filenames ]) # TODO: why is filenames used twice? lazy_values = [ do(avro_body)(b, header) for blocks in blockss for b in blocks ] if lazy: raise gen.Return(lazy_values) else: futures = executor.compute(lazy_values) raise gen.Return(futures)
def read_text(bucket_name, prefix='', path_delimiter='', encoding='utf-8', errors='strict', lineterminator='\n', executor=None, anon=None, collection=True, lazy=True, compression=None): """ Read lines of text from S3 Parameters ---------- bucket_name: string Name of S3 bucket like ``'my-bucket'`` prefix: string Prefix of key name to match like ``'/data/2016/`` path_delimiter: string (optional) Delimiter like ``'/'`` to define implicit S3 directory structure compression: {None, 'gzip'} Returns ------- Dask bag """ from dask import do import dask.bag as db executor = default_executor(executor) blocks = read_bytes(bucket_name, prefix, path_delimiter, executor=executor, lazy=True, anon=anon) if compression: blocks = map(do(decompress[compression]), blocks) lists = [b.decode(encoding, errors).split(lineterminator) for b in blocks] if collection: ensure_default_get(executor) b = db.from_imperative(lists).filter(None) if lazy: return b else: return executor.persist(b)[0] else: if lazy: ensure_default_get(executor) return lists else: return executor.compute(lists)
def read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): """ Read text lines from HDFS Parameters ---------- fn: string filename or globstring of files on HDFS collection: boolean, optional Whether or not to return a high level collection lazy: boolean, optional Whether or not to start reading immediately Returns ------- Dask bag (if collection=True) or Futures or dask values """ from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) ensure_default_get(executor) filenames = sorted(hdfs.glob(fn)) blocks = [block for fn in filenames for block in read_bytes(fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode())] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] from dask.bag import from_imperative if collection: result = from_imperative(lines).filter(None) else: result = lines if not lazy: if collection: result = executor.persist(result) else: result = executor.compute(result) return result
def test_from_imperative(): from dask import do dfs = [do(tm.makeTimeDataFrame)(i) for i in range(1, 5)] df = dd.from_imperative(dfs, columns=['A', 'B', 'C', 'D']) assert (df.compute().columns == df.columns).all() assert list(df.map_partitions(len).compute()) == [1, 2, 3, 4] ss = [df.A for df in dfs] s = dd.from_imperative(ss, columns='A') assert s.compute().name == s.name assert list(s.map_partitions(len).compute()) == [1, 2, 3, 4]
def _read_text(fn, encoding='utf-8', errors='strict', lineterminator='\n', executor=None, hdfs=None, lazy=True, collection=True): from hdfs3 import HDFileSystem from dask import do import pandas as pd hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = sorted(hdfs.glob(fn)) blocks = [ block for fn in filenames for block in read_bytes( fn, executor, hdfs, lazy=True, delimiter=lineterminator.encode()) ] strings = [do(bytes.decode)(b, encoding, errors) for b in blocks] lines = [do(unicode.split)(s, lineterminator) for s in strings] if lazy: from dask.bag import from_imperative if collection: ensure_default_get(executor) raise gen.Return(from_imperative(lines)) else: raise gen.Return(lines) else: futures = executor.compute(lines) from distributed.collections import _futures_to_dask_bag if collection: ensure_default_get(executor) b = yield _futures_to_dask_bag(futures) raise gen.Return(b) else: raise gen.Return(futures)
def test_from_imperative(): from dask import do dfs = [do(tm.makeTimeDataFrame)(i) for i in range(1, 5)] df = dd.from_imperative(dfs, columns=["A", "B", "C", "D"]) assert (df.compute().columns == df.columns).all() assert list(df.map_partitions(len).compute()) == [1, 2, 3, 4] ss = [df.A for df in dfs] s = dd.from_imperative(ss, columns="A") assert s.compute().name == s.name assert list(s.map_partitions(len).compute()) == [1, 2, 3, 4]
def test_progress_stream(e, s, a, b): futures = e.map(div, [1] * 10, range(10)) x = 1 for i in range(5): x = do(inc)(x) future = e.compute(x) yield _wait(futures + [future]) stream = yield progress_stream(s.address, interval=0.010) msg = yield read(stream) assert msg == { 'all': { 'div': 10, 'inc': 5, 'finalize': 1 }, 'erred': { 'div': 1 }, 'in_memory': { 'div': 9, 'finalize': 1 }, 'released': { 'div': 1, 'inc': 5 } } d = progress_quads(msg) assert d == { 'name': ['div', 'inc', 'finalize'], 'all': [10, 5, 1], 'in_memory': [9, 0, 1], 'in_memory_right': [1, 1, 1], 'fraction': ['10 / 10', '5 / 5', '1 / 1'], 'erred': [1, 0, 0], 'erred_left': [0.9, 1, 1], 'released': [1, 5, 0], 'released_right': [0.1, 1, 0], 'top': [0.7, 1.7, 2.7], 'center': [0.5, 1.5, 2.5], 'bottom': [0.3, 1.3, 2.3] } stream.close()
def read_avro(path, executor=None, hdfs=None, lazy=True, **kwargs): """ Read avro encoded data from bytes on HDFS Parameters ---------- fn: string filename or globstring of avro files on HDFS lazy: boolean, optional If True return dask Value objects Returns ------- List of futures of Python objects """ from hdfs3 import HDFileSystem from dask import do import fastavro hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = walk_glob(hdfs, path) blockss = [] for fn in filenames: with hdfs.open(fn, 'rb') as f: av = fastavro.reader(f) header = av._header schema = json.loads(header['meta']['avro.schema'].decode()) blockss.extend([ read_bytes(fn, executor, hdfs, lazy=True, delimiter=header['sync'], not_zero=True) for fn in filenames ]) # TODO: why is filenames used twice? lazy_values = [ do(avro_body)(b, header) for blocks in blockss for b in blocks ] if lazy: return lazy_values else: futures = executor.compute(lazy_values) return futures
def read_avro(path, executor=None, hdfs=None, lazy=True, **kwargs): """ Read avro encoded data from bytes on HDFS Parameters ---------- fn: string filename or globstring of avro files on HDFS lazy: boolean, optional If True return dask Value objects Returns ------- List of futures of Python objects """ from hdfs3 import HDFileSystem from dask import do import fastavro hdfs = hdfs or HDFileSystem() executor = default_executor(executor) filenames = walk_glob(hdfs, path) blockss = [] for fn in filenames: with hdfs.open(fn, 'rb') as f: av = fastavro.reader(f) header = av._header schema = json.loads(header['meta']['avro.schema'].decode()) blockss.extend([read_bytes(fn, executor, hdfs, lazy=True, delimiter=header['sync'], not_zero=True) for fn in filenames]) # TODO: why is filenames used twice? lazy_values = [do(avro_body)(b, header) for blocks in blockss for b in blocks] if lazy: return lazy_values else: futures = executor.compute(lazy_values) return futures
def test_progress_stream(c, s, a, b): futures = c.map(div, [1] * 10, range(10)) x = 1 for i in range(5): x = do(inc)(x) future = c.compute(x) yield _wait(futures + [future]) stream = yield progress_stream(s.address, interval=0.010) msg = yield read(stream) nbytes = msg.pop('nbytes') assert msg == {'all': {'div': 10, 'inc': 5, 'finalize': 1}, 'erred': {'div': 1}, 'memory': {'div': 9, 'finalize': 1}, 'released': {'inc': 5}} assert set(nbytes) == set(msg['all']) assert all(v > 0 for v in nbytes.values()) assert progress_quads(msg) stream.close()
def test_progress_stream(c, s, a, b): futures = c.map(div, [1] * 10, range(10)) x = 1 for i in range(5): x = do(inc)(x) future = c.compute(x) yield _wait(futures + [future]) comm = yield progress_stream(s.address, interval=0.010) msg = yield comm.read() nbytes = msg.pop('nbytes') assert msg == {'all': {'div': 10, 'inc': 5, 'finalize': 1}, 'erred': {'div': 1}, 'memory': {'div': 9, 'finalize': 1}, 'released': {'inc': 5}} assert set(nbytes) == set(msg['all']) assert all(v > 0 for v in nbytes.values()) assert progress_quads(msg) yield comm.close()
centers=centers, cluster_std=cluster_std) try: X = da.concatenate([X, da.from_array(np.concatenate( (data, extra), axis=0), chunks=(1000, 2))], axis=0) except NameError: X = da.from_array(np.concatenate( (data, extra), axis=0), chunks=(1000, 2)) N = X.shape[0] del data, extra, labels_true def distance(a, b): """ Slow version of ``add`` to simulate work """ return np.sum(np.sqrt(np.sum((a - b)**2, axis=1))) # Parallel: t = time() pairs = [dask.do(distance)(X[i:], X[:N - i]) for i in xrange(1, N)] result = dask.do(sum)(pairs) my_sum = result.compute() print 'parallel:\t{} s'.format(time() - t) # Serial: Comment this out if you use a high N as it will eat RAM! t = time() Y = scipy.spatial.distance.pdist(X, 'euclidean') print 'serial:\t\t{} s'.format(time() - t) assert np.round(np.sum(Y)) == np.round( my_sum) # There is minor rounding error after 8 decimal places. print 'sum = {}'.format(my_sum)
def parallel_estimate_pi(nsamples): points = [do(is_inside_circle)() for i in range(nsamples)] return 4. * do(sum)(points) / nsamples
return svm.score(X[test_idx, :], y[test_idx]) d = 10 n = 100 y = np.sign(np.random.randn(n)) X = np.random.randn(n, d) reg_params = np.logspace(-2, 2, 5) n_folds = 4 kf_test = cross_validation.KFold(n, n_folds=n_folds) score_params = [] test_scores = [] for model_sel_idx, test_idx in kf_test: X_train = X[model_sel_idx] y_train = y[model_sel_idx] for reg_param in reg_params: kf = cross_validation.KFold(len(model_sel_idx), n_folds=n_folds) scores = [do(train_test)(reg_param, train_idx, val_idx, X_train, y_train) for train_idx, val_idx in kf] score = do(sum)(scores) / n_folds score_params.append((score, reg_param)) best_param = do(max)(score_params)[1] test_scores.append(do(train_test)(best_param, model_sel_idx, test_idx, X, y)) test_score = do(sum)(test_scores) / n_folds print(test_score.compute())
n = 100 y = np.sign(np.random.randn(n)) X = np.random.randn(n, d) reg_params = np.logspace(-2, 2, 5) n_folds = 4 kf_test = cross_validation.KFold(n, n_folds=n_folds) score_params = [] test_scores = [] for model_sel_idx, test_idx in kf_test: X_train = X[model_sel_idx] y_train = y[model_sel_idx] for reg_param in reg_params: kf = cross_validation.KFold(len(model_sel_idx), n_folds=n_folds) scores = [ do(train_test)(reg_param, train_idx, val_idx, X_train, y_train) for train_idx, val_idx in kf ] score = do(sum)(scores) / n_folds score_params.append((score, reg_param)) best_param = do(max)(score_params)[1] test_scores.append( do(train_test)(best_param, model_sel_idx, test_idx, X, y)) test_score = do(sum)(test_scores) / n_folds print(test_score.compute())
def parallel_estimate_pi(nsamples, k): points = [do(how_many_inside_circle)(k) for i in range(int(nsamples / k))] if nsamples % k != 0: # doesn't divide cleanly points.append(do(how_many_inside_circle(nsamples % k))) return 4. * do(sum)(points) / nsamples