def from_imperative(dfs, metadata=None, divisions=None, columns=None): """ Create DataFrame from many imperative objects Parameters ---------- dfs: list of Values An iterable of dask.imperative.Value objects, such as come from dask.do These comprise the individual partitions of the resulting dataframe metadata: list or string of column names or empty dataframe divisions: list or None """ if columns is not None: print("Deprecation warning: Use metadata argument, not columns") metadata = columns from dask.imperative import Value if isinstance(dfs, Value): dfs = [dfs] dsk = merge(df.dask for df in dfs) name = 'from-imperative-' + tokenize(*dfs) names = [(name, i) for i in range(len(dfs))] values = [df.key for df in dfs] dsk2 = dict(zip(names, values)) if divisions is None: divisions = [None] * (len(dfs) + 1) if isinstance(metadata, str): return Series(merge(dsk, dsk2), name, metadata, divisions) else: return DataFrame(merge(dsk, dsk2), name, metadata, divisions)
def test_to_task_dask(): a = delayed(1, name='a') b = delayed(2, name='b') task, dask = to_task_dask([a, b, 3]) assert task == ['a', 'b', 3] task, dask = to_task_dask((a, b, 3)) assert task == (tuple, ['a', 'b', 3]) assert dict(dask) == merge(a.dask, b.dask) task, dask = to_task_dask({a: 1, b: 2}) assert (task == (dict, [['b', 2], ['a', 1]]) or task == (dict, [['a', 1], ['b', 2]])) assert dict(dask) == merge(a.dask, b.dask) f = namedtuple('f', ['x', 'y']) x = f(1, 2) task, dask = to_task_dask(x) assert task == x assert dict(dask) == {} # Issue https://github.com/dask/dask/issues/2107 class MyClass(dict): pass task, dask = to_task_dask(MyClass()) assert type(task) is MyClass assert dict(dask) == {}
def __getitem__(self, key): if isinstance(key, (str, unicode)): name = self._name + '.' + key if key in self.columns: dsk = dict(((name, i), (operator.getitem, (self._name, i), key)) for i in range(self.npartitions)) return Series(merge(self.dask, dsk), name, key, self.divisions) if isinstance(key, list): name = '%s[%s]' % (self._name, str(key)) if all(k in self.columns for k in key): dsk = dict(((name, i), (operator.getitem, (self._name, i), (list, key))) for i in range(self.npartitions)) return DataFrame(merge(self.dask, dsk), name, key, self.divisions) if isinstance(key, Series) and self.divisions == key.divisions: name = next(names) dsk = dict(((name, i), (operator.getitem, (self._name, i), (key._name, i))) for i in range(self.npartitions)) return DataFrame(merge(self.dask, key.dask, dsk), name, self.columns, self.divisions) raise NotImplementedError()
def _loc(self, ind): """ Helper function for the .loc accessor """ if not self.known_divisions: raise ValueError( "Can not use loc on DataFrame without known divisions") name = next(names) if not isinstance(ind, slice): part = self._partition_of_index_value(ind) dsk = {(name, 0): (lambda df: df.loc[ind], (self._name, part))} return type(self)(merge(self.dask, dsk), name, self.column_info, []) else: assert ind.step in (None, 1) if ind.start: start = self._partition_of_index_value(ind.start) else: start = 0 if ind.stop is not None: stop = self._partition_of_index_value(ind.stop) else: stop = self.npartitions - 1 if stop == start: dsk = {(name, 0): (_loc, (self._name, start), ind.start, ind.stop)} else: dsk = merge( {(name, 0): (_loc, (self._name, start), ind.start, None)}, dict(((name, i), (self._name, start + i)) for i in range(1, stop - start)), {(name, stop - start): (_loc, (self._name, stop), None, ind.stop)}) return type(self)(merge(self.dask, dsk), name, self.column_info, self.divisions[start:stop])
def elemwise(op, *args, **kwargs): """ Elementwise operation for dask.Dataframes """ columns = kwargs.get('columns', None) name = kwargs.get('name', None) _name = next(names) frames = [arg for arg in args if isinstance(arg, _Frame)] other = [(i, arg) for i, arg in enumerate(args) if not isinstance(arg, _Frame)] if other: op2 = partial_by_order(op, other) else: op2 = op assert all(f.divisions == frames[0].divisions for f in frames) assert all(f.npartitions == frames[0].npartitions for f in frames) dsk = dict(((_name, i), (op2,) + frs) for i, frs in enumerate(zip(*[f._keys() for f in frames]))) if columns is not None: return DataFrame(merge(dsk, *[f.dask for f in frames]), _name, columns, frames[0].divisions) else: column_name = name or consistent_name(n for f in frames for n in f.columns) return Series(merge(dsk, *[f.dask for f in frames]), _name, column_name, frames[0].divisions)
def to_hdf(df, path_or_buf, key, mode='a', append=False, complevel=0, complib=None, fletcher32=False, get=get_sync, dask_kwargs=None, name_function=None, compute=True, **kwargs): name = 'to-hdf-' + uuid.uuid1().hex pd_to_hdf = getattr(df._partition_type, 'to_hdf') # if path_or_buf is string, format using i_name if isinstance(path_or_buf, str): if path_or_buf.count('*') + key.count('*') > 1: raise ValueError("A maximum of one asterisk is accepted in file path and dataset key") fmt_obj = lambda path_or_buf, i_name: path_or_buf.replace('*', i_name) else: if key.count('*') > 1: raise ValueError("A maximum of one asterisk is accepted in dataset key") fmt_obj = lambda path_or_buf, _: path_or_buf if name_function is None: name_function = build_name_function(df.npartitions - 1) # we guarantee partition order is preserved when its saved and read # so we enforce name_function to maintain the order of its input. if '*' in key or (isinstance(path_or_buf, str) and '*' in path_or_buf): formatted_names = [name_function(i) for i in range(df.npartitions)] if formatted_names != sorted(formatted_names): warn("In order to preserve order between partitions " "name_function must preserve the order of its input") dsk = dict() i_name = name_function(0) dsk[(name, 0)] = (_link, None, (apply, pd_to_hdf, (tuple, [(df._name, 0), fmt_obj(path_or_buf, i_name), key.replace('*', i_name)]), merge(kwargs, {'mode': mode, 'format': 'table', 'append': append, 'complevel': complevel, 'complib': complib, 'fletcher32': fletcher32}))) for i in range(1, df.npartitions): i_name = name_function(i) dsk[(name, i)] = (_link, (name, i - 1), (apply, pd_to_hdf, (tuple, [(df._name, i), fmt_obj(path_or_buf, i_name), key.replace('*', i_name)]), merge(kwargs, {'mode': 'a', 'format': 'table', 'append': True, 'complevel': complevel, 'complib': complib, 'fletcher32': fletcher32}))) dask_kwargs = dask_kwargs or {} dsk = merge(df.dask, dsk) key = (name, df.npartitions - 1) if compute: return DataFrame._get(dsk, key, get=get, **dask_kwargs) else: return Delayed(key, [dsk])
def set_partition(f, index, divisions, get=threaded.get, **kwargs): """ Set new partitioning along index given divisions """ divisions = unique(divisions) name = next(names) if isinstance(index, Series): assert index.divisions == f.divisions dsk = dict(((name, i), (f._partition_type.set_index, block, ind)) for i, (block, ind) in enumerate(zip(f._keys(), index._keys()))) f2 = type(f)(merge(f.dask, index.dask, dsk), name, f.column_info, f.divisions) else: dsk = dict(((name, i), (f._partition_type.set_index, block, index)) for i, block in enumerate(f._keys())) f2 = type(f)(merge(f.dask, dsk), name, f.column_info, f.divisions) head = f2.head() pf = pframe(like=head, divisions=divisions, **kwargs) def append(block): pf.append(block) return 0 f2.map_blocks(append).compute(get=get) pf.flush() return from_pframe(pf)
def from_imperative(dfs, columns, divisions=None): """ Create DataFrame from many imperative objects Parameters ---------- dfs: list of Values An iterable of dask.imperative.Value objects, such as come from dask.do These comprise the individual partitions of the resulting dataframe columns: list or string The list of column names if the result is a DataFrame Or the single column name if the result is a Series divisions: list or None """ from dask.imperative import Value if isinstance(dfs, Value): dfs = [dfs] dsk = merge(df.dask for df in dfs) name = 'from-imperative-' + tokenize(*dfs) names = [(name, i) for i in range(len(dfs))] values = [df.key for df in dfs] dsk2 = dict(zip(names, values)) if divisions is None: divisions = [None] * (len(dfs) + 1) if isinstance(columns, str): return Series(merge(dsk, dsk2), name, columns, divisions) else: return DataFrame(merge(dsk, dsk2), name, columns, divisions)
def apply(self, latitude, longitude, latitude_mask, **kwargs): latitude = (latitude.T - data.train_gps_mean[0]) / data.train_gps_std[0] longitude = (longitude.T - data.train_gps_mean[1]) / data.train_gps_std[1] latitude_mask = latitude_mask.T rec_in = tensor.concatenate((latitude[:, :, None], longitude[:, :, None]), axis=2) path = self.rec.apply(merge(self.fwd_fork.apply(rec_in, as_dict=True), {'mask': latitude_mask}), merge(self.bkwd_fork.apply(rec_in, as_dict=True), {'mask': latitude_mask}))[0] last_id = tensor.cast(latitude_mask.sum(axis=0) - 1, dtype='int64') path_representation = (path[0][:, -self.config.rec_state_dim:], path[last_id - 1, tensor.arange(last_id.shape[0])] [:, :self.config.rec_state_dim]) embeddings = tuple(self.context_embedder.apply( **{k: kwargs[k] for k in self.context_embedder.inputs })) inputs = tensor.concatenate(path_representation + embeddings, axis=1) outputs = self.rec_to_output.apply(inputs) return outputs
def f(c, a, b): data = yield _scatter((c.ip, c.port), [1, 2, 3]) assert c.ip in str(data[0]) assert c.ip in repr(data[0]) assert merge(a.data, b.data) == \ {d.key: i for d, i in zip(data, [1, 2, 3])} assert set(c.who_has) == {d.key for d in data} assert all(len(v) == 1 for v in c.who_has.values()) result = yield [d._get() for d in data] assert result == [1, 2, 3] yield data[0]._delete() assert merge(a.data, b.data) == \ {d.key: i for d, i in zip(data[1:], [2, 3])} assert data[0].key not in c.who_has data = yield scatter_to_workers((c.ip, c.port), [a.address, b.address], [4, 5, 6]) m = merge(a.data, b.data) for d, v in zip(data, [4, 5, 6]): assert m[d.key] == v result = yield _gather((c.ip, c.port), data) assert result == [4, 5, 6]
def apply(self, source_sentence, source_sentence_mask): """Produces source annotations, either non-recurrently or with a bidirectional RNN architecture. """ # Time as first dimension source_sentence = source_sentence.T source_sentence_mask = source_sentence_mask.T embeddings = self.lookup.apply(source_sentence) representation = self.bidirs[0].apply( merge(self.fwd_forks[0].apply(embeddings, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_forks[0].apply(embeddings, as_dict=True), {'mask': source_sentence_mask})) for i in xrange(1, self.n_layers): if self.skip_connections: inp = tensor.concatenate([representation, embeddings], axis=2) else: inp = representation representation = self.bidirs[i].apply( merge(self.fwd_forks[i].apply(inp, as_dict=True), {'mask': source_sentence_mask}), merge(self.back_forks[i].apply(inp, as_dict=True), {'mask': source_sentence_mask}) ) return representation, source_sentence_mask
def to_hdf(df, path_or_buf, key, mode='a', append=False, complevel=0, complib=None, fletcher32=False, get=get_sync, dask_kwargs=None, **kwargs): name = 'to-hdf-' + uuid.uuid1().hex pd_to_hdf = getattr(df._partition_type, 'to_hdf') dsk = dict() dsk[(name, 0)] = (_link, None, (apply, pd_to_hdf, (tuple, [(df._name, 0), path_or_buf, key]), merge(kwargs, {'mode': mode, 'format': 'table', 'append': append, 'complevel': complevel, 'complib': complib, 'fletcher32': fletcher32}))) for i in range(1, df.npartitions): dsk[(name, i)] = (_link, (name, i - 1), (apply, pd_to_hdf, (tuple, [(df._name, i), path_or_buf, key]), merge(kwargs, {'mode': 'a', 'format': 'table', 'append': True, 'complevel': complevel, 'complib': complib, 'fletcher32': fletcher32}))) dask_kwargs = dask_kwargs or {} DataFrame._get(merge(df.dask, dsk), (name, df.npartitions - 1), get=get, **dask_kwargs)
def elemwise(op, *args, **kwargs): """ Elementwise operation for dask.Dataframes """ columns = kwargs.get('columns', None) name = kwargs.get('name', None) _name = 'elemwise' + next(tokens) dfs = [arg for arg in args if isinstance(arg, _Frame)] other = [(i, arg) for i, arg in enumerate(args) if not isinstance(arg, _Frame)] if other: op2 = partial_by_order(op, other) else: op2 = op if not all(df.divisions == dfs[0].divisions for df in dfs): msg = 'All dask.Dataframe and dask.Series must have same divisions' raise ValueError(msg) if not all(df.npartitions == dfs[0].npartitions for df in dfs): msg = 'All dask.Dataframe and dask.Series must have same npartitions' raise ValueError(msg) dsk = dict(((_name, i), (op2,) + frs) for i, frs in enumerate(zip(*[df._keys() for df in dfs]))) if columns is not None: return DataFrame(merge(dsk, *[df.dask for df in dfs]), _name, columns, dfs[0].divisions) else: column_name = name or consistent_name(n for df in dfs for n in df.columns) return Series(merge(dsk, *[df.dask for df in dfs]), _name, column_name, dfs[0].divisions)
def _loc_slice(self, ind): name = 'loc-slice' + next(tokens) assert ind.step in (None, 1) if ind.start: start = _partition_of_index_value(self.divisions, ind.start) else: start = 0 if ind.stop is not None: stop = _partition_of_index_value(self.divisions, ind.stop) else: stop = self.npartitions - 1 istart = _coerce_loc_index(self.divisions, ind.start) istop = _coerce_loc_index(self.divisions, ind.stop) if stop == start: dsk = {(name, 0): (_loc, (self._name, start), ind.start, ind.stop)} divisions = [istart, istop] else: dsk = merge( {(name, 0): (_loc, (self._name, start), ind.start, None)}, dict(((name, i), (self._name, start + i)) for i in range(1, stop - start)), {(name, stop - start): (_loc, (self._name, stop), None, ind.stop)}) divisions = ((max(istart, self.divisions[start]) if ind.start is not None else self.divisions[0],) + self.divisions[start+1:stop+1] + (min(istop, self.divisions[stop+1]) if ind.stop is not None else self.divisions[-1],)) assert len(divisions) == len(dsk) + 1 return type(self)(merge(self.dask, dsk), name, self.column_info, divisions)
def f(c, a, b): keys = yield _scatter((c.ip, c.port), [1, 2, 3]) assert merge(a.data, b.data) == \ {k: i for k, i in zip(keys, [1, 2, 3])} assert set(c.who_has) == set(keys) assert all(len(v) == 1 for v in c.who_has.values()) keys2, who_has, nbytes = yield scatter_to_workers([a.address, b.address], [4, 5, 6]) m = merge(a.data, b.data) for k, v in zip(keys2, [4, 5, 6]): assert m[k] == v assert isinstance(who_has, dict) assert set(concat(who_has.values())) == {a.address, b.address} assert len(who_has) == len(keys2) assert isinstance(nbytes, dict) assert set(nbytes) == set(who_has) assert all(isinstance(v, int) for v in nbytes.values()) result = yield _gather((c.ip, c.port), keys2) assert result == [4, 5, 6]
def from_imperative(values): """ Create bag from many imperative objects Parameters ---------- values: list of Values An iterable of dask.imperative.Value objects, such as come from dask.do These comprise the individual partitions of the resulting bag Returns ------- Bag Examples -------- >>> b = from_imperative([x, y, z]) # doctest: +SKIP """ from dask.imperative import Value if isinstance(values, Value): values = [values] dsk = merge(v.dask for v in values) name = 'bag-from-imperative-' + tokenize(*values) names = [(name, i) for i in range(len(values))] values = [v.key for v in values] dsk2 = dict(zip(names, values)) return Bag(merge(dsk, dsk2), name, len(values))
def read_csv(fn, *args, **kwargs): chunksize = kwargs.pop('chunksize', 2**16) categorize = kwargs.pop('categorize', None) index = kwargs.pop('index', None) if index and categorize == None: categorize = True header = kwargs.get('header', 1) nlines = linecount(fn) - header nchunks = int(ceil(1.0 * nlines / chunksize)) read = next(read_csv_names) blockdivs = tuple(range(chunksize, nlines, chunksize)) one_chunk = pd.read_csv(fn, *args, nrows=100, **kwargs) cols = [] if categorize or index: if categorize: category_columns = [c for c in one_chunk.dtypes.index if one_chunk.dtypes[c] == 'O'] else: category_columns = [] cols = category_columns + ([index] if index else []) d = read_csv(fn, *args, **merge(kwargs, dict(chunksize=chunksize, usecols=cols, categorize=False, parse_dates=None))) categories = [d[c].drop_duplicates() for c in category_columns] if index: quantiles = d[index].quantiles(np.linspace(0, 100, nchunks + 1)[1:-1]) result = compute(quantiles, *categories) quantiles, categories = result[0], result[1:] else: categories = compute(*categories) categories = dict(zip(category_columns, categories)) kwargs['chunksize'] = chunksize load = {(read, -1): (partial(pd.read_csv, *args, **kwargs), fn)} load.update(dict(((read, i), (get_chunk, (read, i-1), chunksize*i)) for i in range(nchunks))) name = next(names) dsk = dict(((name, i), (getitem, (read, i), 0)) for i in range(nchunks)) result = DataFrame(merge(dsk, load), name, one_chunk.columns, blockdivs) if categorize: func = partial(categorize_block, categories=categories) result = result.map_blocks(func, columns=result.columns) if index: result = set_partition(result, index, quantiles) return result
def compute(*args, **kwargs): """Compute several dask collections at once. Examples -------- >>> import dask.array as da >>> a = da.arange(10, chunks=2).sum() >>> b = da.arange(10, chunks=2).mean() >>> compute(a, b) (45, 4.5) """ groups = groupby(attrgetter('_optimize'), args) get = kwargs.pop('get', None) or _globals['get'] if not get: get = args[0]._default_get if not all(a._default_get == get for a in args): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler `get` function using either " "the `get` kwarg or globally with `set_options`.") dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) keys = [arg._keys() for arg in args] results = get(dsk, keys, **kwargs) return tuple(a._finalize(a, r) for a, r in zip(args, results))
def persist(self, collections): """ Persist dask collections on cluster Starts computation of the collection on the cluster in the background. Provides a new dask collection that is semantically identical to the previous one, but now based off of futures currently in execution. Parameters ---------- collections: sequence or single dask object Collections like dask.array or dataframe or dask.value objects Returns ------- List of collections, or single collection, depending on type of input. Examples -------- >>> xx = executor.persist(x) # doctest: +SKIP >>> xx, yy = executor.persist([x, y]) # doctest: +SKIP See Also -------- Executor.compute """ if isinstance(collections, (tuple, list, set, frozenset)): singleton = False else: singleton = True collections = [collections] assert all(isinstance(c, Base) for c in collections) groups = groupby(lambda x: x._optimize, collections) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) d = {k: unpack_remotedata(v) for k, v in dsk.items()} dsk2 = {k: v[0] for k, v in d.items()} dependencies = {k: v[1] for k, v in d.items()} for k, v in dsk2.items(): dependencies[k] |= set(_deps(dsk, v)) names = list({k for c in collections for k in flatten(c._keys())}) self._send_to_scheduler({'op': 'update-graph', 'tasks': valmap(dumps_task, dsk2), 'dependencies': dependencies, 'keys': names, 'client': self.id}) result = [redict_collection(c, {k: Future(k, self) for k in flatten(c._keys())}) for c in collections] if singleton: return first(result) else: return result
def compute(self, *args, **kwargs): """ Compute dask collections on cluster Parameters ---------- args: iterable of dask objects Collections like dask.array or dataframe or dask.value objects sync: bool (optional) Returns Futures if False (default) or concrete values if True Returns ------- Tuple of Futures or concrete values Examples -------- >>> from dask import do, value >>> from operator import add >>> x = dask.do(add)(1, 2) >>> y = dask.do(add)(x, x) >>> xx, yy = executor.compute(x, y) # doctest: +SKIP >>> xx # doctest: +SKIP <Future: status: finished, key: add-8f6e709446674bad78ea8aeecfee188e> >>> xx.result() # doctest: +SKIP 3 >>> yy.result() # doctest: +SKIP 6 """ sync = kwargs.pop('sync', False) assert not kwargs if sync: return dask.compute(*args, get=self.get) variables = [a for a in args if isinstance(a, Base)] groups = groupby(lambda x: x._optimize, variables) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) names = ['finalize-%s' % tokenize(v) for v in variables] dsk2 = {name: (v._finalize, v, v._keys()) for name, v in zip(names, variables)} self.loop.add_callback(self.scheduler_queue.put_nowait, {'op': 'update-graph', 'dsk': merge(dsk, dsk2), 'keys': names}) i = 0 futures = [] for arg in args: if isinstance(arg, Base): futures.append(Future(names[i], self)) i += 1 else: futures.append(arg) return futures
def from_dask_array(x, columns=None): """ Convert dask Array to dask DataFrame Converts a 2d array into a DataFrame and a 1d array into a Series. Parameters ---------- x: da.Array columns: list or string list of column names if DataFrame, single string if Series Example ------- >>> import dask.array as da >>> import dask.dataframe as dd >>> x = da.ones((4, 2), chunks=(2, 2)) >>> df = dd.io.from_dask_array(x, columns=['a', 'b']) >>> df.compute() a b 0 1 1 1 1 1 2 1 1 3 1 1 """ name = "from-dask-array" + next(tokens) divisions = [0] for c in x.chunks[0]: divisions.append(divisions[-1] + c) index = [(range, a, b) for a, b in zip(divisions[:-1], divisions[1:])] divisions[-1] -= 1 if x.ndim == 1: dsk = dict( ((name, i), (pd.Series, chunk, ind, x.dtype, columns)) for i, (chunk, ind) in enumerate(zip(x._keys(), index)) ) return Series(merge(x.dask, dsk), name, columns, divisions) elif x.ndim == 2: if columns is None: raise ValueError("Must provide columns for DataFrame") if len(columns) != x.shape[1]: raise ValueError( "Columns must be the same length as array width\n" " columns: %s\n width: %d" % (str(columns), x.shape[1]) ) if len(x.chunks[1]) > 1: x = x.rechunk({1: x.shape[1]}) dsk = dict( ((name, i), (pd.DataFrame, chunk[0], ind, columns)) for i, (chunk, ind) in enumerate(zip(x._keys(), index)) ) return DataFrame(merge(x.dask, dsk), name, columns, divisions) else: raise ValueError("Array must have one or two dimensions. Had %d" % x.ndim)
def apply(self, source_sentence_tbf, source_sentence_mask_tb=None): representation_tbf = self.bidir.apply( merge(self.fwd_fork.apply(source_sentence_tbf, as_dict=True), {'mask': source_sentence_mask_tb}), merge(self.back_fork.apply(source_sentence_tbf, as_dict=True), {'mask': source_sentence_mask_tb}) ) return representation_tbf
def compute(*args, **kwargs): """Compute several dask collections at once. Parameters ---------- args : object Any number of objects. If the object is a dask collection, it's computed and the result is returned. Otherwise it's passed through unchanged. get : callable, optional A scheduler ``get`` function to use. If not provided, the default is to check the global settings first, and then fall back to defaults for the collections. optimize_graph : bool, optional If True [default], the optimizations for each collection are applied before computation. Otherwise the graph is run as is. This can be useful for debugging. kwargs Extra keywords to forward to the scheduler ``get`` function. Examples -------- >>> import dask.array as da >>> a = da.arange(10, chunks=2).sum() >>> b = da.arange(10, chunks=2).mean() >>> compute(a, b) (45, 4.5) """ variables = [a for a in args if isinstance(a, Base)] if not variables: return args get = kwargs.pop('get', None) or _globals['get'] if not get: get = variables[0]._default_get if not all(a._default_get == get for a in variables): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler `get` function using either " "the `get` kwarg or globally with `set_options`.") if kwargs.get('optimize_graph', True): groups = groupby(attrgetter('_optimize'), variables) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val], **kwargs) for opt, val in groups.items()]) else: dsk = merge(var.dask for var in variables) keys = [var._keys() for var in variables] results = get(dsk, keys, **kwargs) results_iter = iter(results) return tuple(a if not isinstance(a, Base) else a._finalize(next(results_iter)) for a in args)
def from_delayed(dfs, meta=None, divisions=None, prefix='from-delayed', metadata=None): """ Create Dask DataFrame from many Dask Delayed objects Parameters ---------- dfs : list of Delayed An iterable of ``dask.delayed.Delayed`` objects, such as come from ``dask.delayed`` These comprise the individual partitions of the resulting dataframe. $META divisions : tuple, str, optional Partition boundaries along the index. For tuple, see http://dask.pydata.io/en/latest/dataframe-partitions.html For string 'sorted' will compute the delayed values to find index values. Assumes that the indexes are mutually sorted. If None, then won't use index information prefix : str, optional Prefix to prepend to the keys. """ if metadata is not None and meta is None: warn("Deprecation warning: Use meta keyword, not metadata") meta = metadata from dask.delayed import Delayed if isinstance(dfs, Delayed): dfs = [dfs] dsk = merge(df.dask for df in dfs) name = prefix + '-' + tokenize(*dfs) names = [(name, i) for i in range(len(dfs))] values = [df.key for df in dfs] dsk2 = dict(zip(names, values)) dsk3 = merge(dsk, dsk2) if meta is None: meta = dfs[0].compute() if isinstance(meta, (str, pd.Series)): Frame = Series else: Frame = DataFrame if divisions is None or divisions == 'sorted': divs = [None] * (len(dfs) + 1) else: divs = tuple(divisions) if len(divs) != len(dfs) + 1: raise ValueError("divisions should be a tuple of len(dfs) + 1") df = Frame(dsk3, name, meta, divs) if divisions == 'sorted': from ..core import compute_divisions divisions = compute_divisions(df) df.divisions = divisions return df
def read_csv(fn, *args, **kwargs): chunkbytes = kwargs.pop('chunkbytes', 2**25) # 50 MB categorize = kwargs.pop('categorize', None) index = kwargs.pop('index', None) if index and categorize == None: categorize = True kwargs = fill_kwargs(fn, args, kwargs) # Handle glob strings if '*' in fn: return concat([read_csv(f, *args, **kwargs) for f in sorted(glob(fn))]) token = tokenize(os.path.getmtime(fn), args, kwargs) name = 'read-csv-%s-%s' % (fn, token) columns = kwargs.pop('columns') header = kwargs.pop('header') if 'nrows' in kwargs: # Just create single partition dsk = {(name, 0): (apply, pd.read_csv, (fn,), assoc(kwargs, 'header', header))} result = DataFrame(dsk, name, columns, [None, None]) else: # Chunk sizes and numbers total_bytes = file_size(fn, kwargs['compression']) nchunks = int(ceil(total_bytes / chunkbytes)) divisions = [None] * (nchunks + 1) first_kwargs = merge(kwargs, dict(header=header, compression=None)) rest_kwargs = merge(kwargs, dict(header=None, compression=None)) # Create dask graph dsk = dict(((name, i), (_read_csv, fn, i, chunkbytes, kwargs['compression'], rest_kwargs)) for i in range(1, nchunks)) dsk[(name, 0)] = (_read_csv, fn, 0, chunkbytes, kwargs['compression'], first_kwargs) result = DataFrame(dsk, name, columns, divisions) if categorize or index: categories, quantiles = categories_and_quantiles(fn, args, kwargs, index, categorize, chunkbytes=chunkbytes) if categorize: func = partial(categorize_block, categories=categories) result = result.map_partitions(func, columns=columns) if index: result = set_partition(result, index, quantiles) return result
def gather_from_workers(who_has, rpc=rpc, close=True, permissive=False): """ Gather data directly from peers Parameters ---------- who_has: dict Dict mapping keys to sets of workers that may have that key Returns dict mapping key to value See Also -------- gather _gather """ bad_addresses = set() who_has = {k: set(v) for k, v in who_has.items()} results = dict() all_bad_keys = set() while len(results) + len(all_bad_keys) < len(who_has): d = defaultdict(list) rev = dict() bad_keys = set() for key, addresses in who_has.items(): if key in results: continue try: addr = random.choice(list(addresses - bad_addresses)) d[addr].append(key) rev[key] = addr except IndexError: bad_keys.add(key) if bad_keys: if permissive: all_bad_keys |= bad_keys else: raise KeyError(*bad_keys) rpcs = {addr: rpc(addr) for addr in d} try: coroutines = [rpcs[address].get_data(keys=keys, close=close) for address, keys in d.items()] response = yield ignore_exceptions(coroutines, EnvironmentError) finally: for r in rpcs.values(): r.close_rpc() response = merge(response) bad_addresses |= {v for k, v in rev.items() if k not in response} results.update(merge(response)) if permissive: raise Return((results, all_bad_keys)) else: raise Return(results)
def _construct_dask_df_with_divisions(df): """Construct the new task graph and make a new dask.dataframe around it""" divisions = _get_divisions(df) name = 'csv-index' + df._name dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i]) for i in range(df.npartitions)} from toolz import merge if isinstance(df, dd.DataFrame): return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions) elif isinstance(df, dd.Series): return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
def annotate_fusions_for_assembly( fusions, # type: Iterable[Fusion] reference, # type: TranscriptReference assembly, # type: TranscriptReference skip_annotated=True # type: bool ): # type: (...) -> Iterable[Fusion] """Annotates fusions using the assembled GTF.""" def _exon_region(exon): return (exon.chromosome, exon.start, exon.end) for fusion in fusions: if skip_annotated and 'gene_id' in fusion.metadata: # Already annotated yield fusion else: # Identify overlapped transcripts. transcripts = assembly.overlap_transcripts(fusion.genome_region) if len(transcripts) > 0: for transcript in transcripts: # Lookup genes that overlap with exons. exons = assembly.get_exons(transcript.id) genes = set( itertools.chain.from_iterable( reference.overlap_genes(_exon_region(exon)) for exon in exons)) if len(genes) > 0: for gene in genes: # Yield with information from overlapping genes. new_meta = { 'gene_name': gene.name, 'gene_strand': gene.strand, 'gene_id': gene.id, 'novel_transcript': transcript.id } yield fusion._replace(metadata=toolz.merge( fusion.metadata, new_meta)) else: # No gene overlap, yield with transcript info. new_meta = { 'gene_name': transcript.id, 'gene_id': transcript.id, 'gene_strand': transcript.strand, 'novel_transcript': transcript.id } yield fusion._replace( metadata=toolz.merge(fusion.metadata, new_meta)) else: # No overlap. yield fusion
def _construct_dask_df_with_divisions(df): """Construct the new task graph and make a new dask.dataframe around it.""" divisions = _get_divisions(df) # pylint: disable=protected-access name = 'csv-index' + df._name dsk = {(name, i): (_add_to_index, (df._name, i), divisions[i]) for i in range(df.npartitions)} # pylint: enable=protected-access from toolz import merge # pylint: disable=g-import-not-at-top if isinstance(df, dd.DataFrame): return dd.DataFrame(merge(dsk, df.dask), name, df.columns, divisions) elif isinstance(df, dd.Series): return dd.Series(merge(dsk, df.dask), name, df.name, divisions)
def reduction(self, perpartition, aggregate, split_every=None, out_type=Item, name=None): """ Reduce collection with reduction operators Parameters ---------- perpartition: function reduction to apply to each partition aggregate: function reduction to apply to the results of all partitions split_every: int (optional) Group partitions into groups of this size while performing reduction Defaults to 8 out_type: {Bag, Item} The out type of the result, Item if a single element, Bag if a list of elements. Defaults to Item. Examples -------- >>> b = from_sequence(range(10)) >>> b.reduction(sum, sum).compute() 45 """ if split_every is None: split_every = 8 if split_every is False: split_every = self.npartitions token = tokenize(self, perpartition, aggregate, split_every) a = '%s-part-%s' % (name or funcname(perpartition), token) dsk = dict(((a, i), (perpartition, (self.name, i))) for i in range(self.npartitions)) k = self.npartitions b = a fmt = '%s-aggregate-%s' % (name or funcname(aggregate), token) depth = 0 while k > 1: c = fmt + str(depth) dsk2 = dict(((c, i), (aggregate, [(b, j) for j in inds])) for i, inds in enumerate(partition_all(split_every, range(k)))) dsk.update(dsk2) k = len(dsk2) b = c depth += 1 if out_type is Item: dsk[b] = dsk.pop((b, 0)) return Item(merge(self.dask, dsk), b) else: return Bag(merge(self.dask, dsk), b, 1)
def __init__(self, n_workers=None, threads_per_worker=None, processes=True, loop=None, start=None, host=None, ip=None, scheduler_port=0, silence_logs=logging.WARN, dashboard_address=":8787", worker_dashboard_address=None, diagnostics_port=None, services=None, worker_services=None, service_kwargs=None, asynchronous=False, security=None, protocol=None, blocked_handlers=None, interface=None, worker_class=None, scheduler_kwargs=None, **worker_kwargs): if ip is not None: # In the future we should warn users about this move # warnings.warn("The ip keyword has been moved to host") host = ip if diagnostics_port is not None: warnings.warn("diagnostics_port has been deprecated. " "Please use `dashboard_address=` instead") dashboard_address = diagnostics_port if threads_per_worker == 0: warnings.warn( "Setting `threads_per_worker` to 0 is discouraged. " "Please set to None or to a specific int to get best behavior." ) threads_per_worker = None if "dashboard" in worker_kwargs: warnings.warn( "Setting `dashboard` is discouraged. " "Please set `dashboard_address` to affect the scheduler (more common) " "and `worker_dashboard_address` for the worker (less common).") self.status = None self.processes = processes if security is None: # Falsey values load the default configuration security = Security() elif security is True: # True indicates self-signed temporary credentials should be used security = Security.temporary() elif not isinstance(security, Security): raise TypeError("security must be a Security object") if protocol is None: if host and "://" in host: protocol = host.split("://")[0] elif security and security.require_encryption: protocol = "tls://" elif not self.processes and not scheduler_port: protocol = "inproc://" else: protocol = "tcp://" if not protocol.endswith("://"): protocol = protocol + "://" if host is None and not protocol.startswith( "inproc") and not interface: host = "127.0.0.1" services = services or {} worker_services = worker_services or {} if n_workers is None and threads_per_worker is None: if processes: n_workers, threads_per_worker = nprocesses_nthreads() else: n_workers = 1 threads_per_worker = CPU_COUNT if n_workers is None and threads_per_worker is not None: n_workers = max(1, CPU_COUNT // threads_per_worker) if n_workers and threads_per_worker is None: # Overcommit threads per worker, rather than undercommit threads_per_worker = max(1, int(math.ceil(CPU_COUNT / n_workers))) if n_workers and "memory_limit" not in worker_kwargs: worker_kwargs["memory_limit"] = parse_memory_limit( "auto", 1, n_workers) worker_kwargs.update({ "nthreads": threads_per_worker, "services": worker_services, "dashboard_address": worker_dashboard_address, "dashboard": worker_dashboard_address is not None, "interface": interface, "protocol": protocol, "security": security, "silence_logs": silence_logs, }) scheduler = { "cls": Scheduler, "options": toolz.merge( dict( host=host, services=services, service_kwargs=service_kwargs, security=security, port=scheduler_port, interface=interface, protocol=protocol, dashboard=dashboard_address is not None, dashboard_address=dashboard_address, blocked_handlers=blocked_handlers, ), scheduler_kwargs or {}, ), } worker = { "cls": worker_class or (Worker if not processes else Nanny), "options": worker_kwargs, } workers = {i: worker for i in range(n_workers)} super(LocalCluster, self).__init__( scheduler=scheduler, workers=workers, worker=worker, loop=loop, asynchronous=asynchronous, silence_logs=silence_logs, security=security, )
def cluster(nworkers=2, nanny=False, worker_kwargs={}, active_rpc_timeout=0, scheduler_kwargs={}): with pristine_loop() as loop: with check_active_rpc(loop, active_rpc_timeout): if nanny: _run_worker = run_nanny else: _run_worker = run_worker # The scheduler queue will receive the scheduler's address scheduler_q = mp_context.Queue() # Launch scheduler scheduler = mp_context.Process(target=run_scheduler, args=(scheduler_q, nworkers + 1), kwargs=scheduler_kwargs) scheduler.daemon = True scheduler.start() # Launch workers workers = [] for i in range(nworkers): q = mp_context.Queue() fn = '_test_worker-%s' % uuid.uuid1() kwargs = merge({'ncores': 1, 'local_dir': fn}, worker_kwargs) proc = mp_context.Process(target=_run_worker, args=(q, scheduler_q), kwargs=kwargs) workers.append({'proc': proc, 'queue': q, 'dir': fn}) for worker in workers: worker['proc'].start() for worker in workers: worker['address'] = worker['queue'].get() saddr = scheduler_q.get() start = time() try: with rpc(saddr) as s: while True: ncores = loop.run_sync(s.ncores) if len(ncores) == nworkers: break if time() - start > 5: raise Exception("Timeout on cluster creation") yield {'proc': scheduler, 'address': saddr}, workers finally: logger.debug("Closing out test cluster") loop.run_sync(lambda: disconnect_all( [w['address'] for w in workers], timeout=0.5)) loop.run_sync(lambda: disconnect(saddr, timeout=0.5)) scheduler.terminate() for proc in [w['proc'] for w in workers]: with ignoring(EnvironmentError): proc.terminate() scheduler.join(timeout=2) for proc in [w['proc'] for w in workers]: proc.join(timeout=2) for q in [w['queue'] for w in workers]: q.close() for fn in glob('_test_worker-*'): shutil.rmtree(fn)
def mkdict(row, symbols=self._lookup_most_recent_symbols(sids)): return merge(row, symbols[row['sid']])
def to_textfiles(b, path, name_function=str, compression='infer', encoding=system_encoding, compute=True): """ Write bag to disk, one filename per partition, one line per element **Paths**: This will create one file for each partition in your bag. You can specify the filenames in a variety of ways. Use a globstring >>> b.to_textfiles('/path/to/data/*.json.gz') # doctest: +SKIP The * will be replaced by the increasing sequence 1, 2, ... :: /path/to/data/0.json.gz /path/to/data/1.json.gz Use a globstring and a ``name_function=`` keyword argument. The name_function function should expect an integer and produce a string. >>> from datetime import date, timedelta >>> def name(i): ... return str(date(2015, 1, 1) + i * timedelta(days=1)) >>> name(0) '2015-01-01' >>> name(15) '2015-01-16' >>> b.to_textfiles('/path/to/data/*.json.gz', name_function=name) # doctest: +SKIP :: /path/to/data/2015-01-01.json.gz /path/to/data/2015-01-02.json.gz ... You can also provide an explicit list of paths. >>> paths = ['/path/to/data/alice.json.gz', '/path/to/data/bob.json.gz', ...] # doctest: +SKIP >>> b.to_textfiles(paths) # doctest: +SKIP **Compression**: Filenames with extensions corresponding to known compression algorithms (gz, bz2) will be compressed accordingly. """ if isinstance(path, (str, unicode)): if '*' in path: paths = [ path.replace('*', name_function(i)) for i in range(b.npartitions) ] else: paths = [ os.path.join(path, '%s.part' % name_function(i)) for i in range(b.npartitions) ] elif isinstance(path, (tuple, list, set)): assert len(path) == b.npartitions paths = path else: raise ValueError("""Path should be either" 1. A list of paths -- ['foo.json', 'bar.json', ...] 2. A directory -- 'foo/ 3. A path with a * in it -- 'foo.*.json'""") def get_compression(path, compression=compression): if compression == 'infer': compression = infer_compression(path) return compression name = 'to-textfiles-' + uuid.uuid4().hex dsk = dict(((name, i), (write, (b.name, i), path, get_compression(path), encoding)) for i, path in enumerate(paths)) result = Bag(merge(b.dask, dsk), name, b.npartitions) if compute: result.compute() else: return result
def foldby(self, key, binop, initial=no_default, combine=None, combine_initial=no_default): """ Combined reduction and groupby Foldby provides a combined groupby and reduce for efficient parallel split-apply-combine tasks. The computation >>> b.foldby(key, binop, init) # doctest: +SKIP is equivalent to the following: >>> def reduction(group): # doctest: +SKIP ... return reduce(binop, group, init) # doctest: +SKIP >>> b.groupby(key).map(lambda (k, v): (k, reduction(v)))# doctest: +SKIP But uses minimal communication and so is *much* faster. >>> b = from_sequence(range(10)) >>> iseven = lambda x: x % 2 == 0 >>> add = lambda x, y: x + y >>> dict(b.foldby(iseven, add)) # doctest: +SKIP {True: 20, False: 25} **Key Function** The key function determines how to group the elements in your bag. In the common case where your bag holds dictionaries then the key function often gets out one of those elements. >>> def key(x): ... return x['name'] This case is so common that it is special cased, and if you provide a key that is not a callable function then dask.bag will turn it into one automatically. The following are equivalent: >>> b.foldby(lambda x: x['name'], ...) # doctest: +SKIP >>> b.foldby('name', ...) # doctest: +SKIP **Binops** It can be tricky to construct the right binary operators to perform analytic queries. The ``foldby`` method accepts two binary operators, ``binop`` and ``combine``. Binary operators two inputs and output must have the same type. Binop takes a running total and a new element and produces a new total: >>> def binop(total, x): ... return total + x['amount'] Combine takes two totals and combines them: >>> def combine(total1, total2): ... return total1 + total2 Each of these binary operators may have a default first value for total, before any other value is seen. For addition binary operators like above this is often ``0`` or the identity element for your operation. >>> b.foldby('name', binop, 0, combine, 0) # doctest: +SKIP See Also -------- toolz.reduceby pyspark.combineByKey """ token = tokenize(self, key, binop, initial, combine, combine_initial) a = 'foldby-a-' + token b = 'foldby-b-' + token if combine is None: combine = binop if initial is not no_default: dsk = dict( ((a, i), (reduceby, key, binop, (self.name, i), initial)) for i in range(self.npartitions)) else: dsk = dict(((a, i), (reduceby, key, binop, (self.name, i))) for i in range(self.npartitions)) def combine2(acc, x): return combine(acc, x[1]) if combine_initial is not no_default: dsk2 = { (b, 0): (dictitems, (reduceby, 0, combine2, (toolz.concat, (map, dictitems, list(dsk.keys()))), combine_initial)) } else: dsk2 = { (b, 0): (dictitems, (merge_with, (partial, reduce, combine), list(dsk.keys()))) } return type(self)(merge(self.dask, dsk, dsk2), b, 1)
def set_partition(df, index, divisions, compute=False, drop=True, **kwargs): """ Group DataFrame by index Sets a new index and partitions data along that index according to divisions. Divisions are often found by computing approximate quantiles. The function ``set_index`` will do both of these steps. Parameters ---------- df: DataFrame/Series Data that we want to re-partition index: string or Series Column to become the new index divisions: list Values to form new divisions between partitions drop: bool, default True Whether to delete columns to be used as the new index See Also -------- set_index shuffle partd """ if isinstance(index, Series): assert df.divisions == index.divisions metadata = df._pd.set_index(index._pd, drop=drop) elif np.isscalar(index): metadata = df._pd.set_index(index, drop=drop) else: raise ValueError('index must be Series or scalar, {0} given'.format( type(index))) token = tokenize(df, index, divisions) always_new_token = uuid.uuid1().hex import partd p = ('zpartd-' + always_new_token, ) # Get Categories catname = 'set-partition--get-categories-old-' + always_new_token catname2 = 'set-partition--get-categories-new-' + always_new_token dsk1 = { catname: (get_categories, df._keys()[0]), p: (partd.PandasBlocks, (partd.Buffer, (partd.Dict, ), (partd.File, ))), catname2: (new_categories, catname, index.name if isinstance(index, Series) else index) } # Partition data on disk name = 'set-partition--partition-' + always_new_token if isinstance(index, _Frame): dsk2 = dict( ((name, i), (_set_partition, part, ind, divisions, p, drop)) for i, (part, ind) in enumerate(zip(df._keys(), index._keys()))) else: dsk2 = dict( ((name, i), (_set_partition, part, index, divisions, p, drop)) for i, part in enumerate(df._keys())) # Barrier barrier_token = 'barrier-' + always_new_token dsk3 = {barrier_token: (barrier, list(dsk2))} if compute: dsk = merge(df.dask, dsk1, dsk2, dsk3) if isinstance(index, _Frame): dsk.update(index.dask) p, barrier_token, categories = df._get(dsk, [p, barrier_token, catname], **kwargs) dsk4 = {catname2: categories} else: dsk4 = {} # Collect groups name = 'set-partition--collect-' + token if compute and not categories: dsk4.update( dict(((name, i), (_set_collect, i, p, barrier_token, df.columns)) for i in range(len(divisions) - 1))) else: dsk4.update( dict(((name, i), (_categorize, catname2, (_set_collect, i, p, barrier_token, df.columns))) for i in range(len(divisions) - 1))) dsk = merge(df.dask, dsk1, dsk2, dsk3, dsk4) if isinstance(index, Series): dsk.update(index.dask) if compute: dsk = cull(dsk, list(dsk4.keys())) return DataFrame(dsk, name, metadata, divisions)
def __repr__(self): return "lazy_dict({})".format( t.merge(t.valmap(lambda _: "...", self.thunks), self.realized))
See Also: toolz.functoolz.curry """ import toolz import toolz.curried_exceptions from .functoolz import curry import inspect def _nargs(f): try: return len(inspect.getargspec(f).args) except TypeError: return None def _should_curry(f): do_curry = set((toolz.map, toolz.filter, toolz.sorted, toolz.reduce)) return (callable(f) and _nargs(f) and _nargs(f) > 1 or f in do_curry) _d = dict((name, curry(f) if _should_curry(f) else f) for name, f in toolz.__dict__.items() if '__' not in name) _exceptions = dict((name, curry(f) if callable(f) else f) for name, f in toolz.curried_exceptions.__dict__.items() if '__' not in name) locals().update(toolz.merge(_d, _exceptions))
def extra(self): return merge({"prefix": self.prefix}, template_variables)
def slice_wrap_lists(out_name, in_name, blockdims, index): """ Fancy indexing along blocked array dasks Handles index of type list. Calls slice_slices_and_integers for the rest See Also -------- take - handle slicing with lists ("fancy" indexing) slice_slices_and_integers - handle slicing with slices and integers """ assert all( isinstance(i, (slice, list, Integral, np.ndarray)) for i in index) if not len(blockdims) == len(index): raise IndexError("Too many indices for array") # Do we have more than one list in the index? where_list = [ i for i, ind in enumerate(index) if isinstance(ind, np.ndarray) and ind.ndim > 0 ] if len(where_list) > 1: raise NotImplementedError("Don't yet support nd fancy indexing") # Is the single list an empty list? In this case just treat it as a zero # length slice if where_list and not index[where_list[0]].size: index = list(index) index[where_list.pop()] = slice(0, 0, 1) index = tuple(index) # No lists, hooray! just use slice_slices_and_integers if not where_list: return slice_slices_and_integers(out_name, in_name, blockdims, index) # Replace all lists with full slices [3, 1, 0] -> slice(None, None, None) index_without_list = tuple( slice(None, None, None) if isinstance(i, np.ndarray) else i for i in index) # lists and full slices. Just use take if all( isinstance(i, np.ndarray) or i == slice(None, None, None) for i in index): axis = where_list[0] blockdims2, dsk3 = take(out_name, in_name, blockdims, index[where_list[0]], axis=axis) # Mixed case. Both slices/integers and lists. slice/integer then take else: # Do first pass without lists tmp = 'slice-' + tokenize((out_name, in_name, blockdims, index)) dsk, blockdims2 = slice_slices_and_integers(tmp, in_name, blockdims, index_without_list) # After collapsing some axes due to int indices, adjust axis parameter axis = where_list[0] axis2 = axis - sum(1 for i, ind in enumerate(index) if i < axis and isinstance(ind, Integral)) # Do work blockdims2, dsk2 = take(out_name, tmp, blockdims2, index[axis], axis=axis2) dsk3 = merge(dsk, dsk2) return dsk3, blockdims2
def _get_data(clauses, values, keys): result = frappe.db.sql( """ SELECT e.bank_name AS bank_name, e.bank_ac_no AS bank_ac_no, e.employee_name AS employee_name, sl.name AS salary_slip, sl.start_date AS start_date, a.account_number AS account_number FROM `tabSalary Slip` AS sl LEFT JOIN `tabEmployee` AS e ON e.name = sl.employee LEFT JOIN `tabPayroll Entry` AS pe ON pe.name = sl.payroll_entry LEFT JOIN `tabAccount` AS a ON a.name = pe.payment_account WHERE {clauses} """.format( clauses=clauses ), values=values, as_dict=1, ) get_amounts = compose( partial(groupby, "salary_slip"), lambda type: frappe.db.sql( """ SELECT sl.name AS salary_slip, SUM(sd.amount) AS amount FROM `tabSalary Detail` AS sd LEFT JOIN `tabSalary Slip` AS sl ON sl.name = sd.parent WHERE sd.parentfield = %(parentfield)s AND sd.parent IN %(salary_slips)s AND sd.salary_component IN %(components)s GROUP BY sl.name """, values=merge( values, { "salary_slips": [x.get("salary_slip") for x in result], "parentfield": type, }, ), as_dict=1, ) if result else {}, ) get_amount = compose( lambda x: x.get("amount", 0), excepts(StopIteration, first, lambda _: {}), lambda col, key: col.get(key, []), ) earnings = get_amounts("earnings") deductions = get_amounts("deductions") def add_remarks(row): start_date = row.get("start_date") return merge( row, {"remarks": "{} SAL".format(start_date.strftime("%b").upper())} ) def set_amounts(row): salary_slip = row.get("salary_slip") amount = get_amount(earnings, salary_slip) - get_amount(deductions, salary_slip) return merge(row, {"amount": amount}) make_row = compose(partial(pick, keys), add_remarks, set_amounts) return with_report_generation_time([make_row(x) for x in result], keys)
def set_amounts(row): salary_slip = row.get("salary_slip") amount = get_amount(earnings, salary_slip) - get_amount(deductions, salary_slip) return merge(row, {"amount": amount})
def fn(row): sales_order = row.get("sales_order") return merge(row, {"outstanding": outstanding_amounts.get(sales_order, 0)})
def cluster(nworkers=2, nanny=False, worker_kwargs={}): if nanny: _run_worker = run_nanny else: _run_worker = run_worker scheduler_q = Queue() scheduler = Process(target=run_scheduler, args=(scheduler_q, )) scheduler.daemon = True scheduler.start() sport = scheduler_q.get() workers = [] for i in range(nworkers): q = Queue() fn = '_test_worker-%s' % uuid.uuid1() proc = Process(target=_run_worker, args=(q, sport), kwargs=merge({ 'ncores': 1, 'local_dir': fn }, worker_kwargs)) workers.append({'proc': proc, 'queue': q, 'dir': fn}) for worker in workers: worker['proc'].start() for worker in workers: worker['port'] = worker['queue'].get() loop = IOLoop() s = rpc(ip='127.0.0.1', port=sport) start = time() try: while True: ncores = loop.run_sync(s.ncores) if len(ncores) == nworkers: break if time() - start > 5: raise Exception("Timeout on cluster creation") yield {'proc': scheduler, 'port': sport}, workers finally: logger.debug("Closing out test cluster") with ignoring(socket.error, TimeoutError, StreamClosedError): loop.run_sync(lambda: disconnect('127.0.0.1', sport), timeout=0.5) scheduler.terminate() scheduler.join(timeout=2) for port in [w['port'] for w in workers]: with ignoring(socket.error, TimeoutError, StreamClosedError): loop.run_sync(lambda: disconnect('127.0.0.1', port), timeout=0.5) for proc in [w['proc'] for w in workers]: with ignoring(Exception): proc.terminate() proc.join(timeout=2) for q in [w['queue'] for w in workers]: q.close() for fn in glob('_test_worker-*'): shutil.rmtree(fn) loop.close(all_fds=True)
def test_bundle(self): url_map = merge( { format_wiki_url( self.api_key, symbol, self.start_date, self.end_date, ): test_resource_path('quandl_samples', symbol + '.csv.gz') for symbol in self.symbols }, { format_metadata_url(self.api_key, n): test_resource_path( 'quandl_samples', 'metadata-%d.csv.gz' % n, ) for n in (1, 2) }, ) zipline_root = self.enter_instance_context(tmp_dir()).path environ = { 'ZIPLINE_ROOT': zipline_root, 'QUANDL_API_KEY': self.api_key, } with patch_read_csv(url_map, strict=True): ingest('quandl', environ=environ) bundle = load('quandl', environ=environ) sids = 0, 1, 2, 3 assert_equal(set(bundle.asset_finder.sids), set(sids)) for equity in bundle.asset_finder.retrieve_all(sids): assert_equal(equity.start_date, self.asset_start, msg=equity) assert_equal(equity.end_date, self.asset_end, msg=equity) sessions = self.calendar.all_sessions actual = bundle.equity_daily_bar_reader.load_raw_arrays( self.columns, sessions[sessions.get_loc(self.asset_start, 'bfill')], sessions[sessions.get_loc(self.asset_end, 'ffill')], sids, ) expected_pricing, expected_adjustments = self._expected_data( bundle.asset_finder, ) assert_equal(actual, expected_pricing, array_decimal=2) adjustments_for_cols = bundle.adjustment_reader.load_adjustments( self.columns, sessions, pd.Index(sids), ) for column, adjustments, expected in zip(self.columns, adjustments_for_cols, expected_adjustments): assert_equal( adjustments, expected, msg=column, )
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) topical_transformer = topicalq_transformer( config['source_topic_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) decoder = Decoder(vocab_size=config['trg_vocab_size'], topicWord_size=config['trg_topic_vocab_size'], embedding_dim=config['dec_embed'], topical_dim=config['topical_embedding_dim'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') #target_topic_sentence_mask=tensor.lmatrix('target_topic_mask'); sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') topic_embedding = topical_transformer.apply(source_topical_word) # Get training and development set streams tr_stream = get_tr_stream_with_topic_target(**config) #dev_stream = get_dev_tr_stream_with_topic_target(**config) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representations[0, :, (representations.shape[2] / 2):] cost = decoder.cost(representations, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding, content_embedding) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' cg = ComputationGraph(cost) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (perplexity), on_unused_input='ignore') # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # # Set up beam search and sampling computation graphs if necessary # if config['hook_samples'] >= 1 or config['bleu_script'] is not None: # logger.info("Building sampling model") # sampling_representation = encoder.apply( # sampling_input, tensor.ones(sampling_input.shape)) # generated = decoder.generate( # sampling_input, sampling_representation) # search_model = Model(generated) # _, samples = VariableFilter( # bricks=[decoder.sequence_generator], name="outputs")( # ComputationGraph(generated[1])) # # # Add sampling # if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # model_name=config['model_name'], # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # # # Add early stopping based on bleu # if False: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'], # n_best=3, # track_n_models=6)) # # logger.info("Building perplexity validator") # extensions.append( # pplValidation( config=config, # model=costs_computer, data_stream=dev_stream, # model_name=config['model_name'], # every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='ignore') _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') tw_vocab_overlap = tensor.lmatrix('tw_vocab_overlap') tw_vocab_overlap_matrix = cPickle.load( open(config['tw_vocab_overlap'], 'rb')) tw_vocab_overlap_matrix = numpy.array(tw_vocab_overlap_matrix, dtype='int32') #tw_vocab_overlap=shared(tw_vocab_overlap_matrix); topic_embedding = topical_transformer.apply(source_topical_word) sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens(cPickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) topic_embedding = topical_transformer.apply(source_topical_word) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = sampling_representation[0, :, ( sampling_representation.shape[2] / 2):] generated = decoder.generate(sampling_input, sampling_representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_with_topicalq(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw.split() seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq1 = line[1] input_topical = numpy.tile(seq1, (config['beam_size'], 1)) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={sampling_input: input_,source_topical_word:input_topical,tw_vocab_overlap:tw_vocab_overlap_matrix}, tw_vocab_overlap=tw_vocab_overlap_matrix, max_length=3*len(seq), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] weight = weights[best][:, :len(trans_in)] trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() # ap = afterprocesser(config) # ap.main() elif mode == 'score': logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') source_topical_word = tensor.lmatrix('source_topical') topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding) config['batch_size'] = 1 config['sort_k_batches'] = 1 # Get test set stream test_stream = get_tr_stream_with_topic_target(**config) logger.info("Building sampling model") logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (costs), on_unused_input='ignore') iterator = test_stream.get_epoch_iterator() scores = [] att_weights = [] for i, (src, src_mask, trg, trg_mask, te, te_mask, tt, tt_mask, tb, tb_mask) in enumerate(iterator): costs = costs_computer(*[trg, trg_mask, src, src_mask, te, tt, tb]) cost = costs.sum() print(i, cost) scores.append(cost) print(sum(scores) / 10007)
def load_adjusted_array(self, columns, dates, sids, mask): n, p = self.split_next_and_previous_event_columns(columns) return merge( self.load_next_events(n, dates, sids, mask), self.load_previous_events(p, dates, sids, mask), )
def slice_wrap_lists(out_name, in_name, blockdims, index): """ Fancy indexing along blocked array dasks Handles index of type list. Calls slice_slices_and_integers for the rest See Also -------- take - handle slicing with lists ("fancy" indexing) slice_slices_and_integers - handle slicing with slices and integers """ shape = tuple(map(sum, blockdims)) assert all(isinstance(i, (slice, list, int, long)) for i in index) assert len(blockdims) == len(index) for bd, i in zip(blockdims, index): check_index(i, sum(bd)) # Change indices like -1 to 9 index2 = posify_index(shape, index) # Do we have more than one list in the index? where_list = [i for i, ind in enumerate(index) if isinstance(ind, list)] if len(where_list) > 1: raise NotImplementedError("Don't yet support nd fancy indexing") # Replace all lists with full slices [3, 1, 0] -> slice(None, None, None) index_without_list = tuple( slice(None, None, None) if isinstance(i, list) else i for i in index2) # No lists, hooray! just use slice_slices_and_integers if index2 == index_without_list: return slice_slices_and_integers(out_name, in_name, blockdims, index2) # lists and full slices. Just use take if all( isinstance(i, list) or i == slice(None, None, None) for i in index2): axis = where_list[0] blockdims2, dsk3 = take(out_name, in_name, blockdims, index2[where_list[0]], axis=axis) # Mixed case. Both slices/integers and lists. slice/integer then take else: # Do first pass without lists tmp = 'slice-' + tokenize((out_name, in_name, blockdims, index)) dsk, blockdims2 = slice_slices_and_integers(tmp, in_name, blockdims, index_without_list) # After collapsing some axes due to int indices, adjust axis parameter axis = where_list[0] axis2 = axis - sum(1 for i, ind in enumerate(index2) if i < axis and isinstance(ind, (int, long))) # Do work blockdims2, dsk2 = take(out_name, tmp, blockdims2, index2[axis], axis=axis2) dsk3 = merge(dsk, dsk2) return dsk3, blockdims2
def apply(self, func): name = 'apply-{0}-{1}'.format(funcname(func), tokenize(self, func)) dsk = {name: (func, self.key)} return Item(merge(self.dask, dsk), name)
def gen_cluster( nthreads=[("127.0.0.1", 1), ("127.0.0.1", 2)], ncores=None, scheduler="127.0.0.1", timeout=10, security=None, Worker=Worker, client=False, scheduler_kwargs={}, worker_kwargs={}, client_kwargs={}, active_rpc_timeout=1, config={}, check_new_threads=True, ): from distributed import Client """ Coroutine test with small cluster @gen_cluster() def test_foo(scheduler, worker1, worker2): yield ... # use tornado coroutines See also: start end """ if ncores is not None: warnings.warn("ncores= has moved to nthreads=") nthreads = ncores worker_kwargs = merge( {"memory_limit": TOTAL_MEMORY, "death_timeout": 5}, worker_kwargs ) def _(func): if not iscoroutinefunction(func): func = gen.coroutine(func) def test_func(): result = None workers = [] with clean(threads=check_new_threads, timeout=active_rpc_timeout) as loop: @gen.coroutine def coro(): with dask.config.set(config): s = False for i in range(5): try: s, ws = yield start_cluster( nthreads, scheduler, loop, security=security, Worker=Worker, scheduler_kwargs=scheduler_kwargs, worker_kwargs=worker_kwargs, ) except Exception as e: logger.error( "Failed to start gen_cluster, retrying", exc_info=True, ) else: workers[:] = ws args = [s] + workers break if s is False: raise Exception("Could not start cluster") if client: c = yield Client( s.address, loop=loop, security=security, asynchronous=True, **client_kwargs ) args = [c] + args try: future = func(*args) if timeout: future = gen.with_timeout( timedelta(seconds=timeout), future ) result = yield future if s.validate: s.validate_state() finally: if client and c.status not in ("closing", "closed"): yield c._close(fast=s.status == "closed") yield end_cluster(s, workers) yield gen.with_timeout( timedelta(seconds=1), cleanup_global_workers() ) try: c = yield default_client() except ValueError: pass else: yield c._close(fast=True) for i in range(5): if all(c.closed() for c in Comm._instances): break else: yield gen.sleep(0.05) else: L = [c for c in Comm._instances if not c.closed()] Comm._instances.clear() # raise ValueError("Unclosed Comms", L) print("Unclosed Comms", L) raise gen.Return(result) result = loop.run_sync( coro, timeout=timeout * 2 if timeout else timeout ) for w in workers: if getattr(w, "data", None): try: w.data.clear() except EnvironmentError: # zict backends can fail if their storage directory # was already removed pass del w.data return result return test_func return _
def get(self): with log_errors(): self.render("workers.html", title="Workers", scheduler=self.server, **toolz.merge(self.server.__dict__, ns, self.extra))
def cluster( nworkers=2, nanny=False, worker_kwargs={}, active_rpc_timeout=1, scheduler_kwargs={} ): ws = weakref.WeakSet() enable_proctitle_on_children() with clean(timeout=active_rpc_timeout, threads=False) as loop: if nanny: _run_worker = run_nanny else: _run_worker = run_worker # The scheduler queue will receive the scheduler's address scheduler_q = mp_context.Queue() # Launch scheduler scheduler = mp_context.Process( name="Dask cluster test: Scheduler", target=run_scheduler, args=(scheduler_q, nworkers + 1), kwargs=scheduler_kwargs, ) ws.add(scheduler) scheduler.daemon = True scheduler.start() # Launch workers workers = [] for i in range(nworkers): q = mp_context.Queue() fn = "_test_worker-%s" % uuid.uuid4() kwargs = merge( {"nthreads": 1, "local_dir": fn, "memory_limit": TOTAL_MEMORY}, worker_kwargs, ) proc = mp_context.Process( name="Dask cluster test: Worker", target=_run_worker, args=(q, scheduler_q), kwargs=kwargs, ) ws.add(proc) workers.append({"proc": proc, "queue": q, "dir": fn}) for worker in workers: worker["proc"].start() try: for worker in workers: worker["address"] = worker["queue"].get(timeout=5) except Empty: raise pytest.xfail.Exception("Worker failed to start in test") saddr = scheduler_q.get() start = time() try: try: security = scheduler_kwargs["security"] rpc_kwargs = {"connection_args": security.get_connection_args("client")} except KeyError: rpc_kwargs = {} with rpc(saddr, **rpc_kwargs) as s: while True: nthreads = loop.run_sync(s.ncores) if len(nthreads) == nworkers: break if time() - start > 5: raise Exception("Timeout on cluster creation") # avoid sending processes down to function yield {"address": saddr}, [ {"address": w["address"], "proc": weakref.ref(w["proc"])} for w in workers ] finally: logger.debug("Closing out test cluster") loop.run_sync( lambda: disconnect_all( [w["address"] for w in workers], timeout=0.5, rpc_kwargs=rpc_kwargs ) ) loop.run_sync(lambda: disconnect(saddr, timeout=0.5, rpc_kwargs=rpc_kwargs)) scheduler.terminate() scheduler_q.close() scheduler_q._reader.close() scheduler_q._writer.close() for w in workers: w["proc"].terminate() w["queue"].close() w["queue"]._reader.close() w["queue"]._writer.close() scheduler.join(2) del scheduler for proc in [w["proc"] for w in workers]: proc.join(timeout=2) with ignoring(UnboundLocalError): del worker, w, proc del workers[:] for fn in glob("_test_worker-*"): with ignoring(OSError): shutil.rmtree(fn) try: client = default_client() except ValueError: pass else: client.close() start = time() while any(proc.is_alive() for proc in ws): text = str(list(ws)) sleep(0.2) assert time() < start + 5, ("Workers still around after five seconds", text)
def _nargs(f): try: return len(inspect.getargspec(f).args) except TypeError: return 0 def _should_curry(f): do_curry = frozenset((toolz.map, toolz.filter, toolz.sorted, toolz.reduce)) return (callable(f) and _nargs(f) > 1 or f in do_curry) def _curry_namespace(ns): return dict((name, toolz.curry(f) if _should_curry(f) else f) for name, f in ns.items() if '__' not in name) locals().update( toolz.merge( _curry_namespace(vars(toolz)), _curry_namespace(vars(exceptions)), )) # Clean up the namespace. del _nargs del _should_curry del exceptions del toolz
def compile(fn_graph, get=dask.get): fn_param_info = t.valmap(_param_info, fn_graph) global_param_info = {} for param_info in fn_param_info.values(): for kw, value in param_info.items(): if kw in global_param_info and global_param_info[kw] != value: global_param_info[kw] = _AMBIGUOUS else: global_param_info[kw] = value computed_args = set(fn_graph.keys()) required_params, defaulted = u.split_keys_by_val(_is_required, global_param_info) required_params = required_params - computed_args all_params = required_params.union(defaulted) default_args = u.select_keys(global_param_info, defaulted) def to_task(res_key, param_info): fn = fn_graph[res_key] dask_args = tuple(param_info.keys()) if _is_curry_func(fn): # wrap the fn but persist the args, and kargs on it args = tuple([default_args.get(p, p) for p in param_info.keys()]) set_varargs, set_kargs = _partial_inputs(fn) def wrapper(*args): kwargs = t.merge(set_kargs, dict(zip(param_info.keys(), args))) return fn(*set_varargs, **kwargs) wrapper.__name__ = _func_name(fn) # we maintain the curry/partial func info wrapper.func = _func_name(fn) wrapper.keywords = fn.keywords wrapper.args = fn.args return (wrapper, ) + dask_args return (fn, ) + dask_args base_dask = { k: to_task(k, param_info) for k, param_info in fn_param_info.items() } outputs = list(fn_graph.keys()) def funk(get=get, **kargs): param_keys = set(kargs.keys()) missing_keys = required_params - param_keys if missing_keys: raise TypeError( 'missing these keyword arguments: {}'.format(missing_keys)) extra_keys = param_keys - all_params if extra_keys: raise TypeError( 'unexpected keyword arguments passed in: {}'.format( extra_keys)) dsk = t.merge(base_dask, default_args, kargs) res = get(dsk, outputs) return dict(zip(outputs, res)) funk.required = required_params funk.defaults = default_args funk.base_dask = base_dask funk.full_dask = t.merge(base_dask, dict(zip(all_params, repeat(_UNSPECIFIED)))) # TODO: use bolton's FunctionBuilder to set kargs so it has a useful function signature return funk
def unpack_as_lists_of_keys(*args): parts, dsks = zip(*map(_unpack_keys_dask, args)) if len(set(map(len, parts))) != 1: raise ValueError("inputs must all have the same number " "of partitions along the first dimension") return tuple(parts) + (merge(dsks), )
def wrapper(*args): kwargs = t.merge(set_kargs, dict(zip(param_info.keys(), args))) return fn(*set_varargs, **kwargs)
def _set_period_columns(sales, periods): def groupby_filter(sl): def fn(p): return p.get("start_date") <= sl.get("posting_date") <= p.get("end_date") return fn groupby_fn = compose( partial(get, "key", default=None), excepts(StopIteration, first, lambda __: {}), partial(flip, filter, periods), groupby_filter, ) sales_grouped = groupby(groupby_fn, sales) def summer(key): return compose(sum, partial(pluck, key)) def seg_filter(x): return lambda sale: sale.get("item_code") == x def seger(sum_fn, x): return compose( sum_fn, partial(flip, filter, get(x.get("key"), sales_grouped, [])), seg_filter, ) def total_fn(sum_fn): return compose(sum_fn, partial(flip, filter, sales), seg_filter) summer_qty = summer("qty") summer_amount = summer("amount") segregator_fns = [ merge( x, { "seger_qty": seger(summer_qty, x), "seger_amount": seger(summer_amount, x), }, ) for x in periods ] def seg_reducer(item_code): def fn(a, p): key = get("key", p, None) seger_qty = get("seger_qty", p, lambda __: None) seger_amount = get("seger_amount", p, lambda __: None) return merge( a, { key: seger_qty(item_code), "{}_amount".format(key): seger_amount(item_code), }, ) return fn def fn(item): item_code = item.get("item_code") total_qty = total_fn(summer_qty) total_amount = total_fn(summer_amount) return merge( item, reduce(seg_reducer(item_code), segregator_fns, {}), { "total_qty": total_qty(item_code), "total_amount": total_amount(item_code), }, ) return fn
def execute_aggregation_dataframe(op, data, scope=None, **kwargs): assert op.metrics, 'no metrics found during aggregation execution' if op.sort_keys: raise NotImplementedError( 'sorting on aggregations not yet implemented' ) predicates = op.predicates if predicates: predicate = functools.reduce( operator.and_, (execute(p, scope=scope, **kwargs) for p in predicates), ) data = data.loc[predicate] columns = {} if op.by: grouping_key_pairs = list( zip(op.by, map(operator.methodcaller('op'), op.by)) ) grouping_keys = [ by_op.name if isinstance(by_op, ops.TableColumn) else execute(by, scope=scope, **kwargs).rename(by.get_name()) for by, by_op in grouping_key_pairs ] columns.update( (by_op.name, by.get_name()) for by, by_op in grouping_key_pairs if hasattr(by_op, 'name') ) source = data.groupby(grouping_keys) else: source = data new_scope = toolz.merge(scope, {op.table.op(): source}) pieces = [ pd.Series( execute(metric, scope=new_scope, **kwargs), name=metric.get_name() ) for metric in op.metrics ] # group by always needs a reset to get the grouping key back as a column result = pd.concat(pieces, axis=1).reset_index() result.columns = [columns.get(c, c) for c in result.columns] if op.having: # .having(...) is only accessible on groupby, so this should never # raise if not op.by: raise ValueError( 'Filtering out aggregation values is not allowed without at ' 'least one grouping key' ) # TODO(phillipc): Don't recompute identical subexpressions predicate = functools.reduce( operator.and_, ( execute(having, scope=new_scope, **kwargs) for having in op.having ), ) assert len(predicate) == len( result ), 'length of predicate does not match length of DataFrame' result = result.loc[predicate.values] return result
def add_remarks(row): start_date = row.get("start_date") return merge( row, {"remarks": "{} SAL".format(start_date.strftime("%b").upper())} )
def fn(row): docname = row.get("sales_order") return merge(row, {"Draft": row.get("creation")}, comments.get(docname, {}))