def setup(self): keys = self.keys while not keys.issubset(self.scheduler.tasks): yield gen.sleep(0.05) tasks = [self.scheduler.tasks[k] for k in keys] self.keys = None self.scheduler.add_plugin(self) # subtle race condition here self.all_keys, errors = dependent_keys(tasks, complete=self.complete) if not self.complete: self.keys = self.all_keys.copy() else: self.keys, _ = dependent_keys(tasks, complete=False) self.all_keys.update(keys) self.keys |= errors & self.all_keys if not self.keys: self.stop(exception=None, key=None) # Group keys by func name self.keys = valmap(set, groupby(self.func, self.keys)) self.all_keys = valmap(set, groupby(self.func, self.all_keys)) for k in self.all_keys: if k not in self.keys: self.keys[k] = set() for k in errors: self.transition(k, None, 'erred', exception=True) logger.debug("Set up Progress keys")
def function(scheduler, p): result = {'all': valmap(len, p.all_keys), 'remaining': valmap(len, p.keys), 'status': p.status} if p.status == 'error': result.update(p.extra) return result
def _scatter(self, data, workers=None, broadcast=False): """ Scatter data to local data dictionary Rather than send data out to the cluster we keep data local. However we do report to the scheduler that the local worker has the scattered data. This allows other workers to come by and steal this data if desired. Keywords like ``broadcast=`` do not work, however operations like ``.replicate`` work fine after calling scatter, which can fill in for this functionality. """ with log_errors(): if not (workers is None and broadcast is False): raise NotImplementedError("Scatter from worker doesn't support workers or broadcast keywords") if isinstance(data, dict) and not all(isinstance(k, (bytes, str)) for k in data): d = yield self._scatter(keymap(tokey, data), workers, broadcast) raise gen.Return({k: d[tokey(k)] for k in data}) if isinstance(data, (list, tuple, set, frozenset)): keys = [] for x in data: try: keys.append(tokenize(x)) except: keys.append(str(uuid.uuid1())) data2 = dict(zip(keys, data)) elif isinstance(data, dict): keys = set(data) data2 = data else: raise TypeError("Don't know how to scatter %s" % type(data)) nbytes = valmap(sizeof, data2) # self.worker.data.update(data2) # thread safety matters self.worker.loop.add_callback(self.worker.data.update, data2) yield self.scheduler.update_data( who_has={key: [self.worker.address] for key in data2}, nbytes=valmap(sizeof, data2), client=self.id) if isinstance(data, dict): out = {k: Future(k, self) for k in data} elif isinstance(data, (tuple, list, set, frozenset)): out = type(data)([Future(k, self) for k in keys]) else: raise TypeError( "Input to scatter must be a list or dict") for key in keys: self.futures[key]['status'] = 'finished' self.futures[key]['event'].set() raise gen.Return(out)
def setup(self, keys, complete): errors = Progress.setup(self, keys, complete) # Group keys by func name self.keys = valmap(set, groupby(self.func, self.keys)) self.all_keys = valmap(set, groupby(self.func, self.all_keys)) for k in self.all_keys: if k not in self.keys: self.keys[k] = set() logger.debug("Set up Progress keys") return errors
def expect_dtypes(**named): """ Preprocessing decorator that verifies inputs have expected numpy dtypes. Usage ----- >>> from numpy import dtype, arange, int8, float64 >>> @expect_dtypes(x=dtype(int8)) ... def foo(x, y): ... return x, y ... >>> foo(arange(3, dtype=int8), 'foo') (array([0, 1, 2], dtype=int8), 'foo') >>> foo(arange(3, dtype=float64), 'foo') # doctest: +NORMALIZE_WHITESPACE ... # doctest: +ELLIPSIS Traceback (most recent call last): ... TypeError: ...foo() expected a value with dtype 'int8' for argument 'x', but got 'float64' instead. """ for name, type_ in iteritems(named): if not isinstance(type_, (dtype, tuple)): raise TypeError( "expect_dtypes() expected a numpy dtype or tuple of dtypes" " for argument {name!r}, but got {dtype} instead.".format( name=name, dtype=dtype, ) ) @preprocess(dtypes=call(lambda x: x if isinstance(x, tuple) else (x,))) def _expect_dtype(dtypes): """ Factory for dtype-checking functions that work with the @preprocess decorator. """ def error_message(func, argname, value): # If the bad value has a dtype, but it's wrong, show the dtype # name. Otherwise just show the value. try: value_to_show = value.dtype.name except AttributeError: value_to_show = value return ( "{funcname}() expected a value with dtype {dtype_str} " "for argument {argname!r}, but got {value!r} instead." ).format( funcname=_qualified_name(func), dtype_str=' or '.join(repr(d.name) for d in dtypes), argname=argname, value=value_to_show, ) def _actual_preprocessor(func, argname, argvalue): if getattr(argvalue, 'dtype', object()) not in dtypes: raise TypeError(error_message(func, argname, argvalue)) return argvalue return _actual_preprocessor return preprocess(**valmap(_expect_dtype, named))
def expect_element(*_pos, **named): """ Preprocessing decorator that verifies inputs are elements of some expected collection. Usage ----- >>> @expect_element(x=('a', 'b')) ... def foo(x): ... return x.upper() ... >>> foo('a') 'A' >>> foo('b') 'B' >>> foo('c') Traceback (most recent call last): ... ValueError: foo() expected a value in ('a', 'b') for argument 'x', but got 'c' instead. # noqa Notes ----- This uses the `in` operator (__contains__) to make the containment check. This allows us to use any custom container as long as the object supports the container protocol. """ if _pos: raise TypeError("expect_element() only takes keyword arguments.") return preprocess(**valmap(_expect_element, named))
def expect_dtypes(*_pos, **named): """ Preprocessing decorator that verifies inputs have expected numpy dtypes. Usage ----- >>> from numpy import dtype, arange >>> @expect_dtypes(x=dtype(int)) ... def foo(x, y): ... return x, y ... >>> foo(arange(3), 'foo') (array([0, 1, 2]), 'foo') >>> foo(arange(3, dtype=float), 'foo') Traceback (most recent call last): ... TypeError: foo() expected an argument with dtype 'int64' for argument 'x', but got dtype 'float64' instead. # noqa """ if _pos: raise TypeError("expect_dtypes() only takes keyword arguments.") for name, type_ in iteritems(named): if not isinstance(type_, (dtype, tuple)): raise TypeError( "expect_dtypes() expected a numpy dtype or tuple of dtypes" " for argument {name!r}, but got {dtype} instead.".format( name=name, dtype=dtype, ) ) return preprocess(**valmap(_expect_dtype, named))
def get_has_what(self, stream, keys=None): if keys: keys = [coerce_to_address(key) for key in keys] if keys is not None: return {k: list(self.has_what[k]) for k in keys} else: return valmap(list, self.has_what)
def merge_ownership_periods(mappings): """ Given a dict of mappings where the values are lists of OwnershipPeriod objects, returns a dict with the same structure with new OwnershipPeriod objects adjusted so that the periods have no gaps. Orders the periods chronologically, and pushes forward the end date of each period to match the start date of the following period. The end date of the last period pushed forward to the max Timestamp. """ return valmap( lambda v: tuple( OwnershipPeriod( a.start, b.start, a.sid, a.value, ) for a, b in sliding_window( 2, concatv( sorted(v), # concat with a fake ownership object to make the last # end date be max timestamp [OwnershipPeriod( pd.Timestamp.max.tz_localize('utc'), None, None, None, )], ), ) ), mappings, )
def test_retrieve_specific_type(self, type_, lookup_name, failure_type): equities = make_simple_equity_info( range(5), start_date=pd.Timestamp("2014-01-01"), end_date=pd.Timestamp("2015-01-01") ) max_equity = equities.index.max() futures = make_commodity_future_info(first_sid=max_equity + 1, root_symbols=["CL"], years=[2014]) equity_sids = [0, 1] future_sids = [max_equity + 1, max_equity + 2, max_equity + 3] if type_ == Equity: success_sids = equity_sids fail_sids = future_sids else: fail_sids = equity_sids success_sids = future_sids with tmp_asset_finder(equities=equities, futures=futures) as finder: # Run twice to exercise caching. lookup = getattr(finder, lookup_name) for _ in range(2): results = lookup(success_sids) self.assertIsInstance(results, dict) self.assertEqual(set(results.keys()), set(success_sids)) self.assertEqual(valmap(int, results), dict(zip(success_sids, success_sids))) self.assertEqual({type_}, {type(asset) for asset in itervalues(results)}) with self.assertRaises(failure_type): lookup(fail_sids) with self.assertRaises(failure_type): # Should fail if **any** of the assets are bad. lookup([success_sids[0], fail_sids[0]])
def test_multiprogress(s, a, b): sched, report = Queue(), Queue(); s.handle_queues(sched, report) s.update_graph(tasks=valmap(dumps_task, {'x-1': (inc, 1), 'x-2': (inc, 'x-1'), 'x-3': (inc, 'x-2'), 'y-1': (dec, 'x-3'), 'y-2': (dec, 'y-1')}), keys=['y-2'], dependencies={'x-2': ['x-1'], 'x-3': ['x-2'], 'y-1': ['x-3'], 'y-2': ['y-1']}) p = MultiProgress(['y-2'], scheduler=s, func=key_split) yield p.setup() assert p.keys == {'x': {'x-1', 'x-2', 'x-3'}, 'y': {'y-1', 'y-2'}} while True: msg = yield report.get() if msg['op'] == 'key-in-memory' and msg['key'] == 'x-3': break assert p.keys == {'x': set(), 'y': {'y-1', 'y-2'}} while True: msg = yield report.get() if msg['op'] == 'key-in-memory' and msg['key'] == 'y-2': break assert p.keys == {'x': set(), 'y': set()} assert p.status == 'finished'
def assemble_ast(tag:str, idsclasses: Mapping[str, str], attrs: Mapping[str, str], body: list): """ Small helper function for the template_2_ast function that assembles the appropriate ast element given the tag name, a dictionary of ids/classes from the tag name, further attrs, and a list of children or the body. For most components, there won't be any children. :param tag: :param idsclasses: :param attrs: :param body: :return: """ iscomponent = re.match(r'^[A-Z]', tag) attrs['id'] = (attrs.get('id', '') + ' ' + idsclasses.get('id', '')).strip() attrs['class'] = (attrs.get('class', '') + ' ' + idsclasses.get('class', '')).strip() # remove the empty attributes to avoid clutter and save bytes. attrs = dict(t.valfilter(lambda x: not (isinstance(x, str) and x.strip() == ''), attrs)) # special handling for the "style" attribute, since that can be a dictionary attrs = t.valmap(lambda val:' '.join('{}: {};'.format(k,v) for k,v in val.items()) if isinstance(val, dict) else val, attrs) if iscomponent: return {'name': tag, 'props': attrs, 'children': body} else: return {'tag': tag, 'attrs': attrs, 'body': body}
def symbol_ownership_map(self): rows = sa.select(self.equity_symbol_mappings.c).execute().fetchall() mappings = {} for row in rows: mappings.setdefault((row.company_symbol, row.share_class_symbol), []).append( SymbolOwnership( pd.Timestamp(row.start_date, unit="ns", tz="utc"), pd.Timestamp(row.end_date, unit="ns", tz="utc"), row.sid, row.symbol, ) ) return valmap( lambda v: tuple( SymbolOwnership(a.start, b.start, a.sid, a.symbol) for a, b in sliding_window( 2, concatv( sorted(v), # concat with a fake ownership object to make the last # end date be max timestamp [SymbolOwnership(pd.Timestamp.max.tz_localize("utc"), None, None, None)], ), ) ), mappings, factory=lambda: mappings, )
def into(a, b, **kwargs): dialect = b.dialect.copy() del dialect['lineterminator'] dates = [i for i, typ in enumerate(b.schema[0].types) if 'date' in str(typ)] schema = b.schema if '?' in str(schema): schema = dshape(str(schema).replace('?', '')) dtypes = valmap(to_numpy_dtype, schema[0].dict) datenames = [name for name in dtypes if np.issubdtype(dtypes[name], np.datetime64)] dtypes = dict((k, v) for k, v in dtypes.items() if not np.issubdtype(v, np.datetime64)) if 'strict' in dialect: del dialect['strict'] # Pass only keyword arguments appropriate for read_csv kws = keywords(pd.read_csv) options = toolz.merge(dialect, kwargs) options = toolz.keyfilter(lambda k: k in kws, options) if b.open == gzip.open: options['compression'] = 'gzip' return pd.read_csv(b.path, skiprows=1 if b.header else 0, dtype=dtypes, parse_dates=datenames, names=b.columns, **options)
def check_type(*ty_args, **ty_kwargs): """ 【装饰器】 检查输入参数类型;检查失败raise CheckError :param ty_args: 类型tuple :param ty_kwargs: 类型dict :return: """ # 检查是否有不合规的tuple参数 for ty in ty_args: if not isinstance(ty, (type, tuple)): raise TypeError( "check_type() expected a type or tuple of types" ", but got {type_} instead.".format( type_=ty, ) ) # 检查是否有不合规的dict参数 for name, ty in six.iteritems(ty_kwargs): if not isinstance(ty, (type, tuple)): raise TypeError( "check_type() expected a type or tuple of types for " "argument '{name}', but got {type_} instead.".format( name=name, type_=ty, ) ) # 将type_check作用在函数参数上 return arg_process(*map(type_check, list(ty_args)), **valmap(type_check, ty_kwargs))
def expect_types(*_pos, **named): """ Preprocessing decorator that verifies inputs have expected types. Usage ----- >>> @expect_types(x=int, y=str) ... def foo(x, y): ... return x, y ... >>> foo(2, '3') (2, '3') >>> foo(2.0, '3') Traceback (most recent call last): ... TypeError: foo() expected an argument of type 'int' for argument 'x', but got float instead. # noqa """ if _pos: raise TypeError("expect_types() only takes keyword arguments.") for name, type_ in iteritems(named): if not isinstance(type_, (type, tuple)): raise TypeError( "expect_types() expected a type or tuple of types for " "argument '{name}', but got {type_} instead.".format( name=name, type_=type_, ) ) return preprocess(**valmap(_expect_type, named))
def md(template, *args, **kwargs): """Wraps string.format with naive markdown escaping""" def escape(s): for char in ('*', '#', '_', '~', '`', '>'): s = s.replace(char, '\\' + char) return s return template.format(*map(escape, args), **toolz.valmap(escape, kwargs))
def __init__(self, constructors, name, *args, **kwargs): args = tuple(map(self._unwrap_name, args)) kwargs = valmap(self._unwrap_name, kwargs) already_bound = {} for n, arg in enumerate(args): if arg in already_bound: raise TypeError( 'argument %r at position %d is already bound to the' ' positional argument at index %d' % ( arg, n, already_bound[arg], ), ) already_bound[arg] = n for k, arg in kwargs.items(): if arg in already_bound: loc = already_bound[arg] raise TypeError( 'argument %r at keyword %s is already bound to the %s' % ( arg, k, ('positional argument at index %d' % loc) if isinstance(loc, int) else ('keyword argument %r' % loc), ), ) super().__init__(constructors, name, *args, **kwargs) del constructors[name] self._constructors = constructors
def test_compression_binary(fmt): from dask.bytes.core import open_files files2 = valmap(compression.compress[fmt], files) with filetexts(files2, mode='b'): myfiles = open_files('.test.accounts.*', compression=fmt) data = compute(*[file.read() for file in myfiles]) assert list(data) == [files[k] for k in sorted(files)]
def expect_kinds(**named): """ Preprocessing decorator that verifies inputs have expected dtype kinds. Usage ----- >>> from numpy import int64, int32, float32 >>> @expect_kinds(x='i') ... def foo(x): ... return x ... >>> foo(int64(2)) 2 >>> foo(int32(2)) 2 >>> foo(float32(2)) Traceback (most recent call last): ...n TypeError: foo() expected a numpy object of kind 'i' for argument 'x', but got 'f' instead. # noqa """ for name, kind in iteritems(named): if not isinstance(kind, (str, tuple)): raise TypeError( "expect_dtype_kinds() expected a string or tuple of strings" " for argument {name!r}, but got {kind} instead.".format( name=name, kind=dtype, ) ) @preprocess(kinds=call(lambda x: x if isinstance(x, tuple) else (x,))) def _expect_kind(kinds): """ Factory for kind-checking functions that work the @preprocess decorator. """ def error_message(func, argname, value): # If the bad value has a dtype, but it's wrong, show the dtype # kind. Otherwise just show the value. try: value_to_show = value.dtype.kind except AttributeError: value_to_show = value return ( "{funcname}() expected a numpy object of kind {kinds} " "for argument {argname!r}, but got {value!r} instead." ).format( funcname=_qualified_name(func), kinds=' or '.join(map(repr, kinds)), argname=argname, value=value_to_show, ) def _actual_preprocessor(func, argname, argvalue): if getattrs(argvalue, ('dtype', 'kind'), object()) not in kinds: raise TypeError(error_message(func, argname, argvalue)) return argvalue return _actual_preprocessor return preprocess(**valmap(_expect_kind, named))
def check_subset(*ss_args, **ss_kwargs): """ 【装饰器】 检查输入参数是否是某一集合的子集;检查失败raise CheckError :param ss_args: 参数集合tuple :param ss_kwargs: 参数集合dict :return: """ # 检查是否有不合规的tuple参数 for ss in ss_args: if not isinstance(ss, (list, set, type(None))): raise TypeError( "check_subset() expected a list or set or None of values" ", but got {subset_} or tuple instead.".format( subset_=str(type(ss)), ) ) # 检查是否有不合规的dict参数 for name, ss in six.iteritems(ss_kwargs): if not isinstance(ss, (list, set, type(None))): raise TypeError( "check_subset() expected a list or set of values for " "argument '{name_}', but got {subset_} or tuple instead.".format( name_=name, subset_=str(type(ss)), ) ) # 将subset_check函数作用在函数参数上 return arg_process(*map(subset_check, list(ss_args)), **valmap(subset_check, ss_kwargs))
def __init__(self, data=None, formats=None, authorization=None, allow_profiler=False, profiler_output=None, profile_by_default=False, allow_add=False): if isinstance(data, collections.Mapping): data = valmap(lambda v: v.data if isinstance(v, _Data) else v, data) elif isinstance(data, _Data): data = data._resources() app = self.app = Flask('blaze.server.server') if data is None: data = {} app.register_blueprint(api, data=data, formats=formats if formats is not None else (json,), authorization=authorization, allow_profiler=allow_profiler, profiler_output=profiler_output, profile_by_default=profile_by_default, allow_add=allow_add) self.data = data
def test_multi_progressbar_widget(s, a, b): s.update_graph(tasks=valmap(dumps_task, {'x-1': (inc, 1), 'x-2': (inc, 'x-1'), 'x-3': (inc, 'x-2'), 'y-1': (dec, 'x-3'), 'y-2': (dec, 'y-1'), 'e': (throws, 'y-2'), 'other': (inc, 123)}), keys=['e'], dependencies={'x-2': ['x-1'], 'x-3': ['x-2'], 'y-1': ['x-3'], 'y-2': ['y-1'], 'e': ['y-2']}) p = MultiProgressWidget(['e'], scheduler=(s.ip, s.port)) yield p.listen() assert p.bars['x'].value == 1.0 assert p.bars['y'].value == 1.0 assert p.bars['e'].value == 0.0 assert '3 / 3' in p.bar_texts['x'].value assert '2 / 2' in p.bar_texts['y'].value assert '0 / 1' in p.bar_texts['e'].value assert p.bars['x'].bar_style == 'success' assert p.bars['y'].bar_style == 'success' # assert p.bars['e'].bar_style == 'danger' assert p.status == 'error' capacities = [int(re.search(r'\d+ / \d+', row.children[0].value) .group().split(' / ')[1]) for row in p.bar_widgets.children] assert sorted(capacities, reverse=True) == capacities
def _get(self, dsk, keys, restrictions=None, raise_on_error=True): flatkeys = list(flatten([keys])) futures = {key: Future(key, self) for key in flatkeys} d = {k: unpack_remotedata(v) for k, v in dsk.items()} dsk2 = {k: v[0] for k, v in d.items()} dsk3 = {k: v for k, v in dsk2.items() if (k == v) is not True} dependencies = {k: v[1] for k, v in d.items()} for k, v in dsk3.items(): dependencies[k] |= set(_deps(dsk, v)) self._send_to_scheduler({'op': 'update-graph', 'tasks': valmap(dumps_task, dsk3), 'dependencies': dependencies, 'keys': flatkeys, 'restrictions': restrictions or {}, 'client': self.id}) packed = pack_data(keys, futures) if raise_on_error: result = yield self._gather(packed) else: try: result = yield self._gather(packed) result = 'OK', result except Exception as e: result = 'error', e raise gen.Return(result)
def persist(self, collections): """ Persist dask collections on cluster Starts computation of the collection on the cluster in the background. Provides a new dask collection that is semantically identical to the previous one, but now based off of futures currently in execution. Parameters ---------- collections: sequence or single dask object Collections like dask.array or dataframe or dask.value objects Returns ------- List of collections, or single collection, depending on type of input. Examples -------- >>> xx = executor.persist(x) # doctest: +SKIP >>> xx, yy = executor.persist([x, y]) # doctest: +SKIP See Also -------- Executor.compute """ if isinstance(collections, (tuple, list, set, frozenset)): singleton = False else: singleton = True collections = [collections] assert all(isinstance(c, Base) for c in collections) groups = groupby(lambda x: x._optimize, collections) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) d = {k: unpack_remotedata(v) for k, v in dsk.items()} dsk2 = {k: v[0] for k, v in d.items()} dependencies = {k: v[1] for k, v in d.items()} for k, v in dsk2.items(): dependencies[k] |= set(_deps(dsk, v)) names = list({k for c in collections for k in flatten(c._keys())}) self._send_to_scheduler({'op': 'update-graph', 'tasks': valmap(dumps_task, dsk2), 'dependencies': dependencies, 'keys': names, 'client': self.id}) result = [redict_collection(c, {k: Future(k, self) for k in flatten(c._keys())}) for c in collections] if singleton: return first(result) else: return result
def optimize(dsk, keys, **kwargs): if isinstance(keys, list): dsk2 = cull(dsk, list(core.flatten(keys))) else: dsk2 = cull(dsk, [keys]) dsk3 = fuse(dsk2) dsk4 = valmap(rewrite_rules.rewrite, dsk3) return dsk4
def test_compression(s3, fmt, blocksize): with s3_context('compress', valmap(compress[fmt], files)) as s3: sample, values = read_bytes('compress/test/accounts.*', s3=s3, compression=fmt, blocksize=blocksize) assert sample.startswith(files[sorted(files)[0]][:10]) results = compute(*concat(values)) assert b''.join(results) == b''.join([files[k] for k in sorted(files)])
def lazify(dsk): """ Remove unnecessary calls to ``list`` in tasks See Also: ``dask.bag.core.lazify_task`` """ return valmap(lazify_task, dsk)
def test_delete_data(s, a, b): yield s.scatter(data=valmap(dumps, {'x': 1, 'y': 2, 'z': 3})) assert set(a.data) | set(b.data) == {'x', 'y', 'z'} assert merge(a.data, b.data) == {'x': 1, 'y': 2, 'z': 3} s.delete_data(keys=['x', 'y']) yield s.clear_data_from_workers() assert set(a.data) | set(b.data) == {'z'}
def test_compression_text(fmt): files2 = valmap(compression.compress[fmt], files) with filetexts(files2, mode='b'): myfiles = open_text_files('.test.accounts.*', compression=fmt) data = [] for file in myfiles: with file as f: data.append(f.read()) assert list(data) == [files[k].decode() for k in sorted(files)]
def test_many_Progresss(s, a, b): sched, report = Queue(), Queue() s.handle_queues(sched, report) s.update_graph(tasks=valmap(dumps_task, { 'x': (inc, 1), 'y': (inc, 'x'), 'z': (inc, 'y') }), keys=['z'], dependencies={ 'y': ['x'], 'z': ['y'] }) bars = [Progress(keys=['z'], scheduler=s) for i in range(10)] yield [b.setup() for b in bars] while True: msg = yield report.get() if msg['op'] == 'key-in-memory' and msg['key'] == 'z': break assert all(b.status == 'finished' for b in bars)
def test_add_worker(s, a, b): w = Worker(s.address, ncores=3) w.data["x-5"] = 6 w.data["y"] = 1 yield w dsk = {("x-%d" % i): (inc, i) for i in range(10)} s.update_graph( tasks=valmap(dumps_task, dsk), keys=list(dsk), client="client", dependencies={k: set() for k in dsk}, ) s.add_worker( address=w.address, keys=list(w.data), ncores=w.ncores, services=s.services ) s.validate_state() assert w.ip in s.host_info assert s.host_info[w.ip]["addresses"] == {a.address, b.address, w.address} yield w.close()
def test_add_worker(s, a, b): w = Worker(s.ip, s.port, ncores=3) w.data['x-5'] = 6 w.data['y'] = 1 yield w._start(0) dsk = {('x-%d' % i): (inc, i) for i in range(10)} s.update_graph(tasks=valmap(dumps_task, dsk), keys=list(dsk), client='client', dependencies={k: set() for k in dsk}) s.add_worker(address=w.address, keys=list(w.data), ncores=w.ncores, services=s.services) s.validate_state() assert w.ip in s.host_info assert s.host_info[w.ip]['addresses'] == {a.address, b.address, w.address} yield w._close()
def coerce_types(**kwargs): """ Preprocessing decorator that applies type coercions. Parameters ---------- **kwargs : dict[str -> (type, callable)] Keyword arguments mapping function parameter names to pairs of (from_type, to_type). Examples -------- >>> @coerce_types(x=(float, int), y=(int, str)) ... def func(x, y): ... return (x, y) ... >>> func(1.0, 3) (1, '3') """ def _coerce(types): return coerce(*types) return preprocess(**valmap(_coerce, kwargs))
def test_compression(s3, fmt, blocksize): if fmt == "zip" and sys.version_info.minor == 5: pytest.skip("zipfile is read-only on py35") if fmt not in compress: pytest.skip("compression function not provided") s3._cache.clear() with s3_context("compress", valmap(compress[fmt], files)): if fmt and blocksize: with pytest.raises(ValueError): read_bytes( "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize, ) return sample, values = read_bytes( "s3://compress/test/accounts.*", compression=fmt, blocksize=blocksize ) assert sample.startswith(files[sorted(files)[0]][:10]) assert sample.endswith(b"\n") results = compute(*concat(values)) assert b"".join(results) == b"".join([files[k] for k in sorted(files)])
def into(a, b, **kwargs): dialect = b.dialect.copy() del dialect['lineterminator'] schema = b.schema if '?' in str(schema): schema = dshape(str(schema).replace('?', '')) dtypes = valmap(to_numpy_dtype, schema[0].dict) datenames = [ name for name in dtypes if np.issubdtype(dtypes[name], np.datetime64) ] dtypes = dict((k, v) for k, v in dtypes.items() if not np.issubdtype(v, np.datetime64)) if 'strict' in dialect: del dialect['strict'] # Pass only keyword arguments appropriate for read_csv kws = keywords(pd.read_csv) options = toolz.merge(dialect, kwargs) options = toolz.keyfilter(lambda k: k in kws, options) if b.open == gzip.open: options['compression'] = 'gzip' usecols = names = options.pop('names', b.columns) return pd.read_csv(b.path, header=0 if b.header else None, dtype=dtypes, parse_dates=datenames, names=names, usecols=usecols, **options)
def load_adjusted_array(self, columns, dates, assets, mask): raw = ffill_query_in_range( self._expr, dates[0], dates[-1], self._odo_kwargs, ) sids = raw.loc[:, SID_FIELD_NAME] raw.drop(sids[~sids.isin(assets)].index, inplace=True) gb = raw.groupby(SID_FIELD_NAME) def mkseries(idx, raw_loc=raw.loc): vs = raw_loc[idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]].values return pd.Series( index=pd.DatetimeIndex(vs[:, 0]), data=vs[:, 1], ) return EarningsCalendarLoader( dates, valmap(mkseries, gb.groups), dataset=self._dataset, ).load_adjusted_array(columns, dates, assets, mask)
def merge_ownership_periods(mappings): """ Given a dict of mappings where the values are lists of OwnershipPeriod objects, returns a dict with the same structure with new OwnershipPeriod objects adjusted so that the periods have no gaps. Orders the periods chronologically, and pushes forward the end date of each period to match the start date of the following period. The end date of the last period pushed forward to the max Timestamp. """ return valmap( lambda v: tuple( OwnershipPeriod( a.start, b.start, a.sid, a.value, ) for a, b in sliding_window( 2, concatv( sorted(v), # concat with a fake ownership object to make the last # end date be max timestamp [ OwnershipPeriod( pd.Timestamp.max.tz_localize('utc'), None, None, None, ) ], ), )), mappings, )
def test_add_worker(s, a, b): w = Worker(s.ip, s.port, ncores=3, ip='127.0.0.1') w.data['x-5'] = 6 w.data['y'] = 1 yield w._start(0) dsk = {('x-%d' % i).encode(): (inc, i) for i in range(10)} s.update_graph(tasks=valmap(dumps_task, dsk), keys=list(dsk), client='client', dependencies={k: set() for k in dsk}) s.add_worker(address=w.address, keys=list(w.data), ncores=w.ncores, services=s.services, coerce_address=False) s.validate_state() assert w.ip in s.host_info assert s.host_info[w.ip]['ports'] == set(map(str, [a.port, b.port, w.port]))
def test_nanny_process_failure(s): n = Nanny(s.ip, s.port, ncores=2, ip='127.0.0.1', loop=s.loop) yield n._start() nn = rpc(ip=n.ip, port=n.port) first_dir = n.worker_dir assert os.path.exists(first_dir) ww = rpc(ip=n.ip, port=n.worker_port) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(StreamClosedError): yield ww.compute(function=dumps(sys.exit), args=dumps((0, )), key='z') start = time() while n.process.is_alive(): # wait while process dies yield gen.sleep(0.01) assert time() - start < 2 start = time() while not n.process.is_alive(): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 2 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 2 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir nn.close_streams() s.stop()
def load_raw_arrays(self, dts, assets, fields=None): """股东持仓变动""" sids = [a.sid for a in assets] table = self.metadata.tables['holder'] sql = select([ table.c.sid, table.c.declared_date, table.c.股东, table.c.方式, cast(table.c.变动股本, Numeric(10, 2)), cast(table.c.总持仓, Integer), cast(table.c.占总股本比, Numeric(10, 5)), cast(table.c.总流通股, Integer), cast(table.c.占流通比, Numeric(10, 5)) ]).where(table.c.declared_date.between(dts[0], dts[1])) frame = pd.DataFrame(self.engine.execute(sql).fetchall(), columns=[ 'sid', 'declared_date', '股东', '方式', '变动股本', '总持仓', '占总股本比', '总流通股', '占流通比' ]) frame.set_index('sid', inplace=True) frame.drop_duplicates(inplace=True) frame_dct = unpack_df_to_component_dict(frame, 'declared_date') frame_dct = valmap(lambda x: x.loc[:, fields] if fields else x, frame_dct) holder_frame = keyfilter(lambda x: x in sids, frame_dct) return holder_frame
def test_multi_progressbar_widget_after_close(s, a, b): s.update_graph(tasks=valmap( dumps_task, { 'x-1': (inc, 1), 'x-2': (inc, 'x-1'), 'x-3': (inc, 'x-2'), 'y-1': (dec, 'x-3'), 'y-2': (dec, 'y-1'), 'e': (throws, 'y-2'), 'other': (inc, 123) }), keys=['e'], dependencies={ 'x-2': {'x-1'}, 'x-3': {'x-2'}, 'y-1': {'x-3'}, 'y-2': {'y-1'}, 'e': {'y-2'} }) p = MultiProgressWidget(['x-1', 'x-2', 'x-3'], scheduler=(s.ip, s.port)) yield p.listen() assert 'x' in p.bars
def load_raw_arrays(self, dts, assets, fields=None): sids = [a.sid for a in assets] # 获取数据 table = self.metadata.tables['massive'] sql = select([ table.c.declared_date, table.c.sid, cast(table.c.bid_price, Numeric(10, 2)), cast(table.c.discount, Numeric(10, 5)), cast(table.c.bid_volume, Integer), table.c.buyer, table.c.seller, table.c.cjeltszb ]).where(table.c.declared_date.between(dts[0], dts[1])) frame = pd.DataFrame(self.engine.execute(sql).fetchall(), columns=[ 'declared_date', 'sid', 'bid_price', 'discount', 'bid_volume', 'buyer', 'seller', 'cjeltszb' ]) frame.set_index('sid', inplace=True) frame.drop_duplicates(inplace=True) frame_dct = unpack_df_to_component_dict(frame, 'declared_date') frame_dct = valmap(lambda x: x.loc[:, fields] if fields else x, frame_dct) massive_frame = keyfilter(lambda x: x in sids, frame_dct) return massive_frame
def load_adjusted_array(self, columns, dates, assets, mask): expr = self._expr filtered = expr[expr[TS_FIELD_NAME] <= dates[0]] lower = odo( bz.by( filtered[SID_FIELD_NAME], timestamp=filtered[TS_FIELD_NAME].max(), ).timestamp.min(), pd.Timestamp, **self._odo_kwargs) if pd.isnull(lower): # If there is no lower date, just query for data in the date # range. It must all be null anyways. lower = dates[0] raw = odo( expr[(expr[TS_FIELD_NAME] >= lower) & (expr[TS_FIELD_NAME] <= dates[-1])], pd.DataFrame, **self._odo_kwargs) sids = raw.loc[:, SID_FIELD_NAME] raw.drop(sids[~(sids.isin(assets) | sids.notnull())].index, inplace=True) gb = raw.groupby(SID_FIELD_NAME) def mkseries(idx, raw_loc=raw.loc): vs = raw_loc[idx, [TS_FIELD_NAME, ANNOUNCEMENT_FIELD_NAME]].values return pd.Series( index=pd.DatetimeIndex(vs[:, 0]), data=vs[:, 1], ) return EarningsCalendarLoader( dates, valmap(mkseries, gb.groups), dataset=self._dataset, ).load_adjusted_array(columns, dates, assets, mask)
def test_nanny_process_failure(c, s): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) yield n._start() first_dir = n.worker_dir assert os.path.exists(first_dir) original_process = n.process ww = rpc(n.worker_address) yield ww.update_data(data=valmap(dumps, {'x': 1, 'y': 2})) with ignoring(CommClosedError): yield c._run(sys.exit, 0, workers=[n.worker_address]) start = time() while n.process is original_process: # wait while process dies yield gen.sleep(0.01) assert time() - start < 5 start = time() while not isalive(n.process): # wait while process comes back yield gen.sleep(0.01) assert time() - start < 5 start = time() while n.worker_address not in s.ncores or n.worker_dir is None: yield gen.sleep(0.01) assert time() - start < 5 second_dir = n.worker_dir yield n._close() assert not os.path.exists(second_dir) assert not os.path.exists(first_dir) assert first_dir != n.worker_dir ww.close_rpc() s.stop()
def __init__(self, data=None, formats=None, authorization=None, allow_profiler=False, profiler_output=None, profile_by_default=False): if isinstance(data, collections.Mapping): data = valmap(lambda v: v.data if isinstance(v, _Data) else v, data) elif isinstance(data, _Data): data = data._resources() app = self.app = Flask('blaze.server.server') if data is None: data = {} app.register_blueprint(api, data=data, formats=formats if formats is not None else (json, ), authorization=authorization, allow_profiler=allow_profiler, profiler_output=profiler_output, profile_by_default=profile_by_default) self.data = data
def expect_element(__funcname=_qualified_name, **named): """ Preprocessing decorator that verifies inputs are elements of some expected collection. Notes ----- A special argument, __funcname, can be provided as a string to override the function name shown in error messages. This is most often used on __init__ or __new__ methods to make errors refer to the class name instead of the function name. This uses the `in` operator (__contains__) to make the containment check. This allows us to use any custom container as long as the object supports the container protocol. """ def _expect_element(collection): if isinstance(collection, (set, frozenset)): # Special case the error message for set and frozen set to make it # less verbose. collection_for_error_message = tuple(sorted(collection)) else: collection_for_error_message = collection template = ( "%(funcname)s() expected a value in {collection} " "for argument '%(argname)s', but got %(actual)s instead." ).format(collection=collection_for_error_message) return make_check( ValueError, template, complement(op.contains(collection)), repr, funcname=__funcname, ) return preprocess(**valmap(_expect_element, named))
def main(scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, bokeh_prefix, tls_ca_file, tls_cert, tls_key): sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and name: logger.error( "Failed to launch worker. You cannot use the --name argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port, 'listen_address': listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if not scheduler and not scheduler_file: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, name=name, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, security=sec, contact_address=contact_address, **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield [n.start(addr) for n in nannies] while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def call_neural_network(state): logger = logging.getLogger(__name__) # Pre-process the inputs # ---------------------- kwargs = {} dt = state.pop('dt') for key, val in state.items(): if isinstance(val, np.ndarray): kwargs[key] = val state['SOLIN'] = compute_insolation(state['lat'], state['day']) # Compute the output of all the models # ------------------------------------ merged_outputs = {} for model in MODELS: logger.info(f"Calling NN") nz = 34 #len(model.heights) lower_atmos_kwargs = get_lower_atmosphere(kwargs, nz) # add a singleton dimension and convert to float32 lower_atmos_kwargs = { key: val[np.newaxis].astype(np.float32) for key, val in lower_atmos_kwargs.items() } # call the neural network out = call_with_numpy_dict(model, lower_atmos_kwargs) # remove the singleton first dimension out = valmap(np.squeeze, out) out = expand_lower_atmosphere(state, out, n_in=nz, n_out=state['QT'].shape[0]) renamed = {} for key in out: renamed['F' + key + 'NN'] = out[key] merged_outputs.update(renamed) # update the state state.update(merged_outputs) # Debugging info below here # ------------------------- nstep = int(state['nstep']) output_this_step = OUTPUT_INTERVAL and (nstep - 1) % OUTPUT_INTERVAL == 0 if DEBUG: save_debug({ 'args': (kwargs, dt), 'out': merged_outputs, }, state) try: logger.info("Mean Precip: %f" % out['Prec'].mean()) except KeyError: pass if output_this_step: zarr_logger.append_all(kwargs) zarr_logger.append_all(merged_outputs) zarr_logger.append('time', np.array([state['day']]))
for base in ['A', 'C', 'G', 'T']: new = cb[:i] + base + cb[i+1:] if cb == new: continue if new in counter_some_error: shadow_counts.append(counter_some_error[new]) shadows[cb] = sum(shadow_counts) real[cb] = freq df.append({'n_shadows': shadows[cb], 'n_real': freq, 'cb': cb}) df = pd.DataFrame(df) df['error_rate'] = df['n_shadows'] / df['n_real'] # error_rates = [shadows[cb] / real[cb] for cb, freq in most_common_CBS] return df dfs = toolz.valmap(lambda x: estimate_error_rate_shadows(x[0], x[1]), res_dict) """ the above its the per read error rate: #correct reads / #wrong reads But how does that translate to a per-base error: - to get 16BP correct, we need (1-p_err)^16 - a single error is 16 * err * (1-err)^15 %correct = (1-p_err)^16 c**(1/16) = 1-p p = 1- c**(1/16) """ right = U0 / (U1 + U0)
def test_read_csv_compression(fmt, blocksize): files2 = valmap(compress[fmt], csv_files) with filetexts(files2, mode='b'): df = dd.read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize) assert_eq(df.compute(scheduler='sync').reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False)
def test_read_csv_compression(fmt, blocksize): files2 = valmap(compress[fmt], files) with filetexts(files2, mode='b'): df = read_csv('2014-01-*.csv', compression=fmt, blocksize=blocksize) eq(df.compute(get=get_sync).reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False)
def main( scheduler, host, worker_port, listen_address, contact_address, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, dashboard, bokeh_port, local_directory, scheduler_file, interface, protocol, death_timeout, preload, preload_argv, dashboard_prefix, tls_ca_file, tls_cert, tls_key, dashboard_address, ): g0, g1, g2 = gc.get_threshold( ) # https://github.com/dask/distributed/issues/1653 gc.set_threshold(g0 * 3, g1 * 3, g2 * 3) enable_proctitle_on_current() enable_proctitle_on_children() if bokeh_port is not None: warnings.warn( "The --bokeh-port flag has been renamed to --dashboard-address. " "Consider adding ``--dashboard-address :%d`` " % bokeh_port) dashboard_address = bokeh_port sec = Security(tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key) if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if contact_address and not listen_address: logger.error( "Failed to launch worker. " "Must specify --listen-address when --contact-address is given") exit(1) if nprocs > 1 and listen_address: logger.error("Failed to launch worker. " "You cannot specify --listen-address when nprocs > 1.") exit(1) if (worker_port or host) and listen_address: logger.error( "Failed to launch worker. " "You cannot specify --listen-address when --worker-port or --host is given." ) exit(1) try: if listen_address: (host, worker_port) = get_address_host_port(listen_address, strict=True) if contact_address: # we only need this to verify it is getting parsed (_, _) = get_address_host_port(contact_address, strict=True) else: # if contact address is not present we use the listen_address for contact contact_address = listen_address except ValueError as e: logger.error("Failed to launch worker. " + str(e)) exit(1) if nanny: port = nanny_port else: port = worker_port if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {"worker_port": worker_port, "listen_address": listen_address} t = Nanny else: kwargs = {} if nanny_port: kwargs["service_ports"] = {"nanny": nanny_port} t = Worker if (not scheduler and not scheduler_file and dask.config.get("scheduler-address", None) is None): raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if death_timeout is not None: death_timeout = parse_timedelta(death_timeout, "s") nannies = [ t(scheduler, scheduler_file=scheduler_file, ncores=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, preload_argv=preload_argv, security=sec, contact_address=contact_address, interface=interface, protocol=protocol, host=host, port=port, dashboard_address=dashboard_address if dashboard else None, service_kwargs={"bokhe": { "prefix": dashboard_prefix }}, name=name if nprocs == 1 or not name else name + "-" + str(i), **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler if nanny: yield [n.close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies while all(n.status != "closed" for n in nannies): yield gen.sleep(0.2) install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def test_repeated_groupby(): b = db.range(10, npartitions=4) c = b.groupby(lambda x: x % 3) assert valmap(len, dict(c)) == valmap(len, dict(c))
def expect_kinds(**named): """ Preprocessing decorator that verifies inputs have expected dtype kinds. Examples -------- >>> from numpy import int64, int32, float32 >>> @expect_kinds(x='i') ... def foo(x): ... return x ... >>> foo(int64(2)) 2 >>> foo(int32(2)) 2 >>> foo(float32(2)) # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS Traceback (most recent call last): ... TypeError: ...foo() expected a numpy object of kind 'i' for argument 'x', but got 'f' instead. """ for name, kind in iteritems(named): if not isinstance(kind, (str, tuple)): raise TypeError( "expect_dtype_kinds() expected a string or tuple of strings" " for argument {name!r}, but got {kind} instead.".format( name=name, kind=dtype, ) ) @preprocess(kinds=call(lambda x: x if isinstance(x, tuple) else (x,))) def _expect_kind(kinds): """ Factory for kind-checking functions that work the @preprocess decorator. """ def error_message(func, argname, value): # If the bad value has a dtype, but it's wrong, show the dtype # kind. Otherwise just show the value. try: value_to_show = value.dtype.kind except AttributeError: value_to_show = value return ( "{funcname}() expected a numpy object of kind {kinds} " "for argument {argname!r}, but got {value!r} instead." ).format( funcname=_qualified_name(func), kinds=' or '.join(map(repr, kinds)), argname=argname, value=value_to_show, ) def _actual_preprocessor(func, argname, argvalue): if getattrs(argvalue, ('dtype', 'kind'), object()) not in kinds: raise TypeError(error_message(func, argname, argvalue)) return argvalue return _actual_preprocessor return preprocess(**valmap(_expect_kind, named))
def expect_dtypes(__funcname=_qualified_name, **named): """ Preprocessing decorator that verifies inputs have expected numpy dtypes. Usage ----- >>> from numpy import dtype, arange, int8, float64 >>> @expect_dtypes(x=dtype(int8)) ... def foo(x, y): ... return x, y ... >>> foo(arange(3, dtype=int8), 'foo') (array([0, 1, 2], dtype=int8), 'foo') >>> foo(arange(3, dtype=float64), 'foo') # doctest: +NORMALIZE_WHITESPACE ... # doctest: +ELLIPSIS Traceback (most recent call last): ... TypeError: ...foo() expected a value with dtype 'int8' for argument 'x', but got 'float64' instead. """ for name, type_ in iteritems(named): if not isinstance(type_, (dtype, tuple)): raise TypeError( "expect_dtypes() expected a numpy dtype or tuple of dtypes" " for argument {name!r}, but got {dtype} instead.".format( name=name, dtype=dtype, ) ) if isinstance(__funcname, str): def get_funcname(_): return __funcname else: get_funcname = __funcname @preprocess(dtypes=call(lambda x: x if isinstance(x, tuple) else (x,))) def _expect_dtype(dtypes): """ Factory for dtype-checking functions that work with the @preprocess decorator. """ def error_message(func, argname, value): # If the bad value has a dtype, but it's wrong, show the dtype # name. Otherwise just show the value. try: value_to_show = value.dtype.name except AttributeError: value_to_show = value return ( "{funcname}() expected a value with dtype {dtype_str} " "for argument {argname!r}, but got {value!r} instead." ).format( funcname=get_funcname(func), dtype_str=' or '.join(repr(d.name) for d in dtypes), argname=argname, value=value_to_show, ) def _actual_preprocessor(func, argname, argvalue): if getattr(argvalue, 'dtype', object()) not in dtypes: raise TypeError(error_message(func, argname, argvalue)) return argvalue return _actual_preprocessor return preprocess(**valmap(_expect_dtype, named))
except AttributeError: infer_pandas_dtype = pd.lib.infer_dtype _ibis_dtypes = toolz.valmap( np.dtype, { dt.Boolean: np.bool_, dt.Null: np.object_, dt.Array: np.object_, dt.String: np.object_, dt.Binary: np.object_, dt.Date: 'datetime64[ns]', dt.Time: 'timedelta64[ns]', dt.Timestamp: 'datetime64[ns]', dt.Int8: np.int8, dt.Int16: np.int16, dt.Int32: np.int32, dt.Int64: np.int64, dt.UInt8: np.uint8, dt.UInt16: np.uint16, dt.UInt32: np.uint32, dt.UInt64: np.uint64, dt.Float32: np.float32, dt.Float64: np.float64, dt.Decimal: np.object_, dt.Struct: np.object_, }, ) _numpy_dtypes = toolz.keymap( np.dtype,
def main( scheduler, host, nthreads, name, memory_limit, device_memory_limit, rmm_pool_size, pid_file, resources, dashboard, dashboard_address, local_directory, scheduler_file, interface, death_timeout, preload, dashboard_prefix, tls_ca_file, tls_cert, tls_key, enable_tcp_over_ucx, enable_infiniband, enable_nvlink, enable_rdmacm, net_devices, **kwargs, ): enable_proctitle_on_current() enable_proctitle_on_children() if tls_ca_file and tls_cert and tls_key: sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key ) else: sec = None try: nprocs = len(os.environ["CUDA_VISIBLE_DEVICES"].split(",")) except KeyError: nprocs = get_n_gpus() if not nthreads: nthreads = min(1, multiprocessing.cpu_count() // nprocs) memory_limit = parse_memory_limit(memory_limit, nthreads, total_cores=nprocs) if pid_file: with open(pid_file, "w") as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {} if dashboard: try: from distributed.dashboard import BokehWorker except ImportError: pass else: if dashboard_prefix: result = (BokehWorker, {"prefix": dashboard_prefix}) else: result = BokehWorker services[("dashboard", dashboard_address)] = result if resources: resources = resources.replace(",", " ").split() resources = dict(pair.split("=") for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() preload_argv = kwargs.get("preload_argv", []) kwargs = {"worker_port": None, "listen_address": None} t = Nanny if not scheduler and not scheduler_file and "scheduler-address" not in config: raise ValueError( "Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786" ) if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if rmm_pool_size is not None: try: import rmm # noqa F401 except ImportError: raise ValueError( "RMM pool requested but module 'rmm' is not available. " "For installation instructions, please see " "https://github.com/rapidsai/rmm" ) # pragma: no cover rmm_pool_size = parse_bytes(rmm_pool_size) nannies = [ t( scheduler, scheduler_file=scheduler_file, nthreads=nthreads, services=services, loop=loop, resources=resources, memory_limit=memory_limit, interface=get_ucx_net_devices( cuda_device_index=i, ucx_net_devices=net_devices, get_openfabrics=False, get_network=True, ), preload=(list(preload) or []) + ["dask_cuda.initialize"], preload_argv=(list(preload_argv) or []) + ["--create-cuda-context"], security=sec, env={"CUDA_VISIBLE_DEVICES": cuda_visible_devices(i)}, plugins={CPUAffinity(get_cpu_affinity(i)), RMMPool(rmm_pool_size)}, name=name if nprocs == 1 or not name else name + "-" + str(i), local_directory=local_directory, config={ "ucx": get_ucx_config( enable_tcp_over_ucx=enable_tcp_over_ucx, enable_infiniband=enable_infiniband, enable_nvlink=enable_nvlink, enable_rdmacm=enable_rdmacm, net_devices=net_devices, cuda_device_index=i, ) }, data=( DeviceHostFile, { "device_memory_limit": get_device_total_memory(index=i) if (device_memory_limit == "auto" or device_memory_limit == int(0)) else parse_bytes(device_memory_limit), "memory_limit": memory_limit, "local_directory": local_directory, }, ), **kwargs, ) for i in range(nprocs) ] @gen.coroutine def close_all(): # Unregister all workers from scheduler yield [n._close(timeout=2) for n in nannies] def on_signal(signum): logger.info("Exiting on signal %d", signum) close_all() @gen.coroutine def run(): yield nannies yield [n.finished() for n in nannies] install_signal_handlers(loop, cleanup=on_signal) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker")
def test_groupby_with_indexer(): b = db.from_sequence([[1, 2, 3], [1, 4, 9], [2, 3, 4]]) result = dict(b.groupby(0)) assert valmap(sorted, result) == {1: [[1, 2, 3], [1, 4, 9]], 2: [[2, 3, 4]]}
def main(scheduler, host, worker_port, http_port, nanny_port, nthreads, nprocs, nanny, name, memory_limit, pid_file, reconnect, resources, bokeh, bokeh_port, local_directory, scheduler_file, interface, death_timeout, preload, bokeh_prefix, tls_ca_file, tls_cert, tls_key): sec = Security( tls_ca_file=tls_ca_file, tls_worker_cert=tls_cert, tls_worker_key=tls_key, ) if nanny: port = nanny_port else: port = worker_port if nprocs > 1 and worker_port != 0: logger.error( "Failed to launch worker. You cannot use the --port argument when nprocs > 1." ) exit(1) if nprocs > 1 and name: logger.error( "Failed to launch worker. You cannot use the --name argument when nprocs > 1." ) exit(1) if nprocs > 1 and not nanny: logger.error( "Failed to launch worker. You cannot use the --no-nanny argument when nprocs > 1." ) exit(1) if not nthreads: nthreads = _ncores // nprocs if pid_file: with open(pid_file, 'w') as f: f.write(str(os.getpid())) def del_pid_file(): if os.path.exists(pid_file): os.remove(pid_file) atexit.register(del_pid_file) services = {('http', http_port): HTTPWorker} if bokeh: try: from distributed.bokeh.worker import BokehWorker except ImportError: pass else: if bokeh_prefix: result = (BokehWorker, {'prefix': bokeh_prefix}) else: result = BokehWorker services[('bokeh', bokeh_port)] = result if resources: resources = resources.replace(',', ' ').split() resources = dict(pair.split('=') for pair in resources) resources = valmap(float, resources) else: resources = None loop = IOLoop.current() if nanny: kwargs = {'worker_port': worker_port} t = Nanny else: kwargs = {} if nanny_port: kwargs['service_ports'] = {'nanny': nanny_port} t = Worker if scheduler_file: while not os.path.exists(scheduler_file): sleep(0.01) for i in range(10): try: with open(scheduler_file) as f: cfg = json.load(f) scheduler = cfg['address'] break except (ValueError, KeyError): # race with scheduler on file sleep(0.01) if not scheduler: raise ValueError("Need to provide scheduler address like\n" "dask-worker SCHEDULER_ADDRESS:8786") if interface: if host: raise ValueError("Can not specify both interface and host") else: host = get_ip_interface(interface) if host or port: addr = uri_from_host_port(host, port, 0) else: # Choose appropriate address for scheduler addr = None nannies = [ t(scheduler, ncores=nthreads, services=services, name=name, loop=loop, resources=resources, memory_limit=memory_limit, reconnect=reconnect, local_dir=local_directory, death_timeout=death_timeout, preload=preload, security=sec, **kwargs) for i in range(nprocs) ] @gen.coroutine def close_all(): try: if nanny: yield [n._close(timeout=2) for n in nannies] finally: loop.stop() def handle_signal(signum, frame): logger.info("Exiting on signal %d", signum) if loop._running: loop.add_callback_from_signal(loop.stop) else: exit(0) # NOTE: We can't use the generic install_signal_handlers() function from # distributed.cli.utils because we're handling the signal differently. signal.signal(signal.SIGINT, handle_signal) signal.signal(signal.SIGTERM, handle_signal) for n in nannies: n.start(addr) @gen.coroutine def run(): while all(n.status != 'closed' for n in nannies): yield gen.sleep(0.2) try: loop.run_sync(run) except (KeyboardInterrupt, TimeoutError): pass finally: logger.info("End worker") # Clean exit: unregister all workers from scheduler loop.run_sync(close_all)