def fake_metadata(num_exps, num_plates, num_wells, num_sites, datasets=('train', 'test')): rows = [] for dataset in datasets: for _ in range(num_exps): cell_type = np.random.choice(CELL_TYPES) experiment_number = np.random.choice([1, 2, 3, 4, 5, 6, 7, 8, 9]) experiment = '{}-{:02d}'.format(cell_type, experiment_number) base_row = { 'experiment': experiment, 'cell_type': cell_type, 'dataset': dataset } for p in range(1, num_plates + 1): base_row = t.assoc(base_row, 'plate', p) rand_wells = np.random.choice( ALL_WELLS, num_wells, replace=False) for well in rand_wells: well_row = t.merge(base_row, { 'well': well, 'well_type': 'treatment', 'sirna': 1.0 }) for site in range(1, num_sites + 1): rows.append(t.assoc(well_row, 'site', site)) df = pd.DataFrame(rows) return df
def data_for_app_GET(self): """ List all configurationkeys and rangeconstraints of specific application. Returns application with configurationkeys, rangeconstraints and exclusionconstraints """ app_id = self.request.swagger_data['id'] app = Application.get(app_id) if app is None: print_log(datetime.datetime.now(), 'GET', '/applications/' + str(id) + '/rangeconstraints', 'Get all things of one application', None) return self.createResponse(None, 400) if app.apikey is None: app = self.set_app_apikey(app, app_id) configurationkeys = app.configurationkeys ranges = list( concat(list(map(lambda _: _.rangeconstraints, configurationkeys)))) exclusions = self.get_app_exclusionconstraints(app_id) app_data = app.as_dict() app_data = assoc(app_data, 'configurationkeys', list(map(lambda _: _.as_dict(), configurationkeys))) app_data = assoc(app_data, 'rangeconstraints', list(map(lambda _: _.as_dict(), ranges))) app_data = assoc(app_data, 'exclusionconstraints', list(map(lambda _: _.as_dict(), exclusions))) return app_data
def __init__(self, center, workers): self.center = assoc(start_center(center), 'address', center) self.workers = [ assoc(start_worker(center, worker), 'address', worker) for worker in workers ] sleep(1) self.report()
def todo_app(state, action): if action['type'] == ActionTypes.ADD_TODO: todos = state['todos'] + (action['text'],) return toolz.assoc(state, 'todos', todos) elif action['type'] == ActionTypes.COMPLETE_TODO: todos = state['todos'][:action['index']] + state['todos'][action['index'] + 1:] return toolz.assoc(state, 'todos', todos) else: return state
def set_transaction_type_if_needed(transaction_dict: Dict[str, Any]) -> Dict[str, Any]: if 'type' not in transaction_dict: if 'gasPrice' in transaction_dict and 'accessList' in transaction_dict: # access list txn - type 1 transaction_dict = assoc(transaction_dict, 'type', '0x1') elif 'maxFeePerGas' in transaction_dict and 'maxPriorityFeePerGas' in transaction_dict: # dynamic fee txn - type 2 transaction_dict = assoc(transaction_dict, 'type', '0x2') return transaction_dict
def rebatch_metadata_by_experiment(metadata): normal, normal_rest = prioritize_normals(metadata) batch = metadata[0]["participant"] tumor_batch = [tz.assoc(x, "batch", batch) for x in metadata if x["sample_type"] in PRIORITIZED_TUMOR_CODES.keys()] normal = [tz.assoc(normal, "batch", batch)] if normal else [] # run each non priority normal as its own tumor sample with no control normal_rest = [tz.assoc(x, "batch", batch + "-" + x["sample_type"]) for x in normal_rest] normal_rest = [tz.assoc(x, "phenotype", "tumor") for x in normal_rest] all_batches = normal + normal_rest + tumor_batch return all_batches
def unify(u, v, s): u = transitive_get(u, s) v = transitive_get(v, s) if u == v: return s if isvar(u): return assoc(s, u, v) #返回新的匹配组 if isvar(v): return assoc(s, v, u) return _unify(u, v, s)
def tls_cluster_context(worker_kwargs=None, scheduler_kwargs=None, security=None, **kwargs): security = security or tls_only_security() worker_kwargs = assoc(worker_kwargs or {}, 'security', security) scheduler_kwargs = assoc(scheduler_kwargs or {}, 'security', security) with cluster(worker_kwargs=worker_kwargs, scheduler_kwargs=scheduler_kwargs, **kwargs) as (s, workers): yield s, workers
def __init__(self, handlers, max_buffer_size=MAX_BUFFER_SIZE, connection_limit=512, **kwargs): self.handlers = assoc(handlers, 'identity', self.identity) self.id = str(uuid.uuid1()) self._port = None self.rpc = ConnectionPool(limit=connection_limit) super(Server, self).__init__(max_buffer_size=max_buffer_size, **kwargs)
def __init__(self, handlers, max_buffer_size=MAX_BUFFER_SIZE, connection_limit=512, deserialize=True, **kwargs): self.handlers = assoc(handlers, 'identity', self.identity) self.id = str(uuid.uuid1()) self._port = None self._listen_streams = dict() self.rpc = ConnectionPool(limit=connection_limit, deserialize=deserialize) self.deserialize = deserialize self.monitor = SystemMonitor() self.counters = None self.digests = None if hasattr(self, 'loop'): with ignoring(ImportError): from .counter import Digest self.digests = defaultdict(partial(Digest, loop=self.loop)) from .counter import Counter self.counters = defaultdict(partial(Counter, loop=self.loop)) pc = PeriodicCallback(self.monitor.update, 500, io_loop=self.loop) self.loop.add_callback(pc.start) if self.digests is not None: self._last_tick = time() self._tick_pc = PeriodicCallback(self._measure_tick, 20, io_loop=self.loop) self.loop.add_callback(self._tick_pc.start) self.__stopped = False super(Server, self).__init__(max_buffer_size=max_buffer_size, **kwargs)
async def test_unmonitored_threat( self, threat_monitor, threat_repo, occurrence_factory, threat_factory, report_threat_dto_factory, ): dto = report_threat_dto_factory() threat = threat_factory( name=dto.name, danger_level=dto.danger_level, location=dto.location, occurrences=[occurrence_factory(state="resolved").dict()], ) new_threat = threat_factory( **assoc( threat.dict(), "occurrences", [occurrence_factory(state="pending").dict()], ), ) threat_repo.upsert.return_value = threat threat_repo.create_pending_occurrence.return_value = new_threat threat_monitor.start_monitoring.return_value = new_threat result = await threat_service.report_threat(threat_monitor, threat_repo, dto) threat_repo.upsert.assert_called_once_with(dto) threat_repo.create_pending_occurrence.assert_called_once_with(threat) threat_monitor.start_monitoring.assert_called_once_with(new_threat) assert result == new_threat assert result.is_being_monitored() is True
def step_with_model(rhs, state, dt=.125, n=100): """Perform a number of time steps with a model""" for t in range(n): qt_dot = rhs(state) new_qt = state['QT'] + dt * qt_dot['QT'] state = assoc(state, 'QT', new_qt) yield state
def __init__(self, handlers, connection_limit=512, deserialize=True, io_loop=None): self.handlers = assoc(handlers, 'identity', self.identity) self.id = str(uuid.uuid1()) self._address = None self._listen_address = None self._port = None self._comms = {} self.rpc = ConnectionPool(limit=connection_limit, deserialize=deserialize) self.deserialize = deserialize self.monitor = SystemMonitor() self.counters = None self.digests = None self.listener = None self.io_loop = io_loop or IOLoop.current() if hasattr(self, 'loop'): # XXX? with ignoring(ImportError): from .counter import Digest self.digests = defaultdict(partial(Digest, loop=self.loop)) from .counter import Counter self.counters = defaultdict(partial(Counter, loop=self.loop)) pc = PeriodicCallback(self.monitor.update, 500, io_loop=self.loop) self.loop.add_callback(pc.start) if self.digests is not None: self._last_tick = time() self._tick_pc = PeriodicCallback(self._measure_tick, 20, io_loop=self.loop) self.loop.add_callback(self._tick_pc.start) self.__stopped = False
def _normalize_arg(other, constants): """Get the name to use to build the string and turn the value into something that can go into the ast. If needed this will functionally update the constants. Parameters ---------- other : any The object to normalize. constants : dict[str -> any] The constant namespace. Returns ------- othername : str The name to use in the ``_name`` of the lambda. other : any The normalized value. constants : dict[str -> any] The potentially updated constants. """ if not isinstance(other, placeholder): othername = repr(other) name = '_' + uuid4().hex constants = assoc(constants, name, other) other = ast.Name(id=name, ctx=ast.Load()) elif other._tree is not other: othername = '(%s)' % other._name else: othername = other._name return othername, other, constants
def groupby_many(keys: Callable[[Any], Iterable], reducer: Reducer, initial): """Given a `keys` function, that maps an element into multiple keys, transduces the collection into a dictionary of key to group of matching elements. >>> transducer.transduce( transducer.groupby_many( lambda x: ("even",) if x % 2 == 0 else ("odd",), lambda s, x: (*s, x), (), ), lambda s, _: s, {}, [1, 2, 3, 4, 5], ) {"even": (2, 4), "odd": (1, 3, 5)} """ return functional_generic.compose( mapcat( functional_generic.compose_left( functional_generic.juxt(keys, functional.wrap_tuple), sync.star(itertools.product), ), ), lambda step: lambda s, x: step( toolz.assoc(s, x[0], reducer(s.get(x[0], initial), x[1])), x, ), )
async def run_traffic_jam(nsends, nbytes): # This test eats `nsends * nbytes` bytes in RAM np = pytest.importorskip("numpy") from distributed.protocol import to_serialize data = bytes(np.random.randint(0, 255, size=(nbytes, )).astype("u1").data) async with EchoServer() as e: comm = await connect(e.address) b = BatchedSend(interval=0.01) b.start(comm) msg = {"x": to_serialize(data)} for i in range(nsends): b.send(assoc(msg, "i", i)) if np.random.random() > 0.5: await asyncio.sleep(0.001) results = [] count = 0 while len(results) < nsends: # If this times out then I think it's a backpressure issue # Somehow we're able to flood the socket so that the receiving end # loses some of our messages L = await asyncio.wait_for(comm.read(), 5) count += 1 results.extend(r["i"] for r in L) assert count == b.batch_count == e.count assert b.message_count == nsends assert results == list(range(nsends)) await comm.close() # external closing await b.close()
def run_traffic_jam(nsends, nbytes): # This test eats `nsends * nbytes` bytes in RAM np = pytest.importorskip('numpy') from distributed.protocol import to_serialize data = bytes(np.random.randint(0, 255, size=(nbytes,)).astype('u1').data) with echo_server() as e: comm = yield connect(e.address) b = BatchedSend(interval=0.01) b.start(comm) msg = {'x': to_serialize(data)} for i in range(nsends): b.send(assoc(msg, 'i', i)) if np.random.random() > 0.5: yield gen.sleep(0.001) results = [] count = 0 while len(results) < nsends: # If this times out then I think it's a backpressure issue # Somehow we're able to flood the socket so that the receiving end # loses some of our messages L = yield gen.with_timeout(timedelta(seconds=5), comm.read()) count += 1 results.extend(r['i'] for r in L) assert count == b.batch_count == e.count assert b.message_count == nsends assert results == list(range(nsends)) comm.close() # external closing yield b.close()
def unify(u, v, s): # no check at the moment """ Find substitution so that u == v while satisfying s >>> x = var('x') >>> unify((1, x), (1, 2), {}) {~x: 2} """ u = walk(u, s) v = walk(v, s) if u == v: return s if isvar(u): return assoc(s, u, v) if isvar(v): return assoc(s, v, u) return _unify(u, v, s)
def fill_kwargs(fn, args, kwargs): """ Read a csv file and fill up kwargs This normalizes kwargs against a sample file. It does the following: 1. If given a globstring, just use one file 2. Get names from csv file if not given 3. Identify the presence of a header 4. Identify dtypes 5. Establish column names 6. Switch around dtypes and column names if parse_dates is active Normally ``pd.read_csv`` does this for us. However for ``dd.read_csv`` we need to be consistent across multiple files and don't want to do these heuristics each time so we use the pandas solution once, record the results, and then send back a fully explicit kwargs dict to send to future calls to ``pd.read_csv``. Returns ------- kwargs: dict keyword arguments to give to pd.read_csv """ kwargs = merge(csv_defaults, kwargs) sample_nrows = kwargs.pop('sample_nrows', 1000) essentials = ['columns', 'names', 'header', 'parse_dates', 'dtype'] if set(essentials).issubset(kwargs): return kwargs # Let pandas infer on the first 100 rows if '*' in fn: filenames = sorted(glob(fn)) if not filenames: raise ValueError("No files found matching name %s" % fn) fn = filenames[0] if 'names' not in kwargs: kwargs['names'] = csv_names(fn, **kwargs) if 'header' not in kwargs: kwargs['header'] = infer_header(fn, **kwargs) if kwargs['header'] is True: kwargs['header'] = 0 try: head = pd.read_csv(fn, *args, **assoc(kwargs, 'nrows', sample_nrows)) except StopIteration: head = pd.read_csv(fn, *args, **kwargs) if 'parse_dates' not in kwargs: kwargs['parse_dates'] = [col for col in head.dtypes.index if np.issubdtype(head.dtypes[col], np.datetime64)] if 'dtype' not in kwargs: kwargs['dtype'] = dict(head.dtypes) for col in kwargs['parse_dates']: del kwargs['dtype'][col] kwargs['columns'] = list(head.columns) return kwargs
def rebatch_metadata_by_experiment(metadata): normal, normal_rest = prioritize_normals(metadata) batch = metadata[0]["participant"] tumor_batch = [ tz.assoc(x, "batch", batch) for x in metadata if x["sample_type"] in PRIORITIZED_TUMOR_CODES.keys() ] normal = [tz.assoc(normal, "batch", batch)] if normal else [] # run each non priority normal as its own tumor sample with no control normal_rest = [ tz.assoc(x, "batch", batch + "-" + x["sample_type"]) for x in normal_rest ] normal_rest = [tz.assoc(x, "phenotype", "tumor") for x in normal_rest] all_batches = normal + normal_rest + tumor_batch return all_batches
def process(msg): try: result = yield self.compute(report=False, **msg) bstream.send(result) except Exception as e: logger.exception(e) bstream.send(assoc(error_message(e), 'key', msg.get('key')))
def test_data_for_app_GET(self): from toolz import assoc, concat self.req.swagger_data = {'id': 1} httpApps = Applications(self.req) response = httpApps.data_for_app_GET() app = Application.get(1) configurationkeys = app.configurationkeys ranges = list(concat(list(map(lambda _: _.rangeconstraints, configurationkeys)))) app_data = app.as_dict() app_data = assoc(app_data, 'configurationkeys', list(map(lambda _: _.as_dict(), configurationkeys))) app_data = assoc(app_data, 'rangeconstraints', list(map(lambda _: _.as_dict(), ranges))) app_data = assoc(app_data, 'exclusionconstraints', list(map(lambda _: _.as_dict(), httpApps.get_app_exclusionconstraints(1)))) assert response == app_data
def _get_subnet_config_w_cidr(self, network_config): network_cidr_base = str(network_config.get('network_cidr_base', '172.16.0.0')) network_cidr_size = str(network_config.get('network_cidr_size', '20')) first_network_address_block = str(network_config.get('first_network_address_block', network_cidr_base)) ret_val = {} base_cidr = network_cidr_base + '/' + network_cidr_size net = netaddr.IPNetwork(base_cidr) grouped_subnet = groupby('size', self._get_subnet_config_w_az(network_config)) subnet_groups = sorted(grouped_subnet.items()) available_cidrs = [] for subnet_size, subnet_configs in subnet_groups: newcidrs = net.subnet(int(subnet_size)) for subnet_config in subnet_configs: try: cidr = newcidrs.next() except StopIteration as e: net = chain(*reversed(available_cidrs)).next() newcidrs = net.subnet(int(subnet_size)) cidr = newcidrs.next() new_config = assoc(subnet_config, 'cidr', str(cidr)) yield new_config else: net = newcidrs.next() available_cidrs.append(newcidrs)
def run_traffic_jam(nsends, nbytes): # This test eats `nsends * nbytes` bytes in RAM np = pytest.importorskip('numpy') from distributed.protocol import to_serialize data = bytes(np.random.randint(0, 255, size=(nbytes, )).astype('u1').data) with echo_server() as e: comm = yield connect(e.address) b = BatchedSend(interval=0.01) b.start(comm) msg = {'x': to_serialize(data)} for i in range(nsends): b.send(assoc(msg, 'i', i)) if np.random.random() > 0.5: yield gen.sleep(0.001) results = [] count = 0 while len(results) < nsends: # If this times out then I think it's a backpressure issue # Somehow we're able to flood the socket so that the receiving end # loses some of our messages L = yield gen.with_timeout(timedelta(seconds=5), comm.read()) count += 1 results.extend(r['i'] for r in L) assert count == b.batch_count == e.count assert b.message_count == nsends assert results == list(range(nsends)) comm.close() # external closing yield b.close()
def _get_subnet_config_w_cidr(self, network_config): network_cidr_base = str( network_config.get('network_cidr_base', '172.16.0.0')) network_cidr_size = str(network_config.get('network_cidr_size', '20')) first_network_address_block = str( network_config.get('first_network_address_block', network_cidr_base)) ret_val = {} base_cidr = network_cidr_base + '/' + network_cidr_size net = netaddr.IPNetwork(base_cidr) grouped_subnet = groupby('size', self._get_subnet_config_w_az(network_config)) subnet_groups = sorted(grouped_subnet.items()) available_cidrs = [] for subnet_size, subnet_configs in subnet_groups: newcidrs = net.subnet(int(subnet_size)) for subnet_config in subnet_configs: try: cidr = newcidrs.next() except StopIteration as e: net = chain(*reversed(available_cidrs)).next() newcidrs = net.subnet(int(subnet_size)) cidr = newcidrs.next() new_config = assoc(subnet_config, 'cidr', str(cidr)) yield new_config else: net = newcidrs.next() available_cidrs.append(newcidrs)
def read_csv(fn, *args, **kwargs): chunkbytes = kwargs.pop('chunkbytes', 2**25) # 50 MB categorize = kwargs.pop('categorize', None) index = kwargs.pop('index', None) if index and categorize == None: categorize = True kwargs = fill_kwargs(fn, args, kwargs) # Handle glob strings if '*' in fn: return concat([read_csv(f, *args, **kwargs) for f in sorted(glob(fn))]) token = tokenize(os.path.getmtime(fn), args, kwargs) name = 'read-csv-%s-%s' % (fn, token) columns = kwargs.pop('columns') header = kwargs.pop('header') if 'nrows' in kwargs: # Just create single partition dsk = {(name, 0): (apply, pd.read_csv, (fn,), assoc(kwargs, 'header', header))} result = DataFrame(dsk, name, columns, [None, None]) else: # Chunk sizes and numbers total_bytes = file_size(fn, kwargs['compression']) nchunks = int(ceil(total_bytes / chunkbytes)) divisions = [None] * (nchunks + 1) first_read_csv = partial(pd.read_csv, *args, header=header, **dissoc(kwargs, 'compression')) rest_read_csv = partial(pd.read_csv, *args, header=None, **dissoc(kwargs, 'compression')) # Create dask graph dsk = dict(((name, i), (rest_read_csv, (BytesIO, (textblock, fn, i*chunkbytes, (i+1) * chunkbytes, kwargs['compression'])))) for i in range(1, nchunks)) dsk[(name, 0)] = (first_read_csv, (BytesIO, (textblock, fn, 0, chunkbytes, kwargs['compression']))) result = DataFrame(dsk, name, columns, divisions) if categorize or index: categories, quantiles = categories_and_quantiles(fn, args, kwargs, index, categorize, chunkbytes=chunkbytes) if categorize: func = partial(categorize_block, categories=categories) result = result.map_partitions(func, columns=columns) if index: result = set_partition(result, index, quantiles) return result
def test_parameterized_term_default_value(self): defaults = {'a': 'default for a', 'b': 'default for b'} class F(Factor): params = defaults inputs = (SomeDataSet.foo,) dtype = 'f8' window_length = 5 assert_equal(F().params, defaults) assert_equal(F(a='new a').params, assoc(defaults, 'a', 'new a')) assert_equal(F(b='new b').params, assoc(defaults, 'b', 'new b')) assert_equal( F(a='new a', b='new b').params, {'a': 'new a', 'b': 'new b'}, )
def decode(self, data: bytes) -> _DecodedMsgType: try: raw_decoded = cast(Dict[str, int], super().decode(data)) except rlp.exceptions.ListDeserializationError: self.logger.warning("Malformed Disconnect message: %s", data) raise MalformedMessage(f"Malformed Disconnect message: {data}") return assoc(raw_decoded, "reason_name", self.get_reason_name(raw_decoded["reason"]))
def run(self, *args, **kwargs): port = kwargs.pop('port', DEFAULT_PORT) self.port = port try: self.app.run(*args, port=port, **kwargs) except socket.error: print("\tOops, couldn't connect on port %d. Is it busy?" % port) self.run(*args, **assoc(kwargs, 'port', port + 1))
def experiments_GET(self): """ List all Application's Experiments including Experiments' status """ app_id = self.request.swagger_data['appid'] experiments = Experiment.query().join(Application)\ .filter(Application.id == app_id).all() return list(map(lambda _: assoc(_.as_dict(), 'status',\ _.get_status()), experiments))
def _get_subnet_config_w_az(self, network_config): az_count = int(network_config.get('az_count', 2)) subnet_config = network_config.get('subnet_config', {}) for subnet in subnet_config: for az in range(az_count): newsubnet = assoc(subnet, 'AZ', az) yield newsubnet
def fix_user_identities(event): field = 'user_identities' if field not in event: return None identities = event.pop(field) identities = (assoc(x, 'row_idx', event['row_idx']) for x in identities) return identities
def train(client, X, y, params, model_factory, sample_weight=None, **kwargs): data_parts = X.to_delayed() label_parts = y.to_delayed() if isinstance(data_parts, np.ndarray): assert data_parts.shape[1] == 1 data_parts = data_parts.flatten().tolist() if isinstance(label_parts, np.ndarray): assert label_parts.ndim == 1 or label_parts.shape[1] == 1 label_parts = label_parts.flatten().tolist() # Arrange parts into tuples. This enforces co-locality if sample_weight is not None: sample_weight_parts = sample_weight.to_delayed() if isinstance(sample_weight_parts, np.ndarray): assert sample_weight_parts.ndim == 1 or sample_weight_parts.shape[ 1] == 1 sample_weight_parts = sample_weight_parts.flatten().tolist() parts = list( map(delayed, zip(data_parts, label_parts, sample_weight_parts))) else: parts = list(map(delayed, zip(data_parts, label_parts))) parts = client.compute(parts) # Start computation in the background wait(parts) for part in parts: if part.status == 'error': part # trigger error locally key_to_part_dict = dict([(part.key, part) for part in parts]) who_has = client.who_has(parts) worker_map = defaultdict(list) for key, workers in who_has.items(): worker_map[first(workers)].append(key_to_part_dict[key]) master_worker = first(worker_map) ncores = client.ncores() # Number of cores per worker if "tree_learner" not in params or params['tree_learner'].lower() not in { "data", "feature", "voting" }: logger.warning( "Parameter tree_learner not set or set to incorrect value (%s), using 'data' as default", params.get("tree_learner", None)) params['tree_learner'] = "data" # Tell each worker to init the booster on the chunks/parts that it has locally futures_classifiers = [ client.submit(_fit_local, model_factory=model_factory, params=assoc(params, 'num_threads', ncores[worker]), list_of_parts=list_of_parts, worker_addresses=list(worker_map.keys()), local_listen_port=params.get("local_listen_port", 12400), listen_time_out=params.get("listen_time_out", 120), return_model=worker == master_worker, **kwargs) for worker, list_of_parts in worker_map.items() ] results = client.gather(futures_classifiers) results = [v for v in results if v] return results[0]
def _train(client, params, data, labels, dmatrix_kwargs={}, **kwargs): """ Asynchronous version of train See Also -------- train """ # Break apart Dask.array/dataframe into chunks/parts data_parts = data.to_delayed() label_parts = labels.to_delayed() if isinstance(data_parts, np.ndarray): assert data_parts.shape[1] == 1 data_parts = data_parts.flatten().tolist() if isinstance(label_parts, np.ndarray): assert label_parts.ndim == 1 or label_parts.shape[1] == 1 label_parts = label_parts.flatten().tolist() # Arrange parts into pairs. This enforces co-locality parts = list(map(delayed, zip(data_parts, label_parts))) parts = client.compute(parts) # Start computation in the background yield wait(parts) for part in parts: if part.status == 'error': yield part # trigger error locally # Because XGBoost-python doesn't yet allow iterative training, we need to # find the locations of all chunks and map them to particular Dask workers key_to_part_dict = dict([(part.key, part) for part in parts]) who_has = yield client.scheduler.who_has(keys=[part.key for part in parts]) worker_map = defaultdict(list) for key, workers in who_has.items(): worker_map[first(workers)].append(key_to_part_dict[key]) ncores = yield client.scheduler.ncores() # Number of cores per worker # Start the XGBoost tracker on the Dask scheduler host, port = parse_host_port(client.scheduler.address) env = yield client._run_on_scheduler(start_tracker, host.strip('/:'), len(worker_map)) # Tell each worker to train on the chunks/parts that it has locally futures = [ client.submit(train_part, env, assoc(params, 'nthread', ncores[worker]), list_of_parts, workers=worker, dmatrix_kwargs=dmatrix_kwargs, **kwargs) for worker, list_of_parts in worker_map.items() ] # Get the results, only one will be non-None results = yield client._gather(futures) result = [v for v in results if v][0] raise gen.Return(result)
def run(self, *args, **kwargs): """Run the server""" port = kwargs.pop('port', DEFAULT_PORT) self.port = port try: self.app.run(*args, port=port, **kwargs) except socket.error: print("\tOops, couldn't connect on port %d. Is it busy?" % port) self.run(*args, **assoc(kwargs, 'port', port + 1))
def compute_down(expr, ec, profiler_output=None, compute_kwargs=None, odo_kwargs=None, **kwargs): """Compute down for blaze clients. Parameters ---------- expr : Expr The expression to send to the server. ec : Client The blaze client to compute against. namespace : dict[Symbol -> any], optional The namespace to compute the expression in. This will be amended to include that data for the server. By default this will just be the client mapping to the server's data. compute_kwargs : dict, optional Extra kwargs to pass to compute on the server. odo_kwargs : dict, optional Extra kwargs to pass to odo on the server. profile : bool, optional Should blaze server run cProfile over the computation of the expression and the serialization of the response. profiler_output : file-like object, optional A file like object to hold the profiling output from the server. If this is not passed then the server will write the data to the server's filesystem """ from .server import to_tree kwargs = keymap(u8, kwargs) tree = to_tree(expr) serial = ec.serial if profiler_output is not None: kwargs[u'profile'] = True kwargs[u'profiler_output'] = ':response' kwargs[u'compute_kwargs'] = keymap(u8, compute_kwargs or {}) kwargs[u'odo_kwargs'] = keymap(u8, odo_kwargs or {}) r = post( ec, '/compute', data=serial.dumps(assoc(kwargs, u'expr', tree)), auth=ec.auth, headers=mimetype(serial), ) if not ok(r): raise ValueError("Bad response: %s" % reason(r)) response = serial.loads(content(r)) if profiler_output is not None: profiler_output.write(response[u'profiler_output']) return serial.data_loads(response[u'data'])
def json_expand(json_op, key_name='json'): """ Convert a string json object to Python dict in an op. """ if type(json_op) == dict and key_name in json_op and json_op[key_name]: try: return update_in(json_op, [key_name], json.loads) except JSONDecodeError: return assoc(json_op, key_name, {}) return json_op
def _ready_task(self, function=None, key=None, args=(), kwargs={}, task=None, who_has=None): who_has = who_has or {} diagnostics = {} data = {k: self.data[k] for k in who_has if k in self.data} who_has = {k: set(map(coerce_to_address, v)) for k, v in who_has.items() if k not in self.data} if who_has: try: logger.info("gather %d keys from peers: %s", len(who_has), str(who_has)) diagnostics['transfer-start'] = time() other = yield gather_from_workers(who_has) diagnostics['transfer-stop'] = time() self.data.update(other) yield self.center.add_keys(address=self.address, keys=list(other)) data.update(other) except KeyError as e: logger.warn("Could not find data for %s", key) raise Return({'status': 'missing-data', 'keys': e.args, 'key': key}) else: transfer_time = 0 try: start = default_timer() if task is not None: task = loads(task) if function is not None: function = loads(function) if args: args = loads(args) if kwargs: kwargs = loads(kwargs) diagnostics['deserialization'] = default_timer() - start except Exception as e: logger.warn("Could not deserialize task", exc_info=True) raise Return(assoc(error_message(e), 'key', key)) if task is not None: assert not function and not args and not kwargs function = execute_task args = (task,) # Fill args with data args2 = pack_data(args, data) kwargs2 = pack_data(kwargs, data) raise Return({'status': 'OK', 'function': function, 'args': args2, 'kwargs': kwargs2, 'diagnostics': diagnostics, 'key': key})
def run(self, *args, **kwargs): """Run the server""" port = kwargs.pop('port', DEFAULT_PORT) self.port = port try: self.app.run(*args, port=port, **kwargs) except socket.error: print("\tOops, couldn't connect on port %d. Is it busy?" % port) if kwargs.get('retry', True): # Attempt to start the server on a new port. self.run(*args, **assoc(kwargs, 'port', port + 1))
def _write_tool(step_dir, name, inputs, outputs): out_file = os.path.join(step_dir, "%s.cwl" % name) out = {"class": "CommandLineTool", "baseCommand": ["bcbio_nextgen.py", "runfn", name, "cwl"], "inputs": [], "outputs": []} for i, inp in enumerate(inputs): out["inputs"].append(tz.assoc(inp, "inputBinding", {"prefix": "%s=" % inp["id"].replace("#", ""), "separate": False, "itemSeparator": ";;", "position": i})) with open(out_file, "w") as out_handle: yaml.safe_dump(out, out_handle, default_flow_style=False, allow_unicode=False) return os.path.join("steps", os.path.basename(out_file))
def set_model(filepath,k,v): """ Save key k and value v to json file. """ f = open(filepath,'wr+') fstring = f.read() try: data = json.loads(fstring) except: data = dict() f.close() g = open(filepath,'w+') new_data = assoc(data,k,v) new_json_data = json.dumps(new_data) g.write(new_json_data) g.close()
def test_parameterized_term_default_value_with_not_specified(self): defaults = {'a': 'default for a', 'b': NotSpecified} class F(Factor): params = defaults inputs = (SomeDataSet.foo,) dtype = 'f8' window_length = 5 pattern = r"F expected a keyword parameter 'b'\." with assert_raises_regex(TypeError, pattern): F() with assert_raises_regex(TypeError, pattern): F(a='new a') assert_equal(F(b='new b').params, assoc(defaults, 'b', 'new b')) assert_equal( F(a='new a', b='new b').params, {'a': 'new a', 'b': 'new b'}, )
def fill_kwargs(fn, **kwargs): """ Read a csv file and fill up kwargs This normalizes kwargs against a sample file. It does the following: 1. If given a globstring, just use one file 2. Get names from csv file if not given 3. Identify the presence of a header 4. Identify dtypes 5. Establish column names 6. Switch around dtypes and column names if parse_dates is active Normally ``pd.read_csv`` does this for us. However for ``dd.read_csv`` we need to be consistent across multiple files and don't want to do these heuristics each time so we use the pandas solution once, record the results, and then send back a fully explicit kwargs dict to send to future calls to ``pd.read_csv``. Returns ------- kwargs: dict keyword arguments to give to pd.read_csv """ if 'index_col' in kwargs: msg = """ The index column cannot be set at dataframe creation time. Instead use the `set_index` method on the dataframe after it is created. """ raise ValueError(msg) kwargs = merge(csv_defaults, kwargs) sample_nrows = kwargs.pop('sample_nrows', 1000) essentials = ['columns', 'names', 'header', 'parse_dates', 'dtype'] if set(essentials).issubset(kwargs): return kwargs # Let pandas infer on the first 100 rows if '*' in fn: filenames = sorted(glob(fn)) if not filenames: raise ValueError("No files found matching name %s" % fn) fn = filenames[0] if kwargs['compression'] == 'infer': kwargs['compression'] = infer_compression(fn) if 'names' not in kwargs: kwargs['names'] = csv_names(fn, **kwargs) if 'header' not in kwargs: kwargs['header'] = 0 else: if 'header' not in kwargs: kwargs['header'] = None kwargs = clean_kwargs(kwargs) try: head = pd.read_csv(fn, **assoc(kwargs, 'nrows', sample_nrows)) except StopIteration: head = pd.read_csv(fn, **kwargs) if 'parse_dates' not in kwargs: kwargs['parse_dates'] = [col for col in head.dtypes.index if np.issubdtype(head.dtypes[col], np.datetime64)] new_dtype = dict(head.dtypes) dtype = kwargs.get('dtype', dict()) for k, v in dict(head.dtypes).items(): if k not in dtype: dtype[k] = v if kwargs.get('parse_dates'): for col in kwargs['parse_dates']: del dtype[col] kwargs['dtype'] = dtype return (head.columns.map(lambda s: s.strip() if isinstance(s, str) else s), kwargs)
def _print_python(expr, leaves=None): child, scope = print_python(leaves, expr._child) funcname = next(funcnames) return ('%s(%s)' % (funcname, child), toolz.assoc(scope, funcname, expr.func))
def __init__(self, handlers, max_buffer_size=MAX_BUFFER_SIZE, **kwargs): self.handlers = assoc(handlers, 'identity', self.identity) self.id = uuid.uuid1() super(Server, self).__init__(max_buffer_size=max_buffer_size, **kwargs)
def reducer(state, action): if action['type'] == 'ADD_TODO': todos = state['todos'] + (action['text'],) return toolz.assoc(state, 'todos', todos) return state
def top_then_bottom_then_top_again_etc(expr, scope, **kwargs): """ Compute expression against scope Does the following interpreter strategy: 1. Try compute_down on the entire expression 2. Otherwise compute_up from the leaves until we experience a type change (e.g. data changes from dict -> pandas DataFrame) 3. Re-optimize expression and re-pre-compute data 4. Go to step 1 Examples -------- >>> import numpy as np >>> s = symbol('s', 'var * {name: string, amount: int}') >>> data = np.array([('Alice', 100), ('Bob', 200), ('Charlie', 300)], ... dtype=[('name', 'S7'), ('amount', 'i4')]) >>> e = s.amount.sum() + 1 >>> top_then_bottom_then_top_again_etc(e, {s: data}) 601 See Also -------- bottom_up_until_type_break -- uses this for bottom-up traversal top_to_bottom -- older version bottom_up -- older version still """ # 0. Base case: expression is in dict, return associated data if expr in scope: return scope[expr] if not hasattr(expr, '_leaves'): return expr leaf_exprs = list(expr._leaves()) leaf_data = [scope.get(leaf) for leaf in leaf_exprs] # 1. See if we have a direct computation path with compute_down try: return compute_down(expr, *leaf_data, **kwargs) except NotImplementedError: pass # 2. Compute from the bottom until there is a data type change expr2, scope2 = bottom_up_until_type_break(expr, scope, **kwargs) # 3. Re-optimize data and expressions optimize_ = kwargs.get('optimize', optimize) pre_compute_ = kwargs.get('pre_compute', pre_compute) if pre_compute_: scope3 = dict((e, pre_compute_(e, datum, **assoc(kwargs, 'scope', scope2))) for e, datum in scope2.items()) else: scope3 = scope2 if optimize_: try: expr3 = optimize_(expr2, *[scope3[leaf] for leaf in expr2._leaves()]) _d = dict(zip(expr2._leaves(), expr3._leaves())) scope4 = dict((e._subs(_d), d) for e, d in scope3.items()) except NotImplementedError: expr3 = expr2 scope4 = scope3 else: expr3 = expr2 scope4 = scope3 # 4. Repeat if expr.isidentical(expr3): raise NotImplementedError("Don't know how to compute:\n" "type(expr): %s\n" "expr: %s\n" "data: %s" % (type(expr3), expr3, scope4)) else: return top_then_bottom_then_top_again_etc(expr3, scope4, **kwargs)
def into(a, b, **kwargs): # TODO: handle large CSV case return into(a, into(pd.DataFrame, b), **assoc(kwargs, 'dshape', b.dshape))
def batch_tcga_metadata_by_participant(fns): metadata = [re.match(TCGA_RE, fn).groupdict() for fn in fns] metadata = [tz.assoc(d, "fn", fn) for d, fn in zip(metadata, fns)] participant = tz.groupby(lambda x: x["participant"], metadata).values() return participant
def __init__(self, handlers, max_buffer_size=MAX_BUFFER_SIZE, **kwargs): self.handlers = assoc(handlers, 'identity', self.identity) self.id = str(uuid.uuid1()) self._port = None self._rpcs = dict() super(Server, self).__init__(max_buffer_size=max_buffer_size, **kwargs)