def test_read_bytes_delimited(s3, blocksize): _, values = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=b'\n', s3=s3) _, values2 = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=b'foo', s3=s3) assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes(test_bucket_name+'/test/accounts*', blocksize=blocksize, delimiter=d, s3=s3) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def compute_up(expr, data, scope=None, **kwargs): data = lower_column(data) grouper = compute( expr.grouper, scope, post_compute=False, return_type='native', **kwargs ) app = expr.apply reductions = [ compute( val, data, post_compute=None, return_type='native', ).label(name) for val, name in zip(app.values, app.fields) ] froms = list(unique(chain(get_all_froms(grouper), concat(map(get_all_froms, reductions))))) inner_cols = list(getattr(grouper, 'inner_columns', [grouper])) grouper_cols = inner_cols[:] inner_cols.extend(concat( getattr(getattr(r, 'element', None), 'inner_columns', [r]) for r in reductions )) wheres = unify_wheres([grouper] + reductions) sel = unify_froms(sa.select(inner_cols, whereclause=wheres), froms) return sel.group_by(*grouper_cols)
def test_read_bytes_delimited(): with filetexts(files, mode='b'): for bs in [5, 15, 45, 1500]: _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'\n') _, values2 = read_bytes('.test.accounts*', blocksize=bs, delimiter=b'foo') assert ([a.key for a in concat(values)] != [b.key for b in concat(values2)]) results = compute(*concat(values)) res = [r for r in results if r] assert all(r.endswith(b'\n') for r in res) ourlines = b''.join(res).split(b'\n') testlines = b"".join(files[k] for k in sorted(files)).split(b'\n') assert ourlines == testlines # delimiter not at the end d = b'}' _, values = read_bytes('.test.accounts*', blocksize=bs, delimiter=d) results = compute(*concat(values)) res = [r for r in results if r] # All should end in } except EOF assert sum(r.endswith(b'}') for r in res) == len(res) - 2 ours = b"".join(res) test = b"".join(files[v] for v in sorted(files)) assert ours == test
def diagnostic_yield(self, metric='completeness', cutoff=1, superblock_ids=None, group_id=None, sample_ids=None): """Calculate diagnostic yield.""" # extract column to filter on metric_column = getattr(BlockData, metric) # set up the base query for all blocks total_query = self.total_count(BlockData) if superblock_ids: # apply the superblock filter on the Block class level total_query = total_query.join(BlockData.parent)\ .filter(Block.superblock_id.in_(superblock_ids)) # extend base query to include only passed blocks pass_query = total_query.filter(metric_column >= cutoff) # optionally limit query queries = [limit_query(query, group=group_id, samples=sample_ids) for query in (total_query, pass_query)] # group multiple queries by sample ID (first column) metrics = groupby(get(0), concat(queries)) # iterate over all values, concat different query results, and keep # only the unique values (excluding second sample_id) combined = (unique(concat(values)) for values in itervalues(metrics)) # calculate diagnostic yield by simple division for sample_id, group_id, total, covered in combined: yield sample_id, group_id, (covered / total)
def scatter(kd, control, colors=['orange', 'blue'], **kwargs): """Show a jittered scatterplot of the measurements. Parameters ---------- kd : list of list of float The list of `trf_quantify` results for all AUKB knockdown images in the dataset. (Each result is itself a list.) control : list of list of float The list of `trf_quantify` results for all control images in the dataset. colors : list of two matplotlib colorspecs, optional The colors corresponding to AUKB-KD (0) and control (1) data points on the scatterplot. Additional Parameters --------------------- **kwargs : keyword arguments Additional keyword arguments passed directly to ``plt.scatter``. Returns ------- fig : matplotlib axes The returned value from the call to ``plt.scatter``. """ xs = list(tz.concat([i + 0.2 * np.random.randn(n) for i, n in enumerate(map(len, kd + control))])) color_vector = ([colors[0]] * sum(map(len, kd)) + [colors[1]] * sum(map(len, control))) ys = list(tz.concat(kd + control)) fig = plt.scatter(xs, ys, c=color_vector, **kwargs) plt.xlim(0, max(xs) + 1) plt.ylim(0, max(ys) + 1) return fig
def test_modification_time_read_bytes(): with s3_context('compress', files) as s3: _, a = read_bytes('compress/test/accounts.*', s3=s3) _, b = read_bytes('compress/test/accounts.*', s3=s3) assert [aa._key for aa in concat(a)] == [bb._key for bb in concat(b)] with s3_context('compress', valmap(double, files)) as s3: _, c = read_bytes('compress/test/accounts.*', s3=s3) assert [aa._key for aa in concat(a)] != [cc._key for cc in concat(c)]
def start(self): self.status = 'running' logger.debug("Start Progress Plugin") self._start() if not self.keys or not any(v for v in self.keys.values()): self.stop() elif all(k in self.scheduler.exceptions_blame for k in concat(self.keys.values())): key = next(k for k in concat(self.keys.values()) if k in self.scheduler.exceptions_blame) self.stop(exception=True, key=key)
def compute_up(expr, args, **kwargs): from_objs = list(unique(concat(map(get_all_froms, args)))) if len(from_objs) > 1: # TODO: how do you do this in sql? please send help raise ValueError('only columns from the same table can be merged') cols = list(unique(concat(map(get_unsafe_inner_columns, args, expr.args)))) sel = sa.select(cols, from_obj=from_objs[0]) where = unify_wheres(args) if where is not None: sel = sel.where(where) return sel
def render_tabular(api, options=None): """Entry point for the tabular reporter interface.""" # determine separator separator = options.get('report.separator', '\t') human = options.get('report.human') panel = options.get('report.panel') samples = options.get('report.samples') group = options.get('report.group') # read gene panel file if it has been set if panel: superblock_ids = [line.rstrip() for line in panel] else: superblock_ids = None # get sample ID, group and cutoff from metadata sample_query = limit_query(api.samples(), group=group, samples=samples) metadata = ((sample.id, sample.group_id, sample.cutoff) for sample in sample_query) # get the data base_query = limit_query(api.average_metrics(superblock_ids=superblock_ids), group=group, samples=samples) queries = [metadata, base_query, api.diagnostic_yield(superblock_ids=superblock_ids, group_id=group, sample_ids=samples), api.sex_checker(group_id=group, sample_ids=samples)] # group multiple queries by sample ID (first column) key_metrics = groupby(get(0), concat(queries)) # get the column names dynamically from the query headers = concatv(['sample_id', 'group_id', 'cutoff'], (column['name'] for column in base_query.column_descriptions), ['diagnostic yield', 'gender']) unique_headers = unique(headers) # iterate over all values, concat different query results, and keep # only the unique values (excluding second sample_id) data = (unique(concat(values)) for values in itervalues(key_metrics)) if human: # export key_metrics in a more human friendly format return tabulate(data, unique_headers) # yield headers return '\n'.join(cons('#' + separator.join(unique_headers), stringify_list(data, separator=separator)))
def compile_components(summary, schema): """Given a ``Summary`` object and a table schema, returning 5 sub-functions. Parameters ---------- summary : Summary The expression describing the aggregations to be computed. Returns ------- A tuple of the following functions: ``create(shape)`` Takes the aggregate shape, and returns a tuple of initialized numpy arrays. ``info(df)`` Takes a dataframe, and returns preprocessed 1D numpy arrays of the needed columns. ``append(i, x, y, *aggs_and_cols)`` Appends the ``i``th row of the table to the ``(x, y)`` bin, given the base arrays and columns in ``aggs_and_cols``. This does the bulk of the work. ``combine(base_tuples)`` Combine a list of base tuples into a single base tuple. This forms the reducing step in a reduction tree. ``finalize(aggs)`` Given a tuple of base numpy arrays, returns the finalized ``dynd`` array. """ paths, reds = zip(*preorder_traversal(summary)) # List of base reductions (actually computed) bases = list(unique(concat(r._bases for r in reds))) dshapes = [b.out_dshape(schema) for b in bases] # List of tuples of (append, base, input columns, temps) calls = [_get_call_tuples(b, d) for (b, d) in zip(bases, dshapes)] # List of unique column names needed cols = list(unique(concat(pluck(2, calls)))) # List of temps needed temps = list(pluck(3, calls)) create = make_create(bases, dshapes) info = make_info(cols) append = make_append(bases, cols, calls) combine = make_combine(bases, dshapes, temps) finalize = make_finalize(bases, summary, schema) return create, info, append, combine, finalize
def test_chunk_datetime(): data = [[1, 'Alice', 100, datetime.datetime(2014, 10, 1, 1, 1, 1)], [2, 'Bob', 200, datetime.datetime(2014, 10, 1, 1, 1, 1)], [3, 'Alice', -300, datetime.datetime(2014, 10, 1, 1, 1, 1)], [4, 'Charlie', 400, datetime.datetime(2014, 10, 1, 1, 1, 1)], [5, 'Edith', 200, datetime.datetime(2014, 10, 1, 1, 1, 1)]] t = Symbol('t', 'var * {id: int, name: string, amount: int, when: datetime}') c = ChunkIterable(data, chunksize=2) assert list(concat(compute(t.when.day, c))) == [1] * 5 assert list(concat(compute(t.when.date, c))) == \ [datetime.date(2014, 10, 1)] * 5
def test_deterministic_key_names(hdfs): data = b'abc\n' * int(1e3) fn = '%s/file' % basedir with hdfs.open(fn, 'wb', replication=1) as fil: fil.write(data) _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False) _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n', sample=False) _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c', sample=False) assert [f.key for f in concat(x)] == [f.key for f in concat(y)] assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
def test_join(): cities = TableSymbol('cities', schema='{id: int, city: string}') j = join(t, cities, 'id') city_data = [[1, 'NYC'], [1, 'Chicago'], [5, 'Paris']] assert set(concat(compute(join(cities, t, 'id')[['name', 'city']], {t: c, cities: city_data}))) == \ set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris'))) assert set(concat(compute(join(t, cities, 'id')[['name', 'city']], {t: c, cities: city_data}))) == \ set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris')))
def test_join(): cities = symbol('cities', dshape='var * {id: int, city: string}') j = join(t, cities, 'id') city_data = [[1, 'NYC'], [1, 'Chicago'], [5, 'Paris']] assert set(concat(compute(j[['name', 'city']], {t: c, cities: city_data}))) == \ set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris'))) assert set(concat(compute(j[['name', 'city']], {t: c, cities: city_data}))) == \ set((('Alice', 'NYC'), ('Alice', 'Chicago'), ('Edith', 'Paris')))
def test_deterministic_key_names(e, s, a, b): with make_hdfs() as (hdfs, basedir): data = b'abc\n' * int(1e3) fn = '%s/file' % basedir with hdfs.open(fn, 'wb', replication=1) as f: f.write(data) _, x = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n') _, y = read_bytes('hdfs://%s/*' % basedir, delimiter=b'\n') _, z = read_bytes('hdfs://%s/*' % basedir, delimiter=b'c') assert [f.key for f in concat(x)] == [f.key for f in concat(y)] assert [f.key for f in concat(x)] != [f.key for f in concat(z)]
def deserialize(bytes, dtype, copy=False): if dtype == 'O': try: l = list(concat(map(msgpack.unpackb, framesplit(bytes)))) except: l = list(concat(map(pickle.loads, framesplit(bytes)))) l = decode(l) return np.array(l, dtype='O') else: result = np.frombuffer(bytes, dtype) if copy: result = result.copy() return result
def test_registered_read_bytes(): from dask.bytes.core import read_bytes with filetexts(files, mode='b'): sample, values = read_bytes('.test.accounts.*') results = compute(*concat(values)) assert set(results) == set(files.values())
def _check_for_problem_somatic_batches(items, config): """Identify problem batch setups for somatic calling. We do not support multiple tumors in a single batch and VarDict(Java) does not handle pooled calling, only tumor/normal. """ to_check = [] for data in items: data = copy.deepcopy(data) data["config"] = config_utils.update_w_custom(config, data) to_check.append(data) data_by_batches = collections.defaultdict(list) for data in to_check: batches = dd.get_batches(data) if batches: for batch in batches: data_by_batches[batch].append(data) for batch, items in data_by_batches.items(): if vcfutils.get_paired(items): vcfutils.check_paired_problems(items) elif len(items) > 1: vcs = list(set(tz.concat([dd.get_variantcaller(data) or [] for data in items]))) if any(x.lower().startswith("vardict") for x in vcs): raise ValueError("VarDict does not support pooled non-tumor/normal calling, in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items])) elif any(x.lower() == "mutect" for x in vcs): raise ValueError("Mutect requires a 'phenotype: tumor' sample for calling, in batch %s: %s" % (batch, [dd.get_sample_name(data) for data in items]))
def aggregate_shape(leaf, expr, chunk, chunk_expr): """ The shape of the intermediate aggregate >>> leaf = Symbol('leaf', '10 * 10 * int') >>> expr = leaf.sum(axis=0) >>> chunk = Symbol('chunk', '3 * 3 * int') # 3 does not divide 10 >>> chunk_expr = chunk.sum(axis=0, keepdims=1) >>> aggregate_shape(leaf, expr, chunk, chunk_expr) (4, 10) """ if datashape.var in concat(map(shape, [leaf, expr, chunk, chunk_expr])): return (datashape.var, ) * leaf.ndim numblocks = [int(floor(l / c)) for l, c in zip(leaf.shape, chunk.shape)] last_chunk_shape = [l % c for l, c in zip(leaf.shape, chunk.shape)] if builtins.sum(last_chunk_shape) != 0: last_chunk = Symbol(chunk._name, DataShape(*(last_chunk_shape + [chunk.dshape.measure]))) last_chunk_expr = chunk_expr._subs({chunk: last_chunk}) last_chunk_shape = shape(last_chunk_expr) return tuple(int(floor(l / c)) * ce + lce for l, c, ce, lce in zip(shape(leaf), shape(chunk), shape(chunk_expr), last_chunk_shape))
def f(c, a, b): keys = yield _scatter((c.ip, c.port), [1, 2, 3]) assert merge(a.data, b.data) == \ {k: i for k, i in zip(keys, [1, 2, 3])} assert set(c.who_has) == set(keys) assert all(len(v) == 1 for v in c.who_has.values()) keys2, who_has, nbytes = yield scatter_to_workers([a.address, b.address], [4, 5, 6]) m = merge(a.data, b.data) for k, v in zip(keys2, [4, 5, 6]): assert m[k] == v assert isinstance(who_has, dict) assert set(concat(who_has.values())) == {a.address, b.address} assert len(who_has) == len(keys2) assert isinstance(nbytes, dict) assert set(nbytes) == set(who_has) assert all(isinstance(v, int) for v in nbytes.values()) result = yield _gather((c.ip, c.port), keys2) assert result == [4, 5, 6]
def schema(self): for c in self.children: if not isinstance(c.schema[0], Record): raise TypeError("All schemas must have Record shape. Got %s" % c.schema[0]) return dshape(Record(list(concat(c.schema[0].parameters[0] for c in self.children))))
def test_current_session(self): regular_minutes = self.trading_calendar.minutes_for_sessions_in_range( self.equity_minute_bar_days[0], self.equity_minute_bar_days[-1] ) bts_minutes = days_at_time( self.equity_minute_bar_days, time(8, 45), "US/Eastern" ) # some other non-market-minute three_oh_six_am_minutes = days_at_time( self.equity_minute_bar_days, time(3, 6), "US/Eastern" ) all_minutes = [regular_minutes, bts_minutes, three_oh_six_am_minutes] for minute in list(concat(all_minutes)): bar_data = self.create_bardata(lambda: minute) self.assertEqual( self.trading_calendar.minute_to_session_label(minute), bar_data.current_session )
def write_tables(fname, models, year): """ Write all tables injected into `models` to a pandas.HDFStore file. If year is not None it will be used to prefix the table names so that multiple years can go in the same file. Parameters ---------- fname : str File name for HDFStore. Will be opened in append mode and closed at the end of this function. models : list of str Models from which to gather injected tables for saving. year : int or None If an integer, used as a prefix along with table names for labeling DataFrames in the HDFStore. """ models = (get_model(m) for m in toolz.unique(models)) table_names = toolz.unique(toolz.concat(m._tables_used() for m in models)) tables = (get_table(t) for t in table_names) key_template = '{}/{{}}'.format(year) if year is not None else '{}' with pd.get_store(fname, mode='a') as store: for t in tables: store[key_template.format(t.name)] = t.to_frame()
def test_repeat(): x = np.random.random((10, 11, 13)) d = da.from_array(x, chunks=(4, 5, 3)) repeats = [1, 2, 5] axes = [-3, -2, -1, 0, 1, 2] for r in repeats: for a in axes: assert_eq(x.repeat(r, axis=a), d.repeat(r, axis=a)) assert_eq(d.repeat(2, 0), da.repeat(d, 2, 0)) with pytest.raises(NotImplementedError): da.repeat(d, np.arange(10)) with pytest.raises(NotImplementedError): da.repeat(d, 2, None) with pytest.raises(NotImplementedError): da.repeat(d, 2) for invalid_axis in [3, -4]: with pytest.raises(ValueError): da.repeat(d, 2, axis=invalid_axis) x = np.arange(5) d = da.arange(5, chunks=(2,)) assert_eq(x.repeat(3), d.repeat(3)) for r in [1, 2, 3, 4]: assert all(concat(d.repeat(r).chunks))
def batch(sel, chunksize=10000, bind=None): """Execute `sel`, streaming row at a time and fetching from the database in batches of size `chunksize`. Parameters ---------- sel : sa.sql.Selectable Selectable to execute chunksize : int, optional, default 10000 Number of rows to fetch from the database """ def rowterator(sel, chunksize=chunksize): with getbind(sel, bind).connect() as conn: result = conn.execute(sel) yield result.keys() for rows in iter_except(curry(result.fetchmany, size=chunksize), sa.exc.ResourceClosedError): if rows: yield rows else: return terator = rowterator(sel) return next(terator), concat(terator)
def init_state(self, network): super(ANRATNode, self).init_state(network) inits = list(toolz.concat(network.find_hyperparameters(["inits"], []))) # setting initial lambda to 5 instead of 10, because 10 is too large # for the default parameters # TODO might also want to add clipping to cap the value of lambda initial_lambda = network.find_hyperparameter(["anrat_initial_lambda"], 5) if ANRAT_USE_LOG_LAMBDA: initial_lambda = np.log(initial_lambda) lambda_vw = network.create_vw( name="lambda", is_shared=True, shape=(), tags={"parameter"}, inits=inits + [treeano.inits.ConstantInit(initial_lambda)], ) p = network.find_hyperparameter(["nrae_p"], 2) q = network.find_hyperparameter(["nrae_q"], 2) r = network.find_hyperparameter(["anrat_r"], 1) alpha = network.find_hyperparameter(["anrat_alpha", "alpha"], 0.1) i32_target = network.find_hyperparameter(["i32_target"], False) lambda_var = lambda_vw.variable if ANRAT_USE_LOG_LAMBDA: lambda_var = T.exp(lambda_var) cost_function = functools.partial(_ANRAT, lambda_=lambda_var, p=p, q=q, r=r, alpha=alpha, i32_target=i32_target) network.set_hyperparameter(self.name + "_elementwise", "cost_function", cost_function)
def compute_down(expr, data, chunksize=2**20, map=map, **kwargs): leaf = expr._leaves()[0] # If the bottom expression is a projection or field then want to do # compute_up first children = set(e for e in expr._traverse() if isinstance(e, Expr) and any(i is expr._leaves()[0] for i in e._inputs)) if len(children) == 1 and isinstance(first(children), (Field, Projection)): raise NotImplementedError() chunk = symbol('chunk', chunksize * leaf.schema) (chunk, chunk_expr), (agg, agg_expr) = split(leaf, expr, chunk=chunk) data_parts = partitions(data, chunksize=(chunksize,)) parts = list(map(curry(compute_chunk, data, chunk, chunk_expr), data_parts)) if isinstance(parts[0], np.ndarray): intermediate = np.concatenate(parts) elif isinstance(parts[0], pd.DataFrame): intermediate = pd.concat(parts) elif isinstance(parts[0], Iterable): intermediate = list(concat(parts)) else: raise TypeError( "Don't know how to concatenate objects of type %s" % type(parts[0])) return compute(agg_expr, {agg: intermediate})
def load_adjusted_array(self, columns, dates, assets, mask): return dict( concat(map( partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getdataset, columns)) )) )
def scatter_to_workers(center, ncores, data, key=None): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. ncores should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ center = coerce_to_rpc(center) if key is None: key = str(uuid.uuid1()) if isinstance(ncores, Iterable) and not isinstance(ncores, dict): ncores = {worker: 1 for worker in ncores} workers = list(concat([w] * nc for w, nc in ncores.items())) if isinstance(data, dict): names, data = list(zip(*data.items())) else: names = ("%s-%d" % (key, i) for i in count(0)) L = list(zip(cycle(workers), names, data)) d = groupby(0, L) d = {k: {b: c for a, b, c in v} for k, v in d.items()} yield [rpc(ip=w_ip, port=w_port).update_data(data=v, close=True) for (w_ip, w_port), v in d.items()] result = [RemoteData(b, center.ip, center.port, result=c) for a, b, c in L] raise Return(result)
def compute(self, **kwargs): results = self.get(self.dask, self._keys(), **kwargs) if isinstance(results[0], Iterable): results = concat(results) if not isinstance(results, Iterator): results = iter(results) return results
def aconcat(seqs): """Like `toolz.concat`, but it returns a array instead of an iterator.""" return np.array(list(toolz.concat(seqs)))
def compute_broadcast(expr, *data, **kwargs): expr_inds = tuple(range(ndim(expr)))[::-1] func = get_numba_ufunc(expr) return atop(func, expr_inds, *concat((dat, tuple(range(ndim(dat))[::-1])) for dat in data))
def get_all_froms(function): return list(unique(concat(map(get_all_froms, function.clauses.clauses))))
def slice_with_bool_dask_array(x, index): """ Slice x with one or more dask arrays of bools This is a helper function of `Array.__getitem__`. Parameters ---------- x: Array index: tuple with as many elements as x.ndim, among which there are one or more Array's with dtype=bool Returns ------- tuple of (sliced x, new index) where the new index is the same as the input, but with slice(None) replaced to the original slicer when a filter has been applied. Note: The sliced x will have nan chunks on the sliced axes. """ from .core import Array, atop, elemwise out_index = [ slice(None) if isinstance(ind, Array) and ind.dtype == bool else ind for ind in index ] if len(index) == 1 and index[0].ndim == x.ndim: y = elemwise(getitem, x, *index, dtype=x.dtype) name = 'getitem-' + tokenize(x, index) dsk = {(name, i): k for i, k in enumerate(core.flatten(y.__dask_keys__()))} chunks = ((np.nan, ) * y.npartitions, ) return (Array(sharedict.merge(y.dask, (name, dsk)), name, chunks, x.dtype), out_index) if any( isinstance(ind, Array) and ind.dtype == bool and ind.ndim != 1 for ind in index): raise NotImplementedError( "Slicing with dask.array of bools only permitted when " "the indexer has only one dimension or when " "it has the same dimension as the sliced " "array") indexes = [ ind if isinstance(ind, Array) and ind.dtype == bool else slice(None) for ind in index ] arginds = [] i = 0 for ind in indexes: if isinstance(ind, Array) and ind.dtype == bool: new = (ind, tuple(range(i, i + ind.ndim))) i += x.ndim else: new = (slice(None), None) i += 1 arginds.append(new) arginds = list(concat(arginds)) out = atop(getitem_variadic, tuple(range(x.ndim)), x, tuple(range(x.ndim)), *arginds, dtype=x.dtype) chunks = [] for ind, chunk in zip(index, out.chunks): if isinstance(ind, Array) and ind.dtype == bool: chunks.append((np.nan, ) * len(chunk)) else: chunks.append(chunk) out._chunks = tuple(chunks) return out, tuple(out_index)
def assert_balanced(inp, expected, c, s, *workers): steal = s.extensions["stealing"] steal._pc.stop() counter = itertools.count() tasks = list(concat(inp)) data_seq = itertools.count() futures = [] for w, ts in zip(workers, inp): for t in sorted(ts, reverse=True): if t: [dat] = yield c.scatter([next(data_seq)], workers=w.address) ts = s.tasks[dat.key] # Ensure scheduler state stays consistent old_nbytes = ts.nbytes ts.nbytes = s.bandwidth * t for ws in ts.who_has: ws.nbytes += ts.nbytes - old_nbytes else: dat = 123 s.task_duration[str(int(t))] = 1 i = next(counter) f = c.submit( func, dat, key="%d-%d" % (int(t), i), workers=w.address, allow_other_workers=True, pure=False, priority=-i, ) futures.append(f) while len(s.rprocessing) < len(futures): yield gen.sleep(0.001) for i in range(10): steal.balance() while steal.in_flight: yield gen.sleep(0.001) result = [ sorted([int(key_split(k)) for k in s.processing[w.address]], reverse=True) for w in workers ] result2 = sorted(result, reverse=True) expected2 = sorted(expected, reverse=True) if config.get("pdb-on-err"): if result2 != expected2: import pdb pdb.set_trace() if result2 == expected2: return raise Exception("Expected: {}; got: {}".format(str(expected2), str(result2)))
def everything_but(k, d): """ Return iterator of all values in d except the values in k. """ assert k in d return concat(keyfilter(lambda x: x != k, d).values())
def read_text(urlpath, blocksize=None, compression='infer', encoding=system_encoding, errors='strict', linedelimiter=os.linesep, collection=True, storage_options=None): """ Read lines from text files Parameters ---------- urlpath: string or list Absolute or relative filepath, URL (may include protocols like ``s3://``), globstring, or a list of beforementioned strings. blocksize: None or int Size (in bytes) to cut up larger files. Streams by default. compression: string Compression format like 'gzip' or 'xz'. Defaults to 'infer' encoding: string errors: string linedelimiter: string collection: bool, optional Return dask.bag if True, or list of delayed values if false storage_options: dict Extra options that make sense to a particular storage connection, e.g. host, port, username, password, etc. Examples -------- >>> b = read_text('myfiles.1.txt') # doctest: +SKIP >>> b = read_text('myfiles.*.txt') # doctest: +SKIP >>> b = read_text('myfiles.*.txt.gz') # doctest: +SKIP >>> b = read_text('s3://bucket/myfiles.*.txt') # doctest: +SKIP >>> b = read_text('s3://key:secret@bucket/myfiles.*.txt') # doctest: +SKIP >>> b = read_text('hdfs://namenode.example.com/myfiles.*.txt') # doctest: +SKIP Parallelize a large file by providing the number of uncompressed bytes to load into each partition. >>> b = read_text('largefile.txt', blocksize=1e7) # doctest: +SKIP Returns ------- dask.bag.Bag if collection is True or list of Delayed lists otherwise See Also -------- from_sequence: Build bag from Python sequence """ if isinstance(urlpath, (tuple, list, set)): blocks = sum([read_text(fn, blocksize=blocksize, compression=compression, encoding=encoding, errors=errors, linedelimiter=linedelimiter, collection=False, storage_options=storage_options) for fn in urlpath], []) else: if blocksize is None: files = open_text_files(urlpath, encoding=encoding, errors=errors, compression=compression, **(storage_options or {})) blocks = [delayed(list, pure=True)(delayed(file_to_blocks)(file)) for file in files] else: _, blocks = read_bytes(urlpath, delimiter=linedelimiter.encode(), blocksize=blocksize, sample=False, compression=compression, **(storage_options or {})) if isinstance(blocks[0], (tuple, list)): blocks = list(concat(blocks)) blocks = [delayed(decode)(b, encoding, errors) for b in blocks] if not blocks: raise ValueError("No files found", urlpath) if not collection: return blocks else: return from_delayed(blocks)
def physical_tables_join(join): # Physical roots of Join nodes are the unique physical roots of their # left and right TableNodes. func = compose(physical_tables, methodcaller('op')) return list(unique(concat(map(func, (join.left, join.right)))))
def everything_but(k, d): """ Return iterator of all values in d except the values in k. """ assert k in d return concat(itervalues(keyfilter(ne(k), d)))
def physical_tables_node(node): # Iterative case. Any other Node's physical roots are the unique physical # roots of that Node's root tables. tables = toolz.concat(map(physical_tables, node.root_tables())) return list(toolz.unique(tables, key=id))
def powerset(values): """ Return the power set (i.e., the set of all subsets) of entries in `values`. """ return concat(combinations(values, i) for i in range(len(values) + 1))
def distinct_roots(*expressions): # TODO: move to analysis roots = toolz.concat(expr.op().root_tables() for expr in expressions) return list(toolz.unique(roots))
def distinct_roots(*expressions): roots = toolz.concat(expression._root_tables() for expression in expressions) return list(toolz.unique(roots, key=id))
def __iter__(self): return toolz.unique(toolz.concat(self.layers.values()))
def lconcat(seqs): """Like `toolz.concat`, but it returns a list instead of an iterator.""" return list(toolz.concat(seqs))
def elemwise_array(expr, *data, **kwargs): leaves = expr._inputs expr_inds = tuple(range(ndim(expr)))[::-1] return atop(curry(compute_it, expr, leaves, **kwargs), expr_inds, *concat((dat, tuple(range(ndim(dat))[::-1])) for dat in data))
def morph_counts_fastest_version(self, words): # Word List to list of all morphisms word_counts = Counter( word for word in toolz.concat(map(self.word_tokenizer, words))) #print("words_counts: ") #print(word_counts) print("") print("Unique number words: " + str(len(set(words)))) print("Total number of words: " + str(len(words))) print("") unique_words_set = set(words) unique_words = list(unique_words_set) frog = Frog( FrogOptions(tok=True, lemma=True, morph=True, daringmorph=False, mwu=False, chunking=False, ner=False, parser=False)) batch_size = 400 morphisms = [] print_batch_number = 1 start_time = time.time() total_batch_number = math.ceil(len(unique_words) / batch_size) total_process_time = 0 total_getting_morphisms_time = 0 for i in range(0, len(unique_words), batch_size): t0 = time.time() words_batch = unique_words[i:i + batch_size] words_batch_string = ' '.join(words_batch) output = frog.process(words_batch_string) process_time = time.time() - t0 t1 = time.time() for j in range(0, len(words_batch) - 1): current_word = output[j].get("text") morphisms_word = output[j].get("morph") morphisms_word_list = morphisms_word.replace('[', '').split(']') current_word_count = word_counts[current_word] # Momenteel GEEN GEHELE WOORDEN IN COUNT if len(morphisms_word_list) > 2: morphisms += morphisms_word_list * current_word_count total_batch_length = len(words_batch) print("batch" + " (" + str(batch_size) + " words): " + str(print_batch_number) + " of " + str(total_batch_number)) print_batch_number += 1 getting_morphisms_time = time.time() - t1 total_process_time += process_time total_getting_morphisms_time += getting_morphisms_time print("") print("Total number of words: ") print(len(words)) print("") print("Unique number words: ") print(len(set(words))) print("") print("Total Process Time:") print(self.format_time(total_process_time)) print("") print("Total Getting Morphisms Time: ") print(self.format_time(total_getting_morphisms_time)) print("") print("Total Time:") print(self.format_time(time.time() - start_time)) print("") # Remove the empty strings morphisms = list(filter(None, morphisms)) #Make a counter of all morphisms morph_counts = Counter(morphisms) with open('Old/morph_counts.pickle', 'wb') as outputfile: pickle.dump(morph_counts, outputfile) return morph_counts
def test_default_calendars(self): # concat 连接迭代 for name in concat( [_default_calendar_factories, _default_calendar_aliases]): self.assertIsNotNone(get_calendar(name), "get_calendar(%r) returned None" % name)
def get_assets(self): assets = [directory.walk() for directory in self._root_dirs] self.assets = sorted(toolz.unique(toolz.concat(assets))) return self
def atop(func, out_ind, *args, **kwargs): """ Tensor operation: Generalized inner and outer products A broad class of blocked algorithms and patterns can be specified with a concise multi-index notation. The ``atop`` function applies an in-memory function across multiple blocks of multiple inputs in a variety of ways. Many dask.array operations are special cases of atop including elementwise, broadcasting, reductions, tensordot, and transpose. Parameters ---------- func : callable Function to apply to individual tuples of blocks out_ind : iterable Block pattern of the output, something like 'ijk' or (1, 2, 3) *args : sequence of Array, index pairs Sequence like (x, 'ij', y, 'jk', z, 'i') **kwargs : dict Extra keyword arguments to pass to function dtype : np.dtype Datatype of resulting array. concatenate : bool, keyword only If true concatenate arrays along dummy indices, else provide lists adjust_chunks : dict Dictionary mapping index to function to be applied to chunk sizes new_axes : dict, keyword only New indexes and their dimension lengths Examples -------- 2D embarrassingly parallel operation from two arrays, x, and y. >>> z = atop(operator.add, 'ij', x, 'ij', y, 'ij', dtype='f8') # z = x + y # doctest: +SKIP Outer product multiplying x by y, two 1-d vectors >>> z = atop(operator.mul, 'ij', x, 'i', y, 'j', dtype='f8') # doctest: +SKIP z = x.T >>> z = atop(np.transpose, 'ji', x, 'ij', dtype=x.dtype) # doctest: +SKIP The transpose case above is illustrative because it does same transposition both on each in-memory block by calling ``np.transpose`` and on the order of the blocks themselves, by switching the order of the index ``ij -> ji``. We can compose these same patterns with more variables and more complex in-memory functions z = X + Y.T >>> z = atop(lambda x, y: x + y.T, 'ij', x, 'ij', y, 'ji', dtype='f8') # doctest: +SKIP Any index, like ``i`` missing from the output index is interpreted as a contraction (note that this differs from Einstein convention; repeated indices do not imply contraction.) In the case of a contraction the passed function should expect an iterable of blocks on any array that holds that index. To receive arrays concatenated along contracted dimensions instead pass ``concatenate=True``. Inner product multiplying x by y, two 1-d vectors >>> def sequence_dot(x_blocks, y_blocks): ... result = 0 ... for x, y in zip(x_blocks, y_blocks): ... result += x.dot(y) ... return result >>> z = atop(sequence_dot, '', x, 'i', y, 'i', dtype='f8') # doctest: +SKIP Add new single-chunk dimensions with the ``new_axes=`` keyword, including the length of the new dimension. New dimensions will always be in a single chunk. >>> def f(x): ... return x[:, None] * np.ones((1, 5)) >>> z = atop(f, 'az', x, 'a', new_axes={'z': 5}, dtype=x.dtype) # doctest: +SKIP If the applied function changes the size of each chunk you can specify this with a ``adjust_chunks={...}`` dictionary holding a function for each index that modifies the dimension size in that index. >>> def double(x): ... return np.concatenate([x, x]) >>> y = atop(double, 'ij', x, 'ij', ... adjust_chunks={'i': lambda n: 2 * n}, dtype=x.dtype) # doctest: +SKIP Include literals by indexing with None >>> y = atop(add, 'ij', x, 'ij', 1234, None, dtype=x.dtype) # doctest: +SKIP See Also -------- top - dict formulation of this function, contains most logic """ out = kwargs.pop('name', None) # May be None at this point token = kwargs.pop('token', None) dtype = kwargs.pop('dtype', None) adjust_chunks = kwargs.pop('adjust_chunks', None) new_axes = kwargs.get('new_axes', {}) from .core import Array, unify_chunks, normalize_arg if dtype is None: raise ValueError("Must specify dtype of output array") chunkss, arrays = unify_chunks(*args) for k, v in new_axes.items(): chunkss[k] = (v, ) arginds = list(zip(arrays, args[1::2])) for arg, ind in arginds: if hasattr(arg, 'ndim') and hasattr( ind, '__len__') and arg.ndim != len(ind): raise ValueError( "Index string %s does not match array dimension %d" % (ind, arg.ndim)) numblocks = {a.name: a.numblocks for a, ind in arginds if ind is not None} argindsstr = list( toolz.concat([(normalize_arg(a) if ind is None else a.name, ind) for a, ind in arginds])) # Finish up the name if not out: out = '%s-%s' % (token or utils.funcname(func).strip('_'), base.tokenize(func, out_ind, argindsstr, dtype, ** kwargs)) kwargs2 = {k: normalize_arg(v) for k, v in kwargs.items()} dsk = _top(func, out, out_ind, *argindsstr, numblocks=numblocks, **kwargs2) dsks = [a.dask for a, ind in arginds if ind is not None] chunks = [chunkss[i] for i in out_ind] if adjust_chunks: for i, ind in enumerate(out_ind): if ind in adjust_chunks: if callable(adjust_chunks[ind]): chunks[i] = tuple(map(adjust_chunks[ind], chunks[i])) elif isinstance(adjust_chunks[ind], numbers.Integral): chunks[i] = tuple(adjust_chunks[ind] for _ in chunks[i]) elif isinstance(adjust_chunks[ind], (tuple, list)): chunks[i] = tuple(adjust_chunks[ind]) else: raise NotImplementedError( "adjust_chunks values must be callable, int, or tuple") chunks = tuple(chunks) return Array(sharedict.merge( (out, dsk), *dsks, dependencies={out: {a.name for a, ind in arginds if ind is not None}}), out, chunks, dtype=dtype)
def _(): return concat( convert(chunks(pd.DataFrame), csv, **kwargs) for csv in csvs)
def get_unsafe_inner_columns(f): unique_columns = unique(concat(map(get_unsafe_inner_columns, f.clauses))) lowered = [x.label(getattr(x, 'name', None)) for x in unique_columns] return [getattr(sa.func, f.name)(*lowered)]
def test_registered(s3): sample, values = read_bytes('s3://%s/test/accounts.*.json' % test_bucket_name) results = compute(*concat(values)) assert set(results) == set(files.values())
def get_all_froms(colelement): return list(unique(concat(map(get_all_froms, colelement.get_children()))))
def inputs(self): return tuple(unique(concat(v.inputs for v in self.values)))
def __init__(self, host='127.0.0.1', http_port=9786, bokeh_port=8787, scheduler_address='tcp://127.0.0.1:8786', bokeh_whitelist=[], log_level=logging_level, show=False, prefix=None, use_xheaders=False, quiet=True): self.port = bokeh_port ip = socket.gethostbyname(host) hosts = ['localhost', '127.0.0.1', ip, host] with ignoring(Exception): hosts.append(socket.gethostbyname(ip)) with ignoring(Exception): hosts.append(socket.gethostbyname(socket.gethostname())) hosts = ['%s:%d' % (h, bokeh_port) for h in hosts] hosts.append("*") hosts.extend(map(str, bokeh_whitelist)) args = ([sys.executable, '-m', 'bokeh', 'serve'] + paths + [ '--check-unused-sessions=50', '--unused-session-lifetime=1', '--allow-websocket-origin=*', '--port', str(bokeh_port) ]) if bokeh.__version__ <= '0.12.4': args += sum([['--host', h] for h in hosts], []) if prefix: args.extend(['--prefix', prefix]) if show: args.append('--show') if use_xheaders: args.append('--use-xheaders') if log_level in ('debug', 'info', 'warning', 'error', 'critical'): args.extend(['--log-level', log_level]) bokeh_options = { 'host': host, 'http-port': http_port, 'scheduler-address': scheduler_address, 'bokeh-port': bokeh_port } args.extend(['--args'] + list(map(str, concat(bokeh_options.items())))) import subprocess process = subprocess.Popen(args) self.process = process @atexit.register def cleanup_process(): try: process.terminate() except OSError: pass if not quiet: logger.info("Web UI: http://%s:%d/status/" % (ip, bokeh_port))
plt.show() ########################################################################### # Calculating variances # array of features xa = np.vstack([list(toolz.pluck('x', ep)) for ep in episodes]) # get array of returns ga = np.hstack([episode_return(ep) for ep in episodes]) # returns multiplied by feature vector gx = xa.T * ga # per-feature variance (is this right?) va = np.var(gx, axis=1) # Delta squared for variance deltas = np.array(list(toolz.pluck('delta', toolz.concat(episodes)))) dsqa = deltas**2 # delta-squared return dsqret = calculate_return(dsqa, lpluck('gm', concat(episodes))) # multiplying by features dsqrx = xa.T * dsqret # averaging for per-feature delta-squared return (is this right?) dsvar_w = np.mean(dsqrx, axis=1) # least squares solution (this appears to have some sort of issue) # dsvar_w, res, *_ = np.linalg.lstsq(xa, dsqret) # make heatmap traj = np.array(list(toolz.pluck('obs', episodes[-1])))
def sumDigits(ints: PVector[int]) -> int: return last(accumulate(add, concat(map(lambda c: toDigits(c), ints))))
def physical_tables_node(node): # Iterative case. Any other Node's physical roots are the unique physical # roots of that Node's root tables. return list(unique(concat(map(physical_tables, node.root_tables()))))
def subterms(expr): return concat([[expr], concat(map(subterms, expr._inputs))])