def setup(self): keys = self.keys while not keys.issubset(self.scheduler.tasks): yield gen.sleep(0.05) tasks = [self.scheduler.tasks[k] for k in keys] self.keys = None self.scheduler.add_plugin(self) # subtle race condition here self.all_keys, errors = dependent_keys(tasks, complete=self.complete) if not self.complete: self.keys = self.all_keys.copy() else: self.keys, _ = dependent_keys(tasks, complete=False) self.all_keys.update(keys) self.keys |= errors & self.all_keys if not self.keys: self.stop(exception=None, key=None) # Group keys by func name self.keys = valmap(set, groupby(self.func, self.keys)) self.all_keys = valmap(set, groupby(self.func, self.all_keys)) for k in self.all_keys: if k not in self.keys: self.keys[k] = set() for k in errors: self.transition(k, None, 'erred', exception=True) logger.debug("Set up Progress keys")
def setup(self, keys, complete): errors = Progress.setup(self, keys, complete) # Group keys by func name self.keys = valmap(set, groupby(self.func, self.keys)) self.all_keys = valmap(set, groupby(self.func, self.all_keys)) for k in self.all_keys: if k not in self.keys: self.keys[k] = set() logger.debug("Set up Progress keys") return errors
def scatter_to_workers(center, ncores, data, key=None, report=True): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. ncores should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ if isinstance(center, str): ip, port = center.split(':') elif isinstance(center, rpc): ip, port = center.ip, center.port elif isinstance(center, tuple): ip, port = center else: raise TypeError("Bad type for center") if key is None: key = str(uuid.uuid1()) if isinstance(ncores, Iterable) and not isinstance(ncores, dict): k = len(data) // len(ncores) ncores = {worker: k for worker in ncores} workers = list(concat([w] * nc for w, nc in ncores.items())) in_type = type(data) if isinstance(data, dict): names, data = list(zip(*data.items())) else: names = ('%s-%d' % (key, i) for i in count(0)) worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers)) _round_robin_counter[0] += len(data) L = list(zip(worker_iter, names, data)) d = groupby(0, L) d = {k: {b: c for a, b, c in v} for k, v in d.items()} out = yield All([rpc(ip=w_ip, port=w_port).update_data(data=v, close=True, report=report) for (w_ip, w_port), v in d.items()]) nbytes = merge([o[1]['nbytes'] for o in out]) who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()} result = [RemoteData(b, ip, port, result=c) for a, b, c in L] if in_type is dict: result = dict(zip(names, result)) raise Return((result, who_has, nbytes))
def _run_cnvkit_shared_orig(inputs, backgrounds): """Original CNVkit implementation with full normalization and segmentation. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base, out_base_old = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir, cur_input) if utils.file_exists(out_base_old + ".cns"): out_base = out_base_old ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base}) if not utils.file_exists(ckouts[0]["cns"]): cov_interval = dd.get_coverage_interval(inputs[0]) samples_to_run = list(zip(["background"] * len(backgrounds), backgrounds)) + \ list(zip(["evaluate"] * len(inputs), inputs)) # New style shared SV bins if tz.get_in(["depth", "bins", "target"], inputs[0]): target_bed = tz.get_in(["depth", "bins", "target"], inputs[0]) antitarget_bed = tz.get_in(["depth", "bins", "antitarget"], inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_general_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Back compatible with pre-existing runs else: target_bed, antitarget_bed = _get_original_targets(inputs[0]) raw_coverage_cnns = reduce(operator.add, [_get_original_coverage(cdata, itype) for itype, cdata in samples_to_run]) # Currently metrics not calculated due to speed and needing re-evaluation # We could re-enable with larger truth sets to evaluate background noise # But want to reimplement in a more general fashion as part of normalization if False: coverage_cnns = reduce(operator.add, [_cnvkit_metrics(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()]) background_cnn = cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, inputs, target_bed, antitarget_bed) else: coverage_cnns = raw_coverage_cnns background_cnn = cnvkit_background([x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, inputs, target_bed, antitarget_bed) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs, ckouts) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) [_cnvkit_segment(cnr, cov_interval, data, inputs + backgrounds) for cnr, data in fixed_cnrs] return ckouts
def _run_cnvkit_shared(inputs, backgrounds): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ work_dir = _sv_workdir(inputs[0]) raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_name = dd.get_sample_name(backgrounds[0]) if backgrounds else "flat" background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name)) ckouts = [] for cur_input in inputs: cur_raw_work_dir = utils.safe_makedir(os.path.join(_sv_workdir(cur_input), "raw")) out_base = _bam_to_outbase(dd.get_align_bam(cur_input), cur_raw_work_dir) ckouts.append({"cnr": "%s.cnr" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): cov_interval = dd.get_coverage_interval(inputs[0]) raw_target_bed, access_bed = _get_target_access_files(cov_interval, inputs[0], work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, inputs[0]) parallel = {"type": "local", "cores": dd.get_cores(inputs[0]), "progs": ["cnvkit"]} pct_coverage = (pybedtools.BedTool(raw_target_bed).total_coverage() / float(pybedtools.BedTool(access_bed).total_coverage())) * 100.0 target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, pct_coverage, raw_work_dir, inputs[0]) split_beds = _split_bed(target_bed, inputs[0]) + _split_bed(antitarget_bed, inputs[0]) samples_to_run = zip(["background"] * len(backgrounds), backgrounds) + \ zip(["evaluate"] * len(inputs), inputs) split_cnns = run_multicore(_cnvkit_coverage, [(cdata, bed, itype) for itype, cdata in samples_to_run for bed in split_beds], inputs[0]["config"], parallel) raw_coverage_cnns = _merge_coverage(split_cnns, inputs[0]) coverage_cnns = run_multicore(_cnvkit_metrics, [(cnns, target_bed, antitarget_bed, cov_interval, inputs + backgrounds) for cnns in tz.groupby("bam", raw_coverage_cnns).values()], inputs[0]["config"], parallel) background_cnn = _cnvkit_background(_select_background_cnns(coverage_cnns), background_cnn, target_bed, antitarget_bed, inputs[0]) fixed_cnrs = run_multicore(_cnvkit_fix, [(cnns, background_cnn, inputs + backgrounds) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], inputs[0]["config"], parallel) run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr, data in fixed_cnrs], inputs[0]["config"], parallel) return ckouts
def fuzzy_equity_ownership_by_district(self, district_code): """ 基于区域邮编找到对应的股票集 """ equities = self.retrieve_type_assets('equity') district_equities = groupby(lambda x: x.district, equities) return district_equities[district_code]
def pip_dict(): from pkg_resources import working_set from toolz import groupby first = lambda x: x[0].upper() ws = working_set.by_key WS = groupby(first, ws) return ws, WS
def update_stats(matches: List[PlayerMatch]): """ Update the statistics file based on the played matches. """ with open(DIR / '../data/stats.json', 'w') as f: stats = [{ 'name': name, 'played': len(games), 'red': games | filterwith(lambda x: x['team'] == 'Red') | to(count), 'blue': games | filterwith(lambda x: x['team'] == 'Blue') | to(count), 'wins': games | filterwith(lambda x: x['points'] == 3) | to(count), 'draw': games | filterwith(lambda x: x['points'] == 1) | to(count), 'lose': games | filterwith(lambda x: x['points'] == 0) | to(count), 'points': sum(x['points'] | to(int) for x in games), 'pointsPerGame': round( (sum(x['points'] | to(int) for x in games) | to(float)) / len(games), 1, ), } for name, games in groupby(lambda x: x['name'], matches).items()] json.dump(sorted(stats, key=lambda x: x['pointsPerGame'], reverse=True), f, indent=2)
def bigfoot_map(sightings): classifications = groupby('classification', sightings) return { "data": [{ "type": "scattermapbox", "lat": listpluck("latitude", class_sightings), "lon": listpluck("longitude", class_sightings), "text": listpluck("title", class_sightings), "mode": "markers", "name": classification, "marker": { "size": 3, "opacity": 1.0 } } for classification, class_sightings in classifications.items()], "layout": { "autosize": True, "hovermode": "closest", "mapbox": { "accesstoken": os.environ.get("MAPBOX_KEY"), "bearing": 0, "center": { "lat": 40, "lon": -98.5 }, "pitch": 0, "zoom": 2, "style": "outdoors" } } }
def bigfoot_by_year(sightings): # Create a dict mapping the # classification -> [(year, count), (year, count) ... ] sightings_by_year = { classification: sorted( list( # Group by year -> count. countby(sighting_year, class_sightings).items()), # Sort by year. key=first) for classification, class_sightings in groupby('classification', sightings).items() } # Build the plot with a dictionary. return { "data": [{ "type": "scatter", "mode": "lines+markers", "name": classification, "x": listpluck(0, class_sightings_by_year), "y": listpluck(1, class_sightings_by_year) } for classification, class_sightings_by_year in sightings_by_year.items()], "layout": { "title": "Sightings by Year", "showlegend": False } }
def get_batch_no_details(warehouse, include_batch_price=0): extra_fields = ( "pb_price_based_on, pb_rate, pb_discount," if include_batch_price else "" ) batches = frappe.db.sql( """ SELECT name, item, expiry_date, {extra_fields} ( SELECT SUM(actual_qty) FROM `tabStock Ledger Entry` WHERE batch_no=b.name AND item_code=b.item AND warehouse=%(warehouse)s ) as qty FROM `tabBatch` AS b WHERE IFNULL(expiry_date, '4000-10-10') >= CURDATE() ORDER BY expiry_date """.format( extra_fields=extra_fields ), values={"warehouse": warehouse}, as_dict=1, ) return groupby("item", filter(lambda x: x.get("qty"), batches))
def load_adjusted_array(self, columns, dates, assets, mask): return dict( concat(map( partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getdataset, columns)) )) )
def split_next_and_previous_event_columns(self, requested_columns): """ Split requested columns into columns that should load the next known value and columns that should load the previous known value. Parameters ---------- requested_columns : iterable[BoundColumn] Returns ------- next_cols, previous_cols : iterable[BoundColumn], iterable[BoundColumn] ``requested_columns``, partitioned into sub-sequences based on whether the column should produce values from the next event or the previous event """ def next_or_previous(c): if c in self.next_value_columns: return "next" elif c in self.previous_value_columns: return "previous" raise ValueError("{c} not found in next_value_columns " "or previous_value_columns".format(c=c)) groups = groupby(next_or_previous, requested_columns) return groups.get("next", ()), groups.get("previous", ())
def test_groupby_tasks_3(): func = lambda x: x % 10 b = db.range(20, npartitions=5).groupby(func, shuffle='tasks', max_branch=2) result = b.compute(scheduler='sync') assert dict(result) == groupby(func, range(20))
def format_website(self): # jira category => website category mapping categories = { 'New Feature': 'feature', 'Improvement': 'feature', 'Wish': 'feature', 'Task': 'feature', 'Test': 'bug', 'Bug': 'bug', 'Sub-task': 'feature' } titles = { 'feature': 'New Features and Improvements', 'bugfix': 'Bug Fixes' } issues_by_category = toolz.groupby( lambda issue: categories[issue.fields.issuetype.name], self.issues ) out = StringIO() for category in ('feature', 'bug'): title = titles[category] issues = issues_by_category[category] issues.sort(key=lambda x: x.key) out.write(md('## {}\n\n', title)) for issue in issues: link = md('[{0}]({1}/browse/{0})', issue.key, self.server) out.write(md('* {} - {}\n', link, issue.fields.summary)) out.write('\n') return out.getvalue()
def diagnostic_yield(self, metric='completeness', cutoff=1, superblock_ids=None, group_id=None, sample_ids=None): """Calculate diagnostic yield.""" # extract column to filter on metric_column = getattr(BlockData, metric) # set up the base query for all blocks total_query = self.total_count(BlockData) if superblock_ids: # apply the superblock filter on the Block class level total_query = total_query.join(BlockData.parent)\ .filter(Block.superblock_id.in_(superblock_ids)) # extend base query to include only passed blocks pass_query = total_query.filter(metric_column >= cutoff) # optionally limit query queries = [limit_query(query, group=group_id, samples=sample_ids) for query in (total_query, pass_query)] # group multiple queries by sample ID (first column) metrics = groupby(get(0), concat(queries)) # iterate over all values, concat different query results, and keep # only the unique values (excluding second sample_id) combined = (unique(concat(values)) for values in itervalues(metrics)) # calculate diagnostic yield by simple division for sample_id, group_id, total, covered in combined: yield sample_id, group_id, (covered / total)
def collections_to_dsk(collections, optimize_graph=True, **kwargs): """ Convert many collections into a single dask graph, after optimization """ optimizations = kwargs.pop("optimizations", None) or config.get( "optimizations", []) if optimize_graph: groups = groupby(optimization_function, collections) groups = { opt: _extract_graph_and_keys(val) for opt, val in groups.items() } for opt in optimizations: groups = { k: (opt(dsk, keys), keys) for k, (dsk, keys) in groups.items() } dsk = merge(*map( ensure_dict, [opt(dsk, keys, **kwargs) for opt, (dsk, keys) in groups.items()], )) else: dsk, _ = _extract_graph_and_keys(collections) return dsk
def _merge_coverage(cnns, data): """Merge split CNN outputs into final consolidated output. """ out = [] for (out_file, _), members in tz.groupby(lambda x: (x["final_out"], x["bed_orig"]), cnns).items(): if not utils.file_exists(out_file): with file_transaction(data, out_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for i, in_file in enumerate([ x["file"] for x in sorted(members, key=lambda x: x["bed_i"]) ]): with open(in_file) as in_handle: header = in_handle.readline() if i == 0: out_handle.write(header) for line in in_handle: out_handle.write(line) base = copy.deepcopy(members[0]) base = tz.dissoc(base, "final_out", "bed_i", "bed_orig") base["file"] = out_file out.append(base) return out
def _get_subnet_config_w_cidr(self, network_config): network_cidr_base = str( network_config.get('network_cidr_base', '172.16.0.0')) network_cidr_size = str(network_config.get('network_cidr_size', '20')) base_cidr = network_cidr_base + '/' + network_cidr_size net = netaddr.IPNetwork(base_cidr) grouped_subnet = groupby('size', self._get_subnet_config_w_az(network_config)) subnet_groups = sorted(grouped_subnet.items()) available_cidrs = [] for subnet_size, subnet_configs in subnet_groups: newcidrs = net.subnet(int(subnet_size)) for subnet_config in subnet_configs: try: cidr = newcidrs.next() except StopIteration: # net = chain(*reversed(available_cidrs)).next() newcidrs = net.subnet(int(subnet_size)) cidr = newcidrs.next() new_config = assoc(subnet_config, 'cidr', str(cidr)) yield new_config else: net = newcidrs.next() available_cidrs.append(newcidrs)
def draw(processed, filename, get_mean=lambda x: x.score_mean, get_var=lambda x: x.score_var, show=False): # see runner :) # this is shitty and boring, but it works atm.. plt.figure(figsize=(15, 5)) for oneline in toolz.groupby(lambda x: x.samplerid, processed).values(): oneline.sort(key=lambda x: x.size) sizes = [x.size for x in oneline] y_values = np.array([get_mean(x) for x in oneline]) y_variances = np.array([get_var(x) for x in oneline]) col = getcol(oneline[0]) plt.fill_between(sizes, y_values + y_variances, y_values - y_variances, facecolor=col, alpha=0.15, linewidth=0, label='%s' % samplerid_to_samplername(oneline[0].samplerid)) plt.plot(sizes, y_values, color=col) plt.legend(loc=4) plt.savefig(filename) if show: plt.show()
def fuzzy_symbol_ownership_by_broker(self, broke_id): """ 基于主承商找到对应的股票代码 """ equities = self.retrieve_type_assets('equity') broker_equity_mappings = groupby(lambda x: x.broker, equities) return broker_equity_mappings[broke_id]
def retrieve_asset(self, sids): """ Retrieve asset types for a list of sids. Parameters ---------- sids : list[int] Returns ------- types : dict[sid -> str or None] Asset types for the provided sids. """ dct = groupby(lambda x: x.sid, chain(*(self._asset_type_cache.values()))) print('dct', dct) found = set() if len(sids): for sid in sids: try: asset = dct[sid][0] found.add(asset) except KeyError: raise NotImplementedError('missing code : %s' % sid) return found
def split_next_and_previous_event_columns(self, requested_columns): """ Split requested columns into columns that should load the next known value and columns that should load the previous known value. Parameters ---------- requested_columns : iterable[BoundColumn] Returns ------- next_cols, previous_cols : iterable[BoundColumn], iterable[BoundColumn] ``requested_columns``, partitioned into sub-sequences based on whether the column should produce values from the next event or the previous event """ def next_or_previous(c): if c in self.next_value_columns: return 'next' elif c in self.previous_value_columns: return 'previous' raise ValueError("{c} not found in next_value_columns " "or previous_value_columns".format(c=c)) groups = groupby(next_or_previous, requested_columns) return groups.get('next', ()), groups.get('previous', ())
def load_adjusted_array(self, columns, dates, assets, mask): return merge( self.pool.imap_unordered( partial(self._load_dataset, dates, assets, mask), itervalues(groupby(getdataset, columns)), ), )
def set_params(self, **params): d = groupby(0, [(k.split('__')[0], k.split('__', 1)[1], v) for k, v in params.items()]) d = {k: {a: b for _, a, b in v} for k, v in d.items()} steps = [(name, set_params(est, **d[name]) if name in d else est) for name, est in self.steps] return Pipeline(steps)
def compute(*args, **kwargs): """Compute several dask collections at once. Examples -------- >>> import dask.array as da >>> a = da.arange(10, chunks=2).sum() >>> b = da.arange(10, chunks=2).mean() >>> compute(a, b) (45, 4.5) """ groups = groupby(attrgetter('_optimize'), args) get = kwargs.pop('get', None) or _globals['get'] if not get: get = args[0]._default_get if not all(a._default_get == get for a in args): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler `get` function using either " "the `get` kwarg or globally with `set_options`.") dsk = merge([ opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items() ]) keys = [arg._keys() for arg in args] results = get(dsk, keys, **kwargs) return tuple(a._finalize(a, r) for a, r in zip(args, results))
def scatter_to_workers(center, ncores, data, key=None): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. ncores should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ center = coerce_to_rpc(center) if key is None: key = str(uuid.uuid1()) if isinstance(ncores, Iterable) and not isinstance(ncores, dict): ncores = {worker: 1 for worker in ncores} workers = list(concat([w] * nc for w, nc in ncores.items())) if isinstance(data, dict): names, data = list(zip(*data.items())) else: names = ("%s-%d" % (key, i) for i in count(0)) L = list(zip(cycle(workers), names, data)) d = groupby(0, L) d = {k: {b: c for a, b, c in v} for k, v in d.items()} yield [rpc(ip=w_ip, port=w_port).update_data(data=v, close=True) for (w_ip, w_port), v in d.items()] result = [RemoteData(b, center.ip, center.port, result=c) for a, b, c in L] raise Return(result)
def _get_subnet_config_w_cidr(self, network_config): network_cidr_base = str(network_config.get('network_cidr_base', '172.16.0.0')) network_cidr_size = str(network_config.get('network_cidr_size', '20')) first_network_address_block = str(network_config.get('first_network_address_block', network_cidr_base)) ret_val = {} base_cidr = network_cidr_base + '/' + network_cidr_size net = netaddr.IPNetwork(base_cidr) grouped_subnet = groupby('size', self._get_subnet_config_w_az(network_config)) subnet_groups = sorted(grouped_subnet.items()) available_cidrs = [] for subnet_size, subnet_configs in subnet_groups: newcidrs = net.subnet(int(subnet_size)) for subnet_config in subnet_configs: try: cidr = newcidrs.next() except StopIteration as e: net = chain(*reversed(available_cidrs)).next() newcidrs = net.subnet(int(subnet_size)) cidr = newcidrs.next() new_config = assoc(subnet_config, 'cidr', str(cidr)) yield new_config else: net = newcidrs.next() available_cidrs.append(newcidrs)
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join( raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = _bam_to_outbase(test_bam, raw_work_dir) ckouts.append({ "cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn }) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files( cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = { "type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"] } target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" split_cnns = run_multicore( _cnvkit_coverage, [(bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data)], data["config"], parallel) coverage_cnns = _merge_coverage(split_cnns, data) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data) fixed_cnrs = run_multicore( _cnvkit_fix, [(cnns, background_cnn, data) for cnns in tz.groupby( "bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values()], data["config"], parallel) called_segs = run_multicore(_cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel) return ckouts
def compute(*args, **kwargs): """Compute several dask collections at once. Examples -------- >>> import dask.array as da >>> a = da.arange(10, chunks=2).sum() >>> b = da.arange(10, chunks=2).mean() >>> compute(a, b) (45, 4.5) """ groups = groupby(attrgetter('_optimize'), args) get = kwargs.pop('get', None) or _globals['get'] if not get: get = args[0]._default_get if not all(a._default_get == get for a in args): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler `get` function using either " "the `get` kwarg or globally with `set_options`.") dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) keys = [arg._keys() for arg in args] results = get(dsk, keys, **kwargs) return tuple(a._finalize(a, r) for a, r in zip(args, results))
async def scatter_to_workers(nthreads, data, rpc=rpc, report=True, serializers=None): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. nthreads should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ assert isinstance(nthreads, dict) assert isinstance(data, dict) workers = list(concat([w] * nc for w, nc in nthreads.items())) names, data = list(zip(*data.items())) worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers)) _round_robin_counter[0] += len(data) L = list(zip(worker_iter, names, data)) d = groupby(0, L) d = { worker: {key: value for _, key, value in v} for worker, v in d.items() } rpcs = {addr: rpc(addr) for addr in d} try: out = await All([ rpcs[address].update_data(data=v, report=report, serializers=serializers) for address, v in d.items() ]) finally: for r in rpcs.values(): await r.close_rpc() nbytes = merge(o["nbytes"] for o in out) who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()} return (names, who_has, nbytes)
def persist(self, collections): """ Persist dask collections on cluster Starts computation of the collection on the cluster in the background. Provides a new dask collection that is semantically identical to the previous one, but now based off of futures currently in execution. Parameters ---------- collections: sequence or single dask object Collections like dask.array or dataframe or dask.value objects Returns ------- List of collections, or single collection, depending on type of input. Examples -------- >>> xx = executor.persist(x) # doctest: +SKIP >>> xx, yy = executor.persist([x, y]) # doctest: +SKIP See Also -------- Executor.compute """ if isinstance(collections, (tuple, list, set, frozenset)): singleton = False else: singleton = True collections = [collections] assert all(isinstance(c, Base) for c in collections) groups = groupby(lambda x: x._optimize, collections) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) d = {k: unpack_remotedata(v) for k, v in dsk.items()} dsk2 = {k: v[0] for k, v in d.items()} dependencies = {k: v[1] for k, v in d.items()} for k, v in dsk2.items(): dependencies[k] |= set(_deps(dsk, v)) names = list({k for c in collections for k in flatten(c._keys())}) self._send_to_scheduler({'op': 'update-graph', 'tasks': valmap(dumps_task, dsk2), 'dependencies': dependencies, 'keys': names, 'client': self.id}) result = [redict_collection(c, {k: Future(k, self) for k in flatten(c._keys())}) for c in collections] if singleton: return first(result) else: return result
def keep_latest_sample(batch): groups = tz.groupby(lambda x: (x["participant"], x["sample_type"]), batch).values() keep = [ sorted(group, key=lambda x: int(x["version"]), reverse=True) for group in groups ] return [x[0] for x in keep]
def scatter_to_workers(ncores, data, report=True, serialize=True): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. ncores should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ if isinstance(ncores, Iterable) and not isinstance(ncores, dict): k = len(data) // len(ncores) ncores = {coerce_to_address(worker): k for worker in ncores} workers = list(concat([w] * nc for w, nc in ncores.items())) in_type = type(data) if isinstance(data, dict): names, data = list(zip(*data.items())) else: names = [] for x in data: try: names.append(tokenize(x)) except: names.append(str(uuid.uuid1())) worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers)) _round_robin_counter[0] += len(data) L = list(zip(worker_iter, names, data)) d = groupby(0, L) d = { worker: {key: dumps(value) if serialize else value for _, key, value in v} for worker, v in d.items() } out = yield All([ rpc(address).update_data(data=v, close=True, report=report) for address, v in d.items() ]) nbytes = merge(o['nbytes'] for o in out) who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()} raise Return((names, who_has, nbytes))
def scatter_to_workers(ncores, data, report=True, serialize=True): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. ncores should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ if isinstance(ncores, Iterable) and not isinstance(ncores, dict): k = len(data) // len(ncores) ncores = {coerce_to_address(worker): k for worker in ncores} workers = list(concat([w] * nc for w, nc in ncores.items())) if isinstance(data, dict): names, data = list(zip(*data.items())) else: names = [] for x in data: try: names.append(tokenize(x)) except: names.append(str(uuid.uuid1())) worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers)) _round_robin_counter[0] += len(data) L = list(zip(worker_iter, names, data)) d = groupby(0, L) d = {worker: {key: dumps(value) if serialize else value for _, key, value in v} for worker, v in d.items()} rpcs = {addr: rpc(addr) for addr in d} try: out = yield All([rpcs[address].update_data(data=v, close=True, report=report) for address, v in d.items()]) finally: for r in rpcs.values(): r.close_rpc() nbytes = merge(o['nbytes'] for o in out) who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()} raise Return((names, who_has, nbytes))
def colections(item): sun = {key: 0 for key in timelst} count_dict = { key: len(values) for key, values in toolz.groupby(lambda x: x[-1], item).items() } #actions groupby DS count play time of each day sun.update(count_dict) return sun
def group_by(df, s): # s = re.sub("\s*", "", s).split(',') keys = zip(*select(df, s).values()) y = zip(keys, range(10000)) grouped_keys = groupby(itemgetter(0), y) for g in grouped_keys: index = g[1] yield filter_index(df, index)
def test_rule_match_success_daytime_below_min_minute_within_daytime_range( pg_conn): rule_1 = _create_auth_rule(pg_conn, 1, 'sys/none') patch_auth_rule(pg_conn, rule_1, hour0=0, minute0=10, hour1=3, minute1=20) proc_1 = create_process(pg_conn, started_at=dt(2018, 7, 15, 2, 5)) processes_matches = retrieve_processes_rule_matches(pg_conn) grouped = toolz.groupby('proc_id', processes_matches) assert _proc_matches(grouped, proc_1) == [rule_1]
def compute(self, *args, **kwargs): """ Compute dask collections on cluster Parameters ---------- args: iterable of dask objects Collections like dask.array or dataframe or dask.value objects sync: bool (optional) Returns Futures if False (default) or concrete values if True Returns ------- Tuple of Futures or concrete values Examples -------- >>> from dask import do, value >>> from operator import add >>> x = dask.do(add)(1, 2) >>> y = dask.do(add)(x, x) >>> xx, yy = executor.compute(x, y) # doctest: +SKIP >>> xx # doctest: +SKIP <Future: status: finished, key: add-8f6e709446674bad78ea8aeecfee188e> >>> xx.result() # doctest: +SKIP 3 >>> yy.result() # doctest: +SKIP 6 """ sync = kwargs.pop('sync', False) assert not kwargs if sync: return dask.compute(*args, get=self.get) variables = [a for a in args if isinstance(a, Base)] groups = groupby(lambda x: x._optimize, variables) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val]) for opt, val in groups.items()]) names = ['finalize-%s' % tokenize(v) for v in variables] dsk2 = {name: (v._finalize, v, v._keys()) for name, v in zip(names, variables)} self.loop.add_callback(self.scheduler_queue.put_nowait, {'op': 'update-graph', 'dsk': merge(dsk, dsk2), 'keys': names}) i = 0 futures = [] for arg in args: if isinstance(arg, Base): futures.append(Future(names[i], self)) i += 1 else: futures.append(arg) return futures
def compute(*args, **kwargs): """Compute several dask collections at once. Parameters ---------- args : object Any number of objects. If the object is a dask collection, it's computed and the result is returned. Otherwise it's passed through unchanged. get : callable, optional A scheduler ``get`` function to use. If not provided, the default is to check the global settings first, and then fall back to defaults for the collections. optimize_graph : bool, optional If True [default], the optimizations for each collection are applied before computation. Otherwise the graph is run as is. This can be useful for debugging. kwargs Extra keywords to forward to the scheduler ``get`` function. Examples -------- >>> import dask.array as da >>> a = da.arange(10, chunks=2).sum() >>> b = da.arange(10, chunks=2).mean() >>> compute(a, b) (45, 4.5) """ variables = [a for a in args if isinstance(a, Base)] if not variables: return args get = kwargs.pop('get', None) or _globals['get'] if not get: get = variables[0]._default_get if not all(a._default_get == get for a in variables): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler `get` function using either " "the `get` kwarg or globally with `set_options`.") if kwargs.get('optimize_graph', True): groups = groupby(attrgetter('_optimize'), variables) dsk = merge([ opt(merge([v.dask for v in val]), [v._keys() for v in val], **kwargs) for opt, val in groups.items() ]) else: dsk = merge(var.dask for var in variables) keys = [var._keys() for var in variables] results = get(dsk, keys, **kwargs) results_iter = iter(results) return tuple( a if not isinstance(a, Base) else a._finalize(next(results_iter)) for a in args)
def split_by(args: dict, left_keys: List[str]): """Split into two dictionaries (left is whitelist and right is remaining)""" result = { k: dict(v) for k, v in groupby(lambda pair: pair[0] in left_keys, args.items()).items() } return (result.get(True, {}), result.get(False, {}))
def render_tabular(api, options=None): """Entry point for the tabular reporter interface.""" # determine separator separator = options.get('report.separator', '\t') human = options.get('report.human') panel = options.get('report.panel') samples = options.get('report.samples') group = options.get('report.group') # read gene panel file if it has been set if panel: superblock_ids = [line.rstrip() for line in panel] else: superblock_ids = None # get sample ID, group and cutoff from metadata sample_query = limit_query(api.samples(), group=group, samples=samples) metadata = ((sample.id, sample.group_id, sample.cutoff) for sample in sample_query) # get the data base_query = limit_query( api.average_metrics(superblock_ids=superblock_ids), group=group, samples=samples) queries = [ metadata, base_query, api.diagnostic_yield(superblock_ids=superblock_ids, group_id=group, sample_ids=samples), api.sex_checker(group_id=group, sample_ids=samples) ] # group multiple queries by sample ID (first column) key_metrics = groupby(get(0), concat(queries)) # get the column names dynamically from the query headers = concatv(['sample_id', 'group_id', 'cutoff'], (column['name'] for column in base_query.column_descriptions), ['diagnostic yield', 'gender']) unique_headers = unique(headers) # iterate over all values, concat different query results, and keep # only the unique values (excluding second sample_id) data = (unique(concat(values)) for values in itervalues(key_metrics)) if human: # export key_metrics in a more human friendly format return tabulate(data, unique_headers) # yield headers return '\n'.join( cons('#' + separator.join(unique_headers), stringify_list(data, separator=separator)))
def _get_child_table_rows(query, docs): if not docs: return {} return groupby( "parent", frappe.db.sql( query, values={"docnames": [x.get("name") for x in docs]}, as_dict=1, ), )
def _prompt_app(default_app: Optional[str], apps: List[dict]) -> dict: apps_by_name = groupby("name", apps) if default_app and default_app in apps_by_name: name = default_app else: name = questionary.autocomplete(message="Choose app:", choices=apps_by_name).ask() return first(apps_by_name[name])
def to_lists(self): getlevel = lambda x: x[1] if not self.head: return pairs = self._to_lists(Queue([(self.head, 0)])) for _, node_level_pairs in tz.groupby(getlevel, pairs).items(): yield [node for node, _ in node_level_pairs]
def partition(grouper, sequence, npartitions, p, nelements=2**20): """ Partition a bag along a grouper, store partitions on disk """ for block in partition_all(nelements, sequence): d = groupby(grouper, block) d2 = defaultdict(list) for k, v in d.items(): d2[abs(hash(k)) % npartitions].extend(v) p.append(d2) return p
def __init__(self, restrictions): # A dict mapping each asset to its restrictions, which are sorted by # ascending order of effective_date self._restrictions_by_asset = { asset: sorted(restrictions_for_asset, key=lambda x: x.effective_date) for asset, restrictions_for_asset in groupby( lambda x: x.asset, restrictions).items() }
def compute(*args, **kwargs): """Compute several dask collections at once. Parameters ---------- args : object Any number of objects. If the object is a dask collection, it's computed and the result is returned. Otherwise it's passed through unchanged. get : callable, optional A scheduler ``get`` function to use. If not provided, the default is to check the global settings first, and then fall back to defaults for the collections. optimize_graph : bool, optional If True [default], the optimizations for each collection are applied before computation. Otherwise the graph is run as is. This can be useful for debugging. kwargs Extra keywords to forward to the scheduler ``get`` function. Examples -------- >>> import dask.array as da >>> a = da.arange(10, chunks=2).sum() >>> b = da.arange(10, chunks=2).mean() >>> compute(a, b) (45, 4.5) """ variables = [a for a in args if isinstance(a, Base)] if not variables: return args get = kwargs.pop('get', None) or _globals['get'] if not get: get = variables[0]._default_get if not all(a._default_get == get for a in variables): raise ValueError("Compute called on multiple collections with " "differing default schedulers. Please specify a " "scheduler `get` function using either " "the `get` kwarg or globally with `set_options`.") if kwargs.get('optimize_graph', True): groups = groupby(attrgetter('_optimize'), variables) dsk = merge([opt(merge([v.dask for v in val]), [v._keys() for v in val], **kwargs) for opt, val in groups.items()]) else: dsk = merge(var.dask for var in variables) keys = [var._keys() for var in variables] results = get(dsk, keys, **kwargs) results_iter = iter(results) return tuple(a if not isinstance(a, Base) else a._finalize(next(results_iter)) for a in args)
def get_addons(self, event_slug): event = Event.objects.get( slug=event_slug ) addons = frozenset(TicketAddOnType.objects.filter( tickettypes__event=event )) return {'addon_groups': groupby(lambda x: x.group, addons)}
def __init__(self, restrictions): # A dict mapping each asset to its restrictions, which are sorted by # ascending order of effective_date self._restrictions_by_asset = { asset: sorted( restrictions_for_asset, key=lambda x: x.effective_date ) for asset, restrictions_for_asset in iteritems(groupby(lambda x: x.asset, restrictions)) }
def _run_cnvkit_shared(items, test_bams, background_bams, work_dir, background_name=None): """Shared functionality to run CNVkit, parallelizing over multiple BAM files. """ raw_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw")) background_cnn = os.path.join(raw_work_dir, "%s_background.cnn" % (background_name if background_name else "flat")) ckouts = [] for test_bam in test_bams: out_base = _bam_to_outbase(test_bam, raw_work_dir) ckouts.append({"cnr": "%s.cns" % out_base, "cns": "%s.cns" % out_base, "back_cnn": background_cnn}) if not utils.file_exists(ckouts[0]["cnr"]): data = items[0] cov_interval = dd.get_coverage_interval(data) raw_target_bed, access_bed = _get_target_access_files(cov_interval, data, work_dir) # bail out if we ended up with no regions if not utils.file_exists(raw_target_bed): return {} raw_target_bed = annotate.add_genes(raw_target_bed, data) parallel = {"type": "local", "cores": dd.get_cores(data), "progs": ["cnvkit"]} target_bed, antitarget_bed = _cnvkit_targets(raw_target_bed, access_bed, cov_interval, raw_work_dir, data) def _bam_to_itype(bam): return "background" if bam in background_bams else "evaluate" split_cnns = run_multicore( _cnvkit_coverage, [ (bam, bed, _bam_to_itype(bam), raw_work_dir, data) for bam in test_bams + background_bams for bed in _split_bed(target_bed, data) + _split_bed(antitarget_bed, data) ], data["config"], parallel, ) coverage_cnns = _merge_coverage(split_cnns, data) background_cnn = _cnvkit_background( [x["file"] for x in coverage_cnns if x["itype"] == "background"], background_cnn, target_bed, antitarget_bed, data, ) fixed_cnrs = run_multicore( _cnvkit_fix, [ (cnns, background_cnn, data) for cnns in tz.groupby("bam", [x for x in coverage_cnns if x["itype"] == "evaluate"]).values() ], data["config"], parallel, ) called_segs = run_multicore( _cnvkit_segment, [(cnr, cov_interval, data) for cnr in fixed_cnrs], data["config"], parallel ) return ckouts
def index(): events = Event.query.filter(Event.end > datetime.now()).order_by(Event.start).all() def get_month(event): year = event.start.year month = event.start.month return date(year, month, 1) groups = sorted(groupby(get_month, events).items()) ctx = {"groups": groups} return render_template("calendar/index.html", **ctx)
def collect(grouper, npartitions, group, pbags): """ Collect partitions from disk and yield k,v group pairs """ from pbag import PBag pbags = list(take(npartitions, pbags)) result = defaultdict(list) for pb in pbags: part = pb.get_partition(group) groups = groupby(grouper, part) for k, v in groups.items(): result[k].extend(v) return list(result.items())
def _as_completed(fs, queue): groups = groupby(lambda f: f.key, fs) firsts = [v[0] for v in groups.values()] wait_iterator = gen.WaitIterator(*[f.event.wait() for f in firsts]) while not wait_iterator.done(): result = yield wait_iterator.next() # TODO: handle case of restarted futures future = firsts[wait_iterator.current_index] for f in groups[future.key]: queue.put_nowait(f)
def scatter_to_workers(ncores, data, report=True): """ Scatter data directly to workers This distributes data in a round-robin fashion to a set of workers based on how many cores they have. ncores should be a dictionary mapping worker identities to numbers of cores. See scatter for parameter docstring """ if isinstance(ncores, Iterable) and not isinstance(ncores, dict): k = len(data) // len(ncores) ncores = {worker: k for worker in ncores} workers = list(concat([w] * nc for w, nc in ncores.items())) in_type = type(data) if isinstance(data, dict): names, data = list(zip(*data.items())) else: names = [] for x in data: try: names.append(tokenize(x)) except: names.append(str(uuid.uuid1())) worker_iter = drop(_round_robin_counter[0] % len(workers), cycle(workers)) _round_robin_counter[0] += len(data) L = list(zip(worker_iter, names, data)) d = groupby(0, L) d = {k: {b: c for a, b, c in v} for k, v in d.items()} out = yield All([rpc(ip=w_ip, port=w_port).update_data(data=v, close=True, report=report) for (w_ip, w_port), v in d.items()]) nbytes = merge([o[1]['nbytes'] for o in out]) who_has = {k: [w for w, _, _ in v] for k, v in groupby(1, L).items()} raise Return((names, who_has, nbytes))
def render_tabular(api, options=None): """Entry point for the tabular reporter interface.""" # determine separator separator = options.get('report.separator', '\t') human = options.get('report.human') panel = options.get('report.panel') samples = options.get('report.samples') group = options.get('report.group') # read gene panel file if it has been set if panel: superblock_ids = [line.rstrip() for line in panel] else: superblock_ids = None # get sample ID, group and cutoff from metadata sample_query = limit_query(api.samples(), group=group, samples=samples) metadata = ((sample.id, sample.group_id, sample.cutoff) for sample in sample_query) # get the data base_query = limit_query(api.average_metrics(superblock_ids=superblock_ids), group=group, samples=samples) queries = [metadata, base_query, api.diagnostic_yield(superblock_ids=superblock_ids, group_id=group, sample_ids=samples), api.sex_checker(group_id=group, sample_ids=samples)] # group multiple queries by sample ID (first column) key_metrics = groupby(get(0), concat(queries)) # get the column names dynamically from the query headers = concatv(['sample_id', 'group_id', 'cutoff'], (column['name'] for column in base_query.column_descriptions), ['diagnostic yield', 'gender']) unique_headers = unique(headers) # iterate over all values, concat different query results, and keep # only the unique values (excluding second sample_id) data = (unique(concat(values)) for values in itervalues(key_metrics)) if human: # export key_metrics in a more human friendly format return tabulate(data, unique_headers) # yield headers return '\n'.join(cons('#' + separator.join(unique_headers), stringify_list(data, separator=separator)))
def ordering(signatures): """ A sane ordering of signatures to check, first to last Topoological sort of edges as given by ``edge`` and ``supercedes`` """ signatures = list(map(tuple, signatures)) edges = [(a, b) for a in signatures for b in signatures if edge(a, b)] edges = groupby(first, edges) for s in signatures: if s not in edges: edges[s] = [] edges = dict((k, [b for a, b in v]) for k, v in edges.items()) return _toposort(edges)