def __init__(self, server, sizing_mode="stretch_both", **kwargs): self.server = server self.counter_figures = {} self.counter_sources = {} self.digest_figures = {} self.digest_sources = {} self.sizing_mode = sizing_mode if self.server.digests: for name in self.server.digests: self.add_digest_figure(name) for name in self.server.counters: self.add_counter_figure(name) figures = merge(self.digest_figures, self.counter_figures) figures = [figures[k] for k in sorted(figures)] if len(figures) <= 5: self.root = column(figures, sizing_mode=sizing_mode) else: self.root = column(*[ row(*pair, sizing_mode=sizing_mode) for pair in partition_all(2, figures) ], sizing_mode=sizing_mode)
async def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = await Nanny(s.address, nthreads=2, loop=s.loop) start = time() while len(s.nthreads) < 3: await asyncio.sleep(0.01) assert time() < start + 5 N = 256 expected_result = N * (N + 1) // 2 i = 0 L = c.map(inc, range(N), key=["inc-%d-%d" % (i, j) for j in range(N)]) while len(L) > 1: i += 1 L = c.map( slowadd, *zip(*partition_all(2, L)), key=["add-%d-%d" % (i, j) for j in range(len(L) // 2)], ) await asyncio.sleep(random.random() / 20) with suppress(CommClosedError): # comm will be closed abrupty await c._run(os._exit, 1, workers=[n.worker_address]) await asyncio.sleep(random.random() / 20) while len(s.workers) < 3: await asyncio.sleep(0.01) with suppress( CommClosedError, EnvironmentError): # perhaps new worker can't be contacted yet await c._run(os._exit, 1, workers=[n.worker_address]) [result] = await c.gather(L) assert isinstance(result, int) assert result == expected_result await n.close()
def _reduction( ph: PartitionedHistogram, split_every: int | None = None, ) -> AggHistogram: if split_every is None: split_every = 4 if split_every is False: split_every = ph.npartitions token = tokenize(ph, sum, split_every) name = f"hist-aggregate-{token}" k = ph.npartitions b = ph.name d = 0 dsk = {} while k > split_every: c = f"{name}{d}" for i, inds in enumerate(partition_all(split_every, range(k))): dsk[(c, i)] = ( empty_safe_aggregate, sum, [(b, j) for j in inds], False, ) k = i + 1 b = c d += 1 dsk[(name, 0)] = ( empty_safe_aggregate, sum, [(b, j) for j in range(k)], True, ) dsk[name] = dsk.pop((name, 0)) # type: ignore g = HighLevelGraph.from_collections(name, dsk, dependencies=[ph]) return AggHistogram(g, name, histref=ph.histref)
def test_text_blocks_to_pandas_blocked(reader, files): header = files["2014-01-01.csv"].split(b"\n")[0] + b"\n" blocks = [] for k in sorted(files): b = files[k] lines = b.split(b"\n") blocks.append([b"\n".join(bs) for bs in partition_all(2, lines)]) df = text_blocks_to_pandas(reader, blocks, header, expected.head(), {}) assert_eq( df.compute().reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False, ) expected2 = expected[["name", "id"]] df = text_blocks_to_pandas( reader, blocks, header, expected2.head(), {"usecols": ["name", "id"]} ) assert_eq( df.compute().reset_index(drop=True), expected2.reset_index(drop=True), check_dtype=False, )
def partial_reduce(func, x, split_every, keepdims=False, dtype=None, name=None, reduced_meta=None): """Partial reduction across multiple axes. Parameters ---------- func : function x : Array split_every : dict Maximum reduction block sizes in each dimension. Examples -------- Reduce across axis 0 and 2, merging a maximum of 1 block in the 0th dimension, and 3 blocks in the 2nd dimension: >>> partial_reduce(np.min, x, {0: 1, 2: 3}) # doctest: +SKIP """ name = ((name or funcname(func)) + "-" + tokenize(func, x, split_every, keepdims, dtype)) parts = [ list(partition_all(split_every.get(i, 1), range(n))) for (i, n) in enumerate(x.numblocks) ] keys = product(*map(range, map(len, parts))) out_chunks = [ tuple(1 for p in partition_all(split_every[i], c)) if i in split_every else c for (i, c) in enumerate(x.chunks) ] if not keepdims: out_axis = [i for i in range(x.ndim) if i not in split_every] getter = lambda k: get(out_axis, k) keys = map(getter, keys) out_chunks = list(getter(out_chunks)) dsk = {} for k, p in zip(keys, product(*parts)): decided = dict((i, j[0]) for (i, j) in enumerate(p) if len(j) == 1) dummy = dict(i for i in enumerate(p) if i[0] not in decided) g = lol_tuples((x.name, ), range(x.ndim), decided, dummy) dsk[(name, ) + k] = (func, g) graph = HighLevelGraph.from_collections(name, dsk, dependencies=[x]) meta = x._meta if reduced_meta is not None: try: meta = func(reduced_meta, computing_meta=True) # no meta keyword argument exists for func, and it isn't required except TypeError: meta = func(reduced_meta) # when no work can be computed on the empty array (e.g., func is a ufunc) except ValueError: pass # some functions can't compute empty arrays (those for which reduced_meta # fall into the ValueError exception) and we have to rely on reshaping # the array according to len(out_chunks) if is_arraylike(meta) and meta.ndim != len(out_chunks): if len(out_chunks) == 0: meta = meta.sum() else: meta = meta.reshape((0, ) * len(out_chunks)) if np.isscalar(meta): return Array(graph, name, out_chunks, dtype=dtype) else: with ignoring(AttributeError): meta = meta.astype(dtype) return Array(graph, name, out_chunks, meta=meta)
def reduction( args, chunk=None, aggregate=None, combine=None, meta=None, token=None, chunk_kwargs=None, aggregate_kwargs=None, combine_kwargs=None, split_every=None, **kwargs, ): """Generic tree reduction operation. Parameters ---------- args : Positional arguments for the `chunk` function. All `dask.dataframe` objects should be partitioned and indexed equivalently. chunk : function [block-per-arg] -> block Function to operate on each block of data aggregate : function list-of-blocks -> block Function to operate on the list of results of chunk combine : function list-of-blocks -> block, optional Function to operate on intermediate lists of results of chunk in a tree-reduction. If not provided, defaults to aggregate. $META token : str, optional The name to use for the output keys. chunk_kwargs : dict, optional Keywords for the chunk function only. aggregate_kwargs : dict, optional Keywords for the aggregate function only. combine_kwargs : dict, optional Keywords for the combine function only. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used, and all intermediates will be concatenated and passed to ``aggregate``. Default is 8. kwargs : All remaining keywords will be passed to ``chunk``, ``aggregate``, and ``combine``. """ if chunk_kwargs is None: chunk_kwargs = dict() if aggregate_kwargs is None: aggregate_kwargs = dict() chunk_kwargs.update(kwargs) aggregate_kwargs.update(kwargs) if combine is None: if combine_kwargs: raise ValueError("`combine_kwargs` provided with no `combine`") combine = aggregate combine_kwargs = aggregate_kwargs else: if combine_kwargs is None: combine_kwargs = dict() combine_kwargs.update(kwargs) if not isinstance(args, (tuple, list)): args = [args] npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} if len(npartitions) > 1: raise ValueError("All arguments must have same number of partitions") npartitions = npartitions.pop() if split_every is None: split_every = 8 elif split_every is False: split_every = npartitions elif split_every < 2 or not isinstance(split_every, int): raise ValueError("split_every must be an integer >= 2") token_key = tokenize( token or (chunk, aggregate), meta, args, chunk_kwargs, aggregate_kwargs, combine_kwargs, split_every, ) # Chunk a = f"{token or funcname(chunk)}-chunk-{token_key}" if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: dsk = {(a, 0, i): (chunk, key) for i, key in enumerate(args[0].__dask_keys__())} else: dsk = {(a, 0, i): ( apply, chunk, [(x._name, i) if isinstance(x, _Frame) else x for x in args], chunk_kwargs, ) for i in range(args[0].npartitions)} # Combine b = f"{token or funcname(combine)}-combine-{token_key}" k = npartitions depth = 0 while k > split_every: for part_i, inds in enumerate(partition_all(split_every, range(k))): conc = (list, [(a, depth, i) for i in inds]) dsk[(b, depth + 1, part_i)] = ((apply, combine, [conc], combine_kwargs) if combine_kwargs else (combine, conc)) k = part_i + 1 a = b depth += 1 # Aggregate b = f"{token or funcname(aggregate)}-agg-{token_key}" conc = (list, [(a, depth, i) for i in range(k)]) if aggregate_kwargs: dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) else: dsk[(b, 0)] = (aggregate, conc) if meta is None: meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) meta = dask_make_meta(meta) graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) return dd.core.new_dd_object(graph, b, meta, (None, None))
def categorize(df, columns=None, index=None, split_every=None, **kwargs): """Convert columns of the DataFrame to category dtype. Parameters ---------- columns : list, optional A list of column names to convert to categoricals. By default any column with an object dtype is converted to a categorical, and any unknown categoricals are made known. index : bool, optional Whether to categorize the index. By default, object indices are converted to categorical, and unknown categorical indices are made known. Set True to always categorize the index, False to never. split_every : int, optional Group partitions into groups of this size while performing a tree-reduction. If set to False, no tree-reduction will be used. Default is 16. kwargs Keyword arguments are passed on to compute. """ meta = df._meta if columns is None: columns = list(meta.select_dtypes(["object", "category"]).columns) elif is_scalar(columns): columns = [columns] # Filter out known categorical columns columns = [ c for c in columns if not ( is_categorical_dtype(meta[c]) and has_known_categories(meta[c])) ] if index is not False: if is_categorical_dtype(meta.index): index = not has_known_categories(meta.index) elif index is None: index = meta.index.dtype == object # Nothing to do if not len(columns) and index is False: return df if split_every is None: split_every = 16 elif split_every is False: split_every = df.npartitions elif not isinstance(split_every, Integral) or split_every < 2: raise ValueError("split_every must be an integer >= 2") token = tokenize(df, columns, index, split_every) a = "get-categories-chunk-" + token dsk = {(a, i): (_get_categories, key, columns, index) for (i, key) in enumerate(df.__dask_keys__())} prefix = "get-categories-agg-" + token k = df.npartitions depth = 0 while k > split_every: b = prefix + str(depth) for part_i, inds in enumerate(partition_all(split_every, range(k))): dsk[(b, part_i)] = (_get_categories_agg, [(a, i) for i in inds]) k = part_i + 1 a = b depth += 1 dsk[(prefix, 0)] = (_get_categories_agg, [(a, i) for i in range(k)]) dsk.update(df.dask) # Compute the categories categories, index = compute_as_if_collection(df.__class__, dsk, (prefix, 0), **kwargs) # some operations like get_dummies() rely on the order of categories categories = {k: v.sort_values() for k, v in categories.items()} # Categorize each partition return df.map_partitions(_categorize_block, categories, index)
def _correct_errors(ra, err_rate, p_value=0.05): # True: use Dask's broadcast (ra transfer via inproc/tcp) # False: each worker reacs ra.pickle from disk use_dask_broadcast = False log.debug( "Available CPU / RAM: {} / {} GB".format( _get_cpu_count(), int(_get_available_memory() / 1024 ** 3) ), module_name="rmt_correction", ) n_workers = _calc_max_workers(ra) log.debug( "Estimated optimum n_workers: {}".format(n_workers), module_name="rmt_correction", ) if int(os.environ.get("SEQC_MAX_WORKERS", 0)) > 0: n_workers = int(os.environ.get("SEQC_MAX_WORKERS")) log.debug( "n_workers overridden with SEQC_MAX_WORKERS: {}".format(n_workers), module_name="rmt_correction", ) # n_workers = 1 # p_value = 0.005 # configure dask.distributed # memory_terminate_fraction doesn't work for some reason # https://github.com/dask/distributed/issues/3519 # https://docs.dask.org/en/latest/setup/single-distributed.html#localcluster # https://docs.dask.org/en/latest/scheduling.html#local-threads worker_kwargs = { "n_workers": n_workers, "threads_per_worker": 1, "processes": True, "memory_limit": "64G", "memory_target_fraction": 0.95, "memory_spill_fraction": 0.99, "memory_pause_fraction": False, # "memory_terminate_fraction": False, } # do not kill worker at 95% memory level dask.config.set({"distributed.worker.memory.terminate": False}) dask.config.set({"distributed.scheduler.allowed-failures": 50}) # setup Dask distributed client cluster = LocalCluster(**worker_kwargs) client = Client(cluster) # debug message log.debug( "Dask processes={} threads={}".format( len(client.nthreads().values()), np.sum(list(client.nthreads().values())) ), module_name="rmt_correction", ) log.debug( "Dask worker_kwargs " + " ".join([f"{k}={v}" for k, v in worker_kwargs.items()]), module_name="rmt_correction", ) log.debug("Dask Dashboard=" + client.dashboard_link, module_name="rmt_correction") # group by cells (same cell barcodes as one group) log.debug("Grouping...", module_name="rmt_correction") indices_grouped_by_cells = ra.group_indices_by_cell() if use_dask_broadcast: # send readarray in advance to all workers (i.e. broadcast=True) # this way, we reduce the serialization time log.debug("Scattering ReadArray...", module_name="rmt_correction") [future_ra] = client.scatter([ra], broadcast=True) else: # write ra to pickle which will be used later to parallel process rmt correction with open("pre-correction-ra.pickle", "wb") as fout: pickle.dump(ra, fout, protocol=4) # correct errors per cell group in parallel log.debug("Submitting jobs to Dask...", module_name="rmt_correction") with performance_report(filename="dask-report.html"): futures = [] # distribute chunks to workers evenly n_chunks = math.ceil(len(indices_grouped_by_cells) / n_workers) chunks = partition_all(n_chunks, indices_grouped_by_cells) for chunk in tqdm(chunks, disable=None): future = client.submit( _correct_errors_by_cell_group_chunks, future_ra if use_dask_broadcast else None, chunk, err_rate, p_value, ) futures.append(future) # wait until all done log.debug("Waiting untill all tasks complete...", module_name="rmt_correction") completed, not_completed = wait(futures) if len(not_completed) > 1: raise Exception("There are uncompleted tasks!") # gather the resutls and release log.debug( "Collecting the task results from the workers...", module_name="rmt_correction" ) results = [] for future in tqdm(completed, disable=None): # this returns a list of a list # len(result) should be the number of chunks e.g. 50 result = future.result() # remove empty lists result = list(filter(lambda x: len(x) > 0, result)) # aggregate and release results.extend(result) future.release() # clean up del futures del completed del not_completed client.shutdown() client.close() # iterate through the list of returned read indices and donor rmts # create a mapping tble of pre-/post-correction mapping = set() for result in results: for idx, idx_corrected_rmt in result: # record pre-/post-correction # skip if it's already marked as rmt error if ( ra.data["cell"][idx], ra.data["rmt"][idx], ra.data["rmt"][idx_corrected_rmt], ) in mapping: continue mapping.add( ( ra.data["cell"][idx], ra.data["rmt"][idx], ra.data["rmt"][idx_corrected_rmt], ) ) # iterate through the list of returned read indices and donor rmts # actually, update the read array object with corrected UMI for result in results: for idx, idx_corrected_rmt in result: # skip if it's already marked as rmt error if ra.data["status"][idx_corrected_rmt] & ra.filter_codes["rmt_error"]: continue # correct ra.data["rmt"][idx] = ra.data["rmt"][idx_corrected_rmt] # report error ra.data["status"][idx] |= ra.filter_codes["rmt_error"] return pd.DataFrame(mapping, columns=["CB", "UR", "UB"])
def npoclass(inputs, gpu_core=True, model_path=None, ntee_type='bc', n_jobs=4, backend='multiprocessing', batch_size_dl=64, verbose=1): # Set the seed value all over the place to make this reproducible. seed_val = 42 random.seed(seed_val) np.random.seed(seed_val) torch.manual_seed(seed_val) # Check model files. if ntee_type == 'bc' and model_path == None: raise ValueError( "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_bc.zip, unzip, and specifiy model_path (default set to None)." ) if ntee_type == 'mg' and model_path == None: raise ValueError( "Make sure model files/path are correct. Please download from https://jima.me/open/npoclass_model_mg.zip, unzip, and specifiy model_path (default set to None)." ) # Check ntee type. if ntee_type == 'bc': le_file_name = 'le_broad_cat.pkl' elif ntee_type == 'mg': le_file_name = 'le_major_group.pkl' else: raise ValueError( "ntee_type must be 'bc' (broad category) or 'mg' (major group)") # Read model and label encoder, if not read. global model_loaded, tokenizer_loaded, label_encoder try: assert model_loaded assert tokenizer_loaded assert label_encoder except: #load a pretrained model and tokenizer. model_loaded = BertForSequenceClassification.from_pretrained( model_path) tokenizer_loaded = BertTokenizer.from_pretrained(model_path) # Read label encoder. with open(model_path + le_file_name, 'rb') as label_encoder_pkl: label_encoder = pickle.load(label_encoder_pkl) # Select acceleration method. if gpu_core == True and torch.cuda.is_available(): print('There are %d GPU(s) available.' % torch.cuda.device_count(), 'Using GPU:', torch.cuda.get_device_name(0)) torch.cuda.manual_seed_all(seed_val) device = torch.device('cuda') model_loaded.cuda() else: print('No GPU acceleration available or gpu_core=False, using CPU.') device = torch.device('cpu') model_loaded.cpu() print('Encoding inputs ...') sleep(.5) # Pause a second for better printing results. # Encode inputs. global func_encode_string, func_encode_string_batch # Define as global, otherwise cannot pickle or very slow. def func_encode_string(text_string): encoded_dict = tokenizer_loaded.encode_plus( text_string, add_special_tokens=True, # Add '[CLS]' and '[SEP]' truncation='longest_first', padding='max_length', # Max length accepted by model. return_attention_mask=True, # Construct attn. masks. return_tensors='pt', # Return pytorch tensors. ) return encoded_dict def func_encode_string_batch(text_strings): encoded_dicts = [] for text_string in text_strings: encoded_dicts += [func_encode_string(text_string)] return encoded_dicts # Tokenize all of the sentences and map the tokens to thier word IDs. input_ids = [] attention_masks = [] # Encode input string(s). if type(inputs) == list: if backend == 'multiprocessing': # Multiprocessing is faster than loky in processing large objects. encoded_outputs = Parallel( n_jobs=n_jobs, backend="multiprocessing", batch_size='auto', verbose=verbose)(delayed(func_encode_string)(text_string) for text_string in inputs) for encoded_output in encoded_outputs: # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) elif backend == 'sequential': for text_string in tqdm(inputs): encoded_output = func_encode_string(text_string) # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) elif backend == 'dask': with joblib.parallel_backend('dask'): n_jobs = len( client.scheduler_info()['workers']) # Get # works. string_chunks = partition_all( math.ceil(len(inputs) / n_jobs), inputs) # Collect into groups of size by worker numbers. encoded_outputs = Parallel( n_jobs=-1, batch_size='auto', verbose=verbose)( delayed(func_encode_string_batch)(text_strings) for text_strings in string_chunks) encoded_outputs = itertools.chain(*encoded_outputs) for encoded_output in encoded_outputs: # Add the encoded sentence to the list. input_ids.append(encoded_output['input_ids']) # And its attention mask (simply differentiates padding from non-padding). attention_masks.append(encoded_output['attention_mask']) if type(inputs) == str: encoded_output = func_encode_string(inputs) input_ids = [encoded_output['input_ids']] attention_masks = [encoded_output['attention_mask']] # Convert the lists into tensors. input_ids = torch.cat(input_ids, dim=0) attention_masks = torch.cat(attention_masks, dim=0) # Prepare dataloader for efficient calculation. pred_data = TensorDataset(input_ids, attention_masks) pred_sampler = SequentialSampler(pred_data) pred_dataloader = DataLoader(pred_data, sampler=pred_sampler, batch_size=batch_size_dl) # Start prediction. model_loaded.eval() logits_all = [] print('Predicting categories ...') sleep(.5) # Pause a second for better printing results. for batch in tqdm(pred_dataloader, mininterval=10): # Add batch to the pre-chosen device batch = tuple(t.to(device) for t in batch) b_input_ids, b_input_mask = batch with torch.no_grad(): outputs = model_loaded(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) logits_all += outputs[0].tolist() # Calculate probabilities of logitcs. logits_prob = tf.nn.sigmoid(logits_all).numpy().tolist() # Find the positions of max values in logits. logits_max = np.argmax(logits_prob, axis=1) # Transfer to labels. logits_labels = label_encoder.inverse_transform(logits_max) # Compile results to be returned. result_list = [] for list_index in range(0, len(logits_labels)): result_dict = {} result_dict['recommended'] = logits_labels[list_index] conf_prob = logits_prob[list_index][logits_max[list_index]] if conf_prob >= .99: result_dict['confidence'] = 'high (>=.99)' elif conf_prob >= .95: result_dict['confidence'] = 'medium (<.99|>=.95)' else: result_dict['confidence'] = 'low (<.95)' prob_dict = {} for label_index in range(0, len(label_encoder.classes_)): prob_dict[label_encoder.classes_[label_index]] = logits_prob[ list_index][label_index] result_dict['probabilities'] = prob_dict result_list += [result_dict] return result_list