def partial_reduce(func, x, split_every, keepdims=False, dtype=None, name=None): """Partial reduction across multiple axes. Parameters ---------- func : function x : Array split_every : dict Maximum reduction block sizes in each dimension. Example ------- Reduce across axis 0 and 2, merging a maximum of 1 block in the 0th dimension, and 3 blocks in the 2nd dimension: >>> partial_reduce(np.min, x, {0: 1, 2: 3}) # doctest: +SKIP """ name = name or 'p_reduce-' + tokenize(func, x, split_every, keepdims, dtype) parts = [list(partition_all(split_every.get(i, 1), range(n))) for (i, n) in enumerate(x.numblocks)] keys = product(*map(range, map(len, parts))) out_chunks = [tuple(1 for p in partition_all(split_every[i], c)) if i in split_every else c for (i, c) in enumerate(x.chunks)] if not keepdims: out_axis = [i for i in range(x.ndim) if i not in split_every] getter = lambda k: get(out_axis, k) keys = map(getter, keys) out_chunks = list(getter(out_chunks)) dsk = {} for k, p in zip(keys, product(*parts)): decided = dict((i, j[0]) for (i, j) in enumerate(p) if len(j) == 1) dummy = dict(i for i in enumerate(p) if i[0] not in decided) g = lol_tuples((x.name,), range(x.ndim), decided, dummy) dsk[(name,) + k] = (func, g) return Array(merge(dsk, x.dask), name, out_chunks, dtype=dtype)
def cb_filter(fastq, bc1, bc2, cores, nedit): ''' Filters reads with non-matching barcodes Expects formatted fastq files. ''' bc1 = set(cb.strip() for cb in bc1) if bc2: bc2 = set(cb.strip() for cb in bc2) if nedit == 0: filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2) else: bc1hash = MutationHash(bc1, nedit) bc2hash = None if bc2: bc2hash = MutationHash(bc2, nedit) filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash, bc2hash=bc2hash) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, stream_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_cb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
def big_kmeans(docs, k, batch_size=1000, n_features=(2**20), single_pass=True): """k-means for very large sets of documents. See kmeans for documentation. Differs from that function in that it does not computer tf-idf or LSA, and fetches the documents in a streaming fashion, so they don't need to be held in memory. It does not do random restarts. If the option single_pass is set to False, the documents are visited twice: once to fit a k-means model, once to determine their label in this model. """ from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.text import HashingVectorizer docs = tosequence(docs) v = HashingVectorizer(input="content", n_features=n_features, norm="l2") km = MiniBatchKMeans(n_clusters=k) labels = [] for batch in toolz.partition_all(batch_size, docs): batch = map(fetch, docs) batch = v.transform(batch) y = km.fit_predict(batch) if single_pass: labels.extend(y.tolist()) if not single_pass: for batch in toolz.partition_all(batch_size, docs): batch = map(fetch, docs) batch = v.transform(batch) labels.extend(km.predict(batch).tolist()) return group_clusters(docs, labels)
def export_historical(start, end, export_chunk_func, export_snapshot_func, data_query, generate_snapshot_range_func=None): if generate_snapshot_range_func is None: generate_snapshot_range_func = _generate_snapshot_range interval_map = _get_interval_map(start) resolutions = [item['resolution'] for item in interval_map] # default resolution is hourly resolution = os.environ.get("RESOLUTION", "1h") if resolution not in resolutions: resolution = "1h" for entry in interval_map: snapshots = generate_snapshot_range_func(entry["start"], end, entry["interval"], data_query) chunks = partition_all(CHUNK_SIZE, snapshots) batches = partition_all(POOL_SIZE, chunks) for b in batches: logger.info("starting new pool with %d workers", POOL_SIZE) Parallel(n_jobs=POOL_SIZE, backend="multiprocessing", verbose=100)( delayed(export_chunk_func)(chunk, export_snapshot_func) for chunk in b) # if we reached the final resolution we're done if entry['resolution'] == resolution: break
def fastqtransform(transform, fastq1, fastq2, separate_cb, demuxed_cb, dual_index, cores, min_length): ''' Transform input reads to the tagcounts compatible read layout using regular expressions as defined in a transform file. Outputs new format to stdout. ''' if dual_index and separate_cb: read_template = '{name}:CELL_{CB1}-{CB2}:UMI_{MB}\n{seq}\n+\n{qual}\n' else: read_template = '{name}:CELL_{CB}:UMI_{MB}\n{seq}\n+\n{qual}\n' transform = json.load(open(transform)) read1_regex = re.compile(transform['read1']) read2_regex = re.compile(transform['read2']) if fastq2 else None fastq1_fh = open(fastq1) if fastq1.endswith('gz'): fastq1_fh = gzip.GzipFile(fileobj=fastq1_fh) fastq_file1 = stream_fastq(fastq1_fh) if fastq2: fastq2_fh = open(fastq2) if fastq2.endswith('gz'): fastq2_fh = gzip.GzipFile(fileobj=fastq2_fh) fastq_file2 = stream_fastq(fastq2_fh) else: fastq_file2 = itertools.cycle((None, )) transform = partial(transformer, read1_regex=read1_regex, read2_regex=read2_regex, paired=fastq2) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, itertools.izip(fastq_file1, fastq_file2)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(transform, list(bigchunk)): for read1_dict in chunk: if dual_index: if not separate_cb: read1_dict[ 'CB'] = read1_dict['CB1'] + read1_dict['CB2'] if demuxed_cb: read1_dict['CB'] = demuxed_cb # Deal with spaces in read names read1_dict['name'] = read1_dict['name'].partition(' ')[0] if len(read1_dict['seq']) >= min_length: sys.stdout.write(read_template.format(**read1_dict))
def mb_filter(fastq, cores): ''' Filters umis with non-ACGT bases Expects formatted fastq files. ''' filter_mb = partial(umi_filter) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, stream_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_mb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
def mb_filter(fastq, cores): ''' Filters umis with non-ACGT bases Expects formatted fastq files. ''' filter_mb = partial(umi_filter) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_mb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
def fastqtransform(transform, fastq1, fastq2, separate_cb, demuxed_cb, dual_index, cores, min_length): ''' Transform input reads to the tagcounts compatible read layout using regular expressions as defined in a transform file. Outputs new format to stdout. ''' if dual_index and separate_cb: read_template = '{name}:CELL_{CB1}-{CB2}:UMI_{MB}\n{seq}\n+\n{qual}\n' else: read_template = '{name}:CELL_{CB}:UMI_{MB}\n{seq}\n+\n{qual}\n' transform = json.load(open(transform)) read1_regex = re.compile(transform['read1']) read2_regex = re.compile(transform['read2']) if fastq2 else None fastq1_fh = open(fastq1) if fastq1.endswith('gz'): fastq1_fh = gzip.GzipFile(fileobj=fastq1_fh) fastq_file1 = stream_fastq(fastq1_fh) if fastq2: fastq2_fh = open(fastq2) if fastq2.endswith('gz'): fastq2_fh = gzip.GzipFile(fileobj=fastq2_fh) fastq_file2 = stream_fastq(fastq2_fh) else: fastq_file2 = itertools.cycle((None,)) transform = partial(transformer, read1_regex=read1_regex, read2_regex=read2_regex, paired=fastq2) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, itertools.izip(fastq_file1, fastq_file2)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(transform, list(bigchunk)): for read1_dict in chunk: if dual_index: if not separate_cb: read1_dict['CB'] = read1_dict['CB1'] + read1_dict['CB2'] if demuxed_cb: read1_dict['CB'] = demuxed_cb # Deal with spaces in read names read1_dict['name'] = read1_dict['name'].partition(' ')[0] if len(read1_dict['seq']) >= min_length: sys.stdout.write(read_template.format(**read1_dict))
def add_uid(fastq, cores): ''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication Expects formatted fastq files with correct sample and cell barcodes. ''' uids = partial(append_uids) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, stream_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(uids, list(bigchunk)): for read in chunk: sys.stdout.write(read)
def add_uid(fastq, cores): ''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication Expects formatted fastq files with correct sample and cell barcodes. ''' uids = partial(append_uids) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(uids, list(bigchunk)): for read in chunk: sys.stdout.write(read)
def apply(self, lfs, Xs, block_size=None): blocks = Xs if block_size is None: block_size = int( np.ceil(np.sum([len(x) for x in Xs]) / self.num_workers)) logger.info("auto block size %s", block_size) if block_size: blocks = list( partition_all(block_size, itertools.chain.from_iterable(Xs))) lens = np.unique([len(x) for x in blocks]) logger.info("Partitioned into %s blocks %s sizes ", len(blocks), lens) do = delayed(partial(LabelingServer.worker, lfs)) jobs = (do(batch) for batch in blocks) L = sparse.vstack(self.client(jobs)) # merge matrix blocks Ls = [] i = 0 for n in [len(x) for x in Xs]: Ls.append(L[i:i + n].copy()) i += n return Ls
def get_median_rank_recommended_items(train_interactions, eval_users, item_popularities, n_users_in_chunk): train_interactions = sp.lil_matrix(train_interactions) max_train_interaction_count = max( len(row) for row in train_interactions.rows) train_user_items = { u: set(train_interactions.rows[u]) for u in eval_users if train_interactions.rows[u] } recommended_items = [] with tf.Session() as sess: model = Triplet(sess=sess, params=params) logger.info('Get recommended items for users in evaluation') for user_chunk in toolz.partition_all(n_users_in_chunk, eval_users): recommended_items = recommended_items + \ model.get_recommended_items( user_chunk, train_user_items, max_train_interaction_count, k=50) logger.info('Get rank for recommended items') item_ranks = [] for iids in recommended_items: item_ranks.append([item_popularities[iid] for iid in iids]) median_ranks = np.median(item_ranks, axis=1) tf.reset_default_graph() return np.mean(median_ranks)
def main(args): filelist = glob.glob(f'{args.inputdir}/*') batches = partition_all(args.batch_size, filelist) print(f'Documents: {len(filelist)}') if not os.path.exists(args.outputdir): print("output dir does not exist") return for i,batch in enumerate(batches): data = [] for fpath in batch: doc_name = fpath.split("/")[-1].split(".")[0] text = open(fpath,'r').read() if args.fmt == 'mimic': text = mimic_preprocessing(text) if args.preprocess == 'mimic' else text # escape whitespace text = text.replace('\n', '\\n').replace('\t', '\\t') data.append((doc_name, text)) outfpath = f'{args.outputdir}/{args.batch_size}.{i}.tsv' print(outfpath) save_tsv(data, outfpath) data = []
def apply(self, pipeline: Dict[str, float], documents: List[List[Document]], block_size: Union[str, int] = 'auto'): items = itertools.chain.from_iterable(documents) if block_size == 'auto': num_items = np.sum([len(x) for x in documents]) block_size = int(np.ceil(num_items / self.num_workers)) print(f'auto block size={block_size}') blocks = list(partition_all(block_size, items)) if block_size else documents print( f"Partitioned into {len(blocks)} blocks, {np.unique([len(x) for x in blocks])} sizes" ) do = delayed(partial(TaggerPipelineServer.worker, pipeline)) jobs = (do(batch) for batch in blocks) results = list(itertools.chain.from_iterable(self.client(jobs))) i = 0 items = [] for n in [len(x) for x in documents]: items.append(results[i:i + n].copy()) i += n return items
def apply(self, lfs, Xs, block_size=None): """ :param lfs: :param Xs: :param block_size: :return: """ blocks = Xs if block_size == 'auto': block_size = int( np.ceil(np.sum([len(x) for x in Xs]) / self.num_workers)) if self.verbose: print(f'auto block size={block_size}') if block_size: blocks = list( partition_all(block_size, itertools.chain.from_iterable(Xs))) if self.verbose: sizes = np.unique([len(x) for x in blocks]) print(f"Partitioned into {len(blocks)} blocks, {sizes} sizes") do = delayed(partial(LabelingServer.worker, lfs)) jobs = (do(batch) for batch in blocks) L = sparse.vstack(self.client(jobs)) # merge matrix blocks Ls = [] i = 0 for n in [len(x) for x in Xs]: Ls.append(L[i:i + n].copy()) i += n return Ls
def _update_batch(cls, tuples, trx=True): steemd = SteemClient.instance() timer = Timer(total=len(tuples), entity='post', laps=['rps', 'wps']) tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's for tups in partition_all(1000, tuples): timer.batch_start() buffer = [] post_args = [tup[0].split('/') for tup in tups] posts = steemd.get_content_batch(post_args) post_ids = [tup[1] for tup in tups] post_levels = [tup[2] for tup in tups] for pid, post, level in zip(post_ids, posts, post_levels): if post['author']: buffer.append(cls._sql(pid, post, level=level)) else: print("WARNING: ignoring deleted post {}".format(pid)) cls._bump_last_id(pid) timer.batch_lap() cls._batch_queries(buffer, trx) timer.batch_finish(len(posts)) if len(tuples) >= 1000: print(timer.batch_status())
def apply(self, lfs, Xs, block_size=None): blocks = Xs if block_size is None: block_size = int( np.ceil(np.sum([len(x) for x in Xs]) / self.num_workers)) print(f'auto block size={block_size}') if block_size: blocks = list( partition_all(block_size, itertools.chain.from_iterable(Xs))) print(f"Partitioned into {len(blocks)} blocks, " f"{np.unique([len(x) for x in blocks])} sizes") do = delayed(partial(SequenceLabelingServer.worker, lfs)) jobs = (do(batch) for batch in blocks) L = np.vstack(self.client(jobs)) # merge matrix blocks Ls = [] i = 0 for n in [len(x) for x in Xs]: Ls.append(L[i:i + n].copy()) i += n return Ls
def main_wrapper(data_dir='/data', sensor='mynt_eye', intention_type='dlm'): global IMG_TOPICS global TOPICS global TOPICS_IDX assert (sensor in SENSORS), f'Must be valid sensor in {SENSORS}' RGBS = SENSOR_TOPIC[sensor]['RGBS'] DEPTHS = SENSOR_TOPIC[sensor]['DEPTHS'] bagfns = glob.glob(data_dir + '/*.bag') print('bags:', bagfns) bags = chain(*[parse_bag(bagfn, intention_type) for bagfn in bagfns]) it = ThreadedGenerator(bags, queue_maxsize=6500) # make dirs for images gendir = 'data' if os.path.exists(osp.join(data_dir, gendir)) and os.path.isdir( osp.join(data_dir, gendir)): shutil.rmtree(osp.join(data_dir, gendir)) os.mkdir(osp.join(data_dir, gendir)) with open(osp.join(data_dir, gendir, 'README.txt'), 'w+') as f: f.write('THIS DATA IS PARSED TO SERVE PYTORCH MODEL') topic_save_path = [] for idx, rgb_topic in enumerate(RGBS): fn = osp.join(data_dir, gendir, f'rgb_{idx}') os.mkdir(fn) TOPICS.append(rgb_topic) TOPICS_IDX[rgb_topic] = len(TOPICS) - 1 topic_save_path.append(fn) for idx, depth_topic in enumerate(DEPTHS): fn = osp.join(data_dir, gendir, f'depth_{idx}') os.mkdir(fn) TOPICS.append(depth_topic) TOPICS_IDX[depth_topic] = len(TOPICS) - 1 topic_save_path.append(fn) if intention_type == 'lpe': fn = osp.join(data_dir, gendir, 'intention_img') os.mkdir(fn) TOPICS.append(INTENTION) TOPICS_IDX[INTENTION] = len(TOPICS) - 1 topic_save_path.append(fn) IMG_TOPICS = TOPICS[:] else: IMG_TOPICS = TOPICS[:] TOPICS.append(INTENTION) TOPICS.append(CONTROL) f = open(osp.join(data_dir, gendir, 'label.txt'), 'w') labelwriter = csv.writer(f, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) labelwriter.writerow([ 'frame', 'intention_type', 'current_velocity', 'steering_wheel_angle', 'dlm' ]) for chunk in partition_all(CHUNK_SIZE, tqdm(it)): for c in chunk: for idx, fn in enumerate(topic_save_path): cv2.imwrite(osp.join(fn, f'{c.frame}.jpg'), c.imgs[idx]) labelwriter.writerow( [c.frame, intention_type, c.vel, c.steer, c.dlm])
def _lookup_most_recent_symbols(self, sids): symbol_cols = self.equity_symbol_mappings.c symbols = { row.sid: {c: row[c] for c in symbol_columns} for row in concat( self.engine.execute( sa.select( (symbol_cols.sid,) + tuple(map(op.getitem(symbol_cols), symbol_columns)), ).where( symbol_cols.sid.in_(map(int, sid_group)), ).order_by( symbol_cols.end_date.desc(), ).group_by( symbol_cols.sid, ) ).fetchall() for sid_group in partition_all( SQLITE_MAX_VARIABLE_NUMBER, sids ), ) } if len(symbols) != len(sids): raise EquitiesNotFound( sids=set(sids) - set(symbols), plural=True, ) return symbols
def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs): seq2 = partition_all(chunksize, seq) if kwargs.get('add_index'): mkindex = _add_index else: mkindex = _ignore_index try: first, rest = next(seq2), seq2 except StopIteration: def _(): yield convert(pd.DataFrame, [], **kwargs) else: df = convert(pd.DataFrame, first, **kwargs) df1, n1 = mkindex(df, 0) def _(): n = n1 yield df1 for i in rest: df = convert(pd.DataFrame, i, **kwargs) df, n = mkindex(df, n) yield df return chunks(pd.DataFrame)(_)
def evaluate_AUC(self, data1): # evaluate the results for an input set data = data1.values[:,1:] num_example = len(data) #需要验证样本的数量~ score = [] for user_chunk in toolz.partition_all(600,[i for i in range(num_example)] ): pos = data[list(user_chunk)] NegativeSample = self.sample_negative(pos,50)#采样负样本None,10 neg = np.tile(np.expand_dims(copy.deepcopy(pos),axis =1),[1,50,1]) #none,10,3 neg = neg.reshape([-1,pos.shape[1]])#10*none,3 neg[:,1] = NegativeSample.reshape([-1])#赋值给负样本,形成整个负样本集,注意到,这个地方10倍于正样本集 #计算negative评分 if self.context and self.time: feed_dict_neg = {self.model.Pos: neg[:,:2],self.model.Fea:neg[:,2:-self.time_dimension],self.model.Tim:neg[:,-self.time_dimension:]} #计算positive评分 feed_dict_pos = {self.model.Pos: pos[:,:2],self.model.Fea:pos[:,2:-self.time_dimension],self.model.Tim:pos[:,-self.time_dimension:]} if self.context and not self.time: feed_dict_neg = {self.model.Pos: neg[:,:2],self.model.Fea:neg[:,2:]} #计算positive评分 feed_dict_pos = {self.model.Pos: pos[:,:2],self.model.Fea:pos[:,2:]} if not self.context and self.time: feed_dict_neg = {self.model.Pos: neg[:,:2],self.model.Tim:neg[:,-self.time_dimension:]} #计算positive评分 feed_dict_pos = {self.model.Pos: pos[:,:2],self.model.Tim:pos[:,-self.time_dimension:]} batch_out_pos = self.model.sess.run( self.model.PositiveFeadback, feed_dict=feed_dict_pos) self.neg_score = self.model.sess.run(self.model.PositiveFeadback, feed_dict=feed_dict_neg) self.pos_score = np.reshape(np.tile(np.expand_dims(batch_out_pos,axis = 1),[1,50,1]),[-1,1]) score.extend(np.reshape(self.pos_score>self.neg_score,[-1]).tolist()) return np.mean(score)
def __init__(self, server, sizing_mode='stretch_both', **kwargs): self.server = server self.counter_figures = {} self.counter_sources = {} self.digest_figures = {} self.digest_sources = {} self.sizing_mode = sizing_mode if self.server.digests: for name in self.server.digests: self.add_digest_figure(name) for name in self.server.counters: self.add_counter_figure(name) figures = merge(self.digest_figures, self.counter_figures) figures = [figures[k] for k in sorted(figures)] if len(figures) <= 5: self.root = column(figures, sizing_mode=sizing_mode) else: self.root = column(*[ row(*pair, sizing_mode=sizing_mode) for pair in partition_all(2, figures) ], sizing_mode=sizing_mode)
def uuid_initializer(uuid_): return '{{{}}}'.format( ', '.join( '0x{:02x}{:02x}'.format(high, low) for low, high in toolz.partition_all(2, uuid_.bytes) ), )
def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs): seq2 = partition_all(chunksize, seq) add_index = kwargs.get('add_index', False) if not add_index: # Simple, we can dispatch to dask... f = lambda d: convert(pd.DataFrame, d, **kwargs) data = [partial(f, d) for d in seq2] if not data: data = [convert(pd.DataFrame, [], **kwargs)] return chunks(pd.DataFrame)(data) # TODO: Decide whether we should support the `add_index` flag at all. # If so, we need to post-process the converted DataFrame objects sequencially, # so we can't parallelize the process. try: first, rest = next(seq2), seq2 except StopIteration: def _(): yield convert(pd.DataFrame, [], **kwargs) else: df = convert(pd.DataFrame, first, **kwargs) df1, n1 = _add_index(df, 0) def _(): n = n1 yield df1 for i in rest: df = convert(pd.DataFrame, i, **kwargs) df, n = _add_index(df, n) yield df return chunks(pd.DataFrame)(_)
def partition_map(n: int, func: Any, its: Iterable[Any], name: str = 'compute') -> Iterable[Any]: """ Partition sequence into lumps of size `n`, then construct dask delayed computation evaluating to: [func(x) for x in its[0:1n]], [func(x) for x in its[n:2n]], ... [func(x) for x in its[]], :param n: number of elements to process in one go :param func: Function to apply (non-dask) :param its: Values to feed to fun :param name: How the computation should be named in dask visualizations """ def lump_proc(dd): return [func(d) for d in dd] proc = dask.delayed(lump_proc, nout=1, pure=True) data_name = _randomize('data_' + name) name = _randomize(name) for i, dd in enumerate(toolz.partition_all(n, its)): lump = dask.delayed(dd, pure=True, traverse=False, name=data_name + str(i)) yield proc(lump, dask_key_name=name + str(i))
def test_broken_worker_during_computation(c, s, a, b): n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 L = c.map(inc, range(256)) for i in range(8): L = c.map(add, *zip(*partition_all(2, L))) from random import random yield gen.sleep(random() / 2) with ignoring(OSError): n.process.terminate() yield gen.sleep(random() / 2) with ignoring(OSError): n.process.terminate() result = yield c._gather(L) assert isinstance(result[0], int) yield n._close()
def _get_all_nosetest_xmls(build): """ Return list of nosetest XMLs for all runs (until first 404 is encountered). """ run_path = '{build}/{stage}/1/{job}-runInstance-{run}' url = ('https://{user}:{password}@{host}/go/files/' + run_path + '/test-results/nosetests.xml') context = { 'host': GOCD_HOST, 'user': os.getenv('GOCD_USER'), 'password': os.getenv('GOCD_PASSWORD'), 'build': build, } context.update(_get_pipeline_data(build)) assert context['user'], 'Missing environment variable GOCD_USER' assert context['password'], 'Missing environment variable GOCD_PASSWORD' # URLs are of the form /something/anotherthing/<run> # We do not know the set of valid URLs. I.e., we do not know the maximum # valid value of `run` after which all URLs will be 404s. Instead, we keep # trying increasing values of `run` until we start getting 404s. Since we # do not know the maximum valid value of run a priori, we cannot create # tasks for fetching all valid URLs. Instead we create an initial chunk of # tasks, run these concurrently, and then if we haven't started # encountering 404s, move on to the next chunk. xmls = [] seen_404 = False async def get_xml(run): nonlocal context, seen_404 context = dict(context, run=run) conn = aiohttp.TCPConnector(verify_ssl=False) async with aiohttp.ClientSession(connector=conn) as session: resp = await session.get(url.format(**context)) if resp.status == 404: # We expect to see 404s eventually, but a 404 for the first # value of `run` probably means that something is wrong. assert run > 1, ( 'Unexpected 404: %s' % url.format(**dict(context, password='******'))) seen_404 = True else: resp.raise_for_status() data = await resp.content.read() xmls.append(data) chunk_size = 30 ioloop = asyncio.get_event_loop() for run_chunk in toolz.partition_all(chunk_size, itertools.count(1)): task = asyncio.wait([get_xml(run) for run in run_chunk]) ioloop.run_until_complete(task) if seen_404: ioloop.close() break return xmls
def test_broken_worker_during_computation(c, s, a, b): s.allowed_failures = 100 n = Nanny(s.ip, s.port, ncores=2, loop=s.loop) n.start(0) start = time() while len(s.ncores) < 3: yield gen.sleep(0.01) assert time() < start + 5 L = c.map(inc, range(256)) for i in range(8): L = c.map(add, *zip(*partition_all(2, L))) from random import random yield gen.sleep(random() / 2) with ignoring(CommClosedError): # comm will be closed abrupty yield c._run(os._exit, 1, workers=[n.worker_address]) yield gen.sleep(random() / 2) with ignoring( CommClosedError, EnvironmentError): # perhaps new worker can't be contacted yet yield c._run(os._exit, 1, workers=[n.worker_address]) result = yield c._gather(L) assert isinstance(result[0], int) yield n._close()
def uuid_initializer(uuid_): return "{{{}}}".format( ", ".join( "0x{:02x}{:02x}".format(high, low) for low, high in toolz.partition_all(2, uuid_.bytes) ), )
def append_iterator_to_table(t, rows, dshape=None, **kwargs): assert not isinstance(t, type) rows = iter(rows) # We see if the sequence is of tuples or dicts # If tuples then we coerce them to dicts try: row = next(rows) except StopIteration: return rows = chain([row], rows) if isinstance(row, (tuple, list)): if dshape and isinstance(dshape.measure, datashape.Record): names = dshape.measure.names if set(names) != set(discover(t).measure.names): raise ValueError("Column names of incoming data don't match " "column names of existing SQL table\n" "Names in SQL table: %s\n" "Names from incoming data: %s\n" % (discover(t).measure.names, names)) else: names = discover(t).measure.names rows = (dict(zip(names, row)) for row in rows) engine = t.bind with engine.connect() as conn: for chunk in partition_all(1000, rows): # TODO: 1000 is hardcoded conn.execute(t.insert(), chunk) return t
def chunked_persist(data, n_concurrent, client, verbose=False): """ Force limited concurrency when persisting a large collection. This is useful to control memory usage when operating close to capacity. Sometimes `client.persist(data)` will run out of memory, not because fully-realized data is large, but because of intermediate data memory requirements. This is particularly common when using local dask cluster with only one worker. This function forces evaluation order of the dask graph to control peak memory usage. Say you have a largish task graph of 10x10 top-level sub-tasks, you have enough memory to process 5 sub-tasks concurrently, but Dask might decide to schedule more than that and will cause worker restarts due to out of memory errors. With this function you can force dask scheduler to persist this collection in batches of 5 concurrent sub-tasks, keeping the computation within the memory budget. """ delayed = data.to_delayed().ravel() persisted = [] for chunk in partition_all(n_concurrent, delayed): chunk = client.persist(chunk) _ = dask_wait(chunk) persisted.extend(chunk) if verbose: print(".", end="") # at this point it should be almost no-op return client.persist(data)
def from_checkpoints(self, chunk_size=1000): """Initial sync strategy: read from blocks on disk. This methods scans for files matching ./checkpoints/*.json.lst and uses them for hive's initial sync. Each line must contain exactly one block in JSON format. """ # pylint: disable=no-self-use last_block = Blocks.head_num() tuplize = lambda path: [int(path.split('/')[-1].split('.')[0]), path] basedir = os.path.dirname(os.path.realpath(__file__ + "/../..")) files = glob.glob(basedir + "/checkpoints/*.json.lst") tuples = sorted(map(tuplize, files), key=lambda f: f[0]) last_read = 0 for (num, path) in tuples: if last_block < num: log.info("[SYNC] Load %s. Last block: %d", path, last_block) with open(path) as f: # each line in file represents one block # we can skip the blocks we already have skip_lines = last_block - last_read remaining = drop(skip_lines, f) for lines in partition_all(chunk_size, remaining): Blocks.process_multi(map(json.loads, lines), True) last_block = num last_read = num
def _split_bed(bed_input, data): """Split BED file into sections for processing, allowing better multicore usage. """ split_lines = 100000 split_info = [] base, ext = os.path.splitext(bed_input) base, ext2 = os.path.splitext(base) ext = ext2 + ext with open(bed_input) as in_handle: for cur_index, line_group in enumerate( tz.partition_all(split_lines, in_handle)): cur_file = "%s-%s%s" % (base, cur_index, ext) if not utils.file_uptodate(cur_file, bed_input): with file_transaction(data, cur_file) as tx_out_file: with open(tx_out_file, "w") as out_handle: for line in line_group: out_handle.write(line) split_info.append({ "i": cur_index, "orig": bed_input, "file": cur_file }) if not split_info: # empty input file split_info.append({"file": bed_input, "orig": bed_input}) return split_info
def append_iterator_to_table(t, rows, dshape=None, bind=None, **kwargs): assert not isinstance(t, type) engine = getbind(t, bind) if not t.exists(bind=engine): t.create(bind=engine) rows = iter(rows) # We see if the sequence is of tuples or dicts # If tuples then we coerce them to dicts try: row = next(rows) except StopIteration: return t rows = chain([row], rows) if isinstance(row, (tuple, list)): dshape = dshape and datashape.dshape(dshape) if dshape and isinstance(dshape.measure, datashape.Record): names = dshape.measure.names if set(names) != set(discover(t).measure.names): raise ValueError("Column names of incoming data don't match " "column names of existing SQL table\n" "Names in SQL table: %s\n" "Names from incoming data: %s\n" % (discover(t).measure.names, names)) else: names = discover(t).measure.names rows = (dict(zip(names, row)) for row in rows) with engine.connect() as conn: for chunk in partition_all(1000, rows): # TODO: 1000 is hardcoded conn.execute(t.insert(), chunk) return t
def into(a, b, **kwargs): chunks = partition_all(1024, b) chunk = next(chunks) a = into(a, chunk, **kwargs) for chunk in chunks: a.append(list(zip(*chunk))) a.flush() return a
def execute(file_name): categories = ['distinguished', 'removal_reason'] f = load(file_name) batches = partition_all(200000, f) df, frames = peek(map(to_df, batches)) castra = Castra('./subreddit_dumps/'+file_name+'.castra', template = df, categories = categories) castra.extend_sequence(frames, freq = '3h')
def into(a, b, **kwargs): chunks = partition_all(1024, b) chunk = next(chunks) a = ctable([into(np.ndarray(0), c2) for c2 in zip(*chunk)], **kwargs) for chunk in chunks: a.append(list(zip(*chunk))) a.flush() return a
def main(): images = sorted(f for f in listdir('images/') if f.endswith('.JPG')) captions = image_captions() content = list(zip(images, captions, count(1))) step = 6 size = len(content) // step for i, img_cap_idx_list in enumerate(partition_all(step, content)): create_slide(i, size, img_cap_idx_list) write_app_cache()
def iterator_to_numpy_chunks(seq, chunksize=1024, **kwargs): seq2 = partition_all(chunksize, seq) first, rest = next(seq2), seq2 x = convert(np.ndarray, first, **kwargs) def _(): yield x for i in rest: yield convert(np.ndarray, i, **kwargs) return chunks(np.ndarray)(_)
def into(a, b, **kwargs): kwargs = keyfilter(carray_keywords.__contains__, kwargs) chunks = partition_all(1024, b) chunk = next(chunks) a = into(a, chunk, **kwargs) for chunk in chunks: a.append(list(zip(*chunk))) a.flush() return a
def sb_filter(fastq, bc, cores, nedit): ''' Filters reads with non-matching sample barcodes Expects formatted fastq files. ''' barcodes = set(sb.strip() for sb in bc) if nedit == 0: filter_sb = partial(exact_sample_filter2, barcodes=barcodes) else: barcodehash = MutationHash(barcodes, nedit) filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_sb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
def partition(grouper, sequence, npartitions, p, nelements=2**20): """ Partition a bag along a grouper, store partitions on disk """ for block in partition_all(nelements, sequence): d = groupby(grouper, block) d2 = defaultdict(list) for k, v in d.items(): d2[abs(hash(k)) % npartitions].extend(v) p.append(d2) return p
def cb_filter(fastq, bc1, bc2, cores): ''' Filters reads with non-matching barcodes Expects formatted fastq files. ''' bc1 = set(cb.strip() for cb in bc1) if bc2: bc2 = set(cb.strip() for cb in bc2) filter_cb = partial(cb_filterer, bc1=bc1, bc2=bc2) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, stream_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_cb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
def iter_arrays(self, arrays_per_chunk=None): """Iterates over the arrays in this store.""" if arrays_per_chunk is None: for key in range(self.journal().numarrays()): yield self.get([key]) elif arrays_per_chunk <= 0: raise ValueError('arrays_per_chunk must be None or bigger than 0, it is %r' % arrays_per_chunk) else: for segments in partition_all(arrays_per_chunk, range(self.journal().numarrays())): yield self.get(segments)
def parallel_rebin(K, mz_axis, imzb): mz_axis_chunks = list(partition_all(K, mz_axis)) # create dask array manually using tasks tasks = {('x', i, 0, 0): (get_mz_images, mz_chunk, imzb) for i, mz_chunk in enumerate(mz_axis_chunks)} chunks_mz = [len(c) for c in mz_axis_chunks] chunks_x = (imzb.height,) chunks_y = (imzb.width,) arr = da.Array(tasks, 'x', chunks=(chunks_mz, chunks_x, chunks_y), dtype=float) print arr.shape return arr
def append_local_file_to_hdfs(target, source, blocksize=100000, **kwargs): if raises(FileNotFound, lambda: target.hdfs.list_dir(target.path.lstrip('/'))): target.hdfs.create_file(target.path.lstrip('/'), '') with open(source.path, 'r') as f: blocks = partition_all(blocksize, f) for block in blocks: target.hdfs.append_file(target.path.lstrip('/'), ''.join(block)) return target
def nnls_frob(x, anchors): ncols = x.shape[1] x_sel = np.array(anchors) # print "projection" result = np.zeros((x_sel.shape[1], ncols)) # apply NNLS to chunks so as to avoid loading all m/z images into RAM for chunk in partition_all(100, range(ncols)): residuals = np.array(x[:, chunk]) result[:, chunk] = nnlsm_blockpivot(x_sel, residuals)[0] return result
def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs): seq2 = partition_all(chunksize, seq) try: first, rest = next(seq2), seq2 except StopIteration: return chunks(pd.DataFrame)([]) df = convert(pd.DataFrame, first, **kwargs) def _(): yield df for i in rest: yield convert(pd.DataFrame, i, **kwargs) return chunks(pd.DataFrame)(_)
def get_parallel_regions_block(batch): """CWL target to retrieve block group of callable regions for parallelization. Uses blocking to handle multicore runs. """ samples = [utils.to_single_data(d) for d in batch] regions = _get_parallel_regions(samples[0]) out = [] # Currently don't have core information here so aim for about 10 items per partition n = 10 for region_block in tz.partition_all(n, regions): out.append({"region_block": ["%s:%s-%s" % (c, s, e) for c, s, e in region_block]}) return out
def cb_filter(fastq, bc1, bc2, bc3, cores, nedit): ''' Filters reads with non-matching barcodes Expects formatted fastq files. ''' with open_gzipsafe(bc1) as bc1_fh: bc1 = set(cb.strip() for cb in bc1_fh) if bc2: with open_gzipsafe(bc2) as bc2_fh: bc2 = set(cb.strip() for cb in bc2_fh) if bc3: with open_gzipsafe(bc3) as bc3_fh: bc3 = set(cb.strip() for cb in bc3_fh) annotations = detect_fastq_annotations(fastq) re_string = construct_transformed_regex(annotations) if nedit == 0: filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2, bc3=bc3, re_string=re_string) else: bc1hash = MutationHash(bc1, nedit) bc2hash = None bc3hash = None if bc2: bc2hash = MutationHash(bc2, nedit) if bc3: bc3hash = MutationHash(bc3, nedit) filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash, bc2hash=bc2hash, bc3hash=bc3hash, re_string=re_string) p = multiprocessing.Pool(cores) chunks = tz.partition_all(10000, read_fastq(fastq)) bigchunks = tz.partition_all(cores, chunks) for bigchunk in bigchunks: for chunk in p.map(filter_cb, list(bigchunk)): for read in chunk: sys.stdout.write(read)
def _batch_gvcfs(data, region, vrn_files, ref_file, out_file=None): """Perform batching of gVCF files if above recommended input count. """ if out_file is None: out_file = vrn_files[0] if len(vrn_files) >= MAX_BATCH: out = [] for i, batch_vrn_files in enumerate(tz.partition_all(MAX_BATCH, vrn_files)): base, ext = utils.splitext_plus(out_file) batch_out_file = "%s-b%s%s" % (base, i, ext) out.append(_run_combine_gvcfs(batch_vrn_files, region, ref_file, batch_out_file, data)) return _batch_gvcfs(data, region, out, ref_file) else: return vrn_files
def reduction(self, perpartition, aggregate, split_every=None, out_type=Item, name=None): """ Reduce collection with reduction operators Parameters ---------- perpartition: function reduction to apply to each partition aggregate: function reduction to apply to the results of all partitions split_every: int (optional) Group partitions into groups of this size while performing reduction Defaults to 8 out_type: {Bag, Item} The out type of the result, Item if a single element, Bag if a list of elements. Defaults to Item. Examples -------- >>> b = from_sequence(range(10)) >>> b.reduction(sum, sum).compute() 45 """ if split_every is None: split_every = 8 if split_every is False: split_every = self.npartitions token = tokenize(self, perpartition, aggregate, split_every) a = '%s-part-%s' % (name or funcname(perpartition), token) dsk = dict(((a, i), (perpartition, (self.name, i))) for i in range(self.npartitions)) k = self.npartitions b = a fmt = '%s-aggregate-%s' % (name or funcname(aggregate), token) depth = 0 while k > 1: c = fmt + str(depth) dsk2 = dict(((c, i), (aggregate, [(b, j) for j in inds])) for i, inds in enumerate(partition_all(split_every, range(k)))) dsk.update(dsk2) k = len(dsk2) b = c depth += 1 if out_type is Item: dsk[b] = dsk.pop((b, 0)) return Item(merge(self.dask, dsk), b) else: return Bag(merge(self.dask, dsk), b, 1)
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20), single_pass=True): """k-means for very large sets of documents. See kmeans for documentation. Differs from that function in that it does not computer tf-idf or LSA, and fetches the documents in a streaming fashion, so they don't need to be held in memory. It does not do random restarts. If the option single_pass is set to False, the documents are visited twice: once to fit a k-means model, once to determine their label in this model. """ from sklearn.cluster import MiniBatchKMeans from sklearn.feature_extraction.text import HashingVectorizer docs = tosequence(docs) v = HashingVectorizer(input="content", n_features=n_features, norm="l2") km = MiniBatchKMeans(n_clusters=k) labels = [] for batch in toolz.partition_all(batch_size, docs): batch = map(fetch, docs) batch = v.transform(batch) y = km.fit_predict(batch) if single_pass: labels.extend(y.tolist()) if not single_pass: for batch in toolz.partition_all(batch_size, docs): batch = map(fetch, docs) batch = v.transform(batch) labels.extend(km.predict(batch).tolist()) return group_clusters(docs, labels)
def iterator_to_numpy_chunks(seq, chunksize=1024, **kwargs): seq2 = partition_all(chunksize, seq) try: first, rest = next(seq2), seq2 except StopIteration: # seq is empty def _(): yield convert(np.ndarray, [], **kwargs) else: x = convert(np.ndarray, first, **kwargs) def _(): yield x for i in rest: yield convert(np.ndarray, i, **kwargs) return chunks(np.ndarray)(_)
def test_blocked(): blocks = [] for k in sorted(files): b = files[k] lines = b.split(b'\n') blocks.append([b'\n'.join(bs) for bs in partition_all(2, lines)]) df = read_csv_from_bytes(blocks, header, expected.head(), {}) eq(df.compute().reset_index(drop=True), expected.reset_index(drop=True), check_dtype=False) expected2 = expected[['name', 'id']] df = read_csv_from_bytes(blocks, header, expected2.head(), {'usecols': ['name', 'id']}) eq(df.compute().reset_index(drop=True), expected2.reset_index(drop=True), check_dtype=False)