def genotype_parallel(src_vcf, out_vcf, sample, z, split_slop, min_aligned, sum_quals, split_weight, disc_weight, max_reads, debug, cores, breakpoint_batch_size, ref_fasta): # cleanup unused library attributes for rg in sample.rg_to_lib: sample.rg_to_lib[rg].cleanup() # 1st pass through input vcf -- collect all the relevant breakpoints logit("Collecting breakpoints") breakpoints = collect_breakpoints(src_vcf) logit("Number of breakpoints/SVs to process: {}".format(len(breakpoints))) logit("Collecting regions") regions = [get_breakpoint_regions(b, sample, z) for b in breakpoints] logit("Batch breakpoints into groups of {}".format(breakpoint_batch_size)) breakpoints_batches = list( partition_all(breakpoint_batch_size, breakpoints)) logit("Batch regions into groups of {}".format(breakpoint_batch_size)) regions_batches = list(partition_all(breakpoint_batch_size, regions)) if len(breakpoints_batches) != len(regions_batches): raise RuntimeError( "Batch error: breakpoint batches ({}) != region batches ({})". format(breakpoints_batches, regions_batches)) logit("Number of batches to parallel process: {}".format( len(breakpoints_batches))) std_args = (sample.bam.filename, ref_fasta, sample.rg_to_lib, sample.active_libs, sample.name, split_slop, min_aligned, split_weight, disc_weight, max_reads, debug) pool = mp.Pool(processes=cores) results = [ pool.apply_async(parallel_calculate_genotype, args=std_args + (b, r, i)) for i, (b, r) in enumerate(zip(breakpoints_batches, regions_batches)) ] results = [p.get() for p in results] logit("Finished parallel breakpoint processing") logit("Merging genotype results") merged_genotypes = { g['variant.id']: g for batch in results for g in batch['genotypes'] } total_variants_skipped = sum([batch['skip-count'] for batch in results]) total_variants_with_no_reads = sum( [batch['no-read-count'] for batch in results]) logit("Number of variants skipped (surpassed max-reads threshold): {}". format(total_variants_skipped)) logit("Number of variants with no reads: {}".format( total_variants_with_no_reads)) # 2nd pass through input vcf -- apply the calculated genotypes to the variants logit("Applying genotype results to vcf") apply_genotypes_to_vcf(src_vcf, out_vcf, merged_genotypes, sample, sum_quals) logit("All Done!")
def genotype_parallel(src_vcf, out_vcf, sample, z, split_slop, min_aligned, sum_quals, split_weight, disc_weight, max_reads, max_ci_dist, debug, cores, breakpoint_batch_size, ref_fasta): # cleanup unused library attributes for rg in sample.rg_to_lib: sample.rg_to_lib[rg].cleanup() # 1st pass through input vcf -- collect all the relevant breakpoints logit("Collecting breakpoints") breakpoints = collect_breakpoints(src_vcf, max_ci_dist) logit("Number of breakpoints/SVs to process: {}".format(len(breakpoints))) logit("Collecting regions") regions = [ get_breakpoint_regions(b, sample, z) for b in breakpoints ] logit("Batch breakpoints into groups of {}".format(breakpoint_batch_size)) breakpoints_batches = list(partition_all(breakpoint_batch_size, breakpoints)) logit("Batch regions into groups of {}".format(breakpoint_batch_size)) regions_batches = list(partition_all(breakpoint_batch_size, regions)) if len(breakpoints_batches) != len(regions_batches): raise RuntimeError("Batch error: breakpoint batches ({}) != region batches ({})".format(breakpoints_batches, regions_batches)) logit("Number of batches to parallel process: {}".format(len(breakpoints_batches))) std_args = ( sample.bam.filename, ref_fasta, sample.rg_to_lib, sample.active_libs, sample.name, split_slop, min_aligned, split_weight, disc_weight, max_reads, debug ) pool = mp.Pool(processes=cores) results = [pool.apply_async(parallel_calculate_genotype, args=std_args + (b, r, i)) for i, (b, r) in enumerate(zip(breakpoints_batches, regions_batches))] results = [p.get() for p in results] logit("Finished parallel breakpoint processing") logit("Merging genotype results") merged_genotypes = { g['variant.id'] : g for batch in results for g in batch['genotypes'] } total_variants_skipped = sum([ batch['skip-count'] for batch in results ]) total_variants_with_no_reads = sum([ batch['no-read-count'] for batch in results ]) logit("Number of variants skipped (surpassed max-reads threshold): {}".format(total_variants_skipped)) logit("Number of variants with no reads: {}".format(total_variants_with_no_reads)) # 2nd pass through input vcf -- apply the calculated genotypes to the variants logit("Applying genotype results to vcf") apply_genotypes_to_vcf(src_vcf, out_vcf, merged_genotypes, sample, sum_quals) logit("All Done!")
async def write_pending_messages(self) -> int: """Flushes the buffer, if there are items in it, to the message store. The return value is the number of records that were successfully synced. """ if self._pending.empty(): return 0 total = 0 split_n = math.ceil(self._pending.maxsize / 4.0) # ensure the queue is empty before returning while not self._pending.empty(): pending: List[SerializedMessage] = [] # flush the entire queue while not self._pending.empty(): pending.append(await self._pending.get()) # divide all the pending into like-typed instances partitioned = groupby(attrgetter('type'), pending) for _, items in partitioned.items(): for idx, bundle in partition_all(split_n, items): async with self.connection('write_pending_messages-%d' % idx) as c: async with c.transaction(): for msg in bundle: await c.fetchrow(Procs.write_message, *msg) total += 1 # ~~ no more in pending queue return total
def partial_fit(self, X, y): for i, batch in enumerate(toolz.partition_all(self.batch_size, X)): print 'batch {}'.format(i) rows = [] response = [] for row in batch: try: row_y = y.next() if self.check_response(row_y): row = self._transform(row) rows.append(row) response.append(row_y) except Exception as e: if self.logging: logging.exception(e) shuffledRange = range(len(rows)) # need to shuffle data during each iteration for _ in range(self.n_iter): random.shuffle(shuffledRange) batch_data = sp.sparse.vstack([rows[i] for i in shuffledRange]) shuffled_response = [response[i] for i in shuffledRange] self.steps[-1].partial_fit(batch_data, shuffled_response, classes=[0, 1])
def test_partition_all(): assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)] assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)] assert list(partition_all(2, [])) == [] # Regression test: https://github.com/pycytoolz/cytoolz/issues/387 class NoCompare(object): def __eq__(self, other): if self.__class__ == other.__class__: return True raise ValueError() obj = NoCompare() result = [(obj, obj, obj, obj), (obj, obj, obj)] assert list(partition_all(4, [obj] * 7)) == result assert list(partition_all(4, iter([obj] * 7))) == result
def _request_block_parts( self, headers: List[BlockHeader], request_func: Callable[[ETHPeer, List[BlockHeader]], None]) -> int: length = math.ceil(len(headers) / len(self.peer_pool.peers)) batches = list(partition_all(length, headers)) for peer, batch in zip(self.peer_pool.peers, batches): request_func(cast(ETHPeer, peer), batch) return len(batches)
async def request_nodes(self, node_keys: List[bytes]) -> None: batches = list(partition_all(eth.MAX_STATE_FETCH, node_keys)) for batch in batches: peer = await self.get_idle_peer() now = time.time() for node_key in batch: self._pending_nodes[node_key] = now self.logger.debug("Requesting %d trie nodes to %s", len(batch), peer) peer.sub_proto.send_get_node_data(batch) self._peers_with_pending_requests[peer] = now
def get_model_preds(model: Model, texts: List[str], classes: np.ndarray) -> List[str]: """ Get model predictions for multiple texts as class labels rather than as a 2dim matrix of prediction probabilities. """ # predict in batches, otherwise memory blows UP results = (result for texts_pt in itertoolz.partition_all(1000, texts) for result in get_topn_preds_and_probs(model.predict(texts_pt), 1, classes)) return [lang for result in results for lang, _ in result]
def predict_proba(self, newX): all_preds = [] for batch in toolz.partition_all(self.batch_size, newX): pred_rows = [] for newrow in batch: newrow = self._transform(newrow) pred_rows.append(newrow) test_data = sp.sparse.vstack(pred_rows) all_preds.append(self.steps[-1].predict_proba(test_data)) return np.vstack(all_preds)
async def caching_downloader(get, set, downloader, tiles, num_workers, **kw): """ Download tiles from cache and missing tiles with the downloader. Asynchronous generator of map tiles is returned. The code flow is - caching downloader gets tile data from cache using URLs - the original downloader is used to download missing tile data - cache is updated with all existing tile data The cache getter function (`get` parameter) should return `None` if tile data is not in cache for given URL. A collection of tiles is returned. :param get: Function to get a tile data from cache. :param set: Function to put a tile data in cache. :param downloader: Original tiles downloader (asyncio coroutine). :param tiles: Collection tiles to fetch. :param num_workers: Number of workers used to connect to a map provider service. :param kw: Parameters passed to downloader coroutine. """ tiles = fetch_from_cache(get, tiles) groups = partition_all(10, tiles) for tg in groups: missing = groupby(lambda t: t.img is None, tg) for t in missing.get(False, []): # reset cache for new and old tiles set(t.url, t.img) yield t result = downloader(missing.get(True, []), num_workers, **kw) async for t in result: # reset cache for new and old tiles set(t.url, t.img) yield t
def test_partition_all(): assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)] assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)] assert list(partition_all(2, [])) == []
def download_twitter_data(dirpath, creds_fpath, force=False): """ Download two collections of (lang, tweet_id) pairs from Twitter -- a "uniformly sampled" collection of ~120k tweets over all languages and a "recall oriented" collection of ~1.5k tweets per language -- then fetch available tweets' data from the Twitter API. Args: dirpath (str or :class:`pathlib.Path`) creds_fpath (str or :class:`pathlib.Path`) force (bool) References: https://blog.twitter.com/engineering/en_us/a/2015/evaluating-language-identification-performance.html TODO: Ideally, use a tweet search endpoint and filter by language, then just iterate over all ISO-639-1 language codes. """ dirpath = textacy.utils.to_path(dirpath).resolve() url_fnames = [ ( "https://raw.githubusercontent.com/mitjat/langid_eval/master/uniformly_sampled.tsv", "uniformly_sampled.tsv", ), ( "https://raw.githubusercontent.com/mitjat/langid_eval/master/recall_oriented.tsv", "recall_oriented.tsv", ), ] # download tweet ids first for url, fname in url_fnames: textacy.io.download_file(url, filename=fname, dirpath=dirpath, force=force) # download full tweets data next tweets_fpath = dirpath.joinpath("tweets.jsonl") if tweets_fpath.is_file() and force is False: logging.info("tweets data already downloaded to %s", tweets_fpath) return # load twitter ids data from disk tweet_lang_ids = [] for fname in ["uniformly_sampled.tsv", "recall_oriented.tsv"]: tweet_lang_ids.extend( textacy.io.read_csv( dirpath.joinpath(fname), delimiter="\t", fieldnames=["lang", "status_id"], quoting=1, )) logging.info("loaded %s tweet ids from disk", len(tweet_lang_ids)) # parse status ids status_ids = set() for row in tweet_lang_ids: try: status_ids.add(int(row["status_id"])) # there are a small handful of bad status ids, shrug except ValueError: pass logging.info("... of which %s had valid, unique ids", len(status_ids)) status_ids = list(status_ids) # instantiate twitter api client with textacy.utils.to_path(creds_fpath).resolve().open(mode="rt") as f: creds = yaml.safe_load(f.read()) api = twitter.Api(sleep_on_rate_limit=True, **creds) # get tweets data in chunks chunk_size = 100 pbar = tqdm.tqdm(total=len(status_ids), unit="tweets") tweets = [] try: for chunk_ids in itertoolz.partition_all(chunk_size, status_ids): chunk_tweets = api.GetStatuses(chunk_ids, trim_user=True, include_entities=True, map=False) tweets.extend(chunk_tweets) pbar.update(len(chunk_ids)) except Exception: logging.exception("encountered an error while downloading tweets") finally: pbar.close() tweets = [tweet.AsDict() for tweet in tweets] logging.info("downloaded data for %s tweets", len(tweets)) textacy.io.write_json(tweets, tweets_fpath, mode="wt", lines=True)