Beispiel #1
0
def genotype_parallel(src_vcf, out_vcf, sample, z, split_slop, min_aligned,
                      sum_quals, split_weight, disc_weight, max_reads, debug,
                      cores, breakpoint_batch_size, ref_fasta):

    # cleanup unused library attributes
    for rg in sample.rg_to_lib:
        sample.rg_to_lib[rg].cleanup()

    # 1st pass through input vcf -- collect all the relevant breakpoints
    logit("Collecting breakpoints")
    breakpoints = collect_breakpoints(src_vcf)
    logit("Number of breakpoints/SVs to process: {}".format(len(breakpoints)))
    logit("Collecting regions")
    regions = [get_breakpoint_regions(b, sample, z) for b in breakpoints]
    logit("Batch breakpoints into groups of {}".format(breakpoint_batch_size))
    breakpoints_batches = list(
        partition_all(breakpoint_batch_size, breakpoints))
    logit("Batch regions into groups of {}".format(breakpoint_batch_size))
    regions_batches = list(partition_all(breakpoint_batch_size, regions))

    if len(breakpoints_batches) != len(regions_batches):
        raise RuntimeError(
            "Batch error: breakpoint batches ({}) != region batches ({})".
            format(breakpoints_batches, regions_batches))

    logit("Number of batches to parallel process: {}".format(
        len(breakpoints_batches)))

    std_args = (sample.bam.filename, ref_fasta, sample.rg_to_lib,
                sample.active_libs, sample.name, split_slop, min_aligned,
                split_weight, disc_weight, max_reads, debug)

    pool = mp.Pool(processes=cores)
    results = [
        pool.apply_async(parallel_calculate_genotype,
                         args=std_args + (b, r, i))
        for i, (b, r) in enumerate(zip(breakpoints_batches, regions_batches))
    ]
    results = [p.get() for p in results]
    logit("Finished parallel breakpoint processing")
    logit("Merging genotype results")
    merged_genotypes = {
        g['variant.id']: g
        for batch in results for g in batch['genotypes']
    }

    total_variants_skipped = sum([batch['skip-count'] for batch in results])
    total_variants_with_no_reads = sum(
        [batch['no-read-count'] for batch in results])

    logit("Number of variants skipped (surpassed max-reads threshold): {}".
          format(total_variants_skipped))
    logit("Number of variants with no reads: {}".format(
        total_variants_with_no_reads))

    # 2nd pass through input vcf -- apply the calculated genotypes to the variants
    logit("Applying genotype results to vcf")
    apply_genotypes_to_vcf(src_vcf, out_vcf, merged_genotypes, sample,
                           sum_quals)
    logit("All Done!")
Beispiel #2
0
def genotype_parallel(src_vcf, out_vcf, sample, z, split_slop, min_aligned, sum_quals, split_weight, disc_weight, max_reads, max_ci_dist, debug, cores, breakpoint_batch_size, ref_fasta):

    # cleanup unused library attributes
    for rg in sample.rg_to_lib:
        sample.rg_to_lib[rg].cleanup()

    # 1st pass through input vcf -- collect all the relevant breakpoints
    logit("Collecting breakpoints")
    breakpoints = collect_breakpoints(src_vcf, max_ci_dist)
    logit("Number of breakpoints/SVs to process: {}".format(len(breakpoints)))
    logit("Collecting regions")
    regions = [ get_breakpoint_regions(b, sample, z) for b in breakpoints ]
    logit("Batch breakpoints into groups of {}".format(breakpoint_batch_size))
    breakpoints_batches = list(partition_all(breakpoint_batch_size, breakpoints))
    logit("Batch regions into groups of {}".format(breakpoint_batch_size))
    regions_batches = list(partition_all(breakpoint_batch_size, regions))

    if len(breakpoints_batches) != len(regions_batches):
        raise RuntimeError("Batch error: breakpoint batches ({}) != region batches ({})".format(breakpoints_batches, regions_batches))

    logit("Number of batches to parallel process: {}".format(len(breakpoints_batches)))

    std_args = (
        sample.bam.filename,
        ref_fasta,
        sample.rg_to_lib,
        sample.active_libs,
        sample.name,
        split_slop,
        min_aligned,
        split_weight,
        disc_weight,
        max_reads,
        debug
    )

    pool = mp.Pool(processes=cores)
    results = [pool.apply_async(parallel_calculate_genotype, args=std_args + (b, r, i)) for i, (b, r) in enumerate(zip(breakpoints_batches, regions_batches))]
    results = [p.get() for p in results]
    logit("Finished parallel breakpoint processing")
    logit("Merging genotype results")
    merged_genotypes = { g['variant.id'] : g for batch in results for g in batch['genotypes'] }

    total_variants_skipped = sum([ batch['skip-count'] for batch in results ])
    total_variants_with_no_reads = sum([ batch['no-read-count'] for batch in results ])

    logit("Number of variants skipped (surpassed max-reads threshold): {}".format(total_variants_skipped))
    logit("Number of variants with no reads: {}".format(total_variants_with_no_reads))

    # 2nd pass through input vcf -- apply the calculated genotypes to the variants
    logit("Applying genotype results to vcf")
    apply_genotypes_to_vcf(src_vcf, out_vcf, merged_genotypes, sample, sum_quals)
    logit("All Done!")
Beispiel #3
0
    async def write_pending_messages(self) -> int:
        """Flushes the buffer, if there are items in it, to the message store.

        The return value is the number of records that were successfully synced.
        """
        if self._pending.empty():
            return 0

        total = 0
        split_n = math.ceil(self._pending.maxsize / 4.0)

        # ensure the queue is empty before returning
        while not self._pending.empty():
            pending: List[SerializedMessage] = []

            # flush the entire queue
            while not self._pending.empty():
                pending.append(await self._pending.get())

            # divide all the pending into like-typed instances
            partitioned = groupby(attrgetter('type'), pending)
            for _, items in partitioned.items():
                for idx, bundle in partition_all(split_n, items):
                    async with self.connection('write_pending_messages-%d' %
                                               idx) as c:
                        async with c.transaction():
                            for msg in bundle:
                                await c.fetchrow(Procs.write_message, *msg)
                                total += 1
        # ~~ no more in pending queue
        return total
Beispiel #4
0
    def partial_fit(self, X, y):

        for i, batch in enumerate(toolz.partition_all(self.batch_size, X)):
            print 'batch {}'.format(i)

            rows = []
            response = []

            for row in batch:

                try:
                    row_y = y.next()
                    if self.check_response(row_y):
                        row = self._transform(row)
                        rows.append(row)
                        response.append(row_y)
                except Exception as e:
                    if self.logging:
                        logging.exception(e)

            shuffledRange = range(len(rows))
            # need to shuffle data during each iteration
            for _ in range(self.n_iter):
                random.shuffle(shuffledRange)
                batch_data = sp.sparse.vstack([rows[i] for i in shuffledRange])
                shuffled_response = [response[i] for i in shuffledRange]
                self.steps[-1].partial_fit(batch_data,
                                           shuffled_response,
                                           classes=[0, 1])
Beispiel #5
0
    def partial_fit(self, X, y):

        for i, batch in enumerate(toolz.partition_all(self.batch_size, X)):
            print 'batch {}'.format(i)

            rows = []
            response = []

            for row in batch:

                try:
                    row_y = y.next()
                    if self.check_response(row_y):
                        row = self._transform(row)
                        rows.append(row)
                        response.append(row_y)
                except Exception as e:
                    if self.logging:
                        logging.exception(e)

            shuffledRange = range(len(rows))
            # need to shuffle data during each iteration
            for _ in range(self.n_iter):
                random.shuffle(shuffledRange)
                batch_data = sp.sparse.vstack([rows[i] for i in shuffledRange])
                shuffled_response = [response[i] for i in shuffledRange]
                self.steps[-1].partial_fit(batch_data, shuffled_response, classes=[0, 1])
def test_partition_all():
    assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)]
    assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)]
    assert list(partition_all(2, [])) == []

    # Regression test: https://github.com/pycytoolz/cytoolz/issues/387
    class NoCompare(object):
        def __eq__(self, other):
            if self.__class__ == other.__class__:
                return True
            raise ValueError()

    obj = NoCompare()
    result = [(obj, obj, obj, obj), (obj, obj, obj)]
    assert list(partition_all(4, [obj] * 7)) == result
    assert list(partition_all(4, iter([obj] * 7))) == result
Beispiel #7
0
 def _request_block_parts(
         self, headers: List[BlockHeader],
         request_func: Callable[[ETHPeer, List[BlockHeader]], None]) -> int:
     length = math.ceil(len(headers) / len(self.peer_pool.peers))
     batches = list(partition_all(length, headers))
     for peer, batch in zip(self.peer_pool.peers, batches):
         request_func(cast(ETHPeer, peer), batch)
     return len(batches)
Beispiel #8
0
 async def request_nodes(self, node_keys: List[bytes]) -> None:
     batches = list(partition_all(eth.MAX_STATE_FETCH, node_keys))
     for batch in batches:
         peer = await self.get_idle_peer()
         now = time.time()
         for node_key in batch:
             self._pending_nodes[node_key] = now
         self.logger.debug("Requesting %d trie nodes to %s", len(batch), peer)
         peer.sub_proto.send_get_node_data(batch)
         self._peers_with_pending_requests[peer] = now
Beispiel #9
0
def get_model_preds(model: Model, texts: List[str],
                    classes: np.ndarray) -> List[str]:
    """
    Get model predictions for multiple texts as class labels rather than as a 2dim
    matrix of prediction probabilities.
    """
    # predict in batches, otherwise memory blows UP
    results = (result for texts_pt in itertoolz.partition_all(1000, texts)
               for result in get_topn_preds_and_probs(model.predict(texts_pt),
                                                      1, classes))
    return [lang for result in results for lang, _ in result]
Beispiel #10
0
    def predict_proba(self, newX):

        all_preds = []
        for batch in toolz.partition_all(self.batch_size, newX):
            pred_rows = []
            for newrow in batch:
                newrow = self._transform(newrow)
                pred_rows.append(newrow)
            test_data = sp.sparse.vstack(pred_rows)
            all_preds.append(self.steps[-1].predict_proba(test_data))

        return np.vstack(all_preds)
Beispiel #11
0
    def predict_proba(self, newX):

        all_preds = []
        for batch in toolz.partition_all(self.batch_size, newX):
            pred_rows = []
            for newrow in batch:
                newrow = self._transform(newrow)
                pred_rows.append(newrow)
            test_data = sp.sparse.vstack(pred_rows)
            all_preds.append(self.steps[-1].predict_proba(test_data))

        return np.vstack(all_preds)
Beispiel #12
0
async def caching_downloader(get, set, downloader, tiles, num_workers, **kw):
    """
    Download tiles from cache and missing tiles with the downloader.

    Asynchronous generator of map tiles is returned.

    The code flow is

    - caching downloader gets tile data from cache using URLs
    - the original downloader is used to download missing tile data
    - cache is updated with all existing tile data

    The cache getter function (`get` parameter) should return `None` if
    tile data is not in cache for given URL.

    A collection of tiles is returned.

    :param get: Function to get a tile data from cache.
    :param set: Function to put a tile data in cache.
    :param downloader: Original tiles downloader (asyncio coroutine).
    :param tiles: Collection tiles to fetch.
    :param num_workers: Number of workers used to connect to a map provider
        service.
    :param kw: Parameters passed to downloader coroutine.
    """
    tiles = fetch_from_cache(get, tiles)
    groups = partition_all(10, tiles)
    for tg in groups:
        missing = groupby(lambda t: t.img is None, tg)
        for t in missing.get(False, []):
            # reset cache for new and old tiles
            set(t.url, t.img)
            yield t

        result = downloader(missing.get(True, []), num_workers, **kw)
        async for t in result:
            # reset cache for new and old tiles
            set(t.url, t.img)
            yield t
Beispiel #13
0
async def caching_downloader(get, set, downloader, tiles, num_workers, **kw):
    """
    Download tiles from cache and missing tiles with the downloader.

    Asynchronous generator of map tiles is returned.

    The code flow is

    - caching downloader gets tile data from cache using URLs
    - the original downloader is used to download missing tile data
    - cache is updated with all existing tile data

    The cache getter function (`get` parameter) should return `None` if
    tile data is not in cache for given URL.

    A collection of tiles is returned.

    :param get: Function to get a tile data from cache.
    :param set: Function to put a tile data in cache.
    :param downloader: Original tiles downloader (asyncio coroutine).
    :param tiles: Collection tiles to fetch.
    :param num_workers: Number of workers used to connect to a map provider
        service.
    :param kw: Parameters passed to downloader coroutine.
    """
    tiles = fetch_from_cache(get, tiles)
    groups = partition_all(10, tiles)
    for tg in groups:
        missing = groupby(lambda t: t.img is None, tg)
        for t in missing.get(False, []):
            # reset cache for new and old tiles
            set(t.url, t.img)
            yield t

        result = downloader(missing.get(True, []), num_workers, **kw)
        async for t in result:
            # reset cache for new and old tiles
            set(t.url, t.img)
            yield t
Beispiel #14
0
def test_partition_all():
    assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)]
    assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)]
    assert list(partition_all(2, [])) == []
Beispiel #15
0
def test_partition_all():
    assert list(partition_all(2, [1, 2, 3, 4])) == [(1, 2), (3, 4)]
    assert list(partition_all(3, range(5))) == [(0, 1, 2), (3, 4)]
    assert list(partition_all(2, [])) == []
def download_twitter_data(dirpath, creds_fpath, force=False):
    """
    Download two collections of (lang, tweet_id) pairs from Twitter --
    a "uniformly sampled" collection of ~120k tweets over all languages and
    a "recall oriented" collection of ~1.5k tweets per language --
    then fetch available tweets' data from the Twitter API.

    Args:
        dirpath (str or :class:`pathlib.Path`)
        creds_fpath (str or :class:`pathlib.Path`)
        force (bool)

    References:
        https://blog.twitter.com/engineering/en_us/a/2015/evaluating-language-identification-performance.html

    TODO: Ideally, use a tweet search endpoint and filter by language,
    then just iterate over all ISO-639-1 language codes.
    """
    dirpath = textacy.utils.to_path(dirpath).resolve()
    url_fnames = [
        (
            "https://raw.githubusercontent.com/mitjat/langid_eval/master/uniformly_sampled.tsv",
            "uniformly_sampled.tsv",
        ),
        (
            "https://raw.githubusercontent.com/mitjat/langid_eval/master/recall_oriented.tsv",
            "recall_oriented.tsv",
        ),
    ]
    # download tweet ids first
    for url, fname in url_fnames:
        textacy.io.download_file(url,
                                 filename=fname,
                                 dirpath=dirpath,
                                 force=force)
    # download full tweets data next
    tweets_fpath = dirpath.joinpath("tweets.jsonl")
    if tweets_fpath.is_file() and force is False:
        logging.info("tweets data already downloaded to %s", tweets_fpath)
        return

    # load twitter ids data from disk
    tweet_lang_ids = []
    for fname in ["uniformly_sampled.tsv", "recall_oriented.tsv"]:
        tweet_lang_ids.extend(
            textacy.io.read_csv(
                dirpath.joinpath(fname),
                delimiter="\t",
                fieldnames=["lang", "status_id"],
                quoting=1,
            ))
    logging.info("loaded %s tweet ids from disk", len(tweet_lang_ids))
    # parse status ids
    status_ids = set()
    for row in tweet_lang_ids:
        try:
            status_ids.add(int(row["status_id"]))
        # there are a small handful of bad status ids, shrug
        except ValueError:
            pass
    logging.info("... of which %s had valid, unique ids", len(status_ids))
    status_ids = list(status_ids)
    # instantiate twitter api client
    with textacy.utils.to_path(creds_fpath).resolve().open(mode="rt") as f:
        creds = yaml.safe_load(f.read())
    api = twitter.Api(sleep_on_rate_limit=True, **creds)
    # get tweets data in chunks
    chunk_size = 100
    pbar = tqdm.tqdm(total=len(status_ids), unit="tweets")
    tweets = []
    try:
        for chunk_ids in itertoolz.partition_all(chunk_size, status_ids):
            chunk_tweets = api.GetStatuses(chunk_ids,
                                           trim_user=True,
                                           include_entities=True,
                                           map=False)
            tweets.extend(chunk_tweets)
            pbar.update(len(chunk_ids))
    except Exception:
        logging.exception("encountered an error while downloading tweets")
    finally:
        pbar.close()
        tweets = [tweet.AsDict() for tweet in tweets]
        logging.info("downloaded data for %s tweets", len(tweets))
        textacy.io.write_json(tweets, tweets_fpath, mode="wt", lines=True)