Esempio n. 1
0
def partial_reduce(func, x, split_every, keepdims=False, dtype=None, name=None):
    """Partial reduction across multiple axes.

    Parameters
    ----------
    func : function
    x : Array
    split_every : dict
        Maximum reduction block sizes in each dimension.

    Example
    -------
    Reduce across axis 0 and 2, merging a maximum of 1 block in the 0th
    dimension, and 3 blocks in the 2nd dimension:

    >>> partial_reduce(np.min, x, {0: 1, 2: 3})    # doctest: +SKIP
    """
    name = name or 'p_reduce-' + tokenize(func, x, split_every, keepdims, dtype)
    parts = [list(partition_all(split_every.get(i, 1), range(n))) for (i, n)
             in enumerate(x.numblocks)]
    keys = product(*map(range, map(len, parts)))
    out_chunks = [tuple(1 for p in partition_all(split_every[i], c)) if i
                  in split_every else c for (i, c) in enumerate(x.chunks)]
    if not keepdims:
        out_axis = [i for i in range(x.ndim) if i not in split_every]
        getter = lambda k: get(out_axis, k)
        keys = map(getter, keys)
        out_chunks = list(getter(out_chunks))
    dsk = {}
    for k, p in zip(keys, product(*parts)):
        decided = dict((i, j[0]) for (i, j) in enumerate(p) if len(j) == 1)
        dummy = dict(i for i in enumerate(p) if i[0] not in decided)
        g = lol_tuples((x.name,), range(x.ndim), decided, dummy)
        dsk[(name,) + k] = (func, g)
    return Array(merge(dsk, x.dask), name, out_chunks, dtype=dtype)
Esempio n. 2
0
File: umis.py Progetto: roryk/umis
def cb_filter(fastq, bc1, bc2, cores, nedit):
    ''' Filters reads with non-matching barcodes
    Expects formatted fastq files.
    '''

    bc1 = set(cb.strip() for cb in bc1)
    if bc2:
        bc2 = set(cb.strip() for cb in bc2)

    if nedit == 0:
        filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2)
    else:
        bc1hash = MutationHash(bc1, nedit)
        bc2hash = None
        if bc2:
            bc2hash = MutationHash(bc2, nedit)
        filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash,
                            bc2hash=bc2hash)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, stream_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_cb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 3
0
def big_kmeans(docs, k, batch_size=1000, n_features=(2**20), single_pass=True):
    """k-means for very large sets of documents.

    See kmeans for documentation. Differs from that function in that it does
    not computer tf-idf or LSA, and fetches the documents in a streaming
    fashion, so they don't need to be held in memory. It does not do random
    restarts.

    If the option single_pass is set to False, the documents are visited
    twice: once to fit a k-means model, once to determine their label in
    this model.
    """
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import HashingVectorizer

    docs = tosequence(docs)

    v = HashingVectorizer(input="content", n_features=n_features, norm="l2")
    km = MiniBatchKMeans(n_clusters=k)

    labels = []
    for batch in toolz.partition_all(batch_size, docs):
        batch = map(fetch, docs)
        batch = v.transform(batch)
        y = km.fit_predict(batch)
        if single_pass:
            labels.extend(y.tolist())

    if not single_pass:
        for batch in toolz.partition_all(batch_size, docs):
            batch = map(fetch, docs)
            batch = v.transform(batch)
            labels.extend(km.predict(batch).tolist())

    return group_clusters(docs, labels)
Esempio n. 4
0
def export_historical(start,
                      end,
                      export_chunk_func,
                      export_snapshot_func,
                      data_query,
                      generate_snapshot_range_func=None):
    if generate_snapshot_range_func is None:
        generate_snapshot_range_func = _generate_snapshot_range

    interval_map = _get_interval_map(start)
    resolutions = [item['resolution'] for item in interval_map]
    # default resolution is hourly
    resolution = os.environ.get("RESOLUTION", "1h")
    if resolution not in resolutions:
        resolution = "1h"

    for entry in interval_map:
        snapshots = generate_snapshot_range_func(entry["start"], end,
                                                 entry["interval"], data_query)

        chunks = partition_all(CHUNK_SIZE, snapshots)
        batches = partition_all(POOL_SIZE, chunks)
        for b in batches:
            logger.info("starting new pool with %d workers", POOL_SIZE)
            Parallel(n_jobs=POOL_SIZE, backend="multiprocessing", verbose=100)(
                delayed(export_chunk_func)(chunk, export_snapshot_func)
                for chunk in b)

        # if we reached the final resolution we're done
        if entry['resolution'] == resolution:
            break
Esempio n. 5
0
def partial_reduce(func, x, split_every, keepdims=False, dtype=None, name=None):
    """Partial reduction across multiple axes.

    Parameters
    ----------
    func : function
    x : Array
    split_every : dict
        Maximum reduction block sizes in each dimension.

    Example
    -------
    Reduce across axis 0 and 2, merging a maximum of 1 block in the 0th
    dimension, and 3 blocks in the 2nd dimension:

    >>> partial_reduce(np.min, x, {0: 1, 2: 3})    # doctest: +SKIP
    """
    name = name or 'p_reduce-' + tokenize(func, x, split_every, keepdims, dtype)
    parts = [list(partition_all(split_every.get(i, 1), range(n))) for (i, n)
             in enumerate(x.numblocks)]
    keys = product(*map(range, map(len, parts)))
    out_chunks = [tuple(1 for p in partition_all(split_every[i], c)) if i
                  in split_every else c for (i, c) in enumerate(x.chunks)]
    if not keepdims:
        out_axis = [i for i in range(x.ndim) if i not in split_every]
        getter = lambda k: get(out_axis, k)
        keys = map(getter, keys)
        out_chunks = list(getter(out_chunks))
    dsk = {}
    for k, p in zip(keys, product(*parts)):
        decided = dict((i, j[0]) for (i, j) in enumerate(p) if len(j) == 1)
        dummy = dict(i for i in enumerate(p) if i[0] not in decided)
        g = lol_tuples((x.name,), range(x.ndim), decided, dummy)
        dsk[(name,) + k] = (func, g)
    return Array(merge(dsk, x.dask), name, out_chunks, dtype=dtype)
Esempio n. 6
0
def cb_filter(fastq, bc1, bc2, cores, nedit):
    ''' Filters reads with non-matching barcodes
    Expects formatted fastq files.
    '''

    bc1 = set(cb.strip() for cb in bc1)
    if bc2:
        bc2 = set(cb.strip() for cb in bc2)

    if nedit == 0:
        filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2)
    else:
        bc1hash = MutationHash(bc1, nedit)
        bc2hash = None
        if bc2:
            bc2hash = MutationHash(bc2, nedit)
        filter_cb = partial(correcting_barcode_filter,
                            bc1hash=bc1hash,
                            bc2hash=bc2hash)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, stream_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_cb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 7
0
File: umis.py Progetto: lylamha/umis
def fastqtransform(transform, fastq1, fastq2, separate_cb, demuxed_cb,
                   dual_index, cores, min_length):
    ''' Transform input reads to the tagcounts compatible read layout using
    regular expressions as defined in a transform file. Outputs new format to
    stdout.
    '''
    if dual_index and separate_cb:
        read_template = '{name}:CELL_{CB1}-{CB2}:UMI_{MB}\n{seq}\n+\n{qual}\n'
    else:
        read_template = '{name}:CELL_{CB}:UMI_{MB}\n{seq}\n+\n{qual}\n'

    transform = json.load(open(transform))
    read1_regex = re.compile(transform['read1'])
    read2_regex = re.compile(transform['read2']) if fastq2 else None

    fastq1_fh = open(fastq1)
    if fastq1.endswith('gz'):
        fastq1_fh = gzip.GzipFile(fileobj=fastq1_fh)

    fastq_file1 = stream_fastq(fastq1_fh)

    if fastq2:
        fastq2_fh = open(fastq2)
        if fastq2.endswith('gz'):
            fastq2_fh = gzip.GzipFile(fileobj=fastq2_fh)

        fastq_file2 = stream_fastq(fastq2_fh)

    else:
        fastq_file2 = itertools.cycle((None, ))

    transform = partial(transformer,
                        read1_regex=read1_regex,
                        read2_regex=read2_regex,
                        paired=fastq2)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, itertools.izip(fastq_file1, fastq_file2))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(transform, list(bigchunk)):
            for read1_dict in chunk:
                if dual_index:
                    if not separate_cb:
                        read1_dict[
                            'CB'] = read1_dict['CB1'] + read1_dict['CB2']

                if demuxed_cb:
                    read1_dict['CB'] = demuxed_cb

                # Deal with spaces in read names
                read1_dict['name'] = read1_dict['name'].partition(' ')[0]
                if len(read1_dict['seq']) >= min_length:
                    sys.stdout.write(read_template.format(**read1_dict))
Esempio n. 8
0
def mb_filter(fastq, cores):
    ''' Filters umis with non-ACGT bases
    Expects formatted fastq files.
    '''
    filter_mb = partial(umi_filter)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, stream_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_mb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 9
0
File: umis.py Progetto: vals/umis
def mb_filter(fastq, cores):
    ''' Filters umis with non-ACGT bases
    Expects formatted fastq files.
    '''
    filter_mb = partial(umi_filter)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_mb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 10
0
def fastqtransform(transform, fastq1, fastq2, separate_cb, demuxed_cb,
                   dual_index, cores, min_length):
    ''' Transform input reads to the tagcounts compatible read layout using
    regular expressions as defined in a transform file. Outputs new format to
    stdout.
    '''
    if dual_index and separate_cb:
        read_template = '{name}:CELL_{CB1}-{CB2}:UMI_{MB}\n{seq}\n+\n{qual}\n'
    else:
        read_template = '{name}:CELL_{CB}:UMI_{MB}\n{seq}\n+\n{qual}\n'

    transform = json.load(open(transform))
    read1_regex = re.compile(transform['read1'])
    read2_regex = re.compile(transform['read2']) if fastq2 else None

    fastq1_fh = open(fastq1)
    if fastq1.endswith('gz'):
        fastq1_fh = gzip.GzipFile(fileobj=fastq1_fh)

    fastq_file1 = stream_fastq(fastq1_fh)

    if fastq2:
        fastq2_fh = open(fastq2)
        if fastq2.endswith('gz'):
            fastq2_fh = gzip.GzipFile(fileobj=fastq2_fh)

        fastq_file2 = stream_fastq(fastq2_fh)

    else:
        fastq_file2 = itertools.cycle((None,))

    transform = partial(transformer, read1_regex=read1_regex,
                          read2_regex=read2_regex, paired=fastq2)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, itertools.izip(fastq_file1, fastq_file2))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(transform, list(bigchunk)):
            for read1_dict in chunk:
                if dual_index:
                    if not separate_cb:
                        read1_dict['CB'] = read1_dict['CB1'] + read1_dict['CB2']

                if demuxed_cb:
                    read1_dict['CB'] = demuxed_cb

                # Deal with spaces in read names
                read1_dict['name'] = read1_dict['name'].partition(' ')[0]
                if len(read1_dict['seq']) >= min_length:
                    sys.stdout.write(read_template.format(**read1_dict))
Esempio n. 11
0
def add_uid(fastq, cores):
    ''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication
    Expects formatted fastq files with correct sample and cell barcodes.
    '''

    uids = partial(append_uids)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, stream_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(uids, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 12
0
File: umis.py Progetto: vals/umis
def add_uid(fastq, cores):
    ''' Adds UID:[samplebc cellbc umi] to readname for umi-tools deduplication
    Expects formatted fastq files with correct sample and cell barcodes.
    '''

    uids = partial(append_uids)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(uids, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 13
0
    def apply(self, lfs, Xs, block_size=None):

        blocks = Xs
        if block_size is None:
            block_size = int(
                np.ceil(np.sum([len(x) for x in Xs]) / self.num_workers))
            logger.info("auto block size %s", block_size)

        if block_size:
            blocks = list(
                partition_all(block_size, itertools.chain.from_iterable(Xs)))

        lens = np.unique([len(x) for x in blocks])
        logger.info("Partitioned into %s blocks %s sizes ", len(blocks), lens)

        do = delayed(partial(LabelingServer.worker, lfs))
        jobs = (do(batch) for batch in blocks)
        L = sparse.vstack(self.client(jobs))

        # merge matrix blocks
        Ls = []
        i = 0
        for n in [len(x) for x in Xs]:
            Ls.append(L[i:i + n].copy())
            i += n
        return Ls
Esempio n. 14
0
def get_median_rank_recommended_items(train_interactions, eval_users,
                                      item_popularities, n_users_in_chunk):
    train_interactions = sp.lil_matrix(train_interactions)
    max_train_interaction_count = max(
        len(row) for row in train_interactions.rows)
    train_user_items = {
        u: set(train_interactions.rows[u])
        for u in eval_users if train_interactions.rows[u]
    }

    recommended_items = []
    with tf.Session() as sess:
        model = Triplet(sess=sess, params=params)
        logger.info('Get recommended items for users in evaluation')
        for user_chunk in toolz.partition_all(n_users_in_chunk, eval_users):
            recommended_items = recommended_items + \
                                model.get_recommended_items(
                                    user_chunk, train_user_items,
                                    max_train_interaction_count,
                                    k=50)
        logger.info('Get rank for recommended items')
        item_ranks = []
        for iids in recommended_items:
            item_ranks.append([item_popularities[iid] for iid in iids])
        median_ranks = np.median(item_ranks, axis=1)
    tf.reset_default_graph()
    return np.mean(median_ranks)
Esempio n. 15
0
def main(args):

    filelist = glob.glob(f'{args.inputdir}/*')
    batches = partition_all(args.batch_size, filelist)
    print(f'Documents: {len(filelist)}')

    if not os.path.exists(args.outputdir):
        print("output dir does not exist")
        return

    for i,batch in enumerate(batches):
        data = []
        for fpath in batch:
            doc_name = fpath.split("/")[-1].split(".")[0]
            text = open(fpath,'r').read()
            if args.fmt == 'mimic':
                text = mimic_preprocessing(text) if args.preprocess == 'mimic' else text

            # escape whitespace
            text = text.replace('\n', '\\n').replace('\t', '\\t')
            data.append((doc_name, text))

        outfpath = f'{args.outputdir}/{args.batch_size}.{i}.tsv'
        print(outfpath)
        save_tsv(data, outfpath)
        data = []
Esempio n. 16
0
    def apply(self,
              pipeline: Dict[str, float],
              documents: List[List[Document]],
              block_size: Union[str, int] = 'auto'):

        items = itertools.chain.from_iterable(documents)

        if block_size == 'auto':
            num_items = np.sum([len(x) for x in documents])
            block_size = int(np.ceil(num_items / self.num_workers))
            print(f'auto block size={block_size}')

        blocks = list(partition_all(block_size,
                                    items)) if block_size else documents
        print(
            f"Partitioned into {len(blocks)} blocks, {np.unique([len(x) for x in blocks])} sizes"
        )

        do = delayed(partial(TaggerPipelineServer.worker, pipeline))
        jobs = (do(batch) for batch in blocks)
        results = list(itertools.chain.from_iterable(self.client(jobs)))

        i = 0
        items = []
        for n in [len(x) for x in documents]:
            items.append(results[i:i + n].copy())
            i += n
        return items
Esempio n. 17
0
    def apply(self, lfs, Xs, block_size=None):
        """

        :param lfs:
        :param Xs:
        :param block_size:
        :return:
        """
        blocks = Xs
        if block_size == 'auto':
            block_size = int(
                np.ceil(np.sum([len(x) for x in Xs]) / self.num_workers))
            if self.verbose:
                print(f'auto block size={block_size}')

        if block_size:
            blocks = list(
                partition_all(block_size, itertools.chain.from_iterable(Xs)))

        if self.verbose:
            sizes = np.unique([len(x) for x in blocks])
            print(f"Partitioned into {len(blocks)} blocks, {sizes} sizes")

        do = delayed(partial(LabelingServer.worker, lfs))
        jobs = (do(batch) for batch in blocks)
        L = sparse.vstack(self.client(jobs))

        # merge matrix blocks
        Ls = []
        i = 0
        for n in [len(x) for x in Xs]:
            Ls.append(L[i:i + n].copy())
            i += n
        return Ls
Esempio n. 18
0
    def _update_batch(cls, tuples, trx=True):
        steemd = SteemClient.instance()
        timer = Timer(total=len(tuples), entity='post', laps=['rps', 'wps'])
        tuples = sorted(tuples, key=lambda x: x[1]) # enforce ASC id's

        for tups in partition_all(1000, tuples):
            timer.batch_start()
            buffer = []

            post_args = [tup[0].split('/') for tup in tups]
            posts = steemd.get_content_batch(post_args)
            post_ids = [tup[1] for tup in tups]
            post_levels = [tup[2] for tup in tups]
            for pid, post, level in zip(post_ids, posts, post_levels):
                if post['author']:
                    buffer.append(cls._sql(pid, post, level=level))
                else:
                    print("WARNING: ignoring deleted post {}".format(pid))
                cls._bump_last_id(pid)

            timer.batch_lap()
            cls._batch_queries(buffer, trx)

            timer.batch_finish(len(posts))
            if len(tuples) >= 1000:
                print(timer.batch_status())
Esempio n. 19
0
    def apply(self, lfs, Xs, block_size=None):
        blocks = Xs
        if block_size is None:
            block_size = int(
                np.ceil(np.sum([len(x) for x in Xs]) / self.num_workers))
            print(f'auto block size={block_size}')

        if block_size:
            blocks = list(
                partition_all(block_size, itertools.chain.from_iterable(Xs)))

        print(f"Partitioned into {len(blocks)} blocks, "
              f"{np.unique([len(x) for x in blocks])} sizes")

        do = delayed(partial(SequenceLabelingServer.worker, lfs))
        jobs = (do(batch) for batch in blocks)
        L = np.vstack(self.client(jobs))

        # merge matrix blocks
        Ls = []
        i = 0
        for n in [len(x) for x in Xs]:
            Ls.append(L[i:i + n].copy())
            i += n
        return Ls
Esempio n. 20
0
def main_wrapper(data_dir='/data', sensor='mynt_eye', intention_type='dlm'):
    global IMG_TOPICS
    global TOPICS
    global TOPICS_IDX
    assert (sensor in SENSORS), f'Must be valid sensor in {SENSORS}'
    RGBS = SENSOR_TOPIC[sensor]['RGBS']
    DEPTHS = SENSOR_TOPIC[sensor]['DEPTHS']
    bagfns = glob.glob(data_dir + '/*.bag')
    print('bags:', bagfns)
    bags = chain(*[parse_bag(bagfn, intention_type) for bagfn in bagfns])
    it = ThreadedGenerator(bags, queue_maxsize=6500)
    # make dirs for images
    gendir = 'data'
    if os.path.exists(osp.join(data_dir, gendir)) and os.path.isdir(
            osp.join(data_dir, gendir)):
        shutil.rmtree(osp.join(data_dir, gendir))
    os.mkdir(osp.join(data_dir, gendir))
    with open(osp.join(data_dir, gendir, 'README.txt'), 'w+') as f:
        f.write('THIS DATA IS PARSED TO SERVE PYTORCH MODEL')

    topic_save_path = []
    for idx, rgb_topic in enumerate(RGBS):
        fn = osp.join(data_dir, gendir, f'rgb_{idx}')
        os.mkdir(fn)
        TOPICS.append(rgb_topic)
        TOPICS_IDX[rgb_topic] = len(TOPICS) - 1
        topic_save_path.append(fn)
    for idx, depth_topic in enumerate(DEPTHS):
        fn = osp.join(data_dir, gendir, f'depth_{idx}')
        os.mkdir(fn)
        TOPICS.append(depth_topic)
        TOPICS_IDX[depth_topic] = len(TOPICS) - 1
        topic_save_path.append(fn)
    if intention_type == 'lpe':
        fn = osp.join(data_dir, gendir, 'intention_img')
        os.mkdir(fn)
        TOPICS.append(INTENTION)
        TOPICS_IDX[INTENTION] = len(TOPICS) - 1
        topic_save_path.append(fn)
        IMG_TOPICS = TOPICS[:]
    else:
        IMG_TOPICS = TOPICS[:]
        TOPICS.append(INTENTION)
    TOPICS.append(CONTROL)

    f = open(osp.join(data_dir, gendir, 'label.txt'), 'w')
    labelwriter = csv.writer(f,
                             delimiter=' ',
                             quotechar='|',
                             quoting=csv.QUOTE_MINIMAL)
    labelwriter.writerow([
        'frame', 'intention_type', 'current_velocity', 'steering_wheel_angle',
        'dlm'
    ])
    for chunk in partition_all(CHUNK_SIZE, tqdm(it)):
        for c in chunk:
            for idx, fn in enumerate(topic_save_path):
                cv2.imwrite(osp.join(fn, f'{c.frame}.jpg'), c.imgs[idx])
            labelwriter.writerow(
                [c.frame, intention_type, c.vel, c.steer, c.dlm])
Esempio n. 21
0
    def _lookup_most_recent_symbols(self, sids):
        symbol_cols = self.equity_symbol_mappings.c
        symbols = {
            row.sid: {c: row[c] for c in symbol_columns}
            for row in concat(
                self.engine.execute(
                    sa.select(
                        (symbol_cols.sid,) +
                        tuple(map(op.getitem(symbol_cols), symbol_columns)),
                    ).where(
                        symbol_cols.sid.in_(map(int, sid_group)),
                    ).order_by(
                        symbol_cols.end_date.desc(),
                    ).group_by(
                        symbol_cols.sid,
                    )
                ).fetchall()
                for sid_group in partition_all(
                    SQLITE_MAX_VARIABLE_NUMBER,
                    sids
                ),
            )
        }

        if len(symbols) != len(sids):
            raise EquitiesNotFound(
                sids=set(sids) - set(symbols),
                plural=True,
            )
        return symbols
Esempio n. 22
0
File: convert.py Progetto: EGQM/odo
def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs):
    seq2 = partition_all(chunksize, seq)

    if kwargs.get('add_index'):
        mkindex = _add_index
    else:
        mkindex = _ignore_index

    try:
        first, rest = next(seq2), seq2
    except StopIteration:
        def _():
            yield convert(pd.DataFrame, [], **kwargs)
    else:
        df = convert(pd.DataFrame, first, **kwargs)
        df1, n1 = mkindex(df, 0)

        def _():
            n = n1
            yield df1
            for i in rest:
                df = convert(pd.DataFrame, i, **kwargs)
                df, n = mkindex(df, n)
                yield df
    return chunks(pd.DataFrame)(_)
Esempio n. 23
0
def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs):
    seq2 = partition_all(chunksize, seq)

    if kwargs.get('add_index'):
        mkindex = _add_index
    else:
        mkindex = _ignore_index

    try:
        first, rest = next(seq2), seq2
    except StopIteration:

        def _():
            yield convert(pd.DataFrame, [], **kwargs)
    else:
        df = convert(pd.DataFrame, first, **kwargs)
        df1, n1 = mkindex(df, 0)

        def _():
            n = n1
            yield df1
            for i in rest:
                df = convert(pd.DataFrame, i, **kwargs)
                df, n = mkindex(df, n)
                yield df

    return chunks(pd.DataFrame)(_)
Esempio n. 24
0
    def evaluate_AUC(self, data1):  # evaluate the results for an input set
        data = data1.values[:,1:]
        num_example = len(data) #需要验证样本的数量~
        score = []
        for user_chunk in toolz.partition_all(600,[i for i in range(num_example)] ):
            pos = data[list(user_chunk)]
            NegativeSample = self.sample_negative(pos,50)#采样负样本None,10
            neg = np.tile(np.expand_dims(copy.deepcopy(pos),axis =1),[1,50,1]) #none,10,3
            neg = neg.reshape([-1,pos.shape[1]])#10*none,3
            neg[:,1] = NegativeSample.reshape([-1])#赋值给负样本,形成整个负样本集,注意到,这个地方10倍于正样本集        
            #计算negative评分
            if self.context and self.time:
                feed_dict_neg = {self.model.Pos: neg[:,:2],self.model.Fea:neg[:,2:-self.time_dimension],self.model.Tim:neg[:,-self.time_dimension:]}
            #计算positive评分
                feed_dict_pos = {self.model.Pos: pos[:,:2],self.model.Fea:pos[:,2:-self.time_dimension],self.model.Tim:pos[:,-self.time_dimension:]}
            if self.context and not self.time:
                feed_dict_neg = {self.model.Pos: neg[:,:2],self.model.Fea:neg[:,2:]}
            #计算positive评分
                feed_dict_pos = {self.model.Pos: pos[:,:2],self.model.Fea:pos[:,2:]}
            if not self.context and self.time:
                feed_dict_neg = {self.model.Pos: neg[:,:2],self.model.Tim:neg[:,-self.time_dimension:]}
            #计算positive评分
                feed_dict_pos = {self.model.Pos: pos[:,:2],self.model.Tim:pos[:,-self.time_dimension:]}
    
            batch_out_pos = self.model.sess.run( self.model.PositiveFeadback, feed_dict=feed_dict_pos)
            self.neg_score = self.model.sess.run(self.model.PositiveFeadback, feed_dict=feed_dict_neg)            
            self.pos_score = np.reshape(np.tile(np.expand_dims(batch_out_pos,axis = 1),[1,50,1]),[-1,1])
            
            score.extend(np.reshape(self.pos_score>self.neg_score,[-1]).tolist())

            
            return np.mean(score)
Esempio n. 25
0
    def __init__(self, server, sizing_mode='stretch_both', **kwargs):
        self.server = server
        self.counter_figures = {}
        self.counter_sources = {}
        self.digest_figures = {}
        self.digest_sources = {}
        self.sizing_mode = sizing_mode

        if self.server.digests:
            for name in self.server.digests:
                self.add_digest_figure(name)
        for name in self.server.counters:
            self.add_counter_figure(name)

        figures = merge(self.digest_figures, self.counter_figures)
        figures = [figures[k] for k in sorted(figures)]

        if len(figures) <= 5:
            self.root = column(figures, sizing_mode=sizing_mode)
        else:
            self.root = column(*[
                row(*pair, sizing_mode=sizing_mode)
                for pair in partition_all(2, figures)
            ],
                               sizing_mode=sizing_mode)
Esempio n. 26
0
def uuid_initializer(uuid_):
    return '{{{}}}'.format(
        ', '.join(
            '0x{:02x}{:02x}'.format(high, low)
            for low, high in toolz.partition_all(2, uuid_.bytes)
        ),
    )
Esempio n. 27
0
def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs):
    seq2 = partition_all(chunksize, seq)

    add_index = kwargs.get('add_index', False)
    if not add_index:
        # Simple, we can dispatch to dask...
        f = lambda d: convert(pd.DataFrame, d, **kwargs)
        data = [partial(f, d) for d in seq2]
        if not data:
            data = [convert(pd.DataFrame, [], **kwargs)]
        return chunks(pd.DataFrame)(data)

    # TODO: Decide whether we should support the `add_index` flag at all.
    # If so, we need to post-process the converted DataFrame objects sequencially,
    # so we can't parallelize the process.
    try:
        first, rest = next(seq2), seq2
    except StopIteration:
        def _():
            yield convert(pd.DataFrame, [], **kwargs)
    else:
        df = convert(pd.DataFrame, first, **kwargs)
        df1, n1 = _add_index(df, 0)

        def _():
            n = n1
            yield df1
            for i in rest:
                df = convert(pd.DataFrame, i, **kwargs)
                df, n = _add_index(df, n)
                yield df
    return chunks(pd.DataFrame)(_)
Esempio n. 28
0
def partition_map(n: int,
                  func: Any,
                  its: Iterable[Any],
                  name: str = 'compute') -> Iterable[Any]:
    """ Partition sequence into lumps of size `n`, then construct dask delayed computation evaluating to:

    [func(x) for x in its[0:1n]],
    [func(x) for x in its[n:2n]],
    ...
    [func(x) for x in its[]],

    :param n: number of elements to process in one go
    :param func: Function to apply (non-dask)
    :param its:  Values to feed to fun
    :param name: How the computation should be named in dask visualizations
    """
    def lump_proc(dd):
        return [func(d) for d in dd]

    proc = dask.delayed(lump_proc, nout=1, pure=True)
    data_name = _randomize('data_' + name)
    name = _randomize(name)

    for i, dd in enumerate(toolz.partition_all(n, its)):
        lump = dask.delayed(dd,
                            pure=True,
                            traverse=False,
                            name=data_name + str(i))
        yield proc(lump, dask_key_name=name + str(i))
Esempio n. 29
0
def test_broken_worker_during_computation(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    L = c.map(inc, range(256))
    for i in range(8):
        L = c.map(add, *zip(*partition_all(2, L)))

    from random import random
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()

    result = yield c._gather(L)
    assert isinstance(result[0], int)

    yield n._close()
def _get_all_nosetest_xmls(build):
    """
    Return list of nosetest XMLs for all runs (until first 404 is encountered).
    """
    run_path = '{build}/{stage}/1/{job}-runInstance-{run}'
    url = ('https://{user}:{password}@{host}/go/files/' + run_path +
           '/test-results/nosetests.xml')

    context = {
        'host': GOCD_HOST,
        'user': os.getenv('GOCD_USER'),
        'password': os.getenv('GOCD_PASSWORD'),
        'build': build,
    }

    context.update(_get_pipeline_data(build))

    assert context['user'], 'Missing environment variable GOCD_USER'
    assert context['password'], 'Missing environment variable GOCD_PASSWORD'

    # URLs are of the form /something/anotherthing/<run>

    # We do not know the set of valid URLs. I.e., we do not know the maximum
    # valid value of `run` after which all URLs will be 404s.  Instead, we keep
    # trying increasing values of `run` until we start getting 404s. Since we
    # do not know the maximum valid value of run a priori, we cannot create
    # tasks for fetching all valid URLs. Instead we create an initial chunk of
    # tasks, run these concurrently, and then if we haven't started
    # encountering 404s, move on to the next chunk.
    xmls = []
    seen_404 = False

    async def get_xml(run):
        nonlocal context, seen_404
        context = dict(context, run=run)
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            resp = await session.get(url.format(**context))
            if resp.status == 404:
                # We expect to see 404s eventually, but a 404 for the first
                # value of `run` probably means that something is wrong.
                assert run > 1, (
                    'Unexpected 404: %s' %
                    url.format(**dict(context, password='******')))
                seen_404 = True
            else:
                resp.raise_for_status()
                data = await resp.content.read()
                xmls.append(data)

    chunk_size = 30
    ioloop = asyncio.get_event_loop()
    for run_chunk in toolz.partition_all(chunk_size, itertools.count(1)):
        task = asyncio.wait([get_xml(run) for run in run_chunk])
        ioloop.run_until_complete(task)
        if seen_404:
            ioloop.close()
            break

    return xmls
Esempio n. 31
0
def test_broken_worker_during_computation(c, s, a, b):
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    L = c.map(inc, range(256))
    for i in range(8):
        L = c.map(add, *zip(*partition_all(2, L)))

    from random import random
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()
    yield gen.sleep(random() / 2)
    with ignoring(OSError):
        n.process.terminate()

    result = yield c._gather(L)
    assert isinstance(result[0], int)

    yield n._close()
Esempio n. 32
0
def test_broken_worker_during_computation(c, s, a, b):
    s.allowed_failures = 100
    n = Nanny(s.ip, s.port, ncores=2, loop=s.loop)
    n.start(0)

    start = time()
    while len(s.ncores) < 3:
        yield gen.sleep(0.01)
        assert time() < start + 5

    L = c.map(inc, range(256))
    for i in range(8):
        L = c.map(add, *zip(*partition_all(2, L)))

    from random import random
    yield gen.sleep(random() / 2)
    with ignoring(CommClosedError):  # comm will be closed abrupty
        yield c._run(os._exit, 1, workers=[n.worker_address])
    yield gen.sleep(random() / 2)
    with ignoring(
            CommClosedError,
            EnvironmentError):  # perhaps new worker can't be contacted yet
        yield c._run(os._exit, 1, workers=[n.worker_address])

    result = yield c._gather(L)
    assert isinstance(result[0], int)

    yield n._close()
Esempio n. 33
0
def uuid_initializer(uuid_):
    return "{{{}}}".format(
        ", ".join(
            "0x{:02x}{:02x}".format(high, low)
            for low, high in toolz.partition_all(2, uuid_.bytes)
        ),
    )
Esempio n. 34
0
def append_iterator_to_table(t, rows, dshape=None, **kwargs):
    assert not isinstance(t, type)
    rows = iter(rows)

    # We see if the sequence is of tuples or dicts
    # If tuples then we coerce them to dicts
    try:
        row = next(rows)
    except StopIteration:
        return
    rows = chain([row], rows)
    if isinstance(row, (tuple, list)):
        if dshape and isinstance(dshape.measure, datashape.Record):
            names = dshape.measure.names
            if set(names) != set(discover(t).measure.names):
                raise ValueError("Column names of incoming data don't match "
                                 "column names of existing SQL table\n"
                                 "Names in SQL table: %s\n"
                                 "Names from incoming data: %s\n" %
                                 (discover(t).measure.names, names))
        else:
            names = discover(t).measure.names
        rows = (dict(zip(names, row)) for row in rows)

    engine = t.bind
    with engine.connect() as conn:
        for chunk in partition_all(1000, rows):  # TODO: 1000 is hardcoded
            conn.execute(t.insert(), chunk)

    return t
Esempio n. 35
0
def chunked_persist(data, n_concurrent, client, verbose=False):
    """
    Force limited concurrency when persisting a large collection.

    This is useful to control memory usage when operating close to capacity.

    Sometimes `client.persist(data)` will run out of memory, not because
    fully-realized data is large, but because of intermediate data memory
    requirements. This is particularly common when using local dask cluster
    with only one worker.

    This function forces evaluation order of the dask graph to control peak
    memory usage.

    Say you have a largish task graph of 10x10 top-level sub-tasks, you have
    enough memory to process 5 sub-tasks concurrently, but Dask might decide
    to schedule more than that and will cause worker restarts due to out of
    memory errors. With this function you can force dask scheduler to
    persist this collection in batches of 5 concurrent sub-tasks, keeping
    the computation within the memory budget.
    """
    delayed = data.to_delayed().ravel()

    persisted = []
    for chunk in partition_all(n_concurrent, delayed):
        chunk = client.persist(chunk)
        _ = dask_wait(chunk)
        persisted.extend(chunk)
        if verbose:
            print(".", end="")

    # at this point it should be almost no-op
    return client.persist(data)
Esempio n. 36
0
def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs):
    seq2 = partition_all(chunksize, seq)

    add_index = kwargs.get('add_index', False)
    if not add_index:
        # Simple, we can dispatch to dask...
        f = lambda d: convert(pd.DataFrame, d, **kwargs)
        data = [partial(f, d) for d in seq2]
        if not data:
            data = [convert(pd.DataFrame, [], **kwargs)]
        return chunks(pd.DataFrame)(data)

    # TODO: Decide whether we should support the `add_index` flag at all.
    # If so, we need to post-process the converted DataFrame objects sequencially,
    # so we can't parallelize the process.
    try:
        first, rest = next(seq2), seq2
    except StopIteration:
        def _():
            yield convert(pd.DataFrame, [], **kwargs)
    else:
        df = convert(pd.DataFrame, first, **kwargs)
        df1, n1 = _add_index(df, 0)

        def _():
            n = n1
            yield df1
            for i in rest:
                df = convert(pd.DataFrame, i, **kwargs)
                df, n = _add_index(df, n)
                yield df
    return chunks(pd.DataFrame)(_)
Esempio n. 37
0
    def from_checkpoints(self, chunk_size=1000):
        """Initial sync strategy: read from blocks on disk.

        This methods scans for files matching ./checkpoints/*.json.lst
        and uses them for hive's initial sync. Each line must contain
        exactly one block in JSON format.
        """
        # pylint: disable=no-self-use
        last_block = Blocks.head_num()

        tuplize = lambda path: [int(path.split('/')[-1].split('.')[0]), path]
        basedir = os.path.dirname(os.path.realpath(__file__ + "/../.."))
        files = glob.glob(basedir + "/checkpoints/*.json.lst")
        tuples = sorted(map(tuplize, files), key=lambda f: f[0])

        last_read = 0
        for (num, path) in tuples:
            if last_block < num:
                log.info("[SYNC] Load %s. Last block: %d", path, last_block)
                with open(path) as f:
                    # each line in file represents one block
                    # we can skip the blocks we already have
                    skip_lines = last_block - last_read
                    remaining = drop(skip_lines, f)
                    for lines in partition_all(chunk_size, remaining):
                        Blocks.process_multi(map(json.loads, lines), True)
                last_block = num
            last_read = num
Esempio n. 38
0
def _split_bed(bed_input, data):
    """Split BED file into sections for processing, allowing better multicore usage.
    """
    split_lines = 100000
    split_info = []
    base, ext = os.path.splitext(bed_input)
    base, ext2 = os.path.splitext(base)
    ext = ext2 + ext
    with open(bed_input) as in_handle:
        for cur_index, line_group in enumerate(
                tz.partition_all(split_lines, in_handle)):
            cur_file = "%s-%s%s" % (base, cur_index, ext)
            if not utils.file_uptodate(cur_file, bed_input):
                with file_transaction(data, cur_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        for line in line_group:
                            out_handle.write(line)
            split_info.append({
                "i": cur_index,
                "orig": bed_input,
                "file": cur_file
            })
    if not split_info:  # empty input file
        split_info.append({"file": bed_input, "orig": bed_input})
    return split_info
Esempio n. 39
0
def append_iterator_to_table(t, rows, dshape=None, bind=None, **kwargs):
    assert not isinstance(t, type)
    engine = getbind(t, bind)
    if not t.exists(bind=engine):
        t.create(bind=engine)
    rows = iter(rows)

    # We see if the sequence is of tuples or dicts
    # If tuples then we coerce them to dicts
    try:
        row = next(rows)
    except StopIteration:
        return t
    rows = chain([row], rows)
    if isinstance(row, (tuple, list)):
        dshape = dshape and datashape.dshape(dshape)
        if dshape and isinstance(dshape.measure, datashape.Record):
            names = dshape.measure.names
            if set(names) != set(discover(t).measure.names):
                raise ValueError("Column names of incoming data don't match "
                                 "column names of existing SQL table\n"
                                 "Names in SQL table: %s\n"
                                 "Names from incoming data: %s\n" %
                                 (discover(t).measure.names, names))
        else:
            names = discover(t).measure.names
        rows = (dict(zip(names, row)) for row in rows)

    with engine.connect() as conn:
        for chunk in partition_all(1000, rows):  # TODO: 1000 is hardcoded
            conn.execute(t.insert(), chunk)

    return t
Esempio n. 40
0
def into(a, b, **kwargs):
    chunks = partition_all(1024, b)
    chunk = next(chunks)
    a = into(a, chunk, **kwargs)
    for chunk in chunks:
        a.append(list(zip(*chunk)))
    a.flush()
    return a
def execute(file_name):
    categories = ['distinguished', 'removal_reason']
    f = load(file_name)
    batches = partition_all(200000, f)
    df, frames = peek(map(to_df, batches))
    castra = Castra('./subreddit_dumps/'+file_name+'.castra',
                    template = df, categories = categories)
    castra.extend_sequence(frames, freq = '3h')
Esempio n. 42
0
def into(a, b, **kwargs):
    chunks = partition_all(1024, b)
    chunk = next(chunks)
    a = ctable([into(np.ndarray(0), c2) for c2 in zip(*chunk)], **kwargs)
    for chunk in chunks:
        a.append(list(zip(*chunk)))
    a.flush()
    return a
Esempio n. 43
0
def main():
    images = sorted(f for f in listdir('images/') if f.endswith('.JPG'))
    captions = image_captions()
    content = list(zip(images, captions, count(1)))
    step = 6
    size = len(content) // step
    for i, img_cap_idx_list in enumerate(partition_all(step, content)):
        create_slide(i, size, img_cap_idx_list)
    write_app_cache()
Esempio n. 44
0
def iterator_to_numpy_chunks(seq, chunksize=1024, **kwargs):
    seq2 = partition_all(chunksize, seq)
    first, rest = next(seq2), seq2
    x = convert(np.ndarray, first, **kwargs)
    def _():
        yield x
        for i in rest:
            yield convert(np.ndarray, i, **kwargs)
    return chunks(np.ndarray)(_)
Esempio n. 45
0
def into(a, b, **kwargs):
    kwargs = keyfilter(carray_keywords.__contains__, kwargs)
    chunks = partition_all(1024, b)
    chunk = next(chunks)
    a = into(a, chunk, **kwargs)
    for chunk in chunks:
        a.append(list(zip(*chunk)))
    a.flush()
    return a
Esempio n. 46
0
File: umis.py Progetto: vals/umis
def sb_filter(fastq, bc, cores, nedit):
    ''' Filters reads with non-matching sample barcodes
    Expects formatted fastq files.
    '''
    barcodes = set(sb.strip() for sb in bc)
    if nedit == 0:
        filter_sb = partial(exact_sample_filter2, barcodes=barcodes)
    else:
        barcodehash = MutationHash(barcodes, nedit)
        filter_sb = partial(correcting_sample_filter2, barcodehash=barcodehash)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_sb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 47
0
def partition(grouper, sequence, npartitions, p, nelements=2**20):
    """ Partition a bag along a grouper, store partitions on disk """
    for block in partition_all(nelements, sequence):
        d = groupby(grouper, block)
        d2 = defaultdict(list)
        for k, v in d.items():
            d2[abs(hash(k)) % npartitions].extend(v)
        p.append(d2)
    return p
Esempio n. 48
0
def cb_filter(fastq, bc1, bc2, cores):
    ''' Filters reads with non-matching barcodes
    Expects formatted fastq files.
    '''

    bc1 = set(cb.strip() for cb in bc1)
    if bc2:
        bc2 = set(cb.strip() for cb in bc2)

    filter_cb = partial(cb_filterer, bc1=bc1, bc2=bc2)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, stream_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_cb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 49
0
 def iter_arrays(self, arrays_per_chunk=None):
     """Iterates over the arrays in this store."""
     if arrays_per_chunk is None:
         for key in range(self.journal().numarrays()):
             yield self.get([key])
     elif arrays_per_chunk <= 0:
         raise ValueError('arrays_per_chunk must be None or bigger than 0, it is %r' % arrays_per_chunk)
     else:
         for segments in partition_all(arrays_per_chunk, range(self.journal().numarrays())):
             yield self.get(segments)
def parallel_rebin(K, mz_axis, imzb):
    mz_axis_chunks = list(partition_all(K, mz_axis))
    # create dask array manually using tasks
    tasks = {('x', i, 0, 0): (get_mz_images, mz_chunk, imzb) for i, mz_chunk in enumerate(mz_axis_chunks)}
    chunks_mz = [len(c) for c in mz_axis_chunks]
    chunks_x = (imzb.height,)
    chunks_y = (imzb.width,)
    arr = da.Array(tasks, 'x', chunks=(chunks_mz, chunks_x, chunks_y), dtype=float)
    print arr.shape
    return arr
Esempio n. 51
0
def append_local_file_to_hdfs(target, source, blocksize=100000, **kwargs):
    if raises(FileNotFound,
              lambda: target.hdfs.list_dir(target.path.lstrip('/'))):
        target.hdfs.create_file(target.path.lstrip('/'), '')

    with open(source.path, 'r') as f:
        blocks = partition_all(blocksize, f)
        for block in blocks:
            target.hdfs.append_file(target.path.lstrip('/'), ''.join(block))

    return target
Esempio n. 52
0
    def nnls_frob(x, anchors):
        ncols = x.shape[1]
        x_sel = np.array(anchors)
        # print "projection"
        result = np.zeros((x_sel.shape[1], ncols))

        # apply NNLS to chunks so as to avoid loading all m/z images into RAM
        for chunk in partition_all(100, range(ncols)):
            residuals = np.array(x[:, chunk])
            result[:, chunk] = nnlsm_blockpivot(x_sel, residuals)[0]

        return result
Esempio n. 53
0
def iterator_to_DataFrame_chunks(seq, chunksize=1024, **kwargs):
    seq2 = partition_all(chunksize, seq)
    try:
        first, rest = next(seq2), seq2
    except StopIteration:
        return chunks(pd.DataFrame)([])
    df = convert(pd.DataFrame, first, **kwargs)
    def _():
        yield df
        for i in rest:
            yield convert(pd.DataFrame, i, **kwargs)
    return chunks(pd.DataFrame)(_)
Esempio n. 54
0
def get_parallel_regions_block(batch):
    """CWL target to retrieve block group of callable regions for parallelization.

    Uses blocking to handle multicore runs.
    """
    samples = [utils.to_single_data(d) for d in batch]
    regions = _get_parallel_regions(samples[0])
    out = []
    # Currently don't have core information here so aim for about 10 items per partition
    n = 10
    for region_block in tz.partition_all(n, regions):
        out.append({"region_block": ["%s:%s-%s" % (c, s, e) for c, s, e in region_block]})
    return out
Esempio n. 55
0
File: umis.py Progetto: vals/umis
def cb_filter(fastq, bc1, bc2, bc3, cores, nedit):
    ''' Filters reads with non-matching barcodes
    Expects formatted fastq files.
    '''

    with open_gzipsafe(bc1) as bc1_fh:
        bc1 = set(cb.strip() for cb in bc1_fh)
    if bc2:
        with open_gzipsafe(bc2) as bc2_fh:
            bc2 = set(cb.strip() for cb in bc2_fh)
    if bc3:
        with open_gzipsafe(bc3) as bc3_fh:
            bc3 = set(cb.strip() for cb in bc3_fh)

    annotations = detect_fastq_annotations(fastq)
    re_string = construct_transformed_regex(annotations)

    if nedit == 0:
        filter_cb = partial(exact_barcode_filter, bc1=bc1, bc2=bc2, bc3=bc3,
                            re_string=re_string)
    else:
        bc1hash = MutationHash(bc1, nedit)
        bc2hash = None
        bc3hash = None
        if bc2:
            bc2hash = MutationHash(bc2, nedit)
        if bc3:
            bc3hash = MutationHash(bc3, nedit)
        filter_cb = partial(correcting_barcode_filter, bc1hash=bc1hash,
                            bc2hash=bc2hash, bc3hash=bc3hash, re_string=re_string)
    p = multiprocessing.Pool(cores)

    chunks = tz.partition_all(10000, read_fastq(fastq))
    bigchunks = tz.partition_all(cores, chunks)
    for bigchunk in bigchunks:
        for chunk in p.map(filter_cb, list(bigchunk)):
            for read in chunk:
                sys.stdout.write(read)
Esempio n. 56
0
def _batch_gvcfs(data, region, vrn_files, ref_file, out_file=None):
    """Perform batching of gVCF files if above recommended input count.
    """
    if out_file is None:
        out_file = vrn_files[0]
    if len(vrn_files) >= MAX_BATCH:
        out = []
        for i, batch_vrn_files in enumerate(tz.partition_all(MAX_BATCH, vrn_files)):
            base, ext = utils.splitext_plus(out_file)
            batch_out_file = "%s-b%s%s" % (base, i, ext)
            out.append(_run_combine_gvcfs(batch_vrn_files, region, ref_file, batch_out_file, data))
        return _batch_gvcfs(data, region, out, ref_file)
    else:
        return vrn_files
Esempio n. 57
0
File: core.py Progetto: jcorbin/dask
    def reduction(self, perpartition, aggregate, split_every=None,
                  out_type=Item, name=None):
        """ Reduce collection with reduction operators

        Parameters
        ----------
        perpartition: function
            reduction to apply to each partition
        aggregate: function
            reduction to apply to the results of all partitions
        split_every: int (optional)
            Group partitions into groups of this size while performing reduction
            Defaults to 8
        out_type: {Bag, Item}
            The out type of the result, Item if a single element, Bag if a list
            of elements.  Defaults to Item.

        Examples
        --------
        >>> b = from_sequence(range(10))
        >>> b.reduction(sum, sum).compute()
        45
        """
        if split_every is None:
            split_every = 8
        if split_every is False:
            split_every = self.npartitions
        token = tokenize(self, perpartition, aggregate, split_every)
        a = '%s-part-%s' % (name or funcname(perpartition), token)
        dsk = dict(((a, i), (perpartition, (self.name, i)))
                   for i in range(self.npartitions))
        k = self.npartitions
        b = a
        fmt = '%s-aggregate-%s' % (name or funcname(aggregate), token)
        depth = 0
        while k > 1:
            c = fmt + str(depth)
            dsk2 = dict(((c, i), (aggregate, [(b, j) for j in inds]))
                        for i, inds in enumerate(partition_all(split_every,
                                                               range(k))))
            dsk.update(dsk2)
            k = len(dsk2)
            b = c
            depth += 1

        if out_type is Item:
            dsk[b] = dsk.pop((b, 0))
            return Item(merge(self.dask, dsk), b)
        else:
            return Bag(merge(self.dask, dsk), b, 1)
Esempio n. 58
0
def big_kmeans(docs, k, batch_size=1000, n_features=(2 ** 20),
               single_pass=True):
    """k-means for very large sets of documents.

    See kmeans for documentation. Differs from that function in that it does
    not computer tf-idf or LSA, and fetches the documents in a streaming
    fashion, so they don't need to be held in memory. It does not do random
    restarts.

    If the option single_pass is set to False, the documents are visited
    twice: once to fit a k-means model, once to determine their label in
    this model.
    """
    from sklearn.cluster import MiniBatchKMeans
    from sklearn.feature_extraction.text import HashingVectorizer

    docs = tosequence(docs)

    v = HashingVectorizer(input="content", n_features=n_features, norm="l2")
    km = MiniBatchKMeans(n_clusters=k)

    labels = []
    for batch in toolz.partition_all(batch_size, docs):
        batch = map(fetch, docs)
        batch = v.transform(batch)
        y = km.fit_predict(batch)
        if single_pass:
            labels.extend(y.tolist())

    if not single_pass:
        for batch in toolz.partition_all(batch_size, docs):
            batch = map(fetch, docs)
            batch = v.transform(batch)
            labels.extend(km.predict(batch).tolist())

    return group_clusters(docs, labels)
Esempio n. 59
0
def iterator_to_numpy_chunks(seq, chunksize=1024, **kwargs):
    seq2 = partition_all(chunksize, seq)
    try:
        first, rest = next(seq2), seq2
    except StopIteration:  # seq is empty
        def _():
            yield convert(np.ndarray, [], **kwargs)
    else:
        x = convert(np.ndarray, first, **kwargs)

        def _():
            yield x
            for i in rest:
                yield convert(np.ndarray, i, **kwargs)
    return chunks(np.ndarray)(_)
Esempio n. 60
0
def test_blocked():
    blocks = []
    for k in sorted(files):
        b = files[k]
        lines = b.split(b'\n')
        blocks.append([b'\n'.join(bs) for bs in partition_all(2, lines)])

    df = read_csv_from_bytes(blocks, header, expected.head(), {})
    eq(df.compute().reset_index(drop=True),
       expected.reset_index(drop=True), check_dtype=False)

    expected2 = expected[['name', 'id']]
    df = read_csv_from_bytes(blocks, header, expected2.head(),
                             {'usecols': ['name', 'id']})
    eq(df.compute().reset_index(drop=True),
       expected2.reset_index(drop=True), check_dtype=False)