Esempio n. 1
0
def test_read_write(tmp_path):

    records = [
        busio.Bus_record(0, 0, 10, 20, 1),
        busio.Bus_record(1, 0, 13, 206, 12),
        busio.Bus_record(2, 0, 14, 250, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=1, umi_length=2)

    # check that file got created
    assert pathlib.Path(fname).exists()

    # check that read/write are inverses of each other
    # buffersize is on purpose smaller then len(records)
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=False, buffersize=2))
    assert new_records == records

    # buffersize larger
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=False, buffersize=20))
    assert new_records == records

    # check the decode_Seq works:
    record = next(busio.read_binary_bus(fname, decode_seq=False, buffersize=2))
    assert isinstance(record.CB, int) and isinstance(record.UMI, int)
    record = next(busio.read_binary_bus(fname, decode_seq=True, buffersize=2))
    assert isinstance(record.CB, str) and isinstance(record.UMI, str)
Esempio n. 2
0
def iterate_CB_UMI_of_busfile(fname, decode_seq=True):
    """
    iterates over CB/UMI entries, i.e. all entries with the same CB/UMI
    are emitted together.
    ideally, there'd only be one entry per CB/UMI, but sometimes thers doublets
    """
    bus_iterator = busio.read_binary_bus(fname, decode_seq)

    record = next(bus_iterator)
    current_cell = record.CB
    current_umi = record.UMI
    current_recordlist = [record]
    for record in bus_iterator:
        if record.CB > current_cell or (record.CB == current_cell
                                        and record.UMI > current_umi):

            yield (current_cell, current_umi), current_recordlist

            # reset for the next cell/UMI
            # process results and reset
            current_cell = record.CB
            current_umi = record.UMI
            current_recordlist = [record]
        elif record.CB == current_cell and record.UMI == current_umi:
            current_recordlist.append(record)
        else:
            raise ValueError(
                f'bsufile unsorted:  {record.CB}/{record.UMI}  vs {current_cell}/{current_umi}'
            )

    yield (current_cell, current_umi), current_recordlist
Esempio n. 3
0
def iterate_cells_of_busfile(fname, decode_seq=True):
    """
    runs over the !!!SORTED!!! busfile, collecting all entries for a single CB
    and yield it as `cb,info_list`

    this one returns a list of BusRecords
    """
    bus_iterator = busio.read_binary_bus(fname, decode_seq)

    # get the first entry to get started
    record = next(bus_iterator)
    current_cell = record.CB
    current_recordlist = [record]

    for record in bus_iterator:
        if record.CB > current_cell:
            # we're finished with one cells, yield it and start the next
            yield current_cell, current_recordlist

            # reset for the next cell
            # process results and reset
            current_cell = record.CB
            current_recordlist = [record]
        elif record.CB == current_cell:
            current_recordlist.append(record)
        else:
            raise ValueError(
                f'Bus file not sorted!! {record.CB} vs {current_cell}')

    # emitting the final cell
    yield current_cell, current_recordlist
Esempio n. 4
0
def in_parallel(bus, outfile, cores):
    """

    """
    # put al the intermediate results in here
    tmpfolder = tempfile.mkdtemp(prefix='pug_', dir='/tmp')

    QUEUE_LENGTH = 1000
    cb_queue = mp.Queue(QUEUE_LENGTH)  # has (CB, [records]) as elements

    worker_tasks = []

    # task1 = mp.Pool(1, initializer=cell_producer, initargs=(bus, cb_queue))
    task1 = mp.Process(target=cell_producer, args=(bus, cb_queue, cores))
    worker_tasks.append(task1)
    for i in range(cores):
        busfile = f'{tmpfolder}/{i}.bus'
        # t = mp.Pool(1, initializer=pug_writer, initargs=(bus, cb_queue, busfile))
        t = mp.Process(target=pug_writer, args=(bus, cb_queue, busfile))
        worker_tasks.append(t)

    for t in worker_tasks:
        t.start()

    for t in worker_tasks:
        t.join()

    # cleanup
    assert cb_queue.empty(), "queue not empty!!"
    cb_queue.close()

    for t in worker_tasks:
        t.close()

    # merge all busfiles into a big one!
    bus_iterators = []
    for i in range(cores):
        busfile = f'{tmpfolder}/{i}.bus'
        gen = busio.read_binary_bus(busfile)
        bus_iterators.append(gen)

    big_gen = itertools.chain.from_iterable(bus_iterators)
    unsorted_outfile = f'{tmpfolder}/unsorted.bus'
    write_busfile(unsorted_outfile, big_gen, cb_length=16, umi_length=12)

    # cleanup the parts
    for i in range(cores):
        busfile = f'{tmpfolder}/{i}.bus'
        os.remove(busfile)

    # sort the file
    import subprocess
    import sys
    ret = subprocess.run(["bustools", "sort", '-o', outfile, unsorted_outfile])
    if ret.returncode != 0:
        print("Child was terminated by signal", ret, file=sys.stderr)
        raise ValueError()
Esempio n. 5
0
def speed_test_buffer(buffersize):
    t1 = time.time()
    n_iter = 10_000_000
    gen = read_binary_bus(B1.bus_file, decode_seq=False, buffersize=buffersize)
    gen = toolz.take(n_iter, gen)
    for a in gen:
        pass
    t2 = time.time()
    return t2 - t1
Esempio n. 6
0
    def _gen():
        n_filtered = 0
        n_total = 0
        for record in tqdm.tqdm(read_binary_bus(inbus, decode_seq=False)):
            n_total += 1
            if (record.CB, record.UMI) in dubious_cb_umi:
                n_filtered += 1
                continue
            yield record

        print(
            f'{n_filtered}/{n_total} ({100 * n_filtered/n_total:.3f}%) CUGs filtered'
        )
Esempio n. 7
0
def test_read_write_str(tmp_path):
    """
    write records with strings instead of ints for CB/UMI
    """
    records = [
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)
    # read and compare to originl
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=True, buffersize=2))
    assert new_records == records
Esempio n. 8
0
def get_number_of_reads_and_molecules(fname):
    """
    similar to bustools inspect
    Gets the number of reads and number of bus-records (approx #UMI) in a busfile

    :param fname: filename of the busfile
    """
    # TODO: technically not correct: this count the number of reads (fine) and number of ENTRIES in the busfile.
    # ideally ach molecule would have a single entry.
    # However: a single molecule might map to two different EC classes. The molecule got fragmented into two places, mapping it to two diff locations
    total_reads = 0
    total_molecules = 0
    for record in tqdm.tqdm(read_binary_bus(fname, decode_seq=False), desc='counting reads'):
        total_reads += record.COUNT
        total_molecules += 1
    return total_reads, total_molecules
Esempio n. 9
0
def subsample_busfile(fname_in, fname_out, fraction):
    """
    subsample the reads of an existing busfile by `fraction`, writing the Result
    into a new busfile. The major effect is that some entries will recieve
    0 reads and hence disappear from the file!

    :param fname_in: Filename of the input busfile
    :param fname_out: Filename of the resulting, subsampled busfile
    :param fraction: 0<fraction<1, the fraction of subsampling
    """
    assert 0 < fraction < 1, "fraction must be in [0,1]"

    # for this to work, we have to pass the input file twice:
    # 1. we  have to collect ALL the counts (for each record)
    #    this will then be jointly subsampled
    # 2. we iterate the inputfile again, just now we write out each reocrdin into a
    # different busfile with adjusted count
    huge_array = []
    I = read_binary_bus(fname_in, decode_seq=False)
    for record in tqdm.tqdm(I, desc='First pass'):
        huge_array.append(record.COUNT)

    huge_array = np.array(huge_array)
    n_total = np.sum(huge_array)
    n_target = int(n_total * fraction)
    print(f'Subsampling from {n_total} to {n_target}')
    x = _downsample_array(huge_array, target=n_target, random_state=int(time.time()*1000), replace=False, inplace=False)
    print(f'Downsampled reads: {x.sum()}')

    # create a generator for the bus-records
    def _helper_gen():
        I = read_binary_bus(fname_in, decode_seq=False)
        for i, record in tqdm.tqdm(enumerate(I), desc='Second pass'):
            if x[i] > 0:
                r = Bus_record(record.CB, record.UMI, record.EC, x[i], record.FLAG)
                yield r

    G = _helper_gen()
    # we need to write the correct header
    _, cb_len, umi_len, _ = get_header_info(fname_in)
    write_busfile(fname_out, G, cb_length=cb_len, umi_length=umi_len)
Esempio n. 10
0
    """
    comparing the old read_binary_bus vs the new version using struct.iter_unpack
    """
    from pybustools.pybustools import Bus
    from pybustools import busio, busio_old

    import collections
    import tqdm
    import toolz
    import time

    "gsutil -m cp -r gs://cruk-01-kallisto-nextflow/dnbseqg400.V300026370_88A.L05A_2-658952_cellranger_v3p0p1_refdata-cellranger-GRCh38-1_2_0.outs/ /tmp"
    f = '/tmp/dnbseqg400.V300026370_88A.L05A_2-658952_cellranger_v3p0p1_refdata-cellranger-GRCh38-1_2_0.outs/kallisto/sort_bus/bus_output'

    B1 = Bus(folder=f, decode_seq=True)

    %load_ext snakeviz
    %%snakeviz
    for a in toolz.take(10_000_000, busio.read_binary_bus(B1.bus_file, decode_seq=False, buffersize=1000)):
        pass

    %%snakeviz
    for a in toolz.take(10_000_000, busio_old.read_binary_bus2(B1.bus_file, decode_seq=False, buffersize=1000)):
        pass

    import tqdm
    I1 = toolz.take(10_000_000, busio.read_binary_bus(B1.bus_file, decode_seq=False, buffersize=1000))
    I2 = toolz.take(10_000_000, busio_old.read_binary_bus(B1.bus_file, decode_seq=False, buffersize=1000))
    for a, b in tqdm.tqdm(zip(I1, I2)):
        assert a==b
Esempio n. 11
0
 def _helper_gen():
     I = read_binary_bus(fname_in, decode_seq=False)
     for i, record in tqdm.tqdm(enumerate(I), desc='Second pass'):
         if x[i] > 0:
             r = Bus_record(record.CB, record.UMI, record.EC, x[i], record.FLAG)
             yield r
Esempio n. 12
0
 def iterate_bus(self):
     return busio.read_binary_bus(self.bus_file, self.decode_seq)