def test_read_write(tmp_path): records = [ busio.Bus_record(0, 0, 10, 20, 1), busio.Bus_record(1, 0, 13, 206, 12), busio.Bus_record(2, 0, 14, 250, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=1, umi_length=2) # check that file got created assert pathlib.Path(fname).exists() # check that read/write are inverses of each other # buffersize is on purpose smaller then len(records) new_records = list( busio.read_binary_bus(fname, decode_seq=False, buffersize=2)) assert new_records == records # buffersize larger new_records = list( busio.read_binary_bus(fname, decode_seq=False, buffersize=20)) assert new_records == records # check the decode_Seq works: record = next(busio.read_binary_bus(fname, decode_seq=False, buffersize=2)) assert isinstance(record.CB, int) and isinstance(record.UMI, int) record = next(busio.read_binary_bus(fname, decode_seq=True, buffersize=2)) assert isinstance(record.CB, str) and isinstance(record.UMI, str)
def iterate_CB_UMI_of_busfile(fname, decode_seq=True): """ iterates over CB/UMI entries, i.e. all entries with the same CB/UMI are emitted together. ideally, there'd only be one entry per CB/UMI, but sometimes thers doublets """ bus_iterator = busio.read_binary_bus(fname, decode_seq) record = next(bus_iterator) current_cell = record.CB current_umi = record.UMI current_recordlist = [record] for record in bus_iterator: if record.CB > current_cell or (record.CB == current_cell and record.UMI > current_umi): yield (current_cell, current_umi), current_recordlist # reset for the next cell/UMI # process results and reset current_cell = record.CB current_umi = record.UMI current_recordlist = [record] elif record.CB == current_cell and record.UMI == current_umi: current_recordlist.append(record) else: raise ValueError( f'bsufile unsorted: {record.CB}/{record.UMI} vs {current_cell}/{current_umi}' ) yield (current_cell, current_umi), current_recordlist
def iterate_cells_of_busfile(fname, decode_seq=True): """ runs over the !!!SORTED!!! busfile, collecting all entries for a single CB and yield it as `cb,info_list` this one returns a list of BusRecords """ bus_iterator = busio.read_binary_bus(fname, decode_seq) # get the first entry to get started record = next(bus_iterator) current_cell = record.CB current_recordlist = [record] for record in bus_iterator: if record.CB > current_cell: # we're finished with one cells, yield it and start the next yield current_cell, current_recordlist # reset for the next cell # process results and reset current_cell = record.CB current_recordlist = [record] elif record.CB == current_cell: current_recordlist.append(record) else: raise ValueError( f'Bus file not sorted!! {record.CB} vs {current_cell}') # emitting the final cell yield current_cell, current_recordlist
def in_parallel(bus, outfile, cores): """ """ # put al the intermediate results in here tmpfolder = tempfile.mkdtemp(prefix='pug_', dir='/tmp') QUEUE_LENGTH = 1000 cb_queue = mp.Queue(QUEUE_LENGTH) # has (CB, [records]) as elements worker_tasks = [] # task1 = mp.Pool(1, initializer=cell_producer, initargs=(bus, cb_queue)) task1 = mp.Process(target=cell_producer, args=(bus, cb_queue, cores)) worker_tasks.append(task1) for i in range(cores): busfile = f'{tmpfolder}/{i}.bus' # t = mp.Pool(1, initializer=pug_writer, initargs=(bus, cb_queue, busfile)) t = mp.Process(target=pug_writer, args=(bus, cb_queue, busfile)) worker_tasks.append(t) for t in worker_tasks: t.start() for t in worker_tasks: t.join() # cleanup assert cb_queue.empty(), "queue not empty!!" cb_queue.close() for t in worker_tasks: t.close() # merge all busfiles into a big one! bus_iterators = [] for i in range(cores): busfile = f'{tmpfolder}/{i}.bus' gen = busio.read_binary_bus(busfile) bus_iterators.append(gen) big_gen = itertools.chain.from_iterable(bus_iterators) unsorted_outfile = f'{tmpfolder}/unsorted.bus' write_busfile(unsorted_outfile, big_gen, cb_length=16, umi_length=12) # cleanup the parts for i in range(cores): busfile = f'{tmpfolder}/{i}.bus' os.remove(busfile) # sort the file import subprocess import sys ret = subprocess.run(["bustools", "sort", '-o', outfile, unsorted_outfile]) if ret.returncode != 0: print("Child was terminated by signal", ret, file=sys.stderr) raise ValueError()
def speed_test_buffer(buffersize): t1 = time.time() n_iter = 10_000_000 gen = read_binary_bus(B1.bus_file, decode_seq=False, buffersize=buffersize) gen = toolz.take(n_iter, gen) for a in gen: pass t2 = time.time() return t2 - t1
def _gen(): n_filtered = 0 n_total = 0 for record in tqdm.tqdm(read_binary_bus(inbus, decode_seq=False)): n_total += 1 if (record.CB, record.UMI) in dubious_cb_umi: n_filtered += 1 continue yield record print( f'{n_filtered}/{n_total} ({100 * n_filtered/n_total:.3f}%) CUGs filtered' )
def test_read_write_str(tmp_path): """ write records with strings instead of ints for CB/UMI """ records = [ busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), busio.Bus_record('TAGA', 'TAT', 14, 250, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) # read and compare to originl new_records = list( busio.read_binary_bus(fname, decode_seq=True, buffersize=2)) assert new_records == records
def get_number_of_reads_and_molecules(fname): """ similar to bustools inspect Gets the number of reads and number of bus-records (approx #UMI) in a busfile :param fname: filename of the busfile """ # TODO: technically not correct: this count the number of reads (fine) and number of ENTRIES in the busfile. # ideally ach molecule would have a single entry. # However: a single molecule might map to two different EC classes. The molecule got fragmented into two places, mapping it to two diff locations total_reads = 0 total_molecules = 0 for record in tqdm.tqdm(read_binary_bus(fname, decode_seq=False), desc='counting reads'): total_reads += record.COUNT total_molecules += 1 return total_reads, total_molecules
def subsample_busfile(fname_in, fname_out, fraction): """ subsample the reads of an existing busfile by `fraction`, writing the Result into a new busfile. The major effect is that some entries will recieve 0 reads and hence disappear from the file! :param fname_in: Filename of the input busfile :param fname_out: Filename of the resulting, subsampled busfile :param fraction: 0<fraction<1, the fraction of subsampling """ assert 0 < fraction < 1, "fraction must be in [0,1]" # for this to work, we have to pass the input file twice: # 1. we have to collect ALL the counts (for each record) # this will then be jointly subsampled # 2. we iterate the inputfile again, just now we write out each reocrdin into a # different busfile with adjusted count huge_array = [] I = read_binary_bus(fname_in, decode_seq=False) for record in tqdm.tqdm(I, desc='First pass'): huge_array.append(record.COUNT) huge_array = np.array(huge_array) n_total = np.sum(huge_array) n_target = int(n_total * fraction) print(f'Subsampling from {n_total} to {n_target}') x = _downsample_array(huge_array, target=n_target, random_state=int(time.time()*1000), replace=False, inplace=False) print(f'Downsampled reads: {x.sum()}') # create a generator for the bus-records def _helper_gen(): I = read_binary_bus(fname_in, decode_seq=False) for i, record in tqdm.tqdm(enumerate(I), desc='Second pass'): if x[i] > 0: r = Bus_record(record.CB, record.UMI, record.EC, x[i], record.FLAG) yield r G = _helper_gen() # we need to write the correct header _, cb_len, umi_len, _ = get_header_info(fname_in) write_busfile(fname_out, G, cb_length=cb_len, umi_length=umi_len)
""" comparing the old read_binary_bus vs the new version using struct.iter_unpack """ from pybustools.pybustools import Bus from pybustools import busio, busio_old import collections import tqdm import toolz import time "gsutil -m cp -r gs://cruk-01-kallisto-nextflow/dnbseqg400.V300026370_88A.L05A_2-658952_cellranger_v3p0p1_refdata-cellranger-GRCh38-1_2_0.outs/ /tmp" f = '/tmp/dnbseqg400.V300026370_88A.L05A_2-658952_cellranger_v3p0p1_refdata-cellranger-GRCh38-1_2_0.outs/kallisto/sort_bus/bus_output' B1 = Bus(folder=f, decode_seq=True) %load_ext snakeviz %%snakeviz for a in toolz.take(10_000_000, busio.read_binary_bus(B1.bus_file, decode_seq=False, buffersize=1000)): pass %%snakeviz for a in toolz.take(10_000_000, busio_old.read_binary_bus2(B1.bus_file, decode_seq=False, buffersize=1000)): pass import tqdm I1 = toolz.take(10_000_000, busio.read_binary_bus(B1.bus_file, decode_seq=False, buffersize=1000)) I2 = toolz.take(10_000_000, busio_old.read_binary_bus(B1.bus_file, decode_seq=False, buffersize=1000)) for a, b in tqdm.tqdm(zip(I1, I2)): assert a==b
def _helper_gen(): I = read_binary_bus(fname_in, decode_seq=False) for i, record in tqdm.tqdm(enumerate(I), desc='Second pass'): if x[i] > 0: r = Bus_record(record.CB, record.UMI, record.EC, x[i], record.FLAG) yield r
def iterate_bus(self): return busio.read_binary_bus(self.bus_file, self.decode_seq)