def test_read_write(tmp_path): records = [ busio.Bus_record(0, 0, 10, 20, 1), busio.Bus_record(1, 0, 13, 206, 12), busio.Bus_record(2, 0, 14, 250, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=1, umi_length=2) # check that file got created assert pathlib.Path(fname).exists() # check that read/write are inverses of each other # buffersize is on purpose smaller then len(records) new_records = list( busio.read_binary_bus(fname, decode_seq=False, buffersize=2)) assert new_records == records # buffersize larger new_records = list( busio.read_binary_bus(fname, decode_seq=False, buffersize=20)) assert new_records == records # check the decode_Seq works: record = next(busio.read_binary_bus(fname, decode_seq=False, buffersize=2)) assert isinstance(record.CB, int) and isinstance(record.UMI, int) record = next(busio.read_binary_bus(fname, decode_seq=True, buffersize=2)) assert isinstance(record.CB, str) and isinstance(record.UMI, str)
def test_iterate_cb_umi(tmp_path): records = [ busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 11, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) gen = pybustools.iterate_CB_UMI_of_busfile(fname) # the first record must have one entry cb1, list1 = next(gen) assert cb1 == ('ATAT', 'AAA') and len(list1) == 1 cb2, list2 = next(gen) assert cb2 == ('ATAT', 'GGG') and len(list2) == 2 cb3, list3 = next(gen) assert cb3 == ('TAGA', 'TAT') and len(list3) == 1 cb4, list4 = next(gen) assert cb4 == ('TTAT', 'AAA') and len(list4) == 1
def filter_busfile(inbus, outbus, suspicious): """ filtering the inbus file for CUGs that are listed in the suspicious-file """ if VERBOSE: print(f'Filtering {inbus} using {suspicious}') with open(suspicious, 'rb') as fh: dubious_cb_umi = pickle.load(fh) dubious_cb_umi = set(dubious_cb_umi) _, cb_len, umi_len, _ = get_header_info(inbus) def _gen(): n_filtered = 0 n_total = 0 for record in tqdm.tqdm(read_binary_bus(inbus, decode_seq=False)): n_total += 1 if (record.CB, record.UMI) in dubious_cb_umi: n_filtered += 1 continue yield record print( f'{n_filtered}/{n_total} ({100 * n_filtered/n_total:.3f}%) CUGs filtered' ) write_busfile(outbus, _gen(), cb_len, umi_len)
def test_iterate_cells_UMI_raise_unsorted(tmp_path): """ iterate_CB_UMI_of_busfile must raise an error of the busfile is unsorted (in terms of CBs) """ records = [ busio.Bus_record('ATAT', 'TAT', 14, 250, 13), busio.Bus_record('ATAT', 'AAA', 10, 20, 1), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) with pytest.raises(ValueError): gen = pybustools.iterate_CB_UMI_of_busfile(fname) list(gen) """ also check that it raises when the CB is unsorted """ records = [ # impotant: the UMI should be the same, busio.Bus_record('TTAT', 'TAT', 14, 250, 13), busio.Bus_record('ATAT', 'TAT', 10, 20, 1), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) with pytest.raises(ValueError): gen = pybustools.iterate_CB_UMI_of_busfile(fname) list(gen)
def in_parallel(bus, outfile, cores): """ """ # put al the intermediate results in here tmpfolder = tempfile.mkdtemp(prefix='pug_', dir='/tmp') QUEUE_LENGTH = 1000 cb_queue = mp.Queue(QUEUE_LENGTH) # has (CB, [records]) as elements worker_tasks = [] # task1 = mp.Pool(1, initializer=cell_producer, initargs=(bus, cb_queue)) task1 = mp.Process(target=cell_producer, args=(bus, cb_queue, cores)) worker_tasks.append(task1) for i in range(cores): busfile = f'{tmpfolder}/{i}.bus' # t = mp.Pool(1, initializer=pug_writer, initargs=(bus, cb_queue, busfile)) t = mp.Process(target=pug_writer, args=(bus, cb_queue, busfile)) worker_tasks.append(t) for t in worker_tasks: t.start() for t in worker_tasks: t.join() # cleanup assert cb_queue.empty(), "queue not empty!!" cb_queue.close() for t in worker_tasks: t.close() # merge all busfiles into a big one! bus_iterators = [] for i in range(cores): busfile = f'{tmpfolder}/{i}.bus' gen = busio.read_binary_bus(busfile) bus_iterators.append(gen) big_gen = itertools.chain.from_iterable(bus_iterators) unsorted_outfile = f'{tmpfolder}/unsorted.bus' write_busfile(unsorted_outfile, big_gen, cb_length=16, umi_length=12) # cleanup the parts for i in range(cores): busfile = f'{tmpfolder}/{i}.bus' os.remove(busfile) # sort the file import subprocess import sys ret = subprocess.run(["bustools", "sort", '-o', outfile, unsorted_outfile]) if ret.returncode != 0: print("Child was terminated by signal", ret, file=sys.stderr) raise ValueError()
def test_write_check_cb_umi_length(tmp_path): """ make sure an exception is thrown if the records dont match the given UMI/CB length """ records = [ busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TTA', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' with pytest.raises(AssertionError): busio.write_busfile(fname, records, cb_length=4, umi_length=3)
def test_get_header(tmp_path): records = [ busio.Bus_record(0, 0, 10, 20, 1), busio.Bus_record(1, 0, 13, 206, 12), busio.Bus_record(2, 0, 14, 250, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=12, umi_length=5) _, cb, umi, _ = busio.get_header_info(fname) assert cb == 12 and umi == 5
def test_iterate_cells_raise_unsorted(tmp_path): """ iterate_cells must raise an error of the busfile is unsorted (in terms of CBs) """ records = [ busio.Bus_record('TTAT', 'AAA', 13, 206, 12), busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) with pytest.raises(ValueError): gen = pybustools.iterate_cells_of_busfile(fname) list(gen)
def test_read_write_str(tmp_path): """ write records with strings instead of ints for CB/UMI """ records = [ busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), busio.Bus_record('TAGA', 'TAT', 14, 250, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) # read and compare to originl new_records = list( busio.read_binary_bus(fname, decode_seq=True, buffersize=2)) assert new_records == records
def test_emit_records(tmp_path, ec_matrix_file, transcript_file): """ two busfiles, they have the same CB/UMI, but it maps to different EC/genes emit_records_based_on_gene should yield multiple entries!! """ cb_length = 4 umi_length = 3 records1 = [ # # CB UMI EC COUNT FLAG busio.Bus_record('ATAT', 'GGG', 1, 10, 1), ] fname1 = tmp_path / 'some1.bus' busio.write_busfile(fname1, records1, cb_length, umi_length) bus1 = Bus(folder='/', bus_name=fname1, ec_name=ec_matrix_file, transcript_name=transcript_file) records2 = [ busio.Bus_record('ATAT', 'GGG', 9, 20, 1), ] fname2 = tmp_path / 'some2.bus' busio.write_busfile(fname2, records2, cb_length, umi_length) bus2 = Bus(folder='/', bus_name=fname2, ec_name=ec_matrix_file, transcript_name=transcript_file) busobject_dict = {'s1': bus1, 's2': bus2} bus_iter = iterate_bus_cells_umi_multiple(['s1', 's2'], [fname1, fname2], decode_seq=False) for (cb_, umi_), info_dict_ in bus_iter: # THIS SPLIts according to same gene print('----------') print(info_dict_) print('----------') counter = 0 for (cb, umi), info_dict in emit_records_based_on_gene( cb_, umi_, info_dict_, busobject_dict): counter += 1 print(cb, umi) print(info_dict) assert counter == 2
def test_subsampling(tmp_path): # creating a total of 3 UMIs, nbut 10 counts records = [ # CB UMI EC COUNT FLAG busio.Bus_record(0, 0, 1, 3, 1), busio.Bus_record(1, 0, 2, 3, 12), busio.Bus_record(2, 0, 3, 4, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=1, umi_length=2) fname_out = tmp_path / 'sub.bus' subsampling.subsample_busfile(fname, fname_out, fraction=0.5) # check the number of reads in the subsampled version nreads, nmol = subsampling.get_number_of_reads_and_molecules(fname_out) assert nreads == 5
def test_return_busrecord(tmp_path): """ make sure the functions return the namedtuple, not just the tuple """ records = [ busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) for cb, record_list in pybustools.iterate_cells_of_busfile(fname): for r in record_list: assert isinstance(r, busio.Bus_record) for cbumi, record_list in pybustools.iterate_CB_UMI_of_busfile(fname): for r in record_list: assert isinstance(r, busio.Bus_record)
def h5_to_bus(h5filename, busfile_output, TMPDIR=None): """ turns a 10x molecule_info.h5 into a "fake" bus file: Instead of EC, we just report the actual gene cellranger mapped the read. :param h5filename: path to the h5file :param busfile_output: path to the busfile to be created :param TMPDIR: optional, path to store temporary files """ fh = h5py.File(h5filename, mode='r') CB_list = [_.decode() for _ in fh['/barcodes'][:]] # gene_list = [_.decode() for _ in fh['/features/name'][:]] n_entries = fh['/barcode_idx'].shape[0] cbs_idx = fh['/barcode_idx'][:] gene_idx = fh['/feature_idx'][:] counts = fh['/count'][:] umis = fh['/umi'][:] def _gen(): for i in tqdm.trange(n_entries): cb = CB_list[cbs_idx[i]] cb = _encode_ACGT_to_int(cb) b = Bus_record(cb, int(umis[i]), gene_idx[i], counts[i], 0) yield b unsorted_name = tempfile.mkstemp('.bus', 'unsorted_', TMPDIR)[1] write_busfile(unsorted_name, _gen(), cb_length=16, umi_length=12) fh.close() print('sorting') ret = subprocess.run( ["bustools", "sort", '-o', busfile_output, unsorted_name]) if ret.returncode != 0: print("Child was terminated by signal", ret, file=sys.stderr) raise ValueError() # note that the tmp file wont be deleted if an exception happens above! os.unlink(unsorted_name)
def subsample_busfile(fname_in, fname_out, fraction): """ subsample the reads of an existing busfile by `fraction`, writing the Result into a new busfile. The major effect is that some entries will recieve 0 reads and hence disappear from the file! :param fname_in: Filename of the input busfile :param fname_out: Filename of the resulting, subsampled busfile :param fraction: 0<fraction<1, the fraction of subsampling """ assert 0 < fraction < 1, "fraction must be in [0,1]" # for this to work, we have to pass the input file twice: # 1. we have to collect ALL the counts (for each record) # this will then be jointly subsampled # 2. we iterate the inputfile again, just now we write out each reocrdin into a # different busfile with adjusted count huge_array = [] I = read_binary_bus(fname_in, decode_seq=False) for record in tqdm.tqdm(I, desc='First pass'): huge_array.append(record.COUNT) huge_array = np.array(huge_array) n_total = np.sum(huge_array) n_target = int(n_total * fraction) print(f'Subsampling from {n_total} to {n_target}') x = _downsample_array(huge_array, target=n_target, random_state=int(time.time()*1000), replace=False, inplace=False) print(f'Downsampled reads: {x.sum()}') # create a generator for the bus-records def _helper_gen(): I = read_binary_bus(fname_in, decode_seq=False) for i, record in tqdm.tqdm(enumerate(I), desc='Second pass'): if x[i] > 0: r = Bus_record(record.CB, record.UMI, record.EC, x[i], record.FLAG) yield r G = _helper_gen() # we need to write the correct header _, cb_len, umi_len, _ = get_header_info(fname_in) write_busfile(fname_out, G, cb_length=cb_len, umi_length=umi_len)
def test_ParallelCellGenerator(): """ assert that it returns the same iterator as the serial version """ records1 = random_buslist(1000, 4, 4, 20) records2 = random_buslist(1000, 4, 4, 20) records3 = random_buslist(1000, 4, 4, 20) import tempfile fname1 = tempfile.mktemp() fname2 = tempfile.mktemp() fname3 = tempfile.mktemp() busio.write_busfile(fname1, records1, cb_length=4, umi_length=4) busio.write_busfile(fname2, records2, cb_length=4, umi_length=4) busio.write_busfile(fname3, records3, cb_length=4, umi_length=4) pgen = ParallelCellGenerator({'sample1': fname1, 'sample2': fname2, 'sample3': fname3}, decode_seq=True, queue_size=10) pgen.start_queues() parallel_results = {cb: info for cb, info in pgen.iterate()} serial_results = {cb: info for cb, info in iterate_bus_cells_multiple(['sample1', 'sample2', 'sample3'], [fname1, fname2, fname3])} assert parallel_results == serial_results # check that they return Bus_records, not just tuples for cb, info in parallel_results.items(): for sample, record_list in info.items(): for r in record_list: assert isinstance(r, busio.Bus_record)
def test_fingerprint(tmp_path, ec_matrix_file, transcript_file): """ This should create 3 fingerprints: [10, 0] x 2 [99, 0] x 1 [0, 20] x 1 """ cb_length = 4 umi_length = 3 records1 = [ # # CB UMI EC COUNT FLAG busio.Bus_record('ATAT', 'GGG', 1, 10, 1), busio.Bus_record('ATAT', 'TTT', 1, 10, 1), busio.Bus_record('CTAT', 'GGG', 1, 99, 1), ] fname1 = tmp_path / 'some1.bus' busio.write_busfile(fname1, records1, cb_length, umi_length) bus1 = Bus(folder='/', bus_name=fname1, ec_name=ec_matrix_file, transcript_name=transcript_file) records2 = [ busio.Bus_record('ATAT', 'GGG', 9, 20, 1), ] fname2 = tmp_path / 'some2.bus' busio.write_busfile(fname2, records2, cb_length, umi_length) bus2 = Bus(folder='/', bus_name=fname2, ec_name=ec_matrix_file, transcript_name=transcript_file) busobject_dict = {'s1': bus1, 's2': bus2} df, _cond = phantom_create_dataframes(busobject_dict) print(df) assert len(df) == 3 assert df['freq'].sum() == 4
def test_iterate_cells(tmp_path): records = [ busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) gen = pybustools.iterate_cells_of_busfile(fname) # the first record must have two uMIs cb1, list1 = next(gen) assert cb1 == 'ATAT' and len(list1) == 2 assert list1 == records[:2] cb2, list2 = next(gen) assert cb2 == 'TAGA' and len(list2) == 1 assert list2 == [records[2]] cb3, list3 = next(gen) assert cb3 == 'TTAT' and len(list3) == 1 assert list3 == [records[3]]
def random_buslist(n_records, cb_length, umi_length, ngenes): records = sorted([ random_busrecord(cb_length, umi_length, ngenes) for _ in range(n_records) ]) return records if __name__ == '__main__': fname1 = '/tmp/some1.bus' fname2 = '/tmp/some2.bus' records1 = random_buslist(500, cb_length=4, umi_length=5, ngenes=10) records2 = random_buslist(500, cb_length=4, umi_length=5, ngenes=10) busio.write_busfile(fname1, records1, cb_length=4, umi_length=5) busio.write_busfile(fname2, records2, cb_length=4, umi_length=5) PCG = ParallelCellGenerator({ 'sample1': fname1, 'sample2': fname2 }, decode_seq=True, queue_size=3) PCG.start_queues() results = {cb: info for cb, info in PCG.iterate()} from pybustools.pybustools import iterate_bus_cells_multiple results_serial = { cb: info
def pug_writer(bus, queue, outfile): """ wrapper for cont. taking an item from the queue and writing it to a busfile """ generator = _gen(bus, queue) write_busfile(outfile, generator, cb_length=16, umi_length=12)