def test_read_write(tmp_path): records = [ busio.Bus_record(0, 0, 10, 20, 1), busio.Bus_record(1, 0, 13, 206, 12), busio.Bus_record(2, 0, 14, 250, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=1, umi_length=2) # check that file got created assert pathlib.Path(fname).exists() # check that read/write are inverses of each other # buffersize is on purpose smaller then len(records) new_records = list( busio.read_binary_bus(fname, decode_seq=False, buffersize=2)) assert new_records == records # buffersize larger new_records = list( busio.read_binary_bus(fname, decode_seq=False, buffersize=20)) assert new_records == records # check the decode_Seq works: record = next(busio.read_binary_bus(fname, decode_seq=False, buffersize=2)) assert isinstance(record.CB, int) and isinstance(record.UMI, int) record = next(busio.read_binary_bus(fname, decode_seq=True, buffersize=2)) assert isinstance(record.CB, str) and isinstance(record.UMI, str)
def test_iterate_cells_UMI_raise_unsorted(tmp_path): """ iterate_CB_UMI_of_busfile must raise an error of the busfile is unsorted (in terms of CBs) """ records = [ busio.Bus_record('ATAT', 'TAT', 14, 250, 13), busio.Bus_record('ATAT', 'AAA', 10, 20, 1), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) with pytest.raises(ValueError): gen = pybustools.iterate_CB_UMI_of_busfile(fname) list(gen) """ also check that it raises when the CB is unsorted """ records = [ # impotant: the UMI should be the same, busio.Bus_record('TTAT', 'TAT', 14, 250, 13), busio.Bus_record('ATAT', 'TAT', 10, 20, 1), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) with pytest.raises(ValueError): gen = pybustools.iterate_CB_UMI_of_busfile(fname) list(gen)
def test_iterate_cb_umi(tmp_path): records = [ busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 11, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) gen = pybustools.iterate_CB_UMI_of_busfile(fname) # the first record must have one entry cb1, list1 = next(gen) assert cb1 == ('ATAT', 'AAA') and len(list1) == 1 cb2, list2 = next(gen) assert cb2 == ('ATAT', 'GGG') and len(list2) == 2 cb3, list3 = next(gen) assert cb3 == ('TAGA', 'TAT') and len(list3) == 1 cb4, list4 = next(gen) assert cb4 == ('TTAT', 'AAA') and len(list4) == 1
def test_write_check_cb_umi_length(tmp_path): """ make sure an exception is thrown if the records dont match the given UMI/CB length """ records = [ busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TTA', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' with pytest.raises(AssertionError): busio.write_busfile(fname, records, cb_length=4, umi_length=3)
def test_get_header(tmp_path): records = [ busio.Bus_record(0, 0, 10, 20, 1), busio.Bus_record(1, 0, 13, 206, 12), busio.Bus_record(2, 0, 14, 250, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=12, umi_length=5) _, cb, umi, _ = busio.get_header_info(fname) assert cb == 12 and umi == 5
def test_iterate_cells_raise_unsorted(tmp_path): """ iterate_cells must raise an error of the busfile is unsorted (in terms of CBs) """ records = [ busio.Bus_record('TTAT', 'AAA', 13, 206, 12), busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) with pytest.raises(ValueError): gen = pybustools.iterate_cells_of_busfile(fname) list(gen)
def test_read_write_str(tmp_path): """ write records with strings instead of ints for CB/UMI """ records = [ busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), busio.Bus_record('TAGA', 'TAT', 14, 250, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) # read and compare to originl new_records = list( busio.read_binary_bus(fname, decode_seq=True, buffersize=2)) assert new_records == records
def test_emit_records(tmp_path, ec_matrix_file, transcript_file): """ two busfiles, they have the same CB/UMI, but it maps to different EC/genes emit_records_based_on_gene should yield multiple entries!! """ cb_length = 4 umi_length = 3 records1 = [ # # CB UMI EC COUNT FLAG busio.Bus_record('ATAT', 'GGG', 1, 10, 1), ] fname1 = tmp_path / 'some1.bus' busio.write_busfile(fname1, records1, cb_length, umi_length) bus1 = Bus(folder='/', bus_name=fname1, ec_name=ec_matrix_file, transcript_name=transcript_file) records2 = [ busio.Bus_record('ATAT', 'GGG', 9, 20, 1), ] fname2 = tmp_path / 'some2.bus' busio.write_busfile(fname2, records2, cb_length, umi_length) bus2 = Bus(folder='/', bus_name=fname2, ec_name=ec_matrix_file, transcript_name=transcript_file) busobject_dict = {'s1': bus1, 's2': bus2} bus_iter = iterate_bus_cells_umi_multiple(['s1', 's2'], [fname1, fname2], decode_seq=False) for (cb_, umi_), info_dict_ in bus_iter: # THIS SPLIts according to same gene print('----------') print(info_dict_) print('----------') counter = 0 for (cb, umi), info_dict in emit_records_based_on_gene( cb_, umi_, info_dict_, busobject_dict): counter += 1 print(cb, umi) print(info_dict) assert counter == 2
def test_subsampling(tmp_path): # creating a total of 3 UMIs, nbut 10 counts records = [ # CB UMI EC COUNT FLAG busio.Bus_record(0, 0, 1, 3, 1), busio.Bus_record(1, 0, 2, 3, 12), busio.Bus_record(2, 0, 3, 4, 13) ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=1, umi_length=2) fname_out = tmp_path / 'sub.bus' subsampling.subsample_busfile(fname, fname_out, fraction=0.5) # check the number of reads in the subsampled version nreads, nmol = subsampling.get_number_of_reads_and_molecules(fname_out) assert nreads == 5
def random_busrecord(cb_length, umi_length, ngenes): d = {0: 'A', 1: 'C', 2: 'G', 3: 'T'} cb = [random.randrange(0, 4) for _ in range(cb_length)] cb = ''.join([d[_] for _ in cb]) umi = [random.randrange(0, 4) for _ in range(umi_length)] umi = ''.join([d[_] for _ in umi]) gene = random.randrange(0, ngenes) counts = random.randint(0, 50) return busio.Bus_record(cb, umi, gene, counts, 1)
def test_return_busrecord(tmp_path): """ make sure the functions return the namedtuple, not just the tuple """ records = [ busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) for cb, record_list in pybustools.iterate_cells_of_busfile(fname): for r in record_list: assert isinstance(r, busio.Bus_record) for cbumi, record_list in pybustools.iterate_CB_UMI_of_busfile(fname): for r in record_list: assert isinstance(r, busio.Bus_record)
def test_fingerprint(tmp_path, ec_matrix_file, transcript_file): """ This should create 3 fingerprints: [10, 0] x 2 [99, 0] x 1 [0, 20] x 1 """ cb_length = 4 umi_length = 3 records1 = [ # # CB UMI EC COUNT FLAG busio.Bus_record('ATAT', 'GGG', 1, 10, 1), busio.Bus_record('ATAT', 'TTT', 1, 10, 1), busio.Bus_record('CTAT', 'GGG', 1, 99, 1), ] fname1 = tmp_path / 'some1.bus' busio.write_busfile(fname1, records1, cb_length, umi_length) bus1 = Bus(folder='/', bus_name=fname1, ec_name=ec_matrix_file, transcript_name=transcript_file) records2 = [ busio.Bus_record('ATAT', 'GGG', 9, 20, 1), ] fname2 = tmp_path / 'some2.bus' busio.write_busfile(fname2, records2, cb_length, umi_length) bus2 = Bus(folder='/', bus_name=fname2, ec_name=ec_matrix_file, transcript_name=transcript_file) busobject_dict = {'s1': bus1, 's2': bus2} df, _cond = phantom_create_dataframes(busobject_dict) print(df) assert len(df) == 3 assert df['freq'].sum() == 4
def test_iterate_cells(tmp_path): records = [ busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) gen = pybustools.iterate_cells_of_busfile(fname) # the first record must have two uMIs cb1, list1 = next(gen) assert cb1 == 'ATAT' and len(list1) == 2 assert list1 == records[:2] cb2, list2 = next(gen) assert cb2 == 'TAGA' and len(list2) == 1 assert list2 == [records[2]] cb3, list3 = next(gen) assert cb3 == 'TTAT' and len(list3) == 1 assert list3 == [records[3]]