def test_iterate_cells_UMI_raise_unsorted(tmp_path): """ iterate_CB_UMI_of_busfile must raise an error of the busfile is unsorted (in terms of CBs) """ records = [ busio.Bus_record('ATAT', 'TAT', 14, 250, 13), busio.Bus_record('ATAT', 'AAA', 10, 20, 1), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) with pytest.raises(ValueError): gen = pybustools.iterate_CB_UMI_of_busfile(fname) list(gen) """ also check that it raises when the CB is unsorted """ records = [ # impotant: the UMI should be the same, busio.Bus_record('TTAT', 'TAT', 14, 250, 13), busio.Bus_record('ATAT', 'TAT', 10, 20, 1), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) with pytest.raises(ValueError): gen = pybustools.iterate_CB_UMI_of_busfile(fname) list(gen)
def test_iterate_cb_umi(tmp_path): records = [ busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 11, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) gen = pybustools.iterate_CB_UMI_of_busfile(fname) # the first record must have one entry cb1, list1 = next(gen) assert cb1 == ('ATAT', 'AAA') and len(list1) == 1 cb2, list2 = next(gen) assert cb2 == ('ATAT', 'GGG') and len(list2) == 2 cb3, list3 = next(gen) assert cb3 == ('TAGA', 'TAT') and len(list3) == 1 cb4, list4 = next(gen) assert cb4 == ('TTAT', 'AAA') and len(list4) == 1
def subsample_bus_unseens_species(busfile, fractions): """ subsamples the given bus file at different depth, and records the number of reads/umis obtained returns a DataFrame with the nReads and nUMIs as well as the UMI prevalences (duplicity of an umi -> #observations of that duplicity) """ gc.collect() assert all([0 <= _ <= 1 for _ in fractions]), "Fractions must be in [0,1]" reads = [] # lets group entires by CB/UMI instead of CB/UMI/EC (as done via B.iterate_bus()) # and sum the counts over ECs for (cb, umi), record_list in tqdm.tqdm(iterate_CB_UMI_of_busfile(busfile, decode_seq=False)): s = sum([r.COUNT for r in record_list]) reads.append(s) reads = np.array(reads) prevalences = collections.Counter(reads) df = [] total = reads.sum() for percent in fractions: print(percent) target = int(percent * total) y = _downsample_array(reads, target=target, random_state=int(time.time()*1000), replace=False) numis = (y > 0).sum() df.append({ 'nUMIs': numis, 'percent': percent, 'nReads': target, 'nReads2': y.sum() }) df = pd.DataFrame(df) return df, prevalences
def cell_umi_producer(fname, out_queue, decode_seq): """ turns iterate_CB_UMI_of_busfile into a queue """ for cb_umi, record_list in iterate_CB_UMI_of_busfile(fname, decode_seq): # print('Putting record:', fname, record[0]) # due to some dumb pickling issues, we cant pickle a namedtuple directly # hence deconstruct it and put that onto the queue record_list = [_namedtuple_to_tuple(r) for r in record_list] out_queue.put((cb_umi, record_list)) # print('sleeping') # time.sleep(random.randrange(1,3)) out_queue.put(TERMINATOR)
def test_return_busrecord(tmp_path): """ make sure the functions return the namedtuple, not just the tuple """ records = [ busio.Bus_record('ATAT', 'AAA', 10, 20, 1), busio.Bus_record('ATAT', 'GGG', 10, 20, 1), busio.Bus_record('TAGA', 'TAT', 14, 250, 13), busio.Bus_record('TTAT', 'AAA', 13, 206, 12), ] fname = tmp_path / 'some.bus' busio.write_busfile(fname, records, cb_length=4, umi_length=3) for cb, record_list in pybustools.iterate_cells_of_busfile(fname): for r in record_list: assert isinstance(r, busio.Bus_record) for cbumi, record_list in pybustools.iterate_CB_UMI_of_busfile(fname): for r in record_list: assert isinstance(r, busio.Bus_record)
def count_cb_umi_pairs(busfile): counter = 0 for _ in tqdm.tqdm(iterate_CB_UMI_of_busfile(busfile, decode_seq=False)): counter += 1 return counter