Ejemplo n.º 1
0
def test_iterate_cells_UMI_raise_unsorted(tmp_path):
    """
    iterate_CB_UMI_of_busfile must raise an error of the busfile is unsorted (in terms of CBs)
    """
    records = [
        busio.Bus_record('ATAT', 'TAT', 14, 250, 13),
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    with pytest.raises(ValueError):
        gen = pybustools.iterate_CB_UMI_of_busfile(fname)
        list(gen)
    """
    also check that it raises when the CB is unsorted
    """
    records = [  # impotant: the UMI should be the same,
        busio.Bus_record('TTAT', 'TAT', 14, 250, 13),
        busio.Bus_record('ATAT', 'TAT', 10, 20, 1),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)
    with pytest.raises(ValueError):
        gen = pybustools.iterate_CB_UMI_of_busfile(fname)
        list(gen)
Ejemplo n.º 2
0
def test_iterate_cb_umi(tmp_path):
    records = [
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 11, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    gen = pybustools.iterate_CB_UMI_of_busfile(fname)

    # the first record must have one entry
    cb1, list1 = next(gen)
    assert cb1 == ('ATAT', 'AAA') and len(list1) == 1

    cb2, list2 = next(gen)
    assert cb2 == ('ATAT', 'GGG') and len(list2) == 2

    cb3, list3 = next(gen)
    assert cb3 == ('TAGA', 'TAT') and len(list3) == 1

    cb4, list4 = next(gen)
    assert cb4 == ('TTAT', 'AAA') and len(list4) == 1
Ejemplo n.º 3
0
def subsample_bus_unseens_species(busfile, fractions):
    """
    subsamples the given bus file at different depth, and records the number of reads/umis obtained
    returns a DataFrame with the nReads and nUMIs as well as the UMI prevalences (duplicity of an umi -> #observations of that duplicity)
    """
    gc.collect()

    assert all([0 <= _ <= 1 for _ in fractions]), "Fractions must be in [0,1]"
    reads = []

    # lets group entires by CB/UMI instead of CB/UMI/EC (as done via B.iterate_bus())
    # and sum the counts over ECs
    for (cb, umi), record_list in tqdm.tqdm(iterate_CB_UMI_of_busfile(busfile, decode_seq=False)):
        s = sum([r.COUNT for r in record_list])
        reads.append(s)

    reads = np.array(reads)
    prevalences = collections.Counter(reads)

    df = []
    total = reads.sum()
    for percent in fractions:
        print(percent)
        target = int(percent * total)
        y = _downsample_array(reads, target=target, random_state=int(time.time()*1000), replace=False)

        numis = (y > 0).sum()
        df.append({
            'nUMIs': numis,
            'percent': percent,
            'nReads':  target,
            'nReads2':  y.sum()
        })
    df = pd.DataFrame(df)
    return df, prevalences
Ejemplo n.º 4
0
def cell_umi_producer(fname, out_queue, decode_seq):
    """
    turns iterate_CB_UMI_of_busfile into a queue
    """
    for cb_umi, record_list in iterate_CB_UMI_of_busfile(fname, decode_seq):
        # print('Putting record:', fname, record[0])

        # due to some dumb pickling issues, we cant pickle a namedtuple directly
        # hence deconstruct it and put that onto the queue
        record_list = [_namedtuple_to_tuple(r) for r in record_list]
        out_queue.put((cb_umi, record_list))
        # print('sleeping')
        # time.sleep(random.randrange(1,3))
    out_queue.put(TERMINATOR)
Ejemplo n.º 5
0
def test_return_busrecord(tmp_path):
    """
    make sure the functions return the namedtuple, not just the tuple
    """
    records = [
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    for cb, record_list in pybustools.iterate_cells_of_busfile(fname):
        for r in record_list:
            assert isinstance(r, busio.Bus_record)

    for cbumi, record_list in pybustools.iterate_CB_UMI_of_busfile(fname):
        for r in record_list:
            assert isinstance(r, busio.Bus_record)
Ejemplo n.º 6
0
def count_cb_umi_pairs(busfile):
    counter = 0
    for _ in tqdm.tqdm(iterate_CB_UMI_of_busfile(busfile, decode_seq=False)):
        counter += 1
    return counter