Exemple #1
0
def test_read_write(tmp_path):

    records = [
        busio.Bus_record(0, 0, 10, 20, 1),
        busio.Bus_record(1, 0, 13, 206, 12),
        busio.Bus_record(2, 0, 14, 250, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=1, umi_length=2)

    # check that file got created
    assert pathlib.Path(fname).exists()

    # check that read/write are inverses of each other
    # buffersize is on purpose smaller then len(records)
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=False, buffersize=2))
    assert new_records == records

    # buffersize larger
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=False, buffersize=20))
    assert new_records == records

    # check the decode_Seq works:
    record = next(busio.read_binary_bus(fname, decode_seq=False, buffersize=2))
    assert isinstance(record.CB, int) and isinstance(record.UMI, int)
    record = next(busio.read_binary_bus(fname, decode_seq=True, buffersize=2))
    assert isinstance(record.CB, str) and isinstance(record.UMI, str)
Exemple #2
0
def test_iterate_cb_umi(tmp_path):
    records = [
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 11, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    gen = pybustools.iterate_CB_UMI_of_busfile(fname)

    # the first record must have one entry
    cb1, list1 = next(gen)
    assert cb1 == ('ATAT', 'AAA') and len(list1) == 1

    cb2, list2 = next(gen)
    assert cb2 == ('ATAT', 'GGG') and len(list2) == 2

    cb3, list3 = next(gen)
    assert cb3 == ('TAGA', 'TAT') and len(list3) == 1

    cb4, list4 = next(gen)
    assert cb4 == ('TTAT', 'AAA') and len(list4) == 1
Exemple #3
0
def filter_busfile(inbus, outbus, suspicious):
    """
    filtering the inbus file for CUGs that are listed in the suspicious-file
    """
    if VERBOSE:
        print(f'Filtering {inbus} using {suspicious}')

    with open(suspicious, 'rb') as fh:
        dubious_cb_umi = pickle.load(fh)
        dubious_cb_umi = set(dubious_cb_umi)

    _, cb_len, umi_len, _ = get_header_info(inbus)

    def _gen():
        n_filtered = 0
        n_total = 0
        for record in tqdm.tqdm(read_binary_bus(inbus, decode_seq=False)):
            n_total += 1
            if (record.CB, record.UMI) in dubious_cb_umi:
                n_filtered += 1
                continue
            yield record

        print(
            f'{n_filtered}/{n_total} ({100 * n_filtered/n_total:.3f}%) CUGs filtered'
        )

    write_busfile(outbus, _gen(), cb_len, umi_len)
Exemple #4
0
def test_iterate_cells_UMI_raise_unsorted(tmp_path):
    """
    iterate_CB_UMI_of_busfile must raise an error of the busfile is unsorted (in terms of CBs)
    """
    records = [
        busio.Bus_record('ATAT', 'TAT', 14, 250, 13),
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    with pytest.raises(ValueError):
        gen = pybustools.iterate_CB_UMI_of_busfile(fname)
        list(gen)
    """
    also check that it raises when the CB is unsorted
    """
    records = [  # impotant: the UMI should be the same,
        busio.Bus_record('TTAT', 'TAT', 14, 250, 13),
        busio.Bus_record('ATAT', 'TAT', 10, 20, 1),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)
    with pytest.raises(ValueError):
        gen = pybustools.iterate_CB_UMI_of_busfile(fname)
        list(gen)
Exemple #5
0
def in_parallel(bus, outfile, cores):
    """

    """
    # put al the intermediate results in here
    tmpfolder = tempfile.mkdtemp(prefix='pug_', dir='/tmp')

    QUEUE_LENGTH = 1000
    cb_queue = mp.Queue(QUEUE_LENGTH)  # has (CB, [records]) as elements

    worker_tasks = []

    # task1 = mp.Pool(1, initializer=cell_producer, initargs=(bus, cb_queue))
    task1 = mp.Process(target=cell_producer, args=(bus, cb_queue, cores))
    worker_tasks.append(task1)
    for i in range(cores):
        busfile = f'{tmpfolder}/{i}.bus'
        # t = mp.Pool(1, initializer=pug_writer, initargs=(bus, cb_queue, busfile))
        t = mp.Process(target=pug_writer, args=(bus, cb_queue, busfile))
        worker_tasks.append(t)

    for t in worker_tasks:
        t.start()

    for t in worker_tasks:
        t.join()

    # cleanup
    assert cb_queue.empty(), "queue not empty!!"
    cb_queue.close()

    for t in worker_tasks:
        t.close()

    # merge all busfiles into a big one!
    bus_iterators = []
    for i in range(cores):
        busfile = f'{tmpfolder}/{i}.bus'
        gen = busio.read_binary_bus(busfile)
        bus_iterators.append(gen)

    big_gen = itertools.chain.from_iterable(bus_iterators)
    unsorted_outfile = f'{tmpfolder}/unsorted.bus'
    write_busfile(unsorted_outfile, big_gen, cb_length=16, umi_length=12)

    # cleanup the parts
    for i in range(cores):
        busfile = f'{tmpfolder}/{i}.bus'
        os.remove(busfile)

    # sort the file
    import subprocess
    import sys
    ret = subprocess.run(["bustools", "sort", '-o', outfile, unsorted_outfile])
    if ret.returncode != 0:
        print("Child was terminated by signal", ret, file=sys.stderr)
        raise ValueError()
Exemple #6
0
def test_write_check_cb_umi_length(tmp_path):
    """
    make sure an exception is thrown if the records dont match the given UMI/CB length
    """
    records = [
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TTA', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    with pytest.raises(AssertionError):
        busio.write_busfile(fname, records, cb_length=4, umi_length=3)
Exemple #7
0
def test_get_header(tmp_path):

    records = [
        busio.Bus_record(0, 0, 10, 20, 1),
        busio.Bus_record(1, 0, 13, 206, 12),
        busio.Bus_record(2, 0, 14, 250, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=12, umi_length=5)

    _, cb, umi, _ = busio.get_header_info(fname)
    assert cb == 12 and umi == 5
Exemple #8
0
def test_iterate_cells_raise_unsorted(tmp_path):
    """
    iterate_cells must raise an error of the busfile is unsorted (in terms of CBs)
    """
    records = [
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    with pytest.raises(ValueError):
        gen = pybustools.iterate_cells_of_busfile(fname)
        list(gen)
Exemple #9
0
def test_read_write_str(tmp_path):
    """
    write records with strings instead of ints for CB/UMI
    """
    records = [
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)
    # read and compare to originl
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=True, buffersize=2))
    assert new_records == records
Exemple #10
0
def test_emit_records(tmp_path, ec_matrix_file, transcript_file):
    """
    two busfiles, they have the same CB/UMI, but it maps to different EC/genes
    emit_records_based_on_gene should yield multiple entries!!
    """
    cb_length = 4
    umi_length = 3

    records1 = [  # #      CB     UMI    EC COUNT FLAG
        busio.Bus_record('ATAT', 'GGG', 1, 10, 1),
    ]
    fname1 = tmp_path / 'some1.bus'
    busio.write_busfile(fname1, records1, cb_length, umi_length)
    bus1 = Bus(folder='/',
               bus_name=fname1,
               ec_name=ec_matrix_file,
               transcript_name=transcript_file)

    records2 = [
        busio.Bus_record('ATAT', 'GGG', 9, 20, 1),
    ]
    fname2 = tmp_path / 'some2.bus'
    busio.write_busfile(fname2, records2, cb_length, umi_length)
    bus2 = Bus(folder='/',
               bus_name=fname2,
               ec_name=ec_matrix_file,
               transcript_name=transcript_file)

    busobject_dict = {'s1': bus1, 's2': bus2}
    bus_iter = iterate_bus_cells_umi_multiple(['s1', 's2'], [fname1, fname2],
                                              decode_seq=False)
    for (cb_, umi_), info_dict_ in bus_iter:
        # THIS SPLIts according to same gene
        print('----------')
        print(info_dict_)
        print('----------')

        counter = 0
        for (cb, umi), info_dict in emit_records_based_on_gene(
                cb_, umi_, info_dict_, busobject_dict):
            counter += 1
            print(cb, umi)
            print(info_dict)
        assert counter == 2
def test_subsampling(tmp_path):

    # creating a total of 3 UMIs, nbut 10 counts
    records = [
        # CB UMI EC COUNT FLAG
        busio.Bus_record(0, 0, 1, 3, 1),
        busio.Bus_record(1, 0, 2, 3, 12),
        busio.Bus_record(2, 0, 3, 4, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=1, umi_length=2)

    fname_out = tmp_path / 'sub.bus'

    subsampling.subsample_busfile(fname, fname_out, fraction=0.5)

    # check the number of reads in the subsampled version
    nreads, nmol = subsampling.get_number_of_reads_and_molecules(fname_out)
    assert nreads == 5
Exemple #12
0
def test_return_busrecord(tmp_path):
    """
    make sure the functions return the namedtuple, not just the tuple
    """
    records = [
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    for cb, record_list in pybustools.iterate_cells_of_busfile(fname):
        for r in record_list:
            assert isinstance(r, busio.Bus_record)

    for cbumi, record_list in pybustools.iterate_CB_UMI_of_busfile(fname):
        for r in record_list:
            assert isinstance(r, busio.Bus_record)
Exemple #13
0
def h5_to_bus(h5filename, busfile_output, TMPDIR=None):
    """
    turns a 10x molecule_info.h5 into a "fake" bus file:
    Instead of EC, we just report the actual gene cellranger mapped the read.

    :param h5filename: path to the h5file
    :param busfile_output: path to the busfile to be created
    :param TMPDIR: optional, path to store temporary files
    """
    fh = h5py.File(h5filename, mode='r')
    CB_list = [_.decode() for _ in fh['/barcodes'][:]]
    # gene_list = [_.decode() for _ in fh['/features/name'][:]]
    n_entries = fh['/barcode_idx'].shape[0]

    cbs_idx = fh['/barcode_idx'][:]
    gene_idx = fh['/feature_idx'][:]
    counts = fh['/count'][:]
    umis = fh['/umi'][:]

    def _gen():
        for i in tqdm.trange(n_entries):
            cb = CB_list[cbs_idx[i]]
            cb = _encode_ACGT_to_int(cb)
            b = Bus_record(cb, int(umis[i]), gene_idx[i], counts[i], 0)
            yield b

    unsorted_name = tempfile.mkstemp('.bus', 'unsorted_', TMPDIR)[1]

    write_busfile(unsorted_name, _gen(), cb_length=16, umi_length=12)

    fh.close()

    print('sorting')
    ret = subprocess.run(
        ["bustools", "sort", '-o', busfile_output, unsorted_name])
    if ret.returncode != 0:
        print("Child was terminated by signal", ret, file=sys.stderr)
        raise ValueError()

    # note that the tmp file wont be deleted if an exception happens above!
    os.unlink(unsorted_name)
Exemple #14
0
def subsample_busfile(fname_in, fname_out, fraction):
    """
    subsample the reads of an existing busfile by `fraction`, writing the Result
    into a new busfile. The major effect is that some entries will recieve
    0 reads and hence disappear from the file!

    :param fname_in: Filename of the input busfile
    :param fname_out: Filename of the resulting, subsampled busfile
    :param fraction: 0<fraction<1, the fraction of subsampling
    """
    assert 0 < fraction < 1, "fraction must be in [0,1]"

    # for this to work, we have to pass the input file twice:
    # 1. we  have to collect ALL the counts (for each record)
    #    this will then be jointly subsampled
    # 2. we iterate the inputfile again, just now we write out each reocrdin into a
    # different busfile with adjusted count
    huge_array = []
    I = read_binary_bus(fname_in, decode_seq=False)
    for record in tqdm.tqdm(I, desc='First pass'):
        huge_array.append(record.COUNT)

    huge_array = np.array(huge_array)
    n_total = np.sum(huge_array)
    n_target = int(n_total * fraction)
    print(f'Subsampling from {n_total} to {n_target}')
    x = _downsample_array(huge_array, target=n_target, random_state=int(time.time()*1000), replace=False, inplace=False)
    print(f'Downsampled reads: {x.sum()}')

    # create a generator for the bus-records
    def _helper_gen():
        I = read_binary_bus(fname_in, decode_seq=False)
        for i, record in tqdm.tqdm(enumerate(I), desc='Second pass'):
            if x[i] > 0:
                r = Bus_record(record.CB, record.UMI, record.EC, x[i], record.FLAG)
                yield r

    G = _helper_gen()
    # we need to write the correct header
    _, cb_len, umi_len, _ = get_header_info(fname_in)
    write_busfile(fname_out, G, cb_length=cb_len, umi_length=umi_len)
def test_ParallelCellGenerator():
    """
    assert that it returns the same iterator as the serial version
    """
    records1 = random_buslist(1000, 4, 4, 20)
    records2 = random_buslist(1000, 4, 4, 20)
    records3 = random_buslist(1000, 4, 4, 20)

    import tempfile
    fname1 = tempfile.mktemp()
    fname2 = tempfile.mktemp()
    fname3 = tempfile.mktemp()

    busio.write_busfile(fname1, records1, cb_length=4, umi_length=4)
    busio.write_busfile(fname2, records2, cb_length=4, umi_length=4)
    busio.write_busfile(fname3, records3, cb_length=4, umi_length=4)

    pgen = ParallelCellGenerator({'sample1': fname1, 'sample2': fname2, 'sample3': fname3}, decode_seq=True, queue_size=10)
    pgen.start_queues()
    parallel_results = {cb: info for cb, info in pgen.iterate()}

    serial_results = {cb: info for cb, info in iterate_bus_cells_multiple(['sample1', 'sample2', 'sample3'], [fname1, fname2, fname3])}

    assert parallel_results == serial_results

    # check that they return Bus_records, not just tuples
    for cb, info in parallel_results.items():
        for sample, record_list in info.items():
            for r in record_list:
                assert isinstance(r, busio.Bus_record)
Exemple #16
0
def test_fingerprint(tmp_path, ec_matrix_file, transcript_file):
    """
    This should create 3 fingerprints:
    [10, 0]  x 2
    [99, 0]  x 1
    [0, 20]  x 1
    """
    cb_length = 4
    umi_length = 3

    records1 = [  # #      CB     UMI    EC COUNT FLAG
        busio.Bus_record('ATAT', 'GGG', 1, 10, 1),
        busio.Bus_record('ATAT', 'TTT', 1, 10, 1),
        busio.Bus_record('CTAT', 'GGG', 1, 99, 1),
    ]
    fname1 = tmp_path / 'some1.bus'
    busio.write_busfile(fname1, records1, cb_length, umi_length)
    bus1 = Bus(folder='/',
               bus_name=fname1,
               ec_name=ec_matrix_file,
               transcript_name=transcript_file)

    records2 = [
        busio.Bus_record('ATAT', 'GGG', 9, 20, 1),
    ]
    fname2 = tmp_path / 'some2.bus'
    busio.write_busfile(fname2, records2, cb_length, umi_length)
    bus2 = Bus(folder='/',
               bus_name=fname2,
               ec_name=ec_matrix_file,
               transcript_name=transcript_file)

    busobject_dict = {'s1': bus1, 's2': bus2}

    df, _cond = phantom_create_dataframes(busobject_dict)

    print(df)
    assert len(df) == 3
    assert df['freq'].sum() == 4
Exemple #17
0
def test_iterate_cells(tmp_path):
    records = [
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    gen = pybustools.iterate_cells_of_busfile(fname)

    # the first record must have two uMIs
    cb1, list1 = next(gen)
    assert cb1 == 'ATAT' and len(list1) == 2
    assert list1 == records[:2]

    cb2, list2 = next(gen)
    assert cb2 == 'TAGA' and len(list2) == 1
    assert list2 == [records[2]]

    cb3, list3 = next(gen)
    assert cb3 == 'TTAT' and len(list3) == 1
    assert list3 == [records[3]]
Exemple #18
0
def random_buslist(n_records, cb_length, umi_length, ngenes):
    records = sorted([
        random_busrecord(cb_length, umi_length, ngenes)
        for _ in range(n_records)
    ])
    return records


if __name__ == '__main__':

    fname1 = '/tmp/some1.bus'
    fname2 = '/tmp/some2.bus'

    records1 = random_buslist(500, cb_length=4, umi_length=5, ngenes=10)
    records2 = random_buslist(500, cb_length=4, umi_length=5, ngenes=10)
    busio.write_busfile(fname1, records1, cb_length=4, umi_length=5)
    busio.write_busfile(fname2, records2, cb_length=4, umi_length=5)

    PCG = ParallelCellGenerator({
        'sample1': fname1,
        'sample2': fname2
    },
                                decode_seq=True,
                                queue_size=3)
    PCG.start_queues()

    results = {cb: info for cb, info in PCG.iterate()}

    from pybustools.pybustools import iterate_bus_cells_multiple
    results_serial = {
        cb: info
Exemple #19
0
def pug_writer(bus, queue, outfile):
    """
    wrapper for cont. taking an item from the queue and writing it to a busfile
    """
    generator = _gen(bus, queue)
    write_busfile(outfile, generator, cb_length=16, umi_length=12)