Example #1
0
def test_read_write(tmp_path):

    records = [
        busio.Bus_record(0, 0, 10, 20, 1),
        busio.Bus_record(1, 0, 13, 206, 12),
        busio.Bus_record(2, 0, 14, 250, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=1, umi_length=2)

    # check that file got created
    assert pathlib.Path(fname).exists()

    # check that read/write are inverses of each other
    # buffersize is on purpose smaller then len(records)
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=False, buffersize=2))
    assert new_records == records

    # buffersize larger
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=False, buffersize=20))
    assert new_records == records

    # check the decode_Seq works:
    record = next(busio.read_binary_bus(fname, decode_seq=False, buffersize=2))
    assert isinstance(record.CB, int) and isinstance(record.UMI, int)
    record = next(busio.read_binary_bus(fname, decode_seq=True, buffersize=2))
    assert isinstance(record.CB, str) and isinstance(record.UMI, str)
Example #2
0
def test_iterate_cells_UMI_raise_unsorted(tmp_path):
    """
    iterate_CB_UMI_of_busfile must raise an error of the busfile is unsorted (in terms of CBs)
    """
    records = [
        busio.Bus_record('ATAT', 'TAT', 14, 250, 13),
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    with pytest.raises(ValueError):
        gen = pybustools.iterate_CB_UMI_of_busfile(fname)
        list(gen)
    """
    also check that it raises when the CB is unsorted
    """
    records = [  # impotant: the UMI should be the same,
        busio.Bus_record('TTAT', 'TAT', 14, 250, 13),
        busio.Bus_record('ATAT', 'TAT', 10, 20, 1),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)
    with pytest.raises(ValueError):
        gen = pybustools.iterate_CB_UMI_of_busfile(fname)
        list(gen)
Example #3
0
def test_iterate_cb_umi(tmp_path):
    records = [
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 11, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    gen = pybustools.iterate_CB_UMI_of_busfile(fname)

    # the first record must have one entry
    cb1, list1 = next(gen)
    assert cb1 == ('ATAT', 'AAA') and len(list1) == 1

    cb2, list2 = next(gen)
    assert cb2 == ('ATAT', 'GGG') and len(list2) == 2

    cb3, list3 = next(gen)
    assert cb3 == ('TAGA', 'TAT') and len(list3) == 1

    cb4, list4 = next(gen)
    assert cb4 == ('TTAT', 'AAA') and len(list4) == 1
Example #4
0
def test_write_check_cb_umi_length(tmp_path):
    """
    make sure an exception is thrown if the records dont match the given UMI/CB length
    """
    records = [
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TTA', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    with pytest.raises(AssertionError):
        busio.write_busfile(fname, records, cb_length=4, umi_length=3)
Example #5
0
def test_get_header(tmp_path):

    records = [
        busio.Bus_record(0, 0, 10, 20, 1),
        busio.Bus_record(1, 0, 13, 206, 12),
        busio.Bus_record(2, 0, 14, 250, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=12, umi_length=5)

    _, cb, umi, _ = busio.get_header_info(fname)
    assert cb == 12 and umi == 5
Example #6
0
def test_iterate_cells_raise_unsorted(tmp_path):
    """
    iterate_cells must raise an error of the busfile is unsorted (in terms of CBs)
    """
    records = [
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    with pytest.raises(ValueError):
        gen = pybustools.iterate_cells_of_busfile(fname)
        list(gen)
Example #7
0
def test_read_write_str(tmp_path):
    """
    write records with strings instead of ints for CB/UMI
    """
    records = [
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)
    # read and compare to originl
    new_records = list(
        busio.read_binary_bus(fname, decode_seq=True, buffersize=2))
    assert new_records == records
Example #8
0
def test_emit_records(tmp_path, ec_matrix_file, transcript_file):
    """
    two busfiles, they have the same CB/UMI, but it maps to different EC/genes
    emit_records_based_on_gene should yield multiple entries!!
    """
    cb_length = 4
    umi_length = 3

    records1 = [  # #      CB     UMI    EC COUNT FLAG
        busio.Bus_record('ATAT', 'GGG', 1, 10, 1),
    ]
    fname1 = tmp_path / 'some1.bus'
    busio.write_busfile(fname1, records1, cb_length, umi_length)
    bus1 = Bus(folder='/',
               bus_name=fname1,
               ec_name=ec_matrix_file,
               transcript_name=transcript_file)

    records2 = [
        busio.Bus_record('ATAT', 'GGG', 9, 20, 1),
    ]
    fname2 = tmp_path / 'some2.bus'
    busio.write_busfile(fname2, records2, cb_length, umi_length)
    bus2 = Bus(folder='/',
               bus_name=fname2,
               ec_name=ec_matrix_file,
               transcript_name=transcript_file)

    busobject_dict = {'s1': bus1, 's2': bus2}
    bus_iter = iterate_bus_cells_umi_multiple(['s1', 's2'], [fname1, fname2],
                                              decode_seq=False)
    for (cb_, umi_), info_dict_ in bus_iter:
        # THIS SPLIts according to same gene
        print('----------')
        print(info_dict_)
        print('----------')

        counter = 0
        for (cb, umi), info_dict in emit_records_based_on_gene(
                cb_, umi_, info_dict_, busobject_dict):
            counter += 1
            print(cb, umi)
            print(info_dict)
        assert counter == 2
Example #9
0
def test_subsampling(tmp_path):

    # creating a total of 3 UMIs, nbut 10 counts
    records = [
        # CB UMI EC COUNT FLAG
        busio.Bus_record(0, 0, 1, 3, 1),
        busio.Bus_record(1, 0, 2, 3, 12),
        busio.Bus_record(2, 0, 3, 4, 13)
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=1, umi_length=2)

    fname_out = tmp_path / 'sub.bus'

    subsampling.subsample_busfile(fname, fname_out, fraction=0.5)

    # check the number of reads in the subsampled version
    nreads, nmol = subsampling.get_number_of_reads_and_molecules(fname_out)
    assert nreads == 5
Example #10
0
def random_busrecord(cb_length, umi_length, ngenes):
    d = {0: 'A', 1: 'C', 2: 'G', 3: 'T'}
    cb = [random.randrange(0, 4) for _ in range(cb_length)]
    cb = ''.join([d[_] for _ in cb])
    umi = [random.randrange(0, 4) for _ in range(umi_length)]
    umi = ''.join([d[_] for _ in umi])
    gene = random.randrange(0, ngenes)
    counts = random.randint(0, 50)

    return busio.Bus_record(cb, umi, gene, counts, 1)
Example #11
0
def test_return_busrecord(tmp_path):
    """
    make sure the functions return the namedtuple, not just the tuple
    """
    records = [
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    for cb, record_list in pybustools.iterate_cells_of_busfile(fname):
        for r in record_list:
            assert isinstance(r, busio.Bus_record)

    for cbumi, record_list in pybustools.iterate_CB_UMI_of_busfile(fname):
        for r in record_list:
            assert isinstance(r, busio.Bus_record)
Example #12
0
def test_fingerprint(tmp_path, ec_matrix_file, transcript_file):
    """
    This should create 3 fingerprints:
    [10, 0]  x 2
    [99, 0]  x 1
    [0, 20]  x 1
    """
    cb_length = 4
    umi_length = 3

    records1 = [  # #      CB     UMI    EC COUNT FLAG
        busio.Bus_record('ATAT', 'GGG', 1, 10, 1),
        busio.Bus_record('ATAT', 'TTT', 1, 10, 1),
        busio.Bus_record('CTAT', 'GGG', 1, 99, 1),
    ]
    fname1 = tmp_path / 'some1.bus'
    busio.write_busfile(fname1, records1, cb_length, umi_length)
    bus1 = Bus(folder='/',
               bus_name=fname1,
               ec_name=ec_matrix_file,
               transcript_name=transcript_file)

    records2 = [
        busio.Bus_record('ATAT', 'GGG', 9, 20, 1),
    ]
    fname2 = tmp_path / 'some2.bus'
    busio.write_busfile(fname2, records2, cb_length, umi_length)
    bus2 = Bus(folder='/',
               bus_name=fname2,
               ec_name=ec_matrix_file,
               transcript_name=transcript_file)

    busobject_dict = {'s1': bus1, 's2': bus2}

    df, _cond = phantom_create_dataframes(busobject_dict)

    print(df)
    assert len(df) == 3
    assert df['freq'].sum() == 4
Example #13
0
def test_iterate_cells(tmp_path):
    records = [
        busio.Bus_record('ATAT', 'AAA', 10, 20, 1),
        busio.Bus_record('ATAT', 'GGG', 10, 20, 1),
        busio.Bus_record('TAGA', 'TAT', 14, 250, 13),
        busio.Bus_record('TTAT', 'AAA', 13, 206, 12),
    ]
    fname = tmp_path / 'some.bus'
    busio.write_busfile(fname, records, cb_length=4, umi_length=3)

    gen = pybustools.iterate_cells_of_busfile(fname)

    # the first record must have two uMIs
    cb1, list1 = next(gen)
    assert cb1 == 'ATAT' and len(list1) == 2
    assert list1 == records[:2]

    cb2, list2 = next(gen)
    assert cb2 == 'TAGA' and len(list2) == 1
    assert list2 == [records[2]]

    cb3, list3 = next(gen)
    assert cb3 == 'TTAT' and len(list3) == 1
    assert list3 == [records[3]]