Ejemplo n.º 1
0
def test_bam2fasta_valid_args():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')
        csv_path = os.path.join(location, "all_barcodes_meta.csv")
        barcodes_path = utils.get_test_data('10x-example/barcodes.tsv')
        renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv')
        fastas_dir = os.path.join(location, "fastas")
        if not os.path.exists(fastas_dir):
            os.makedirs(fastas_dir)
        parser = bam2fasta_args.create_parser()
        args = [
            '--filename', testdata1,
            '--min-umi-per-barcode', '10',
            '--write-barcode-meta-csv', csv_path,
            '--barcodes-file', barcodes_path,
            '--rename-10x-barcodes', renamer_path,
            '--save-fastas', fastas_dir,
        ]
        expected_args_vals = {
            "filename": testdata1,
            "min_umi_per_barcode": 10,
            "write_barcode_meta_csv": csv_path,
            "barcodes_file": barcodes_path,
            "rename_10x_barcodes": renamer_path,
            "save_fastas": fastas_dir,
            "processes": bam2fasta_args.DEFAULT_PROCESSES,
            "delimiter": bam2fasta_args.DEFAULT_DELIMITER,
            "line_count": bam2fasta_args.DEFAULT_LINE_COUNT}
        args = parser.parse_args(args)
        args = vars(args)
        for key, val in args.items():
            assert key in list(expected_args_vals.keys())
            assert val in list(expected_args_vals.values())
Ejemplo n.º 2
0
def test_pass_alignment_qc():
    barcodes = tenx.read_barcodes_file(
        utils.get_test_data('10x-example/barcodes.tsv'))
    bam = tenx.read_bam_file(
        utils.get_test_data('10x-example/possorted_genome_bam.bam'))

    total_pass = sum(1 for alignment in bam
                     if tenx.pass_alignment_qc(alignment, barcodes))
    assert total_pass == 439
Ejemplo n.º 3
0
def test_pass_alignment_qc_filtered():
    bam = tenx.read_bam_file(
        utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam'))
    total_alignments = sum(1 for _ in bam)
    bam = tenx.read_bam_file(
        utils.get_test_data('10x-example/possorted_genome_bam_filtered.bam'))
    assert total_alignments == 1500
    total_pass = sum(1 for alignment in bam
                     if tenx.pass_alignment_qc(alignment, None))
    assert total_pass == 192
Ejemplo n.º 4
0
def test_get_cell_barcode_umis():
    path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz')
    barcodes_file = utils.get_test_data('10x-example/barcodes.tsv')
    barcodes = tenx.read_barcodes_file(barcodes_file)
    barcode_cell_umi_dict = tenx.get_cell_barcode_umis(
        path,
        bam2fasta_args.CELL_BARCODE_PATTERN,
        bam2fasta_args.MOLECULAR_BARCODE_PATTERN)
    for barcode in list(barcode_cell_umi_dict.keys()):
        assert barcode in barcodes
Ejemplo n.º 5
0
def test_bam_to_temp_fasta():
    filename = utils.get_test_data('10x-example/barcodes.tsv')
    bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam')
    barcodes = tenx.read_barcodes_file(filename)

    fastas = tenx.bam_to_temp_fasta(barcodes=barcodes,
                                    barcode_renamer=None,
                                    delimiter="X",
                                    bam_file=bam_file)
    assert len(list(fastas)) == 8
Ejemplo n.º 6
0
def test_get_cell_barcode():
    fastq_file = utils.get_test_data(
        '10x-example/possorted_genome_bam.fastq.gz')
    barcodes_file = utils.get_test_data('10x-example/barcodes.tsv')
    barcodes = tenx.read_barcodes_file(barcodes_file)
    with screed.open(fastq_file) as f:
        for record_count, record in enumerate(f):
            result = tenx.get_cell_barcode(
                record, bam2fasta_args.CELL_BARCODE_PATTERN)
            if result is not None:
                assert result in barcodes
Ejemplo n.º 7
0
def test_parse_barcode_renamer():
    filename = utils.get_test_data('10x-example/barcodes.tsv')
    barcodes = tenx.read_barcodes_file(filename)
    renamer = tenx.parse_barcode_renamer(barcodes, None)
    for key, value in renamer.items():
        assert key == value
    assert len(renamer) == len(barcodes)

    renamer = tenx.parse_barcode_renamer(
        barcodes, utils.get_test_data('10x-example/barcodes_renamer.tsv'))
    for key, value in renamer.items():
        assert key in value
        assert "epithelial_cell" in value
    assert len(renamer) == len(barcodes)
Ejemplo n.º 8
0
def test_write_to_barcode_meta_csv():
    bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam')
    with utils.TempDirectory() as location:
        single_barcode_fastas = tenx.bam_to_temp_fasta(
            barcodes=None,
            barcode_renamer=None,
            delimiter="X",
            bam_file=bam_file,
            temp_folder=location)
        single_barcode_fastas = "," .join(
            itertools.chain(single_barcode_fastas))
        all_fastas_sorted = tenx.get_fastas_per_unique_barcodes(
            single_barcode_fastas)
        tenx.barcode_umi_seq_to_fasta(
            location,
            "X",
            True,
            0,
            location,
            all_fastas_sorted)
        csv = os.path.join(location, "meta.csv")
        tenx.write_to_barcode_meta_csv(location, csv)
        umi_counts = [6, 2, 6, 4, 15, 5, 2, 2]
        read_counts = [312, 36, 251, 194, 594, 153, 2, 68]
        for index, row in pd.read_csv(csv).iterrows():
            assert umi_counts[index] == row[tenx.UMI_COUNT]
            assert read_counts[index] == row[tenx.READ_COUNT]
Ejemplo n.º 9
0
def test_barcode_umi_seq_to_fasta():
    bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam')
    with utils.TempDirectory() as location:
        single_barcode_fastas = tenx.bam_to_temp_fasta(
            barcodes=None,
            barcode_renamer=None,
            delimiter="X",
            bam_file=bam_file,
            temp_folder=location)
        single_barcode_fastas = "," .join(
            itertools.chain(single_barcode_fastas))
        all_fastas_sorted = tenx.get_fastas_per_unique_barcodes(
            single_barcode_fastas)
        tenx.barcode_umi_seq_to_fasta(
            location,
            "X",
            True,
            10,
            location,
            all_fastas_sorted)
        fastas = glob.glob(
            os.path.join(location, "*_bam2fasta.fasta"))
        assert len(fastas) == 1
        meta_txts = glob.glob(
            os.path.join(location, "*_meta.txt"))
        assert len(meta_txts) == 8
Ejemplo n.º 10
0
def test_shard_bam_file():
    filename = utils.get_test_data('10x-example/possorted_genome_bam.bam')
    bam_file = tenx.read_bam_file(filename)
    assert isinstance(bam_file, bs.AlignmentFile)

    expected_alignments = sum(1 for _ in bam_file)
    with utils.TempDirectory() as location:
        bam_shard_files = tenx.shard_bam_file(filename, expected_alignments,
                                              location)
        assert len(bam_shard_files) == 1

    num_shards = 2
    with utils.TempDirectory() as location:
        bam_shard_files = tenx.shard_bam_file(
            filename, expected_alignments // num_shards, location)
        assert len(bam_shard_files) == 2

        total_alignments = 0
        for bam_file in bam_shard_files:
            total_alignments += sum(1 for _ in tenx.read_bam_file(bam_file))
        assert total_alignments == expected_alignments

        whole_bam_file = tenx.read_bam_file(filename)
        for bam_file in bam_shard_files:
            for line in tenx.read_bam_file(bam_file):
                assert line == next(whole_bam_file)
Ejemplo n.º 11
0
def test_get_fastas_per_unique_barcodes():
    filename = utils.get_test_data('10x-example/barcodes.tsv')
    renamer_filename = utils.get_test_data('10x-example/barcodes_renamer.tsv')
    bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam')
    barcodes = tenx.read_barcodes_file(filename)

    with utils.TempDirectory() as location:

        all_fastas = tenx.bam_to_temp_fasta(
            barcodes=barcodes,
            barcode_renamer=renamer_filename,
            delimiter="X",
            bam_file=bam_file,
            temp_folder=location)
        all_fastas = ",".join(itertools.chain(all_fastas))
        fastas_sorted = tenx.get_fastas_per_unique_barcodes(all_fastas)
        assert len(fastas_sorted) == 8
Ejemplo n.º 12
0
def test_filtered_bam_to_umi_fasta():
    bam_file = utils.get_test_data(
        '10x-example/possorted_genome_bam_filtered.bam')
    fastas = tenx.bam_to_temp_fasta(barcodes=None,
                                    barcode_renamer=None,
                                    delimiter='X',
                                    bam_file=bam_file)
    assert len(list(fastas)) == 32
Ejemplo n.º 13
0
def test_get_molecular_barcode(umis):
    fastq_file = utils.get_test_data(
        '10x-example/possorted_genome_bam.fastq.gz')
    with screed.open(fastq_file) as f:
        for record_count, record in enumerate(f):
            result = tenx.get_molecular_barcode(
                record, bam2fasta_args.MOLECULAR_BARCODE_PATTERN)
            umis.append(result)
            if result is not None:
                assert result in umis
Ejemplo n.º 14
0
def test_run_bam2fasta_supply_all_args():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')
        csv_path = os.path.join(location, "all_barcodes_meta.csv")
        barcodes_path = utils.get_test_data('10x-example/barcodes.tsv')
        renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv')
        fastas_dir = os.path.join(location, "fastas")
        temp_fastas_dir = os.path.join(os.path.dirname(testdata1),
                                       "temp_fastas/")
        if not os.path.exists(fastas_dir):
            os.makedirs(fastas_dir)
        if not os.path.exists(temp_fastas_dir):
            os.makedirs(temp_fastas_dir)

        status, out, err = utils.run_shell_cmd(
            'bam2fasta percell --filename ' + testdata1 +
            ' --min-umi-per-barcode 10' + ' --write-barcode-meta-csv ' +
            csv_path + ' --save-intermediate-files ' + temp_fastas_dir +
            ' --barcodes-file ' + barcodes_path + ' --rename-10x-barcodes ' +
            renamer_path + ' --save-fastas ' + fastas_dir + " --processes 1",
            in_directory=location)

        assert status == 0
        with open(csv_path, 'rb') as f:
            data = [line.split() for line in f]
        assert len(data) == 9
        fasta_files = os.listdir(fastas_dir)
        barcodes = [filename.replace(".fasta", "") for filename in fasta_files]
        assert len(barcodes) == 1
        assert len(fasta_files) == 1
        assert barcodes[0] == \
            ('lung_epithelial_cell_AAATGCCCAAACTGCT-1_bam2fasta')
        count = 0
        fasta_file_name = os.path.join(fastas_dir, fasta_files[0])
        for record in screed.open(fasta_file_name):
            name = record.name
            sequence = record.sequence
            count += 1
            assert name.startswith('lung_epithelial_cell_AAATGCCCAAACTGCT-1')
            assert sequence.count(">") == 0
            assert sequence.count("X") == 0
        shutil.rmtree(temp_fastas_dir)
Ejemplo n.º 15
0
def test_run_bam2fasta_percell_no_shard():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data(
            '10x-example/possorted_genome_bam.fastq.gz')

        fasta_files = cli.percell(
            ['--filename', testdata1, '--save-fastas', location])
        print(fasta_files)

        barcodes = [filename.replace(".fasta", "") for filename in fasta_files]
        assert len(barcodes) == 8
Ejemplo n.º 16
0
def test_record_to_fastq_string():
    path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz')
    expected = (
        "@A00111:50:H2H5YDMXX:2:1334:1886:36072\tUB:Z:AGAAAATACG\n" +
        "GATTACTTAGTAGCTGTTTACTTAGCAGCACATTTGCAACAGCATCAAAAGCTATGT" +
        "TACTATAAAATCAGTGCGTGAAGTCTGATTTAC\n")
    with screed.open(path) as f:
        for record_count, record in enumerate(f):
            result = tenx.record_to_fastq_string(record)
            assert expected in result
            break
Ejemplo n.º 17
0
def test_make_per_cell_fastq_gzs():
    path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz')
    barcodes_file = utils.get_test_data('10x-example/barcodes.tsv')

    barcodes = tenx.read_barcodes_file(barcodes_file)
    with utils.TempDirectory() as location:
        outdir = os.path.join(location, "outdir")
        os.makedirs(outdir)
        tenx.make_per_cell_fastqs(
            path,
            outdir,
            "possorted_aligned_",
            "fastq.gz",
            bam2fasta_args.CELL_BARCODE_PATTERN,
            barcodes_file)
        fastas = glob.glob(os.path.join(outdir, "*.fastq.gz"))
        for fasta in fastas:
            fasta_name = os.path.basename(fasta).replace(
                ".fastq.gz", "").replace("possorted_aligned__", "")
            assert fasta_name in barcodes
Ejemplo n.º 18
0
def test_run_make_fastqs_percell():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data(
            '10x-example/possorted_genome_bam.fastq.gz')
        good_barcodes_path = utils.get_test_data(
            '10x-example/good_barcodes.csv')

        barcodes_path = utils.get_test_data('10x-example/barcodes.tsv')
        fastas_dir = os.path.join(location, "fastas")
        if not os.path.exists(fastas_dir):
            os.makedirs(fastas_dir)

        status, out, err = utils.run_shell_cmd(
            'bam2fasta make_fastqs_percell --filename ' + testdata1 +
            ' --barcodes-file ' + barcodes_path +
            " --barcodes-significant-umis-file " + good_barcodes_path +
            ' --save-fastas ' + fastas_dir,
            in_directory=location)
        assert status == 0
        fastqs = glob.glob(os.path.join(fastas_dir + "/*.fastq"))
        assert len(fastqs) == 1, "fastas_dir is {}".format(fastas_dir)
Ejemplo n.º 19
0
def test_bam2fasta_valid_args():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')
        csv_path = os.path.join(location, "all_barcodes_meta.csv")
        barcodes_path = utils.get_test_data('10x-example/barcodes.tsv')
        renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv')
        fastas_dir = os.path.join(location, "fastas")
        save_intermediate_files_dir = os.path.join(location, "temp_fastas")
        if not os.path.exists(fastas_dir):
            os.makedirs(fastas_dir)
        parser = b2fa_args.create_parser()
        args = [
            '--filename', testdata1, '--min-umi-per-barcode', '10',
            '--write-barcode-meta-csv', csv_path, '--barcodes-file',
            barcodes_path, '--rename-10x-barcodes', renamer_path,
            '--save-fastas', fastas_dir, '--save-intermediate-files',
            save_intermediate_files_dir
        ]
        expected_args_vals = {
            "filename": testdata1,
            "min_umi_per_barcode": 10,
            "write_barcode_meta_csv": csv_path,
            "barcodes_file": barcodes_path,
            "rename_10x_barcodes": renamer_path,
            "save_fastas": fastas_dir,
            "barcodes_significant_umis_file": None,
            "channel_id": "",
            "output_format": "fastq",
            "processes": b2fa_args.DEFAULT_PROCESSES,
            "delimiter": b2fa_args.DEFAULT_DELIMITER,
            "shard_size": b2fa_args.DEFAULT_LINE_COUNT,
            "cell_barcode_pattern": b2fa_args.CELL_BARCODE_PATTERN,
            "molecular_barcode_pattern": b2fa_args.MOLECULAR_BARCODE_PATTERN,
            "save_intermediate_files": save_intermediate_files_dir
        }
        args = parser.parse_args(args)
        args = vars(args)
        for key, val in args.items():
            assert key in list(expected_args_vals.keys())
            assert val in list(expected_args_vals.values())
Ejemplo n.º 20
0
def test_run_bam2fasta_fq_percell_no_shard_nonzero_umi():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data(
            '10x-example/possorted_genome_bam.fastq.gz')

        fasta_files = cli.percell([
            '--filename', testdata1, '--save-fastas', location,
            '--min-umi-per-barcode', '10'
        ])
        print(fasta_files)
        barcodes = [filename.replace(".fastq", "") for filename in fasta_files]
        assert len(barcodes) == 1
        sequences_fastq = []
        with screed.open(fasta_files[0]) as f:
            for record in f:
                sequences_fastq.append(record.sequence)
        gt_data = utils.get_test_data(
            '10x-example/groundtruth_fasta_sequences.txt')
        with open(gt_data, "r") as f:
            for index, line in enumerate(f.readlines()):
                assert line.strip() in sequences_fastq, \
                    "failed at index {}".format(index)
Ejemplo n.º 21
0
def test_run_bam2fasta_convert():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')

        fasta_files = cli.convert([
            '--filename',
            testdata1,
            '--save-fastas',
            location,
        ])

        barcodes = [filename.replace(".fasta", "") for filename in fasta_files]
        assert len(barcodes) == 8
Ejemplo n.º 22
0
def test_run_bam2fasta_default_args():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data('10x-example/possorted_genome_bam.bam')

        status, out, err = utils.run_shell_cmd(
            'bam2fasta percell --filename ' + testdata1, in_directory=location)

        assert status == 0
        fasta_files = os.listdir(location)
        barcodes = [
            filename.replace(".fasta", "") for filename in fasta_files
            if filename.endswith("_bam2fasta.fasta")
        ]
        assert len(barcodes) == 8
Ejemplo n.º 23
0
def test_write_fastq():
    path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz')
    with utils.TempDirectory() as location:
        with screed.open(path) as f:
            records = []
            for record_count, record in enumerate(f):
                records.append(record)
        write_path = os.path.join(location, "result.fastq")
        tenx.write_fastq(records, write_path)
        with screed.open(write_path) as f:
            records_written = []
            for record_count, record in enumerate(f):
                records_written.append(record)
            assert records_written == records
Ejemplo n.º 24
0
def test_run_count_umis_percell():
    with utils.TempDirectory() as location:
        testdata1 = utils.get_test_data(
            '10x-example/possorted_genome_bam.fastq.gz')
        csv_path = os.path.join(location, "all_barcodes_meta.csv")
        good_barcodes_path = os.path.join(location, "good_barcodes.csv")

        barcodes_path = utils.get_test_data('10x-example/barcodes.tsv')
        renamer_path = utils.get_test_data('10x-example/barcodes_renamer.tsv')

        status, out, err = utils.run_shell_cmd(
            'bam2fasta count_umis_percell --filename ' + testdata1 +
            ' --min-umi-per-barcode 10' + ' --write-barcode-meta-csv ' +
            csv_path + ' --barcodes-file ' + barcodes_path +
            ' --rename-10x-barcodes ' + renamer_path + " --processes 1 " +
            "--barcodes-significant-umis-file " + good_barcodes_path,
            in_directory=location)
        assert status == 0
        with open(csv_path, 'rb') as f:
            data = [line.split() for line in f]
        assert len(data) == 8
        with open(good_barcodes_path, 'rb') as f:
            data = [line.split() for line in f]
        assert len(data) == 1
Ejemplo n.º 25
0
def test_count_umis_per_cell(expected_good_barcodes):
    path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz')
    with utils.TempDirectory() as location:
        meta = os.path.join(location, "barcode_umi_meta.csv")
        good_barcodes = os.path.join(
            location, "barcodes_with_significant_umi_records.csv")
        tenx.count_umis_per_cell(
            path,
            meta,
            bam2fasta_args.CELL_BARCODE_PATTERN,
            bam2fasta_args.MOLECULAR_BARCODE_PATTERN,
            3,
            good_barcodes)
        expected_meta = [6, 15, 2, 2, 5, 4, 6, 2]
        all_barcodes = [
            'AAAGATGCAGATCTGT-1', 'AAATGCCCAAACTGCT-1', 'AACACGTAGTGTACCT-1',
            'AACCATGAGTTGTCGT-1', 'AAATGCCGTGAACCTT-1', 'AAATGCCAGATAGTCA-1',
            'AAACGGGAGGATATAC-1', 'AAACGGGTCTCGTATT-1']
        assert all_barcodes == pd.read_csv(
            meta, header=None).iloc[:, 0].values.tolist()
        assert expected_meta == pd.read_csv(
            meta, header=None).iloc[:, 1].values.tolist()
        assert expected_good_barcodes == pd.read_csv(
            good_barcodes, header=None).iloc[:, 0].values.tolist()
Ejemplo n.º 26
0
def test_read_barcodes_file():
    filename = utils.get_test_data('10x-example/barcodes.tsv')
    barcodes = tenx.read_barcodes_file(filename)
    assert len(barcodes) == 10
Ejemplo n.º 27
0
def test_read_bam_file():
    filename = utils.get_test_data('10x-example/possorted_genome_bam.bam')
    bam_file = tenx.read_bam_file(filename)
    assert isinstance(bam_file, bs.AlignmentFile)
    total_alignments = sum(1 for _ in bam_file)
    assert total_alignments == 1714