コード例 #1
0
def test_pass_alignment_qc():
    barcodes = tenx.read_barcodes_file(
        utils.get_test_data('10x-example/barcodes.tsv'))
    bam = tenx.read_bam_file(
        utils.get_test_data('10x-example/possorted_genome_bam.bam'))

    total_pass = sum(1 for alignment in bam
                     if tenx.pass_alignment_qc(alignment, barcodes))
    assert total_pass == 439
コード例 #2
0
def test_bam_to_temp_fasta():
    filename = utils.get_test_data('10x-example/barcodes.tsv')
    bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam')
    barcodes = tenx.read_barcodes_file(filename)

    fastas = tenx.bam_to_temp_fasta(barcodes=barcodes,
                                    barcode_renamer=None,
                                    delimiter="X",
                                    bam_file=bam_file)
    assert len(list(fastas)) == 8
コード例 #3
0
ファイル: test_tenx_utils.py プロジェクト: czbiohub/bam2fasta
def test_get_cell_barcode_umis():
    path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz')
    barcodes_file = utils.get_test_data('10x-example/barcodes.tsv')
    barcodes = tenx.read_barcodes_file(barcodes_file)
    barcode_cell_umi_dict = tenx.get_cell_barcode_umis(
        path,
        bam2fasta_args.CELL_BARCODE_PATTERN,
        bam2fasta_args.MOLECULAR_BARCODE_PATTERN)
    for barcode in list(barcode_cell_umi_dict.keys()):
        assert barcode in barcodes
コード例 #4
0
ファイル: test_tenx_utils.py プロジェクト: czbiohub/bam2fasta
def test_get_cell_barcode():
    fastq_file = utils.get_test_data(
        '10x-example/possorted_genome_bam.fastq.gz')
    barcodes_file = utils.get_test_data('10x-example/barcodes.tsv')
    barcodes = tenx.read_barcodes_file(barcodes_file)
    with screed.open(fastq_file) as f:
        for record_count, record in enumerate(f):
            result = tenx.get_cell_barcode(
                record, bam2fasta_args.CELL_BARCODE_PATTERN)
            if result is not None:
                assert result in barcodes
コード例 #5
0
def test_parse_barcode_renamer():
    filename = utils.get_test_data('10x-example/barcodes.tsv')
    barcodes = tenx.read_barcodes_file(filename)
    renamer = tenx.parse_barcode_renamer(barcodes, None)
    for key, value in renamer.items():
        assert key == value
    assert len(renamer) == len(barcodes)

    renamer = tenx.parse_barcode_renamer(
        barcodes, utils.get_test_data('10x-example/barcodes_renamer.tsv'))
    for key, value in renamer.items():
        assert key in value
        assert "epithelial_cell" in value
    assert len(renamer) == len(barcodes)
コード例 #6
0
ファイル: test_tenx_utils.py プロジェクト: czbiohub/bam2fasta
def test_get_fastas_per_unique_barcodes():
    filename = utils.get_test_data('10x-example/barcodes.tsv')
    renamer_filename = utils.get_test_data('10x-example/barcodes_renamer.tsv')
    bam_file = utils.get_test_data('10x-example/possorted_genome_bam.bam')
    barcodes = tenx.read_barcodes_file(filename)

    with utils.TempDirectory() as location:

        all_fastas = tenx.bam_to_temp_fasta(
            barcodes=barcodes,
            barcode_renamer=renamer_filename,
            delimiter="X",
            bam_file=bam_file,
            temp_folder=location)
        all_fastas = ",".join(itertools.chain(all_fastas))
        fastas_sorted = tenx.get_fastas_per_unique_barcodes(all_fastas)
        assert len(fastas_sorted) == 8
コード例 #7
0
ファイル: test_tenx_utils.py プロジェクト: czbiohub/bam2fasta
def test_make_per_cell_fastq_gzs():
    path = utils.get_test_data('10x-example/possorted_genome_bam.fastq.gz')
    barcodes_file = utils.get_test_data('10x-example/barcodes.tsv')

    barcodes = tenx.read_barcodes_file(barcodes_file)
    with utils.TempDirectory() as location:
        outdir = os.path.join(location, "outdir")
        os.makedirs(outdir)
        tenx.make_per_cell_fastqs(
            path,
            outdir,
            "possorted_aligned_",
            "fastq.gz",
            bam2fasta_args.CELL_BARCODE_PATTERN,
            barcodes_file)
        fastas = glob.glob(os.path.join(outdir, "*.fastq.gz"))
        for fasta in fastas:
            fasta_name = os.path.basename(fasta).replace(
                ".fastq.gz", "").replace("possorted_aligned__", "")
            assert fasta_name in barcodes
コード例 #8
0
ファイル: cli.py プロジェクト: luizirber/bam2fasta
def convert(args):
    """Cli tool to convert bam to fasta files"""
    parser = create_parser()
    args = parser.parse_args(args)

    logger.info(args)

    umi_filter = True if args.min_umi_per_barcode != 0 else False
    all_fastas_sorted = []
    all_fastas = ""

    def collect_reduce_temp_fastas(index):
        """Convert fasta to sig record"""
        if umi_filter:
            return filtered_umi_to_fasta(index)
        else:
            return unfiltered_umi_to_fasta(index)

    def unfiltered_umi_to_fasta(index):
        """Returns signature records across fasta files for a unique barcode"""

        # Getting all fastas for a given barcode
        # from different shards
        single_barcode_fastas = all_fastas_sorted[index]
        count = 0
        # Iterating through fasta files for single barcode from different
        # fastas
        for fasta in iter_split(single_barcode_fastas, ","):

            # Initializing the fasta file to write
            # all the sequences from all bam shards to
            if count == 0:
                unique_fasta_file = os.path.basename(fasta)
                barcode_name = unique_fasta_file.replace(".fasta", "")
                f = open(os.path.join(args.save_fastas, unique_fasta_file),
                         "w")

            # Add sequence
            for record in screed.open(fasta):
                sequence = record.sequence
                umi = record.name

                split_seqs = sequence.split(args.delimiter)
                for index, seq in enumerate(split_seqs):
                    if seq == "":
                        continue
                    f.write(">{}\n{}\n".format(
                        barcode_name + "_" + umi + "_" +
                        '{:03d}'.format(index), seq))

            # Delete fasta file in tmp folder
            if os.path.exists(fasta):
                os.unlink(fasta)

            count += 1

        # close the fasta file
        f.close()

        return os.path.join(args.save_fastas, unique_fasta_file)

    def filtered_umi_to_fasta(index):
        """Returns signature records for all the fasta files for a unique
        barcode, only if it has more than min_umi_per_barcode number of umis"""

        # Getting all fastas for a given barcode
        # from different shards
        single_barcode_fastas = all_fastas_sorted[index]

        logger.debug("calculating umi counts")
        # Tracking UMI Counts
        umis = defaultdict(int)
        # Iterating through fasta files for single barcode from different
        # fastas
        for fasta in iter_split(single_barcode_fastas, ","):
            # calculate unique umi, sequence counts
            for record in screed.open(fasta):
                umis[record.name] += record.sequence.count(args.delimiter)

        if args.write_barcode_meta_csv:
            unique_fasta_file = os.path.basename(fasta)
            unique_meta_file = unique_fasta_file.replace(".fasta", "_meta.txt")
            with open(unique_meta_file, "w") as f:
                f.write("{} {}".format(len(umis), sum(list(umis.values()))))

        logger.debug("Completed tracking umi counts")
        if len(umis) < args.min_umi_per_barcode:
            return []
        count = 0
        for fasta in iter_split(single_barcode_fastas, ","):

            # Initializing fasta file to save the sequence to
            if count == 0:
                unique_fasta_file = os.path.basename(fasta)
                barcode_name = unique_fasta_file.replace(".fasta", "")
                f = open(os.path.join(args.save_fastas, unique_fasta_file),
                         "w")

            # Add sequences of barcodes with more than min-umi-per-barcode umis
            for record in screed.open(fasta):
                sequence = record.sequence
                umi = record.name

                # Appending sequence of a umi to the fasta
                split_seqs = sequence.split(args.delimiter)
                for index, seq in enumerate(split_seqs):
                    if seq == "":
                        continue
                    f.write(">{}\n{}\n".format(
                        barcode_name + "_" + umi + "_" +
                        '{:03d}'.format(index), seq))
            # Delete fasta file in tmp folder
            if os.path.exists(fasta):
                os.unlink(fasta)
            count += 1

        # close the opened fasta file
        f.close()
        return os.path.join(args.save_fastas, unique_fasta_file)

    def write_to_barcode_meta_csv():
        """ Merge all the meta text files for each barcode to
        one csv file with CELL_BARCODE, UMI_COUNT,READ_COUNT"""
        barcodes_meta_txts = glob.glob("*_meta.txt")

        with open(args.write_barcode_meta_csv, "w") as fp:
            fp.write("{},{},{}".format(CELL_BARCODE, UMI_COUNT, READ_COUNT))
            fp.write('\n')
            for barcode_meta_txt in barcodes_meta_txts:
                with open(barcode_meta_txt, 'r') as f:
                    umi_count, read_count = f.readline().split()
                    umi_count = int(umi_count)
                    read_count = int(read_count)

                    barcode_name = barcode_meta_txt.replace('_meta.txt', '')
                    fp.write("{},{},{}\n".format(barcode_name, umi_count,
                                                 read_count))

    def get_unique_barcodes(all_fastas):
        """ Build a dictionary with each unique barcode as key and
        their fasta files from different shards """
        fasta_files_dict = OrderedDict()
        for fasta in iter_split(all_fastas, ","):
            barcode = os.path.basename(fasta).replace(".fasta", "")
            value = fasta_files_dict.get(barcode, "")
            fasta_files_dict[barcode] = value + fasta + ","
        # Find unique barcodes
        all_fastas_sorted = list(fasta_files_dict.values())
        del fasta_files_dict
        return all_fastas_sorted

    # Initializing time
    startt = time.time()

    # Setting barcodes file, some 10x files don't have a filtered
    # barcode file
    if args.barcodes_file is not None:
        barcodes = tenx_utils.read_barcodes_file(args.barcodes_file)
    else:
        barcodes = None

    # Shard bam file to smaller bam file
    logger.info('... reading bam file from %s', args.filename)
    n_jobs = args.processes
    filenames, mmap_file = np_utils.to_memmap(
        np.array(
            tenx_utils.shard_bam_file(args.filename, args.line_count,
                                      os.getcwd())))

    # Create a per-cell fasta generator of sequences
    # If the reads should be filtered by barcodes and umis
    # umis are saved in fasta file as record name and name of
    # the fasta file is the barcode
    func = partial(tenx_utils.bam_to_temp_fasta, barcodes,
                   args.rename_10x_barcodes, args.delimiter)

    length_sharded_bam_files = len(filenames)
    chunksize = calculate_chunksize(length_sharded_bam_files, n_jobs)
    pool = multiprocessing.Pool(processes=n_jobs)
    logger.info(
        "multiprocessing pool processes {} and chunksize {} calculated".format(
            n_jobs, chunksize))
    # All the fastas are stored in a string instead of a list
    # This saves memory per element of the list by 8 bits
    # If we have unique barcodes in the order of 10^6 before
    # filtering that would result in a huge list if each barcode
    # is saved as a separate element, hence the string
    all_fastas = ",".join(
        itertools.chain(*(pool.imap(
            lambda x: func(x.encode('utf-8')), filenames, chunksize=chunksize)
                          )))

    # clean up the memmap and sharded intermediary bam files
    [os.unlink(file) for file in filenames if os.path.exists(file)]
    del filenames
    os.unlink(mmap_file)
    logger.info("Deleted intermediary bam and memmap files")

    all_fastas_sorted = get_unique_barcodes(all_fastas)
    unique_barcodes = len(all_fastas_sorted)
    logger.info("Found %d unique barcodes", unique_barcodes)
    # Cleaning up to retrieve memory from unused large variables
    del all_fastas
    chunksize = calculate_chunksize(unique_barcodes, n_jobs)
    logger.info("Pooled %d and chunksize %d mapped", n_jobs, chunksize)

    fastas = list(
        pool.imap(lambda index: collect_reduce_temp_fastas(index),
                  range(unique_barcodes),
                  chunksize=chunksize))

    if args.write_barcode_meta_csv:
        write_to_barcode_meta_csv()
    logger.info("time taken to convert fastas for 10x folder is %.5f seconds",
                time.time() - startt)
    fastas = [fasta for fasta in fastas if fasta != []]

    pool.close()
    pool.join()
    return fastas
コード例 #9
0
ファイル: cli.py プロジェクト: czbiohub/bam2fasta
def percell(args):
    """Cli tool to convert bam to per cell fasta files"""
    parser = create_parser()
    args = parser.parse_args(args)

    logger.info(args)

    save_fastas = os.path.abspath(args.save_fastas)
    if not os.path.exists(save_fastas):
        os.makedirs(save_fastas)
    else:
        logger.info("Path {} already exists, might be overwriting data".format(
            save_fastas))

    # Initializing time
    startt = time.time()

    save_intermediate_files = os.path.abspath(args.save_intermediate_files)
    if not os.path.exists(save_intermediate_files):
        os.makedirs(save_intermediate_files)
    else:
        logger.info("Path {} already exists, might be overwriting data".format(
            save_intermediate_files))

    # Setting barcodes file, some 10x files don't have a filtered
    # barcode file
    if args.barcodes_file is not None:
        barcodes = tenx_utils.read_barcodes_file(args.barcodes_file)
    else:
        barcodes = None

    # Shard bam file to smaller bam file
    logger.info('... reading bam file from %s', args.filename)
    n_jobs = args.processes
    input_format = os.path.basename(args.filename).split(".")[-1]
    if input_format == "bam":
        filenames = tenx_utils.shard_bam_file(args.filename, args.shard_size,
                                              save_intermediate_files)

        # Create a per-cell fasta generator of sequences
        # If the reads should be filtered by barcodes and umis
        # umis are saved in fasta file as record name and name of
        # the fasta file is the barcode
        func = partial(tenx_utils.bam_to_temp_fasta, barcodes,
                       args.rename_10x_barcodes, args.delimiter,
                       save_intermediate_files)

        length_sharded_bam_files = len(filenames)
        chunksize = tenx_utils.calculate_chunksize(length_sharded_bam_files,
                                                   n_jobs)
        pool = multiprocessing.Pool(processes=n_jobs)
        logger.info("multiprocessing pool processes {} & chunksize {}".format(
            n_jobs, chunksize))
        # All the fastas are stored in a string instead of a list
        # This saves memory per element of the list by 8 bits
        # If we have unique barcodes in the order of 10^6 before
        # filtering that would result in a huge list if each barcode
        # is saved as a separate element, hence the string
        all_fastas = ",".join(
            itertools.chain(
                *(pool.imap_unordered(lambda x: func(x.encode('utf-8')),
                                      filenames,
                                      chunksize=chunksize))))

        # clean up the memmap and sharded intermediary bam files
        [os.unlink(file) for file in filenames if os.path.exists(file)]
        del filenames
        logger.info("Deleted intermediary bam")

        all_fastas_sorted = tenx_utils.get_fastas_per_unique_barcodes(
            all_fastas)
        unique_barcodes = len(all_fastas_sorted)
        logger.info("Found %d unique barcodes", unique_barcodes)
        # Cleaning up to retrieve memory from unused large variables
        del all_fastas

        # Gather all barcodes per umis to one fasta
        func = partial(tenx_utils.barcode_umi_seq_to_fasta, save_fastas,
                       args.delimiter, args.write_barcode_meta_csv,
                       args.min_umi_per_barcode, save_intermediate_files)

        chunksize = tenx_utils.calculate_chunksize(unique_barcodes, n_jobs)

        logger.info("Pooled %d and chunksize %d mapped for %d lists", n_jobs,
                    chunksize, len(all_fastas_sorted))
        list(
            pool.imap_unordered(lambda fasta: func([fasta]),
                                all_fastas_sorted,
                                chunksize=chunksize))

        pool.close()
        pool.join()

        # Write barcode meta csv
        if args.write_barcode_meta_csv:
            tenx_utils.write_to_barcode_meta_csv(save_intermediate_files,
                                                 args.write_barcode_meta_csv)

        # Gather all the fastas
        fastas = glob.glob(os.path.join(save_fastas, "*_bam2fasta.fasta"))
    else:
        # if the fastq.gz file is already given
        assert input_format == "gz", \
            ("please convert" +
             "bam files to fastq.gz using samtools")
        # Check if the good barcodes with significant umis is already given
        barcodes_significant_umis_file = args.barcodes_significant_umis_file
        logger.info("barcodes_significant_umis_file {}".format(
            barcodes_significant_umis_file))
        # Find the good_barcodes file from the aligned sequences and use it
        # for unaligned.fastq.gz
        basename_wo_format = os.path.basename(args.filename).replace(
            ".fastq.gz", "__")
        args.barcodes_significant_umis_file = os.path.join(
            save_fastas, "barcodes_with_significant_umis.tsv")
        count_umis_percell(args)
        args.channel_id = basename_wo_format
        fastas = make_fastqs_percell(args)
        logger.info("time taken to write fastas is %.5f seconds",
                    time.time() - startt)

    return fastas
コード例 #10
0
def test_read_barcodes_file():
    filename = utils.get_test_data('10x-example/barcodes.tsv')
    barcodes = tenx.read_barcodes_file(filename)
    assert len(barcodes) == 10