Beispiel #1
0
def get_bc_counts(genomes, genes, molecule_counter):
    genome_ids = molecule_counter.get_column('genome')
    genome_index = cr_reference.get_genome_index(genomes)
    conf_mapped_reads = molecule_counter.get_column('reads')
    barcodes = molecule_counter.get_column('barcode')

    bc_counts = {}
    for genome in genomes:
        genome_id = cr_reference.get_genome_id(genome, genome_index)
        genome_indices = genome_ids == genome_id
        if genome_indices.sum() == 0:
            # edge case - there's no data for this genome (e.g. empty sample, false barnyard sample, or nothing confidently mapped)
            continue
        bcs_for_genome = barcodes[genome_indices]
        # only count UMIs with at least one conf mapped read
        umi_conf_mapped_to_genome = conf_mapped_reads[genome_indices] > 0
        bc_breaks = bcs_for_genome[1:] - bcs_for_genome[:-1]
        bc_breaks = np.concatenate(
            ([1], bc_breaks))  # first row is always a break
        bc_break_indices = np.nonzero(bc_breaks)[0]
        unique_bcs = bcs_for_genome[bc_break_indices]
        umis_per_bc = np.add.reduceat(umi_conf_mapped_to_genome,
                                      bc_break_indices)
        cmb_reads_per_bc = np.add.reduceat(conf_mapped_reads[genome_indices],
                                           bc_break_indices)
        bc_counts[genome] = (unique_bcs, umis_per_bc, cmb_reads_per_bc)

    return bc_counts
Beispiel #2
0
def write_filtered_molecules(ctr_in, ctr_out, genomes, bcs_per_genome):
    ctr_out.set_all_metrics(ctr_in.get_all_metrics())

    filtered_bc_tuples = set()
    genome_ids = ctr_in.get_column('genome')
    genome_index = cr_reference.get_genome_index(genomes)
    for (genome, formatted_bcs) in bcs_per_genome.iteritems():
        genome_id = cr_reference.get_genome_id(genome, genome_index)
        for formatted_bc in formatted_bcs:
            (bc, gg) = cr_utils.split_barcode_seq(formatted_bc)
            cbc = cr_mol_counter.MoleculeCounter.compress_barcode_seq(bc)
            filtered_bc_tuples.add((genome_id, gg, cbc))

    def keep_molecule(genome_id, gem_group, barcode):
        tup = (genome_id, gem_group, barcode)
        return (tup in filtered_bc_tuples)

    filter_func = np.vectorize(keep_molecule)

    gem_groups = ctr_in.get_column('gem_group')
    barcodes = ctr_in.get_column('barcode')
    filter_index = filter_func(genome_ids, gem_groups, barcodes)

    for col in cr_mol_counter.MOLECULE_INFO_COLUMNS:
        data = ctr_in.get_column(col)
        filtered_data = data[filter_index]
        ctr_out.add_many(col, filtered_data)

    for col in cr_mol_counter.MOLECULE_REF_COLUMNS:
        ctr_out.set_ref_column(col, ctr_in.get_ref_column(col))

    # summarize filtered data
    genomes = ctr_out.get_ref_column('genome_ids')
    filtered_reads = ctr_out.get_column('reads')
    flt_conf_mapped_per_genome = {}
    if len(genomes) == 1:
        genome = genomes[0]
        flt_conf_mapped_per_genome[genome] = filtered_reads.sum()
    else:
        genome_ids = ctr_out.get_column('genome')
        genome_index = cr_reference.get_genome_index(genomes)
        for genome in genomes:
            genome_id = cr_reference.get_genome_id(genome, genome_index)
            flt_conf_mapped_per_genome[genome] = filtered_reads[
                genome_ids == genome_id].sum()
    summary = {'flt_conf_mapped_per_genome': flt_conf_mapped_per_genome}
    return summary
Beispiel #3
0
def main(args, outs):
    np.random.seed(0)

    with cr_mol_counter.MoleculeCounter.open(args.molecules,
                                             'r',
                                             start=int(args.chunk_start),
                                             length=int(
                                                 args.chunk_len)) as ctr_in:
        with cr_mol_counter.MoleculeCounter.open(outs.out_molecules,
                                                 'w') as ctr_out:
            metrics_in = ctr_in.get_all_metrics()
            metrics_out = metrics_in.copy()

            reads = ctr_in.get_column('reads')
            gem_groups = ctr_in.get_column('gem_group')

            if args.downsample and len(args.downsample_map) > 1:
                downsample_func = np.vectorize(
                    lambda gem_group, read_count: np.random.binomial(
                        read_count, args.downsample_map[str(gem_group)][
                            'frac_reads_kept']))

                # downsample metrics
                for gg in metrics_out[cr_mol_counter.GEM_GROUPS_METRIC]:
                    frac_reads_kept = args.downsample_map[str(
                        gg)]['frac_reads_kept']
                    total_reads_in = metrics_in[
                        cr_mol_counter.GEM_GROUPS_METRIC][gg][
                            cr_mol_counter.GG_TOTAL_READS_METRIC]
                    total_reads_out = round(frac_reads_kept * total_reads_in)
                    metrics_out[cr_mol_counter.GEM_GROUPS_METRIC][gg][
                        cr_mol_counter.
                        GG_DOWNSAMPLED_READS_METRIC] = total_reads_out

                ctr_out.set_all_metrics(metrics_out)

                # downsample molecule info
                subsampled_reads = downsample_func(gem_groups, reads)
                for col in cr_mol_counter.MOLECULE_INFO_COLUMNS:
                    if col == 'reads':
                        data = subsampled_reads
                    else:
                        data = ctr_in.get_column(col)
                    ctr_out.add_many(col, data)

                # pass reference info
                for col in cr_mol_counter.MOLECULE_REF_COLUMNS:
                    ctr_out.set_ref_column(col, ctr_in.get_ref_column(col))

            else:
                subsampled_reads = reads

            # collect summary stats
            genomes = ctr_in.get_ref_column('genome_ids')
            raw_conf_mapped_per_genome = {}
            if len(genomes) == 1:
                genome = genomes[0]
                raw_conf_mapped_per_genome[genome] = subsampled_reads.sum()
            else:
                genome_ids = ctr_in.get_column('genome')
                genome_index = cr_reference.get_genome_index(genomes)
                for genome in genomes:
                    genome_id = cr_reference.get_genome_id(
                        genome, genome_index)
                    raw_conf_mapped_per_genome[genome] = subsampled_reads[
                        genome_ids == genome_id].sum()

            summary = {
                'raw_conf_mapped_per_genome': raw_conf_mapped_per_genome,
                'mol_counter_metrics': metrics_out
            }

            with open(outs.summary, 'w') as f:
                tk_json.dump_numpy(summary, f, pretty=True)
Beispiel #4
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w')

    mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns()
    mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)}

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genome_index = cr_reference.get_genome_index(genomes)
    none_gene_id = len(gene_index.get_genes())

    # store reference index columns
    # NOTE - these must be cast to str first, as unicode is not supported
    counter.set_ref_column('genome_ids', [str(genome) for genome in genomes])
    counter.set_ref_column('gene_ids',
                           [str(gene.id) for gene in gene_index.genes])
    counter.set_ref_column('gene_names',
                           [str(gene.name) for gene in gene_index.genes])

    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)

    gg_metrics = collections.defaultdict(
        lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0})

    for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby(
            in_bam, key=cr_utils.barcode_sort_key):
        if barcode is None or gem_group is None:
            continue
        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode, gem_group) in filtered_bcs
        molecules = collections.defaultdict(
            lambda: np.zeros(len(mol_data_columns), dtype=np.uint64))

        compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq(
            barcode)
        gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group(
            gem_group)

        read_positions = collections.defaultdict(set)
        for read in reads_iter:
            umi = cr_utils.get_read_umi(read)
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or umi is None or read.is_read2:
                continue

            raw_umi = cr_utils.get_read_raw_umi(read)
            raw_bc, raw_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_raw_barcode(read))
            proc_bc, proc_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_barcode(read))

            if cr_utils.is_read_conf_mapped_to_transcriptome(
                    read, cr_utils.get_high_conf_mapq(args.align)):
                assert len(gene_ids) == 1

                mol_key, map_type = (umi, gene_index.gene_id_to_int(
                    gene_ids[0])), 'reads'

                read_pos = (read.tid, read.pos)
                uniq_read_pos = read_pos not in read_positions[mol_key]
                read_positions[mol_key].add(read_pos)

                if is_cell_barcode:
                    gg_metrics[int(gem_group)][
                        cr_mol_counter.
                        GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1

            elif read.is_unmapped:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'unmapped_reads', False
            else:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'nonconf_mapped_reads', False
            molecules[mol_key][mol_data_columns[map_type]] += 1
            molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int(
                not raw_umi == umi)
            molecules[mol_key][mol_data_columns[
                'barcode_corrected_reads']] += int(not raw_bc == proc_bc)
            molecules[mol_key][mol_data_columns[
                'conf_mapped_uniq_read_pos']] += int(uniq_read_pos)

        for mol_key, molecule in sorted(molecules.items()):
            umi, gene_id = mol_key
            genome = cr_utils.get_genome_from_str(
                gene_index.int_to_gene_id(gene_id), genomes)
            genome_id = cr_reference.get_genome_id(genome, genome_index)
            counter.add(
                barcode=compressed_barcode,
                gem_group=gem_group,
                umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi),
                gene=gene_id,
                genome=genome_id,
                **{
                    key: molecule[col_idx]
                    for key, col_idx in mol_data_columns.iteritems()
                })

    in_bam.close()

    counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics))

    counter.save()