예제 #1
0
def write_filtered_barcodes(out_csv, gem_group, mol_counter, bcs_per_genome):
    with open(out_csv, 'wb') as f:
        writer = csv.writer(f)
        for (genome, bc_ids) in bcs_per_genome.iteritems():
            for bc_id in bc_ids:
                formatted_barcode = cr_utils.format_barcode_seq(
                    mol_counter.decompress_barcode_seq(bc_id), gem_group)
                writer.writerow([genome, formatted_barcode])
    def get_filtered_barcodes(barcode_info, library_info, barcodes,
                              genome_idx=None, library_type=None):
        """Get a list of filtered barcode strings e.g. ['ACGT-1',...]
        Args:
          barcode_info (BarcodeInfo): Barcode info object.
          library_info (list of dict): Library info.
          barcodes (np.array): Barcode sequences.
          genome_idx (int): Restrict passing definition to this genome. None for no restriction.
          library_type (str): Restrict passing definition to this library type. None for no restriction.
        Returns:
          list of str
        """

        # Without restrictions, assumes passing filter in a single library or genome is sufficient
        # for a barcode to be passing filter overall.

        pass_filter = barcode_info.pass_filter

        pf_barcode_idx = pass_filter[:,0]
        pf_library_idx = pass_filter[:,1]
        pf_genome_idx = pass_filter[:,2]

        mask = np.ones(pass_filter.shape[0], dtype=bool)
        if genome_idx is not None:
            mask &= pf_genome_idx == genome_idx

        if library_type is not None:
            library_inds = np.array([i for i,lib in enumerate(library_info) if lib['library_type'] == library_type],
                                    dtype=MOLECULE_INFO_COLUMNS['library_idx'])
            mask &= np.isin(pf_library_idx, library_inds)
        inds = np.flatnonzero(mask)

        lib_to_gg = np.array([lib['gem_group'] for lib in library_info], dtype='uint64')

        pf_gem_group = lib_to_gg[pf_library_idx[inds]]

        # Take unique, sorted barcodes (sorted by (gem_group, barcode_idx))
        gg_bcs = np.unique(np.column_stack((pf_gem_group, pf_barcode_idx[inds])), axis=0)

        # Create barcode strings
        return [cr_utils.format_barcode_seq(barcodes[gg_bcs[i, 1]],
                                            gg_bcs[i, 0]) for i in xrange(gg_bcs.shape[0])]
예제 #3
0
    def get_molecule_iter(self, barcode_length, subsample_rate=1.0):
        """ Return an iterator on Molecule tuples """
        assert subsample_rate >= 0 and subsample_rate <= 1.0

        # Store the previous compressed barcode so we don't have to decompress every single row
        prev_compressed_bc = None
        prev_gem_group = None
        prev_bc = None

        # Load the molecule data
        mol_barcodes = self.get_column('barcode')
        mol_gem_groups = self.get_column('gem_group')
        mol_genome_ints = self.get_column('genome')
        mol_gene_ints = self.get_column('gene')
        mol_reads = self.get_column('reads')

        gene_ids = self.get_ref_column('gene_ids')
        genome_ids = self.get_ref_column('genome_ids')

        if subsample_rate < 1.0:
            mol_reads = np.random.binomial(mol_reads, subsample_rate)

        for compressed_bc, gem_group, genome_int, gene_int, reads in itertools.izip(
                mol_barcodes, mol_gem_groups, mol_genome_ints, mol_gene_ints,
                mol_reads):
            if reads == 0:
                continue

            # Decompress the cell barcode if necessary
            if compressed_bc == prev_compressed_bc and gem_group == prev_gem_group:
                bc = prev_bc
            else:
                bc = cr_utils.format_barcode_seq(
                    self.decompress_barcode_seq(compressed_bc,
                                                barcode_length=barcode_length),
                    gem_group)
            yield Molecule(barcode=bc,
                           genome=genome_ids[genome_int],
                           gene_id=gene_ids[gene_int],
                           reads=reads)
예제 #4
0
def main(args, outs):
    outs.coerce_strings()

    # Load whitelist
    whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_to_idx = OrderedDict((k, i) for i, k in enumerate(whitelist))

    # Load feature reference
    feature_ref = rna_feature_ref.from_transcriptome_and_csv(
        args.reference_path, args.feature_reference)

    # Load library info from BAM
    in_bam = tk_bam.create_bam_infile(args.chunk_input)
    library_info = rna_library.get_bam_library_info(in_bam)

    # Get cell-associated barcodes by genome
    filtered_bcs_by_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bc_union = cr_utils.get_cell_associated_barcode_set(
        args.filtered_barcodes)

    # Create the barcode info
    barcode_info = MoleculeCounter.build_barcode_info(filtered_bcs_by_genome,
                                                      library_info, whitelist)

    # Create the molecule info file
    mc = MoleculeCounter.open(outs.output,
                              mode='w',
                              feature_ref=feature_ref,
                              barcodes=whitelist,
                              library_info=library_info,
                              barcode_info=barcode_info)

    # Initialize per-library metrics
    lib_metrics = {}
    for lib_idx in xrange(len(library_info)):
        lib_metrics[str(lib_idx)] = {}
        lib_metrics[str(lib_idx)][cr_mol_counter.USABLE_READS_METRIC] = 0

    # Record read-counts per molecule. Note that UMIs are not contiguous
    # in the input because no sorting was done after UMI correction.

    prev_gem_group = None
    prev_barcode_idx = None

    for (gem_group, barcode_seq), reads_iter in \
        itertools.groupby(in_bam, key=cr_utils.barcode_sort_key_no_umi):
        if barcode_seq is None:
            continue

        barcode_idx = barcode_to_idx[barcode_seq]

        # Assert expected sort order of input BAM
        assert gem_group >= prev_gem_group
        if gem_group == prev_gem_group:
            assert barcode_idx >= prev_barcode_idx

        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode_seq, gem_group) in filtered_bc_union

        counts = defaultdict(int)

        for read in reads_iter:
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or \
               read.is_read2 or \
               cr_utils.is_read_low_support_umi(read) or \
               not cr_utils.is_read_conf_mapped_to_feature(read):
                continue

            umi_seq = cr_utils.get_read_umi(read)
            if umi_seq is None:
                continue

            umi_int = MoleculeCounter.compress_umi_seq(
                umi_seq,
                MoleculeCounter.get_column_dtype('umi').itemsize * 8)

            feature_ids = cr_utils.get_read_gene_ids(read)
            assert len(feature_ids) == 1
            feature_int = feature_ref.id_map[feature_ids[0]].index

            library_idx = cr_utils.get_read_library_index(read)

            counts[(umi_int, library_idx, feature_int)] += 1

            if is_cell_barcode:
                lib_metrics[str(library_idx)][
                    cr_mol_counter.USABLE_READS_METRIC] += 1

            prev_gem_group = gem_group
            prev_barcode_idx = barcode_idx

        # Record data for this barcode
        gg_int = MoleculeCounter.get_column_dtype('gem_group').type(gem_group)
        mc.append_column('gem_group', np.repeat(gg_int, len(counts)))
        bc_int = MoleculeCounter.get_column_dtype('barcode_idx').type(
            barcode_idx)
        mc.append_column('barcode_idx', np.repeat(bc_int, len(counts)))

        feature_ints = np.fromiter(
            (k[2] for k in counts.iterkeys()),
            dtype=MoleculeCounter.get_column_dtype('feature_idx'),
            count=len(counts))
        # Sort by feature for fast matrix construction
        order = np.argsort(feature_ints)
        feature_ints = feature_ints[order]
        mc.append_column('feature_idx', feature_ints)
        del feature_ints

        li_ints = np.fromiter(
            (k[1] for k in counts.iterkeys()),
            dtype=MoleculeCounter.get_column_dtype('library_idx'),
            count=len(counts))[order]
        mc.append_column('library_idx', li_ints)
        del li_ints

        umi_ints = np.fromiter((k[0] for k in counts.iterkeys()),
                               dtype=MoleculeCounter.get_column_dtype('umi'),
                               count=len(counts))[order]
        mc.append_column('umi', umi_ints)
        del umi_ints

        count_ints = np.fromiter(
            counts.itervalues(),
            dtype=MoleculeCounter.get_column_dtype('count'),
            count=len(counts))[order]
        mc.append_column('count', count_ints)
        del count_ints

    in_bam.close()

    mc.set_metric(cr_mol_counter.LIBRARIES_METRIC, dict(lib_metrics))

    mc.save()
예제 #5
0
def main(args, outs):
    np.random.seed(0)

    unique_gem_groups = np.unique(args.gem_groups).tolist()
    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    cell_barcodes = set()
    bc_support = defaultdict(int)

    # Load barcode whitelist
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)

    all_gem_groups = sorted(set(args.gem_groups))

    if args.recovered_cells:
        recovered_cells = args.recovered_cells
    else:
        recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len(
            all_gem_groups)

    for gem_group in all_gem_groups:
        if barcode_whitelist is None:
            break

        # Load barcode raw read count distribution
        barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                                  barcode_whitelist,
                                                  gem_group,
                                                  proportions=False)
        counts = np.array(barcode_dist.values())

        # Append gem group to barcode seqs
        barcodes = np.array([
            cr_utils.format_barcode_seq(seq, gem_group)
            for seq in barcode_dist.keys()
        ])

        # Call cell barcodes
        gg_bc_support, gg_cell_bcs, rpu_threshold, umi_threshold, confidence = call_cell_barcodes(
            args.umi_info, int(gem_group))

        # Record the RPU and UMI thresholds
        reporter._get_metric_attr('vdj_filter_bcs_rpu_threshold',
                                  gem_group).set_value(rpu_threshold)
        reporter._get_metric_attr('vdj_filter_bcs_umi_threshold',
                                  gem_group).set_value(umi_threshold)
        reporter._get_metric_attr('vdj_filter_bcs_confidence',
                                  gem_group).set_value(confidence)

        if len(gg_bc_support) > 0:
            if args.force_cells is not None:
                sorted_bcs = map(
                    lambda kv: kv[0],
                    sorted(gg_bc_support.items(),
                           key=lambda kv: kv[1],
                           reverse=True))
                gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells
                                              )]

            # Update set of BCs called as cells
            cell_barcodes.update(set(gg_cell_bcs))

            # Sum BC support
            for bc, count in gg_bc_support.iteritems():
                bc_support[bc] += count

        # Load the extract_reads summary to get the total raw reads
        total_read_pairs = cr_utils.get_metric_from_json(
            args.extract_reads_summary, 'total_read_pairs')

        reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts,
                                        total_read_pairs, recovered_cells)

    save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes)

    with open(outs.barcode_support, 'w') as f:
        f.write('barcode,count\n')
        for k, v in bc_support.iteritems():
            f.write('%s,%d\n' % (k, v))

    write_barcode_umi_summary(args.umi_info, reporter,
                              outs.barcode_umi_summary,
                              args.min_readpairs_per_umi, cell_barcodes)

    reporter.report_summary_json(outs.summary)
예제 #6
0
def main(args, outs):
    unique_gem_groups = np.unique(args.gem_groups).tolist()
    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    cell_barcodes = set()
    bc_support = {}

    # Load barcode whitelist
    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)

    all_gem_groups = sorted(set(args.gem_groups))

    if args.recovered_cells:
        recovered_cells = args.recovered_cells
    else:
        recovered_cells = cr_constants.DEFAULT_TOP_BARCODE_CUTOFF * len(
            all_gem_groups)

    for gem_group in all_gem_groups:
        if barcode_whitelist is None:
            break

        # Load barcode raw read count distribution
        barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                                  barcode_whitelist,
                                                  gem_group,
                                                  proportions=False)
        counts = np.array(barcode_dist.values())

        # Append gem group to barcode seqs
        barcodes = np.array([
            cr_utils.format_barcode_seq(seq, gem_group)
            for seq in barcode_dist.keys()
        ])

        # Call cell barcodes
        gg_bc_support, gg_cell_bcs, threshold = call_cell_barcodes(
            args.umi_summary, int(gem_group), args.min_umis,
            args.readpairs_per_umi_nx, args.readpairs_per_umi_ratio)

        # Record the threshold
        reporter._get_metric_attr(
            'vdj_filtered_bc_contig_kth_umi_readpair_threshold',
            gem_group).set_value(threshold)

        if len(gg_bc_support) > 0:
            if args.force_cells is not None:
                sorted_bcs = map(
                    lambda kv: kv[0],
                    sorted(gg_bc_support.items(),
                           key=lambda kv: kv[1],
                           reverse=True))
                gg_cell_bcs = sorted_bcs[:min(len(sorted_bcs), args.force_cells
                                              )]

            cell_barcodes.update(set(gg_cell_bcs))
            bc_support.update(gg_bc_support)

        # Load the extract_reads summary to get the total raw reads
        total_read_pairs = cr_utils.get_metric_from_json(
            args.extract_reads_summary, 'total_read_pairs')

        # Load the assembly metrics summary to get the total assemblable reads
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in cell_barcodes)

        reporter.vdj_filter_barcodes_cb(cell_barcodes, barcodes, counts,
                                        total_read_pairs,
                                        assemblable_read_pairs,
                                        recovered_cells)

    save_cell_barcodes_json(cell_barcodes, outs.cell_barcodes)

    with open(outs.barcode_support, 'w') as f:
        f.write('barcode,count\n')
        for k, v in bc_support.iteritems():
            f.write('%s,%d\n' % (k, v))

    write_barcode_umi_summary(args.umi_info, reporter,
                              outs.barcode_umi_summary,
                              args.min_readpairs_per_umi, cell_barcodes)

    reporter.report_summary_json(outs.summary)
예제 #7
0
def make_barcode_tags(qname, reporter, args):
    gem_group = args.gem_group
    correct_barcodes = args.correct_barcodes
    barcode_confidence_threshold = args.barcode_confidence_threshold
    barcode_whitelist = reporter.barcode_whitelist
    barcode_dist = reporter.barcode_dist

    tags = []
    fastq_header = AugmentedFastqHeader(qname)

    # Barcode tags
    raw_bc_seq = fastq_header.get_tag(cr_constants.RAW_BARCODE_TAG)
    bc_qual = fastq_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)
    barcode_info = None

    if len(raw_bc_seq) > 0:
        processed_bc_seq = reporter.raw_barcode_cb(raw_bc_seq, bc_qual)

        # Add the gem group
        if processed_bc_seq is not None:
            processed_bc_seq = cr_utils.format_barcode_seq(processed_bc_seq, gem_group=gem_group)

        if (processed_bc_seq is None) and (barcode_whitelist is not None):
            if correct_barcodes:
                # Try to correct the barcode
                processed_bc_seq = cr_stats.correct_bc_error(barcode_confidence_threshold, raw_bc_seq, bc_qual, barcode_dist)

                # Add the gem group
                if processed_bc_seq is not None:
                    processed_bc_seq = cr_utils.format_barcode_seq(processed_bc_seq, gem_group=gem_group)

            else:
                # If the barcode was already corrected, take that (gem group is included)
                processed_bc_seq = fastq_header.get_tag(cr_constants.PROCESSED_BARCODE_TAG)
        tags.append((cr_constants.RAW_BARCODE_TAG, raw_bc_seq))
        tags.append((cr_constants.RAW_BARCODE_QUAL_TAG, bc_qual))
        if processed_bc_seq is not None:
            tags.append((cr_constants.PROCESSED_BARCODE_TAG, processed_bc_seq))
        barcode_info = cr_constants.ProcessedRead(raw_bc_seq, processed_bc_seq, bc_qual)

    # UMI tags
    raw_umi_seq = fastq_header.get_tag(cr_constants.RAW_UMI_TAG)
    umi_qual = fastq_header.get_tag(cr_constants.UMI_QUAL_TAG)
    umi_info = None

    if len(raw_umi_seq) > 0:
        processed_umi_seq = reporter.raw_umi_cb(raw_umi_seq, umi_qual)
        tags.append((cr_constants.RAW_UMI_TAG, raw_umi_seq))
        tags.append((cr_constants.UMI_QUAL_TAG, umi_qual))
        if processed_umi_seq is not None:
            tags.append((cr_constants.PROCESSED_UMI_TAG, processed_umi_seq))
        umi_info = cr_constants.ProcessedRead(raw_umi_seq, processed_umi_seq, umi_qual)

    # Sample index tags
    si_seq = fastq_header.get_tag(tk_constants.SAMPLE_INDEX_TAG)
    si_qual = fastq_header.get_tag(tk_constants.SAMPLE_INDEX_QUAL_TAG)

    if len(si_seq) > 0:
        tags.append((tk_constants.SAMPLE_INDEX_TAG, si_seq))
        tags.append((tk_constants.SAMPLE_INDEX_QUAL_TAG, si_qual))

    stripped_qname = fastq_header.fastq_header

    return stripped_qname, tags, barcode_info, umi_info
예제 #8
0
def main(args, outs):
    np.random.seed(0)

    LogPerf.mem()

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        barcode_info = mc.get_barcode_info()

        metrics_in = mc.get_all_metrics()
        metrics_out = copy.deepcopy(metrics_in)

        # Compute subsampling rate and approximate new total readpair count
        frac_reads_kept = np.array(args.frac_reads_kept, dtype=float)
        total_reads_in = mc.get_raw_read_pairs_per_library()
        total_reads_out = total_reads_in * frac_reads_kept

        for lib_idx, _ in enumerate(library_info):
            metrics_out[cr_mol_counter.LIBRARIES_METRIC][str(
                lib_idx)][cr_mol_counter.
                          DOWNSAMPLED_READS_METRIC] = total_reads_out[lib_idx]

        # downsample molecule info
        chunk = slice(args.chunk_start, args.chunk_start + args.chunk_len)
        mol_library_idx = mc.get_column_lazy('library_idx')[chunk]
        mol_read_pairs = mc.get_column_lazy('count')[chunk]

        mol_rate = frac_reads_kept[mol_library_idx]
        del mol_library_idx

        new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate)
        del mol_read_pairs
        del mol_rate

        keep_mol = np.flatnonzero(new_read_pairs)
        new_read_pairs = new_read_pairs[keep_mol]

        mol_gem_group = mc.get_column_lazy('gem_group')[chunk][keep_mol]
        mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk][keep_mol]
        mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk][keep_mol]

        # Assert that gem groups start at 1 and are contiguous
        gem_groups = sorted(set(lib['gem_group'] for lib in library_info))
        assert(min(gem_groups) == 1 and \
               np.all(np.diff(np.array(gem_groups,dtype=int)) == 1))

        feature_ref = mc.get_feature_ref()

        # Compute matrix dimensions
        # Get the range of possible barcode indices for each gem group.
        gg_barcode_idx_start = np.zeros(1 + len(gem_groups), dtype=int)
        gg_barcode_idx_len = np.zeros(1 + len(gem_groups), dtype=int)
        for gg_str, idx_range in sorted(
                args.gem_group_barcode_ranges.iteritems(),
                key=lambda kv: int(kv[0])):
            gg = int(gg_str)
            gg_barcode_idx_start[gg] = idx_range[0]
            gg_barcode_idx_len[gg] = idx_range[1] - idx_range[0]

        num_bcs = gg_barcode_idx_len.sum()
        num_features = feature_ref.get_num_features()

        print 'downsampled'
        LogPerf.mem()

        # Convert molecule barcode indices into matrix barcode indices
        # The molecule info barcode_idx is in this space:
        #  [W_0, W_1, ...] where W_i is distinct original whitelist i.
        # The matrix is in, e.g., this space:
        #  [w_0-1, w_1-2, w_0-3, ...] where w_i-j is a copy of whitelist i for gem group j.

        # Return to the original whitelist index
        mol_barcode_idx -= gg_barcode_idx_start.astype(
            np.uint64)[mol_gem_group]

        # Offset by the cumulative whitelist length up to a barcode's gem group
        gg_barcode_matrix_start = np.cumsum(gg_barcode_idx_len).astype(
            np.uint64)
        mol_barcode_idx += gg_barcode_matrix_start[mol_gem_group - 1]

        ones = np.ones(len(mol_barcode_idx),
                       dtype=cr_matrix.DEFAULT_DATA_DTYPE)
        umi_matrix = sp_sparse.coo_matrix(
            (ones, (mol_feature_idx, mol_barcode_idx)),
            shape=(num_features, num_bcs))
        print 'created umi matrix'
        LogPerf.mem()

        # Create a read-count matrix so we can summarize reads per barcode
        read_matrix = sp_sparse.coo_matrix(
            (new_read_pairs, (mol_feature_idx, mol_barcode_idx)),
            shape=(num_features, num_bcs))
        del ones
        del mol_feature_idx
        del mol_barcode_idx
        del new_read_pairs

        # Get all barcodes strings for the raw matrix
        barcode_seqs = mc.get_barcodes()

        print len(barcode_seqs), len(gem_groups)
        print 'creating barcode strings'
        LogPerf.mem()

        barcodes = []
        for gg in gem_groups:
            idx_start = gg_barcode_idx_start[gg]
            idx_end = idx_start + gg_barcode_idx_len[gg]
            gg_bcs = np.array([
                cr_utils.format_barcode_seq(bc, gg)
                for bc in barcode_seqs[idx_start:idx_end]
            ])
            barcodes.append(gg_bcs)
        barcodes = np.concatenate(barcodes)
        barcodes.flags.writeable = False

        print 'created barcode strings'
        LogPerf.mem()

        # Get mapped reads per barcode per library,genome
        read_summary = {}
        read_matrix = CountMatrix(feature_ref, barcodes, read_matrix)
        read_matrix.m = read_matrix.m.tocsc(copy=True)
        read_summary = summarize_read_matrix(read_matrix, library_info,
                                             barcode_info, barcode_seqs)
        del read_matrix

        print 'created read matrix'
        LogPerf.mem()
        # Construct the raw UMI matrix
        raw_umi_matrix = CountMatrix(feature_ref, barcodes, umi_matrix)
        raw_umi_matrix.save_h5_file(outs.raw_matrix_h5)
        outs.raw_nnz = raw_umi_matrix.m.nnz

        # Construct the filtered UMI matrix
        filtered_bcs = MoleculeCounter.get_filtered_barcodes(
            barcode_info, library_info, barcode_seqs)
        filtered_umi_matrix = raw_umi_matrix.select_barcodes_by_seq(
            filtered_bcs)
        filtered_umi_matrix.save_h5_file(outs.filtered_matrix_h5)
        outs.filtered_nnz = filtered_umi_matrix.m.nnz

        print 'created filtered umi matrix'
        LogPerf.mem()

        summary = {
            'read_summary': read_summary,
            'mol_metrics': metrics_out,
        }

        with open(outs.chunk_summary, 'w') as f:
            json.dump(tk_safe_json.json_sanitize(summary),
                      f,
                      indent=4,
                      sort_keys=True)

    # Don't write MEX from chunks.
    outs.raw_matrices_mex = None
    outs.filtered_matrices_mex = None
예제 #9
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w')

    mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns()
    mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)}

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genome_index = cr_reference.get_genome_index(genomes)
    none_gene_id = len(gene_index.get_genes())

    # store reference index columns
    # NOTE - these must be cast to str first, as unicode is not supported
    counter.set_ref_column('genome_ids', [str(genome) for genome in genomes])
    counter.set_ref_column('gene_ids',
                           [str(gene.id) for gene in gene_index.genes])
    counter.set_ref_column('gene_names',
                           [str(gene.name) for gene in gene_index.genes])

    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)

    gg_metrics = collections.defaultdict(
        lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0})

    for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby(
            in_bam, key=cr_utils.barcode_sort_key):
        if barcode is None or gem_group is None:
            continue
        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode, gem_group) in filtered_bcs
        molecules = collections.defaultdict(
            lambda: np.zeros(len(mol_data_columns), dtype=np.uint64))

        compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq(
            barcode)
        gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group(
            gem_group)

        read_positions = collections.defaultdict(set)
        for read in reads_iter:
            umi = cr_utils.get_read_umi(read)
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or umi is None or read.is_read2:
                continue

            raw_umi = cr_utils.get_read_raw_umi(read)
            raw_bc, raw_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_raw_barcode(read))
            proc_bc, proc_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_barcode(read))

            if cr_utils.is_read_conf_mapped_to_transcriptome(
                    read, cr_utils.get_high_conf_mapq(args.align)):
                assert len(gene_ids) == 1

                mol_key, map_type = (umi, gene_index.gene_id_to_int(
                    gene_ids[0])), 'reads'

                read_pos = (read.tid, read.pos)
                uniq_read_pos = read_pos not in read_positions[mol_key]
                read_positions[mol_key].add(read_pos)

                if is_cell_barcode:
                    gg_metrics[int(gem_group)][
                        cr_mol_counter.
                        GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1

            elif read.is_unmapped:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'unmapped_reads', False
            else:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'nonconf_mapped_reads', False
            molecules[mol_key][mol_data_columns[map_type]] += 1
            molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int(
                not raw_umi == umi)
            molecules[mol_key][mol_data_columns[
                'barcode_corrected_reads']] += int(not raw_bc == proc_bc)
            molecules[mol_key][mol_data_columns[
                'conf_mapped_uniq_read_pos']] += int(uniq_read_pos)

        for mol_key, molecule in sorted(molecules.items()):
            umi, gene_id = mol_key
            genome = cr_utils.get_genome_from_str(
                gene_index.int_to_gene_id(gene_id), genomes)
            genome_id = cr_reference.get_genome_id(genome, genome_index)
            counter.add(
                barcode=compressed_barcode,
                gem_group=gem_group,
                umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi),
                gene=gene_id,
                genome=genome_id,
                **{
                    key: molecule[col_idx]
                    for key, col_idx in mol_data_columns.iteritems()
                })

    in_bam.close()

    counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics))

    counter.save()
예제 #10
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group,
                                              args.library_type)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = cr_io.open_maybe_gzip(args.read1_chunk)
    in_read2_fastq = cr_io.open_maybe_gzip(
        args.read2_chunk) if args.read2_chunk else []

    outs.corrected_bcs += h5_constants.LZ4_SUFFIX
    out_file = cr_io.open_maybe_gzip(outs.corrected_bcs, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip_longest(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                            tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        processed_bc = None

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        out_file.write('%s\n' %
                       (processed_bc if processed_bc is not None else ''))

    in_read1_fastq.close()
    if in_read2_fastq:
        in_read2_fastq.close()
    out_file.close()

    bc_counter.close()

    reporter.save(outs.chunked_reporter)
예제 #11
0
def main(args, outs):
    # Load barcode whitelist
    if args.barcode_whitelist is not None:
        barcode_whitelist = cr_utils.load_barcode_whitelist(
            args.barcode_whitelist)

    reporter = vdj_report.VdjReporter()

    # Load barcode count distribution
    barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts,
                                              barcode_whitelist,
                                              args.gem_group)

    if args.barcode_whitelist is not None:
        barcode_whitelist_set = set(barcode_whitelist)
    else:
        barcode_whitelist_set = None

    in_read1_fastq = open(args.read1_chunk)
    in_read2_fastq = open(args.read2_chunk)
    out_read1_fastq = open(outs.corrected_read1s, 'w')
    out_read2_fastq = open(outs.corrected_read2s, 'w')

    bc_counter = cr_fastq.BarcodeCounter(args.barcode_whitelist,
                                         outs.corrected_barcode_counts)

    # Correct barcodes, add processed bc tag to fastq
    read_pair_iter = itertools.izip(tk_fasta.read_generator_fastq(in_read1_fastq), \
                                    tk_fasta.read_generator_fastq(in_read2_fastq))
    for read1, read2 in itertools.islice(read_pair_iter, args.initial_reads):
        read1_header = cr_fastq.AugmentedFastqHeader(read1[0])
        read2_header = cr_fastq.AugmentedFastqHeader(read2[0])

        raw_bc = read1_header.get_tag(cr_constants.RAW_BARCODE_TAG)
        bc_qual = read1_header.get_tag(cr_constants.RAW_BARCODE_QUAL_TAG)

        if raw_bc:
            if barcode_whitelist_set is not None and raw_bc not in barcode_whitelist_set:
                processed_bc = cr_stats.correct_bc_error(
                    args.barcode_confidence_threshold, raw_bc, bc_qual,
                    barcode_dist)
            else:
                # Disallow Ns in no-whitelist case
                if 'N' in raw_bc:
                    processed_bc = None
                else:
                    processed_bc = raw_bc

            if processed_bc:
                bc_counter.count(None, processed_bc, None)

                # Add gem group to barcode sequence
                processed_bc = cr_utils.format_barcode_seq(
                    processed_bc, gem_group=args.gem_group)
                read1_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)
                read2_header.set_tag(cr_constants.PROCESSED_BARCODE_TAG,
                                     processed_bc)

            reporter.vdj_barcode_cb(raw_bc, processed_bc)

        tk_fasta.write_read_fastq(out_read1_fastq, read1_header.to_string(),
                                  read1[1], read1[2])
        tk_fasta.write_read_fastq(out_read2_fastq, read2_header.to_string(),
                                  read2[1], read2[2])

    in_read1_fastq.close()
    in_read2_fastq.close()
    out_read1_fastq.close()
    out_read2_fastq.close()
    bc_counter.close()

    reporter.save(outs.chunked_reporter)
예제 #12
0
def main(args, outs):
    np.random.seed(0)

    mc = MoleculeCounter.open(args.molecule_info, 'r')

    # Get cell-associated barcodes
    genomes = sorted(
        set(
            f.tags.get('genome', '')
            for f in mc.feature_reference.feature_defs))
    cell_bcs_by_genome = get_cell_associated_barcodes(genomes,
                                                      args.filtered_barcodes)

    # Load chunk of relevant data from the mol_info
    chunk = slice(int(args.chunk_start),
                  int(args.chunk_start) + int(args.chunk_len))
    mol_library_idx = mc.get_column_lazy('library_idx')[chunk]
    mol_read_pairs = mc.get_column_lazy('count')[chunk]
    mol_gem_group = mc.get_column_lazy('gem_group')[chunk]
    mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk]
    mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk]

    barcodes = mc.get_ref_column('barcodes')

    # Give each cell-associated barcode an integer index
    cell_bcs = sorted(list(cell_bcs_by_genome['']))
    cell_bc_to_int = {bc: i for i, bc in enumerate(cell_bcs)}

    # Give each genome an integer index
    genome_to_int = {g: i for i, g in enumerate(genomes)}
    feature_int_to_genome_int = np.fromiter(
        (genome_to_int[f.tags.get('genome', '')]
         for f in mc.feature_reference.feature_defs),
        dtype=int)
    mol_genome_idx = feature_int_to_genome_int[mol_feature_idx]

    # determine which (library type, genome) pairs have any associated reads
    lib_types = sorted(set(lib['library_type'] for lib in mc.library_info))
    lib_type_to_int = {l: i for i, l in enumerate(lib_types)}
    lib_idx_to_lib_type_idx = np.fromiter(
        (lib_type_to_int[lib['library_type']] for lib in mc.library_info),
        dtype=np.int)

    lib_type_genome_any_reads = np.zeros((len(lib_types), len(genomes)),
                                         dtype=np.bool)
    lib_genome_idx_pairs = set(
        izip(mol_library_idx[mol_read_pairs > 0],
             mol_genome_idx[mol_read_pairs > 0]))
    for (lib_idx, genome_idx) in lib_genome_idx_pairs:
        lib_type_idx = lib_idx_to_lib_type_idx[lib_idx]
        lib_type_genome_any_reads[lib_type_idx, genome_idx] = True

    # Run each subsampling task on this chunk of data
    n_tasks = len(args.subsample_info)
    n_genomes = len(genomes)
    n_cells = len(cell_bcs)

    umis_per_bc = np.zeros((n_tasks, n_genomes, n_cells))
    features_det_per_bc = np.zeros((n_tasks, n_genomes, n_cells))
    read_pairs_per_task = np.zeros((n_tasks, n_genomes))
    umis_per_task = np.zeros((n_tasks, n_genomes))

    for task_idx, task in enumerate(args.subsample_info):
        # Per-library subsampling rates
        rates_per_library = np.array(task['library_subsample_rates'],
                                     dtype=float)

        if np.count_nonzero(rates_per_library) == 0:
            continue

        mol_rate = rates_per_library[mol_library_idx]

        # Subsampled read pairs per molecule
        new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate)

        # Compute tallies for each barcode
        group_keys = (mol_gem_group, mol_barcode_idx)
        group_values = (mol_feature_idx, mol_genome_idx, new_read_pairs)
        for (gg, bc_idx), (feature_idx, genome_idx, read_pairs) in \
            cr_utils.numpy_groupby(group_values, group_keys):

            barcode = cr_utils.format_barcode_seq(barcodes[bc_idx], gg)

            cell_idx = cell_bc_to_int.get(barcode)

            for this_genome_idx in xrange(len(genomes)):
                umis = np.flatnonzero((read_pairs > 0)
                                      & (genome_idx == this_genome_idx))
                this_genome_read_pairs = np.sum(
                    read_pairs[genome_idx == this_genome_idx])

                # Tally UMIs and median features detected
                if barcode in cell_bcs_by_genome[genomes[this_genome_idx]]:
                    # This is a cell-associated barcode for this genome
                    umis_per_bc[task_idx, this_genome_idx,
                                cell_idx] = len(umis)
                    features_det_per_bc[task_idx, this_genome_idx,
                                        cell_idx] = np.count_nonzero(
                                            np.bincount(feature_idx[umis]))

                # Tally numbers for duplicate fraction
                read_pairs_per_task[task_idx, this_genome_idx] += np.sum(
                    this_genome_read_pairs)
                umis_per_task[task_idx, this_genome_idx] += len(umis)

    with open(outs.metrics, 'w') as f:
        data = {
            'umis_per_bc': umis_per_bc,
            'features_det_per_bc': features_det_per_bc,
            'read_pairs': read_pairs_per_task,
            'umis': umis_per_task,
            'lib_type_genome_any_reads': lib_type_genome_any_reads,
        }
        cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)