def call_cell_barcodes(umi_info_path, gem_group):
    """ Call cell barcodes by UMI support.
        Args: umi_info_path (str) - path to umi info h5
              gem_group (int) -  gem group
        Returns: (bc_support, cell_bcs, rt, ut)
                 where bc_support = dict of { barcode: umi_count },
                       cell_bcs = list(str) of cell barcodes)
                       rt = read pair per umi threshold used
                       ut = umi threshold """

    # Get umi info for this gem group only
    bc_idx = vdj_umi_info.get_column(umi_info_path, 'barcode_idx')
    bc_str = vdj_umi_info.get_column(umi_info_path, 'barcodes')
    bc_gg = np.array([int(cr_utils.split_barcode_seq(bc)[1]) for bc in bc_str])
    bc_in_gg = bc_gg == gem_group
    umi_in_gg = bc_in_gg[bc_idx]

    umi_read_pairs = vdj_umi_info.get_column(umi_info_path, 'reads')
    rpu_threshold, umi_threshold, bc_support, confidence = vdj_stats.call_vdj_cells(
        umi_barcode_idx=bc_idx[umi_in_gg],
        umi_read_pairs=umi_read_pairs[umi_in_gg],
        barcodes=bc_str,
        rpu_mix_init_sd=RPU_MIX_INIT_SD,
        umi_mix_init_sd=UMI_MIX_INIT_SD,
        verbosity=1,
    )

    cell_bcs = [
        bc for bc, umis in bc_support.iteritems() if umis >= umi_threshold
    ]

    return bc_support, cell_bcs, rpu_threshold, umi_threshold, confidence
Exemple #2
0
def main(args, outs):
    np.random.seed(0)

    unique_gem_groups = np.unique(args.gem_groups).tolist()

    reporter = vdj_report.VdjReporter(gem_groups=unique_gem_groups)

    # Load the umi info
    umi_info = vdj_umi_info.read_umi_info(args.umi_info, args.start_row,
                                          args.end_row)
    chains = umi_info['chains']
    barcodes = umi_info['barcodes']
    bc_gg = [str(cr_utils.split_barcode_seq(bc)[1]) for bc in barcodes]
    # Compute N50 read pairs per UMI for this gem group
    umi_read_pairs = []
    total_read_pairs = {}
    chain_bad_read_pairs = {}
    for bc_idx, data_iter in itertools.groupby(itertools.izip(
            umi_info['barcode_idx'], umi_info['umi_idx'],
            umi_info['chain_idx'], umi_info['reads']),
                                               key=lambda x: x[0]):

        bc_umi_read_pairs = {}
        for _, umi, chain_idx, reads in data_iter:
            bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads
            chain = chains[chain_idx]
            total_read_pairs[chain] = total_read_pairs.get(chain, 0) + reads
            total_read_pairs[
                cr_constants.MULTI_REFS_PREFIX] = total_read_pairs.get(
                    cr_constants.MULTI_REFS_PREFIX, 0) + reads
            if reads < args.min_readpairs_per_umi[bc_gg[bc_idx]]:
                chain_bad_read_pairs[chain] = chain_bad_read_pairs.get(
                    chain, 0) + reads
                chain_bad_read_pairs[
                    cr_constants.MULTI_REFS_PREFIX] = chain_bad_read_pairs.get(
                        cr_constants.MULTI_REFS_PREFIX, 0) + reads

        for r in bc_umi_read_pairs.itervalues():
            umi_read_pairs.append(r)

    rppu_n50 = tk_stats.NX(umi_read_pairs, 0.5)
    if rppu_n50 is None:
        rppu_n50 = float('NaN')

    # Report bad read-pairs/umi
    for chain in reporter.vdj_genes:
        bad_count = chain_bad_read_pairs.get(chain, 0)
        total_count = total_read_pairs.get(chain, 0)
        reporter._get_metric_attr('vdj_recombinome_low_support_reads_frac',
                                  chain).set_value(bad_count, total_count)

    reporter._get_metric_attr('vdj_recombinome_readpairs_per_umi_n50',
                              cr_constants.MULTI_REFS_PREFIX,
                              args.gem_group).set_value(rppu_n50)

    reporter.save(outs.chunked_reporter)
    def get_compressed_bc_iter(barcodes):
        """ Yields compressed barcode tuples that can be compared against
            a MoleculeCounter's data. Useful for filtering a MoleculeCounter by barcode.
        Args: barcodes (iterable) - list of barcode strings (e.g., ACGT-1)
        Yields: (compressed_bc, compressed_gem_group) tuples """

        for barcode in barcodes:
            barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
            compressed_bc = MoleculeCounter.compress_barcode_seq(barcode_seq)
            compressed_gg = MoleculeCounter.compress_gem_group(gem_group)
            yield compressed_bc, compressed_gg
Exemple #4
0
def join(args, outs, chunk_defs, chunk_outs):
    chunks = zip(chunk_defs, chunk_outs)
    chunks.sort(
        key=lambda chunk: cr_utils.split_barcode_seq(chunk[0].prefix)[::-1])

    buckets = []
    outs.total_reads = 0
    for chunk in chunks:
        buckets.append(chunk[1].default)
        outs.total_reads += chunk[1].total_reads

    tk_bam.concatenate(outs.default, buckets)
Exemple #5
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {
        int(k): v
        for k, v in args.chunks_per_gem_group.iteritems()
    }

    with open(args.read1s_chunk) as f1:
        read1s = [read for read in tk_fasta.read_generator_fastq(f1)]

    with open(args.read2s_chunk) as f2:
        read2s = [read for read in tk_fasta.read_generator_fastq(f2)]

    assert len(read1s) == len(read2s)

    fastqs_out = {}
    buckets = {}

    outs.buckets = {}

    for gem_group, bucket_name in enumerate_bucket_names(
            args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        fastqs_out[bucket_name] = open(filename, 'w')
        outs.buckets[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2 in itertools.izip(read1s, read2s):
        barcode = vdj_utils.get_fastq_read_barcode(read1)

        # Exclude unbarcoded reads
        if barcode is None:
            continue

        assert barcode == vdj_utils.get_fastq_read_barcode(read2)

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq,
                                      args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append(read1)
        buckets[bucket_name].append(read2)

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        fastq_out = fastqs_out[bucket_name]
        for read in bucket:
            tk_fasta.write_read_fastq(fastq_out, *read)

        fastq_out.close()
Exemple #6
0
def main(args, outs):
    bam_in = tk_bam.create_bam_infile(args.chunk_input)

    # Get gem groups
    library_info = rna_library.get_bam_library_info(bam_in)
    gem_groups = sorted(list(set(lib['gem_group'] for lib in library_info)))

    # Define buckets
    bucket_names = []
    prefixes = cr_utils.get_seqs(args.nbases)
    for gg in gem_groups:
        for prefix in prefixes:
            bucket_names.append('%s-%d' % (prefix, gg))
    bucket_names.append('')

    # Read all records
    reads = [read for read in bam_in]

    # Bucket the records
    bams_out = {}
    outs.buckets = {}
    buckets = {}
    for bucket_name in bucket_names:
        filename = martian.make_path("bc-%s.bam" % bucket_name)
        bam_out, _ = tk_bam.create_bam_outfile(filename,
                                               None,
                                               None,
                                               template=bam_in,
                                               rgs=args.read_groups,
                                               replace_rg=True)

        bams_out[bucket_name] = bam_out
        outs.buckets[bucket_name] = filename
        buckets[bucket_name] = []

    for r in reads:
        barcode = cr_utils.get_read_barcode(r)
        if barcode is None:
            bucket_name = ''
        else:
            barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
            prefix = barcode_seq[:args.nbases]
            bucket_name = '%s-%d' % (prefix, gem_group)
        buckets[bucket_name].append(r)

    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=cr_utils.barcode_sort_key)
        bam_out = bams_out[bucket_name]
        for r in bucket:
            bam_out.write(r)
        bam_out.close()
    def build_barcode_info(filtered_barcodes_by_genome, library_info, barcodes):
        """Generate numpy arrays for per-barcode info
        Args:
          filtered_barcodes_by_genome (dict of str:list(str)): Keys are genomes, values are lists of filtered barcode strings.
          library_info (list of dict): Per-library metadata.
          barcodes (list of str): All barcode sequences (e.g. ['ACGT', ...]
        Returns:
          BarcodeInfo object
        """
        # Replace a genome string with its lexicographical rank
        genome_to_idx = {g:i for i, g in \
                         enumerate(sorted(filtered_barcodes_by_genome.keys()))}

        libraries_for_gem_group = defaultdict(list)
        for lib_idx, lib in enumerate(library_info):
            libraries_for_gem_group[lib['gem_group']].append(lib_idx)

        # Map a barcode sequence to its index into the MoleculeCounter
        #  'barcodes' array
        bc_seq_to_idx = {bc:i for i, bc in enumerate(barcodes)}

        # Populate the "pass filter" array of tuples
        pf_tuples = []
        for genome, bcs in filtered_barcodes_by_genome.iteritems():
            genome_idx = genome_to_idx[genome]
            for bc_str in bcs:
                seq, gg = cr_utils.split_barcode_seq(bc_str)
                barcode_idx = bc_seq_to_idx[seq]

                # FIXME: Assumes no per-library filtering, just per-gem-group
                library_inds = libraries_for_gem_group[gg]
                for library_idx in library_inds:
                    pf_tuples.append((barcode_idx, library_idx, genome_idx))

        if len(pf_tuples) > 0:
            pass_filter = np.array(pf_tuples, dtype=BARCODE_INFO_DTYPES['pass_filter'])
        else:
            pass_filter = np.zeros((0,3), dtype=BARCODE_INFO_DTYPES['pass_filter'])

        assert pass_filter.shape[0] == len(pf_tuples)
        assert pass_filter.shape[1] == 3

        # Sort by barcode index
        pass_filter = pass_filter[np.argsort(pass_filter[:,0]), :]

        return BarcodeInfo(
            pass_filter,
            genomes=sorted(filtered_barcodes_by_genome.keys()),
        )
Exemple #8
0
def call_cell_barcodes(umi_info_path, gem_group):
    """ Call cell barcodes by UMI support.
        Args: umi_info_path (str) - path to umi info h5
              gem_group (int) -  gem group
        Returns: (bc_support, cell_bcs, rt, ut)
                 where bc_support = dict of { barcode: umi_count },
                       cell_bcs = list(str) of cell barcodes)
                       rt = read pair per umi threshold used
                       ut = umi threshold """

    # Get umi info for this gem group only
    bc_str = vdj_umi_info.get_column(umi_info_path, 'barcodes')
    bc_gg = np.array([int(cr_utils.split_barcode_seq(bc)[1]) for bc in bc_str])
    bc_in_gg = bc_gg == gem_group

    umi_info = vdj_umi_info.read_umi_info(umi_info_path)
    umi_barcode_idx = []
    umi_read_pairs = []
    for bc_idx, data_iter in itertools.groupby(itertools.izip(
            umi_info['barcode_idx'], umi_info['umi_idx'], umi_info['reads']),
                                               key=lambda x: x[0]):
        if not bc_in_gg[bc_idx]:
            continue

        bc_umi_read_pairs = {}
        for _, umi, reads in data_iter:
            bc_umi_read_pairs[umi] = bc_umi_read_pairs.get(umi, 0) + reads

        for r in bc_umi_read_pairs.itervalues():
            umi_barcode_idx.append(bc_idx)
            umi_read_pairs.append(r)

    rpu_threshold, umi_threshold, bc_support, confidence = vdj_stats.call_vdj_cells(
        umi_barcode_idx=np.array(umi_barcode_idx,
                                 dtype=vdj_umi_info.get_dtype('barcode_idx')),
        umi_read_pairs=np.array(umi_read_pairs,
                                dtype=vdj_umi_info.get_dtype('reads')),
        barcodes=bc_str,
        rpu_mix_init_sd=RPU_MIX_INIT_SD,
        umi_mix_init_sd=UMI_MIX_INIT_SD,
        verbosity=1,
    )

    cell_bcs = [
        bc for bc, umis in bc_support.iteritems() if umis >= umi_threshold
    ]

    return bc_support, cell_bcs, rpu_threshold, umi_threshold, confidence
def write_filtered_molecules(ctr_in, ctr_out, genomes, bcs_per_genome):
    ctr_out.set_all_metrics(ctr_in.get_all_metrics())

    filtered_bc_tuples = set()
    genome_ids = ctr_in.get_column('genome')
    genome_index = cr_reference.get_genome_index(genomes)
    for (genome, formatted_bcs) in bcs_per_genome.iteritems():
        genome_id = cr_reference.get_genome_id(genome, genome_index)
        for formatted_bc in formatted_bcs:
            (bc, gg) = cr_utils.split_barcode_seq(formatted_bc)
            cbc = cr_mol_counter.MoleculeCounter.compress_barcode_seq(bc)
            filtered_bc_tuples.add((genome_id, gg, cbc))

    def keep_molecule(genome_id, gem_group, barcode):
        tup = (genome_id, gem_group, barcode)
        return (tup in filtered_bc_tuples)

    filter_func = np.vectorize(keep_molecule)

    gem_groups = ctr_in.get_column('gem_group')
    barcodes = ctr_in.get_column('barcode')
    filter_index = filter_func(genome_ids, gem_groups, barcodes)

    for col in cr_mol_counter.MOLECULE_INFO_COLUMNS:
        data = ctr_in.get_column(col)
        filtered_data = data[filter_index]
        ctr_out.add_many(col, filtered_data)

    for col in cr_mol_counter.MOLECULE_REF_COLUMNS:
        ctr_out.set_ref_column(col, ctr_in.get_ref_column(col))

    # summarize filtered data
    genomes = ctr_out.get_ref_column('genome_ids')
    filtered_reads = ctr_out.get_column('reads')
    flt_conf_mapped_per_genome = {}
    if len(genomes) == 1:
        genome = genomes[0]
        flt_conf_mapped_per_genome[genome] = filtered_reads.sum()
    else:
        genome_ids = ctr_out.get_column('genome')
        genome_index = cr_reference.get_genome_index(genomes)
        for genome in genomes:
            genome_id = cr_reference.get_genome_id(genome, genome_index)
            flt_conf_mapped_per_genome[genome] = filtered_reads[
                genome_ids == genome_id].sum()
    summary = {'flt_conf_mapped_per_genome': flt_conf_mapped_per_genome}
    return summary
Exemple #10
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.skip:
        return

    gg_id_to_batch_id, batch_id_to_name = {}, {}

    for lib in args.library_info:
        gg_id_to_batch_id[lib['gem_group']] = lib['batch_id']
        batch_id_to_name[lib['batch_id']] = lib['batch_name']

    matrix = cr_matrix.CountMatrix.load_h5_file(args.matrix_h5)
    matrix = matrix.select_features_by_type(GENE_EXPRESSION_LIBRARY_TYPE)

    batch_ids = np.array([gg_id_to_batch_id[cr_util.split_barcode_seq(bc)[1]] for bc in matrix.bcs])

    # select intersect of non-zero feature in each batch
    feature_mask = np.ones(matrix.features_dim)
    for b_id in batch_id_to_name:
        batch_bc_indices = np.where(batch_ids == b_id)[0]
        matrix_view = cr_matrix.CountMatrixView(matrix, bc_indices=batch_bc_indices)
        feature_mask = np.logical_and(feature_mask, matrix_view.sum(axis=1))

    matrix = matrix.select_features(np.flatnonzero(feature_mask))

    # filter barcodes with zero count
    bc_indices = np.flatnonzero(matrix.get_counts_per_bc())
    matrix = matrix.select_barcodes(bc_indices)

    # l2 norm
    matrix.m = matrix.m.astype('float64')
    cr_matrix.inplace_csc_column_normalize_l2(matrix.m)

    n_pcs = args.num_pcs if args.num_pcs is not None else analysis_constants.CBC_N_COMPONENTS_DEFAULT
    dimred_matrix = fbpca_reduce_dimension(matrix, n_pcs)

    outs.dimred_matrix = martian.make_path('dimred_matrix.pickle')
    with open(outs.dimred_matrix, 'wb') as fp:
        cPickle.dump(dimred_matrix, fp, cPickle.HIGHEST_PROTOCOL)

    bc_feature_info = {
        'barcodes' : matrix.bcs,
        'features' : matrix.feature_ref.feature_defs,
    }
    outs.matrix_barcode_feature_info = martian.make_path('matrix_barcode_feature_info.pickle')
    with open(outs.matrix_barcode_feature_info, 'wb') as fp:
        cPickle.dump(bc_feature_info, fp, cPickle.HIGHEST_PROTOCOL)
Exemple #11
0
def split(args):
    # Get required info from the mol info
    mc = MoleculeCounter.open(args.molecule_info, 'r')

    genomes = sorted(
        set(
            f.tags.get('genome', '')
            for f in mc.feature_reference.feature_defs))
    cell_bcs_by_genome = get_cell_associated_barcodes(genomes,
                                                      args.filtered_barcodes)

    # Get cell counts per gem group
    n_cells_per_gg = defaultdict(int)
    for bc in cell_bcs_by_genome['']:
        _, gem_group = cr_utils.split_barcode_seq(bc)
        n_cells_per_gg[gem_group] += 1

    # Assign gem group cell counts to their constituent libraries
    # TODO FIXME: Need to allow for per-library cell counts
    #   because some feature types might only have a subset of the GEX cell-assoc barcodes.
    n_cells_per_lib = np.zeros(len(mc.library_info), dtype=int)
    for lib_idx, lib in enumerate(mc.library_info):
        n_cells_per_lib[lib_idx] = n_cells_per_gg[lib['gem_group']]

    if n_cells_per_lib.sum() == 0:
        return {'chunks': []}

    library_info = mc.library_info

    raw_count_per_lib = np.array(mc.get_raw_read_pairs_per_library())
    raw_rppc_per_lib = raw_count_per_lib.astype(float) / n_cells_per_lib
    usable_count_per_lib = np.array(mc.get_usable_read_pairs_per_library())

    subsamplings = list()  # track subsample info definitions

    library_types = sorted(set(lib['library_type'] for lib in library_info))
    for library_type in library_types:
        # All libraries w/ this type
        lib_indexes = np.array([
            i for i, lib in enumerate(library_info)
            if lib['library_type'] == library_type
        ])

        # For plotting, we want a series of target depths that exist for all
        #   libraries w/ the same library type. When there's a single library
        #   per type (the common case), this is trivial - split it into deciles.
        #   But if there are multiple libraries with different depths, (e.g.,
        #   because gem-group-aggregation was used to increase cell numbers),
        #   we need to find depths that are achievable for all libraries.
        #   For now, let the lowest-depth library for a given type dictate this.
        min_raw_rppc = np.min(raw_rppc_per_lib[lib_indexes])

        # Use deciles of the raw read pairs per cell.
        deciles = np.arange(0.1, 1.1, 0.1)
        plot_targets = map(round, min_raw_rppc * deciles)

        # TODO: separate this work (internal + non)
        raw_targets = cr_constants.SUBSAMPLE_READS_PER_CELL + \
                      plot_targets

        # TODO: separate this work (internal + non)
        usable_targets = cr_constants.SUBSAMPLE_READS_PER_CELL + \
                         plot_targets

        for targets, depth_type in \
            ((raw_targets, cr_constants.RAW_SUBSAMPLE_TYPE), \
             ((usable_targets, cr_constants.MAPPED_SUBSAMPLE_TYPE)),):
            targets = sorted(list(set(map(int, targets))))
            for target_rppc in targets:
                if depth_type == cr_constants.RAW_SUBSAMPLE_TYPE:
                    # Infer the usable depth required to achieve this raw depth
                    usable_read_fracs = usable_count_per_lib.astype(
                        float) / raw_count_per_lib
                    target_usable_counts = target_rppc * n_cells_per_lib * usable_read_fracs
                else:
                    target_usable_counts = target_rppc * n_cells_per_lib

                # Zero out libraries of the other types
                rates = np.zeros(len(library_info), dtype=float)
                rates[lib_indexes] = target_usable_counts[lib_indexes].astype(float) \
                                     / usable_count_per_lib[lib_indexes]

                # Clamp rates that are close to 1 to 1
                rates[np.absolute(rates - 1) < 1e-3] = 1

                # Zero out the libraries for which we have fewer reads than the target
                rates[rates > 1] = 0.0

                enough_data = np.any((rates > 0) & (rates <= 1))
                if not enough_data:
                    rates = np.zeros(len(rates))

                subsamplings.append({
                    'library_type':
                    library_type,
                    'subsample_type':
                    depth_type,
                    'target_read_pairs_per_cell':
                    int(target_rppc),
                    'library_subsample_rates':
                    list(map(float, rates)),
                })

    # Each chunk needs to store a piece of the mol info h5
    tgt_chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK

    # Split the molecule info h5 into equi-RAM chunks
    chunks = []
    for chunk_start, chunk_len in mc.get_chunks(tgt_chunk_len,
                                                preserve_boundaries=True):
        chunks.append({
            'chunk_start':
            chunk_start,
            'chunk_len':
            chunk_len,
            'subsample_info':
            subsamplings,
            # The estimate_mem_gb only count the memory usage for the MoleculeCounter object, which is
            # under-estimated the actual memory usage.
            # Based on memory profiling with test case fuzzer_114, actual memory usageis ~4x more
            # than estimate_mem_gb (without cap), here set scale = 6.
            '__mem_gb':
            MoleculeCounter.estimate_mem_gb(chunk_len, scale=6),
        })

    join = {
        '__mem_gb': 6,
    }

    mc.close()

    # TODO: is this really necessary w/ martian 3
    if len(chunks) == 0:
        chunks.append({
            'chunk_start': str(0),
            'chunk_len': str(0),
            'subsample_info': [],
        })

    return {'chunks': chunks, 'join': join}
Exemple #12
0
 def select_barcodes_by_gem_group(self, gem_group):
     return self.select_barcodes_by_seq([
         bc for bc in self.bcs
         if gem_group == cr_utils.split_barcode_seq(bc)[1]
     ])
Exemple #13
0
def filter_barcodes(args, outs):
    random.seed(0)
    np.random.seed(0)

    correction_data = pd.read_csv(args.barcode_correction_csv)
    raw_matrix = cr_matrix.CountMatrix.load_h5_file(args.matrices_h5)
    if np.isin(rna_library.ANTIBODY_LIBRARY_TYPE,
               correction_data.library_type):
        matrix, metrics_to_report, removed_bcs_df = remove_bcs_with_high_umi_corrected_reads(
            correction_data, raw_matrix)
        ### report all idenitified aggregate barcodes, together with their reads, umi corrected reads, fraction of corrected reads, and fraction of total reads
        removed_bcs_df.to_csv(outs.aggregate_barcodes)
        summary = metrics_to_report
    else:
        matrix = raw_matrix
        summary = {}

    if args.cell_barcodes is not None:
        method = FilterMethod.MANUAL
    elif args.force_cells is not None:
        method = FilterMethod.TOP_N_BARCODES
    else:
        method = FilterMethod.ORDMAG_NONAMBIENT

    summary['total_diversity'] = matrix.bcs_dim
    summary['filter_barcodes_method'] = get_filter_method_name(method)

    # Get unique gem groups
    unique_gem_groups = sorted(list(set(args.gem_groups)))

    # Get per-gem group cell load
    if args.recovered_cells is not None:
        gg_recovered_cells = int(
            float(args.recovered_cells) / float(len(unique_gem_groups)))
    else:
        gg_recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP

    if args.force_cells is not None:
        gg_force_cells = int(
            float(args.force_cells) / float(len(unique_gem_groups)))

    # Only use gene expression matrix for cell calling
    gex_matrix = matrix.view().select_features_by_type(
        lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)

    # Make initial cell calls for each genome separately
    genomes = gex_matrix.get_genomes()

    # (gem_group, genome) => dict
    filtered_metrics_groups = OrderedDict()
    # (gem_group, genome) => list of barcode strings
    filtered_bcs_groups = OrderedDict()

    for genome in genomes:
        genome_matrix = gex_matrix.select_features_by_genome(genome)

        # Make initial cell calls for each gem group individually
        for gem_group in unique_gem_groups:

            gg_matrix = genome_matrix.select_barcodes_by_gem_group(gem_group)

            if method == FilterMethod.ORDMAG or \
               method == FilterMethod.ORDMAG_NONAMBIENT:
                gg_total_diversity = gg_matrix.bcs_dim
                gg_bc_counts = gg_matrix.get_counts_per_bc()
                gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_ordmag(
                    gg_bc_counts, gg_recovered_cells, gg_total_diversity)
                gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices)

            elif method == FilterMethod.MANUAL:
                with (open(args.cell_barcodes)) as f:
                    cell_barcodes = json.load(f)
                gg_filtered_bcs, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_manual(
                    gg_matrix, cell_barcodes)

            elif method == FilterMethod.TOP_N_BARCODES:
                gg_bc_counts = gg_matrix.get_counts_per_bc()
                gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_fixed_cutoff(
                    gg_bc_counts, gg_force_cells)
                gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices)

            else:
                martian.exit("Unsupported BC filtering method: %s" % method)

            if msg is not None:
                martian.log_info(msg)

            filtered_metrics_groups[(gem_group, genome)] = gg_filtered_metrics
            filtered_bcs_groups[(gem_group, genome)] = gg_filtered_bcs

    # Do additional cell calling
    outs.nonambient_calls = None

    if method == FilterMethod.ORDMAG_NONAMBIENT:
        # We need the full gene expression matrix instead of just a view
        full_gex_matrix = matrix.select_features_by_type(
            lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)

        # Track these for recordkeeping
        eval_bcs_arrays = []
        umis_per_bc_arrays = []
        loglk_arrays = []
        pvalue_arrays = []
        pvalue_adj_arrays = []
        nonambient_arrays = []
        genome_call_arrays = []

        # Do it by gem group, but agnostic to genome
        for gg in unique_gem_groups:
            gg_matrix = full_gex_matrix.select_barcodes_by_gem_group(gg)

            # Take union of initial cell calls across genomes
            gg_bcs = sorted(
                list(
                    reduce(set.union, [
                        set(bcs)
                        for group, bcs in filtered_bcs_groups.iteritems()
                        if group[0] == gg
                    ])))

            result = cr_cell.find_nonambient_barcodes(gg_matrix, gg_bcs)
            if result is None:
                print 'Failed at attempt to call non-ambient barcodes in GEM group %s' % gg
                continue

            # Assign a genome to the cell calls by argmax genome counts
            genome_counts = []
            for genome in genomes:
                genome_counts.append(gg_matrix.view() \
                                     .select_features_by_genome(genome) \
                                     .select_barcodes(result.eval_bcs) \
                                     .get_counts_per_bc())
            genome_counts = np.column_stack(genome_counts)
            genome_calls = np.array(genomes)[np.argmax(genome_counts, axis=1)]

            umis_per_bc = gg_matrix.get_counts_per_bc()

            eval_bcs_arrays.append(np.array(gg_matrix.bcs)[result.eval_bcs])
            umis_per_bc_arrays.append(umis_per_bc[result.eval_bcs])
            loglk_arrays.append(result.log_likelihood)
            pvalue_arrays.append(result.pvalues)
            pvalue_adj_arrays.append(result.pvalues_adj)
            nonambient_arrays.append(result.is_nonambient)
            genome_call_arrays.append(genome_calls)

            # Update the lists of cell-associated barcodes
            for genome in genomes:
                eval_bc_strs = np.array(gg_matrix.bcs)[result.eval_bcs]
                filtered_bcs_groups[(gg, genome)].extend(
                    eval_bc_strs[(genome_calls == genome)
                                 & (result.is_nonambient)])

        if len(eval_bcs_arrays) > 0:
            nonambient_summary = pd.DataFrame(
                OrderedDict([
                    ('barcode', np.concatenate(eval_bcs_arrays)),
                    ('umis', np.concatenate(umis_per_bc_arrays)),
                    ('ambient_loglk', np.concatenate(loglk_arrays)),
                    ('pvalue', np.concatenate(pvalue_arrays)),
                    ('pvalue_adj', np.concatenate(pvalue_adj_arrays)),
                    ('nonambient', np.concatenate(nonambient_arrays)),
                    ('genome', np.concatenate(genome_call_arrays)),
                ]))
            nonambient_summary.to_csv(outs.nonambient_calls)

    # Record all filtered barcodes
    genome_filtered_bcs = defaultdict(set)
    filtered_bcs = set()
    for (gem_group, genome), bcs in filtered_bcs_groups.iteritems():
        genome_filtered_bcs[genome].update(bcs)
        filtered_bcs.update(bcs)

    # Combine initial-cell-calling metrics
    for genome in genomes:
        # Merge metrics over all gem groups for this genome
        txome_metrics = [
            v for k, v in filtered_metrics_groups.iteritems() if k[1] == genome
        ]
        txome_summary = cr_stats.merge_filtered_metrics(txome_metrics)

        # Append method name to metrics
        summary.update({
            ('%s_%s_%s' % (genome,
                           key,
                           get_filter_method_name(method))): txome_summary[key] \
            for (key,_) in txome_summary.iteritems()})

        summary['%s_filtered_bcs' % genome] = len(genome_filtered_bcs[genome])

        # NOTE: This metric only applies to the initial cell calls
        summary['%s_filtered_bcs_cv' %
                genome] = txome_summary['filtered_bcs_cv']

    # Deduplicate and sort filtered barcode sequences
    # Sort by (gem_group, barcode_sequence)
    barcode_sort_key = lambda x: cr_utils.split_barcode_seq(x)[::-1]

    for genome, bcs in genome_filtered_bcs.iteritems():
        genome_filtered_bcs[genome] = sorted(list(set(bcs)),
                                             key=barcode_sort_key)
    filtered_bcs = sorted(list(set(filtered_bcs)), key=barcode_sort_key)

    # Re-compute various metrics on the filtered matrix
    reads_summary = cr_utils.merge_jsons_as_dict(
        [args.raw_fastq_summary, args.attach_bcs_summary])
    matrix_summary = rna_report_mat.report_genomes(
        matrix,
        reads_summary=reads_summary,
        barcode_summary_h5_path=args.barcode_summary,
        recovered_cells=args.recovered_cells,
        cell_bc_seqs=genome_filtered_bcs)

    # Write metrics json
    combined_summary = matrix_summary.copy()
    combined_summary.update(summary)
    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(combined_summary),
                  f,
                  indent=4,
                  sort_keys=True)

    # Write the filtered barcodes file
    write_filtered_barcodes(outs.filtered_barcodes, genome_filtered_bcs)

    # Select cell-associated barcodes
    filtered_matrix = matrix.select_barcodes_by_seq(filtered_bcs)

    return filtered_matrix
Exemple #14
0
def split(args):
    if args.skip:
        return {'chunks': []}

    gg_id_to_batch_id, batch_id_to_name = {}, {}

    for lib in args.library_info:
        gg_id_to_batch_id[lib['gem_group']] = lib['batch_id']
        batch_id_to_name[lib['batch_id']] = lib['batch_name']

    # load the barcodes
    with open(args.matrix_barcode_feature_info) as fp:
        bc_feature_info = cPickle.load(fp)
        bcs = bc_feature_info.get('barcodes')

    batch_ids = np.array(
        [gg_id_to_batch_id[cr_util.split_barcode_seq(bc)[1]] for bc in bcs])

    with open(args.dimred_matrix) as fp:
        dimred_matrix = cPickle.load(fp)

    # re-order matrix such that barcodes from same batch are grouped together
    new_bc_indices = None
    batch_to_bc_indices = []
    idx_to_batch_id = np.full(dimred_matrix.shape[0], 0, dtype=np.int8)

    base = 0
    for b_id in range(len(batch_id_to_name)):
        batch_bc_indices = np.where(batch_ids == b_id)[0]
        if batch_bc_indices.shape[0] == 0:
            continue

        new_bc_indices = batch_bc_indices if new_bc_indices is None else np.append(
            new_bc_indices, batch_bc_indices)
        batch_to_bc_indices.append((base, base + batch_bc_indices.shape[0]))
        idx_to_batch_id[base:base + batch_bc_indices.shape[0]] = b_id
        base += len(batch_bc_indices)

    # 1. check if needs re-order; 2. if needs re-order, store the original order
    need_reorder_barcode = (not np.all(np.diff(new_bc_indices) >= 0))
    if need_reorder_barcode:
        dimred_matrix = dimred_matrix[new_bc_indices]
        barcode_reorder_index = np.argsort(new_bc_indices)

        barcode_reorder_index_file = martian.make_path(
            'barcode_reorder_index.pickle')
        with open(barcode_reorder_index_file, 'wb') as fp:
            cPickle.dump(barcode_reorder_index, fp, cPickle.HIGHEST_PROTOCOL)

        ordered_dimred_matrix_file = martian.make_path(
            'ordered_dimred_matrix.pickle')
        with open(ordered_dimred_matrix_file, 'wb') as fp:
            cPickle.dump(dimred_matrix, fp, cPickle.HIGHEST_PROTOCOL)
    else:
        barcode_reorder_index_file, ordered_dimred_matrix_file = None, None

    idx_to_batch_id_file = martian.make_path('idx_to_batch_id.pickle')
    with open(idx_to_batch_id_file, 'wb') as fp:
        cPickle.dump(idx_to_batch_id, fp, cPickle.HIGHEST_PROTOCOL)

    nitem, ndim = dimred_matrix.shape
    nbatch = len(batch_to_bc_indices)
    cbc_knn = option(args.cbc_knn, analysis_constants.CBC_KNN)
    matrix_mem_gb = sys.getsizeof(
        dimred_matrix) / 1e9  # float(nitem * ndim) / NUM_ENTRIES_PER_MEM_GB
    # 72 for size of tuple, 32 * 2 for size of 2 np.int64's, and 40% for inefficient dictionaries
    nn_mem_gb = 1.4 * nbatch * nitem * cbc_knn * (72 + 2 * 32) / 1e9
    # presuming all in one batch, dimred_matrix, cur_matrix, ref_matrix
    main_mem_gb = max(int(3.0 * matrix_mem_gb + nn_mem_gb + 1.0),
                      h5_constants.MIN_MEM_GB)

    chunks = []
    for batch_id in xrange(len(batch_to_bc_indices)):
        chunks.append({
            '__mem_gb': main_mem_gb,
            'batch_id': batch_id,
            'batch_to_bc_indices': batch_to_bc_indices,
            'ordered_dimred_matrix': ordered_dimred_matrix_file,
            'idx_to_batch_id': idx_to_batch_id_file,
            'need_reorder_barcode': need_reorder_barcode,
            'barcode_reorder_index': barcode_reorder_index_file,
        })

    return {'chunks': chunks, 'join': {'__mem_gb': JOIN_MEM_GB}}
Exemple #15
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w')

    mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns()
    mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)}

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genome_index = cr_reference.get_genome_index(genomes)
    none_gene_id = len(gene_index.get_genes())

    # store reference index columns
    # NOTE - these must be cast to str first, as unicode is not supported
    counter.set_ref_column('genome_ids', [str(genome) for genome in genomes])
    counter.set_ref_column('gene_ids',
                           [str(gene.id) for gene in gene_index.genes])
    counter.set_ref_column('gene_names',
                           [str(gene.name) for gene in gene_index.genes])

    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)

    gg_metrics = collections.defaultdict(
        lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0})

    for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby(
            in_bam, key=cr_utils.barcode_sort_key):
        if barcode is None or gem_group is None:
            continue
        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode, gem_group) in filtered_bcs
        molecules = collections.defaultdict(
            lambda: np.zeros(len(mol_data_columns), dtype=np.uint64))

        compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq(
            barcode)
        gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group(
            gem_group)

        read_positions = collections.defaultdict(set)
        for read in reads_iter:
            umi = cr_utils.get_read_umi(read)
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or umi is None or read.is_read2:
                continue

            raw_umi = cr_utils.get_read_raw_umi(read)
            raw_bc, raw_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_raw_barcode(read))
            proc_bc, proc_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_barcode(read))

            if cr_utils.is_read_conf_mapped_to_transcriptome(
                    read, cr_utils.get_high_conf_mapq(args.align)):
                assert len(gene_ids) == 1

                mol_key, map_type = (umi, gene_index.gene_id_to_int(
                    gene_ids[0])), 'reads'

                read_pos = (read.tid, read.pos)
                uniq_read_pos = read_pos not in read_positions[mol_key]
                read_positions[mol_key].add(read_pos)

                if is_cell_barcode:
                    gg_metrics[int(gem_group)][
                        cr_mol_counter.
                        GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1

            elif read.is_unmapped:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'unmapped_reads', False
            else:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'nonconf_mapped_reads', False
            molecules[mol_key][mol_data_columns[map_type]] += 1
            molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int(
                not raw_umi == umi)
            molecules[mol_key][mol_data_columns[
                'barcode_corrected_reads']] += int(not raw_bc == proc_bc)
            molecules[mol_key][mol_data_columns[
                'conf_mapped_uniq_read_pos']] += int(uniq_read_pos)

        for mol_key, molecule in sorted(molecules.items()):
            umi, gene_id = mol_key
            genome = cr_utils.get_genome_from_str(
                gene_index.int_to_gene_id(gene_id), genomes)
            genome_id = cr_reference.get_genome_id(genome, genome_index)
            counter.add(
                barcode=compressed_barcode,
                gem_group=gem_group,
                umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi),
                gene=gene_id,
                genome=genome_id,
                **{
                    key: molecule[col_idx]
                    for key, col_idx in mol_data_columns.iteritems()
                })

    in_bam.close()

    counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics))

    counter.save()
Exemple #16
0
def split(args):
    """ Chunk the UMI info HDF5 file by gem group """

    num_entries = vdj_umi_info.get_num_rows(args.umi_info)
    if num_entries > 1e9:
        print 'Warning: There are >1e9 entries in the umi_info - this could potentially cause an out-of-memory error.'

    # This will cause an OOM if there are >1.5e9 UMIs
    barcode_indices = vdj_umi_info.get_column(args.umi_info, 'barcode_idx')
    barcodes = vdj_umi_info.get_column(args.umi_info, 'barcodes')

    chunks = []

    start_row = 0
    prev_gem_group = None
    prev_barcode_idx = None

    for row, barcode_idx in enumerate(barcode_indices):
        if barcode_idx == prev_barcode_idx:
            continue

        _, gem_group = cr_utils.split_barcode_seq(barcodes[barcode_idx])

        if prev_gem_group is not None and gem_group != prev_gem_group:
            # Write complete chunk
            end_row = row
            mem_gb = max(
                cr_constants.MIN_MEM_GB, 2 * int(
                    np.ceil(
                        vdj_umi_info.get_mem_gb(args.umi_info,
                                                start_row=start_row,
                                                end_row=end_row))))

            chunks.append({
                'gem_group': prev_gem_group,
                'start_row': start_row,
                'end_row': end_row,
                '__mem_gb': mem_gb,
            })

            start_row = end_row

        prev_gem_group = gem_group
        prev_barcode_idx = barcode_idx

    # Write final chunk
    end_row = vdj_umi_info.get_num_rows(args.umi_info)
    mem_gb = max(
        cr_constants.MIN_MEM_GB, 2 * int(
            np.ceil(
                vdj_umi_info.get_mem_gb(
                    args.umi_info, start_row=start_row, end_row=end_row))))

    # Handle case where umi info is empty by supplying a dummy gem group
    if prev_gem_group is None:
        prev_gem_group = args.gem_groups[0]

    chunks.append({
        'gem_group': prev_gem_group,
        'start_row': start_row,
        'end_row': end_row,
        '__mem_gb': mem_gb,
    })

    return {'chunks': chunks}
Exemple #17
0
def write_barcode_umi_summary(umi_info_filename, reporter, filename, threshold,
                              cell_barcode_set):
    """ Write a summary of UMI readpair-counts per (barcode, chain) tuple.
        Args: filename - output filename
              threshold (int) - min read pairs per UMI used in asm
              barcodes - set of barcode strings """

    # Load the umi info
    umi_info = vdj_umi_info.read_umi_info(umi_info_filename)
    chains = umi_info['chains']
    barcodes = umi_info['barcodes']

    sep = ','

    with open(filename, 'w') as writer:
        field_names = ["bc"]
        field_names += [chain + "_all_umis" for chain in reporter.vdj_genes] + \
                       [chain + "_good_umis" for chain in reporter.vdj_genes]
        writer.write(sep.join(field_names))
        writer.write("\n")

        # Assume sorted by barcode
        for bc_idx, umi_iter in itertools.groupby(itertools.izip(
                umi_info['barcode_idx'], umi_info['chain_idx'],
                umi_info['reads']),
                                                  key=lambda x: x[0]):
            bc = barcodes[bc_idx]
            if bc not in cell_barcode_set:
                continue

            # Count UMIs
            umis = list(umi_iter)
            chain_counts = defaultdict(int)
            good_chain_counts = defaultdict(int)
            for bc_idx, chain_idx, reads in umis:
                chain = chains[chain_idx]
                chain_counts[chain] += 1
                chain_counts[cr_constants.MULTI_REFS_PREFIX] += 1

                _, gem_group = cr_utils.split_barcode_seq(barcodes[bc_idx])

                if reads >= threshold:
                    good_chain_counts[chain] += 1
                    good_chain_counts[cr_constants.MULTI_REFS_PREFIX] += 1

            # Report barcode totals
            flds = {}
            flds["bc"] = bc

            num_good_umis = good_chain_counts[cr_constants.MULTI_REFS_PREFIX]
            reporter._get_metric_attr(
                'vdj_recombinome_total_umis_per_cell_distribution').add(
                    num_good_umis)
            reporter._get_metric_attr(
                'vdj_recombinome_total_umis_per_cell_median').add(
                    num_good_umis)

            # Report per-chain totals for this barcode
            for chain in reporter.vdj_genes:
                chain_all_umis = chain_counts[chain]
                chain_good_umis = good_chain_counts[chain]

                flds[chain + "_all_umis"] = chain_all_umis
                flds[chain + "_good_umis"] = chain_good_umis

                reporter._get_metric_attr(
                    'vdj_recombinome_umis_per_cell_distribution',
                    chain).add(chain_good_umis)
                reporter._get_metric_attr(
                    'vdj_recombinome_umis_per_cell_median',
                    chain).add(chain_good_umis)

            writer.write(sep.join([str(flds[name]) for name in field_names]))
            writer.write("\n")
Exemple #18
0
def main(args, outs):
    # Martian coerces dict keys to string
    # Coerce keys back to int
    args.chunks_per_gem_group = {int(k): v for k, v in args.chunks_per_gem_group.iteritems()}

    paired_end = args.read2s_chunk is not None

    # Lazy load R1
    r1_file = cr_io.open_maybe_gzip(args.read1s_chunk)
    read1s = tk_fasta.read_generator_fastq(r1_file)

    # Lazy load R2
    if paired_end:
        r2_file = cr_io.open_maybe_gzip(args.read2s_chunk)
        read2s = tk_fasta.read_generator_fastq(r2_file)
    else:
        read2s = []

    # Lazy load corrected BCs
    bc_file = cr_io.open_maybe_gzip(args.bcs)
    bcs = (line.strip() for line in bc_file)

    buckets = {}

    bucket_filenames = {}

    for gem_group, bucket_name in enumerate_bucket_names(args.chunks_per_gem_group):
        filename = martian.make_path("%s.fastq" % bucket_name)
        bucket_filenames[bucket_name] = filename
        buckets[bucket_name] = []

    for read1, read2, barcode in itertools.izip_longest(read1s, read2s, bcs):
        # Exclude unbarcoded reads
        if barcode == '':
            continue

        # Exclude short reads
        if len(read1[1]) < MIN_READ_LENGTH or (read2 is not None and len(read2[1]) < MIN_READ_LENGTH):
            continue

        # Attach processed barcode to reads
        r1_hdr = cr_fastq.AugmentedFastqHeader(read1[0])
        r1_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
        r1_new_qname = r1_hdr.to_string()

        if paired_end:
            r2_hdr = cr_fastq.AugmentedFastqHeader(read2[0])
            r2_hdr.set_tag(cr_constants.PROCESSED_BARCODE_TAG, barcode)
            r2_new_qname = r2_hdr.to_string()

        barcode_seq, gem_group = cr_utils.split_barcode_seq(barcode)
        bucket_name = get_bucket_name(gem_group, barcode_seq, args.chunks_per_gem_group[gem_group])

        buckets[bucket_name].append((r1_new_qname, read1[1], read1[2]))
        if paired_end:
            buckets[bucket_name].append((r2_new_qname, read2[1], read2[2]))

    outs.buckets = {}

    # Sort and write each bucket
    for bucket_name, bucket in buckets.iteritems():
        bucket.sort(key=vdj_utils.fastq_barcode_sort_key)

        # Don't create empty bucket files.
        # This is common when the reads are ordered by gem group
        # And a chunk sees only a single gem group.
        if len(bucket) == 0:
            continue

        filename = bucket_filenames[bucket_name]
        with cr_io.open_maybe_gzip(filename, 'w') as f:
            for read in bucket:
                tk_fasta.write_read_fastq(f, *read)

        outs.buckets[bucket_name] = bucket_filenames[bucket_name]
Exemple #19
0
def call_cell_barcodes(umi_summary_filename, gem_group, min_umis, threshold_nx,
                       threshold_ratio):
    """ Call cell barcodes by contig/UMI read support.
        umi_summary_filename (str) - path to umi summary tsv generated by vdj_asm
        gem_group (int) -  gem group
        min_umis (int) - min passing UMIs on highest-passing-UMI-contig to call cell
        Returns: (d,b,t)
                 where d = dict of { barcode: best_contig_kth_umi_readpairs },
                            k = min_umis and
                            kth_umi_readpairs = 0 if best_contig has <k umis,
                       b = list(str) of cell barcodes)
                       t = read pair threshold used """

    with open(umi_summary_filename) as f:
        # First pass: compute threshold
        reader = csv.reader(f, delimiter='\t')

        hdr = next(reader)
        bc_col = hdr.index('barcode')
        umi_col = hdr.index('umi')
        reads_col = hdr.index('reads')
        thresh_col = hdr.index('min_umi_reads')
        good_col = hdr.index('good_umi')
        contigs_col = hdr.index('contigs')

        def use_umi(row):
            return (row[umi_col] != '') and \
                (row[contigs_col] != '') and \
                (row[good_col] == 'True')

        read_pairs = []
        assembly_rppu_threshold = 1

        bc_support = {}

        for row in reader:
            # Only take this gem group
            _, gg = cr_utils.split_barcode_seq(row[bc_col])
            if str(gg) != str(gem_group):
                continue

            # Initialize all barcodes
            bc_support[row[bc_col]] = 0

            if not use_umi(row):
                continue

            # Get the RPPU threshold that was used in assembly
            # The tsv reports reads per UMI, so divide by 2 for pairs.
            assembly_rppu_threshold = int(row[thresh_col]) / 2
            read_pairs.append(int(row[reads_col]) / 2)

        read_pairs = np.array(read_pairs, dtype=int)

        # Estimate the high end of the distribution
        if len(read_pairs) > 0:
            high_rppu = tk_stats.NX(read_pairs, threshold_nx)
        else:
            high_rppu = 1

        # Take UMIs within X of the high end, roughly corresponding the to highest mode
        # and therefore to molecules amplified from the first cycle.
        threshold = int(
            round(tk_stats.robust_divide(high_rppu, threshold_ratio)))

        # Don't drop below the looser threshold that was used in assembly.
        threshold = max(assembly_rppu_threshold, threshold)

        # Second pass: Call as cell BCs those with at least k UMIs
        # passing the strict threshold computed above.
        f.seek(0)
        reader = csv.reader(f, delimiter='\t')
        next(reader)

        cell_barcodes = []

        good_umi_iter = itertools.ifilter(use_umi, reader)
        bc_group_iter = itertools.groupby(good_umi_iter,
                                          key=lambda row: row[bc_col])

        for bc, rows in bc_group_iter:
            # Restrict to the current gem group
            bc_seq, gg = cr_utils.split_barcode_seq(bc)
            if str(gg) != str(gem_group):
                continue

            # Collect readpair support for all UMIs for all contigs
            contig_umis_readpairs = defaultdict(list)
            for row in rows:
                contig_umis_readpairs[row[contigs_col]].append(
                    int(row[reads_col]) / 2)

            # Get the max (contig-kth-umi)
            best_kth_umi_readpairs = 0

            for contig, umi_readpairs in contig_umis_readpairs.iteritems():
                # Sort UMIs by readpairs, descending
                umi_readpairs = np.array(umi_readpairs, dtype=int)
                umi_readpairs[::-1].sort()

                # Get the kth UMI's readpair support or 0
                if len(umi_readpairs) >= min_umis:
                    kth_umi_readpairs = umi_readpairs[min_umis - 1]
                else:
                    kth_umi_readpairs = 0

                best_kth_umi_readpairs = max(best_kth_umi_readpairs,
                                             kth_umi_readpairs)

            bc_support[bc] = best_kth_umi_readpairs

            if best_kth_umi_readpairs >= threshold:
                cell_barcodes.append(bc)

        return bc_support, cell_barcodes, threshold