Ejemplo n.º 1
0
    def correct_for_saturation(self, summary_json_paths):
        merged_jsons = cr_utils.merge_jsons_as_dict(summary_json_paths)

        for genome, matrix in self.matrices.iteritems():
            effective_umi_diversity = merged_jsons.get(
                '%s_conf_mapped_effective_umi_diversity' % genome, 0)
            matrix.correct_for_saturation(float(effective_umi_diversity),
                                          genome)
Ejemplo n.º 2
0
def join(args, outs, chunk_defs, chunk_outs):
    summary = cr_utils.merge_jsons_as_dict([
        args.extract_reads_summary,
        args.attach_bcs_and_umis_summary,
        args.mark_duplicates_summary,
    ])
    gem_groups = sorted(set(args.gem_groups))
    metrics = cr_mol_counter.MoleculeCounter.get_metrics_from_summary(
        summary, gem_groups, args.recovered_cells, args.force_cells)
    input_h5_filenames = [chunk_out.output for chunk_out in chunk_outs]
    # update with metrics that were computed in the chunks
    chunk_metric = cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC
    for gg, count in cr_mol_counter.MoleculeCounter.sum_gem_group_metric(
            input_h5_filenames, chunk_metric).iteritems():
        metrics[cr_mol_counter.GEM_GROUPS_METRIC][gg][chunk_metric] = count
    # make sure to sort globally by gem group. since the input is a barcode-sorted BAM, we assume it's already sorted by barcode.
    sort_columns = ['gem_group']
    cr_mol_counter.MoleculeCounter.concatenate_sort(outs.output,
                                                    input_h5_filenames,
                                                    sort_columns,
                                                    metrics=metrics)
Ejemplo n.º 3
0
def join(args, outs, chunk_defs, chunk_outs):
    summary = cr_utils.merge_jsons_as_dict([
        args.extract_reads_summary,
        args.attach_bcs_and_umis_summary,
        args.mark_duplicates_summary,
    ])

    # Hack for getting reference metadata -
    # this used to be computed in prior stages.
    # This is needed for storage in the molecule_info HDF5.
    tmp_reporter = cr_report.Reporter()
    tmp_reporter.store_reference_metadata(args.reference_path,
                                          cr_constants.REFERENCE_TYPE,
                                          cr_constants.REFERENCE_METRIC_PREFIX)
    ref_metadata = tmp_reporter.report(cr_constants.DEFAULT_REPORT_TYPE)
    summary.update(ref_metadata)

    # Load library info from BAM
    in_bam = tk_bam.create_bam_infile(args.inputs[0])
    library_info = rna_library.get_bam_library_info(in_bam)

    metrics = MoleculeCounter.get_metrics_from_summary(summary, library_info,
                                                       args.recovered_cells,
                                                       args.force_cells)

    input_h5_filenames = [chunk_out.output for chunk_out in chunk_outs]
    # update with metrics that were computed in the chunks
    chunk_metric = cr_mol_counter.USABLE_READS_METRIC
    summed_lib_metrics = MoleculeCounter.sum_library_metric(
        input_h5_filenames, chunk_metric)
    for lib_key, value in summed_lib_metrics.iteritems():
        metrics[cr_mol_counter.LIBRARIES_METRIC][lib_key][chunk_metric] = value

    MoleculeCounter.concatenate(outs.output,
                                input_h5_filenames,
                                metrics=metrics)
Ejemplo n.º 4
0
def filter_barcodes(args, outs):
    random.seed(0)
    np.random.seed(0)

    correction_data = pd.read_csv(args.barcode_correction_csv)
    raw_matrix = cr_matrix.CountMatrix.load_h5_file(args.matrices_h5)
    if np.isin(rna_library.ANTIBODY_LIBRARY_TYPE,
               correction_data.library_type):
        matrix, metrics_to_report, removed_bcs_df = remove_bcs_with_high_umi_corrected_reads(
            correction_data, raw_matrix)
        ### report all idenitified aggregate barcodes, together with their reads, umi corrected reads, fraction of corrected reads, and fraction of total reads
        removed_bcs_df.to_csv(outs.aggregate_barcodes)
        summary = metrics_to_report
    else:
        matrix = raw_matrix
        summary = {}

    if args.cell_barcodes is not None:
        method = FilterMethod.MANUAL
    elif args.force_cells is not None:
        method = FilterMethod.TOP_N_BARCODES
    else:
        method = FilterMethod.ORDMAG_NONAMBIENT

    summary['total_diversity'] = matrix.bcs_dim
    summary['filter_barcodes_method'] = get_filter_method_name(method)

    # Get unique gem groups
    unique_gem_groups = sorted(list(set(args.gem_groups)))

    # Get per-gem group cell load
    if args.recovered_cells is not None:
        gg_recovered_cells = int(
            float(args.recovered_cells) / float(len(unique_gem_groups)))
    else:
        gg_recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP

    if args.force_cells is not None:
        gg_force_cells = int(
            float(args.force_cells) / float(len(unique_gem_groups)))

    # Only use gene expression matrix for cell calling
    gex_matrix = matrix.view().select_features_by_type(
        lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)

    # Make initial cell calls for each genome separately
    genomes = gex_matrix.get_genomes()

    # (gem_group, genome) => dict
    filtered_metrics_groups = OrderedDict()
    # (gem_group, genome) => list of barcode strings
    filtered_bcs_groups = OrderedDict()

    for genome in genomes:
        genome_matrix = gex_matrix.select_features_by_genome(genome)

        # Make initial cell calls for each gem group individually
        for gem_group in unique_gem_groups:

            gg_matrix = genome_matrix.select_barcodes_by_gem_group(gem_group)

            if method == FilterMethod.ORDMAG or \
               method == FilterMethod.ORDMAG_NONAMBIENT:
                gg_total_diversity = gg_matrix.bcs_dim
                gg_bc_counts = gg_matrix.get_counts_per_bc()
                gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_ordmag(
                    gg_bc_counts, gg_recovered_cells, gg_total_diversity)
                gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices)

            elif method == FilterMethod.MANUAL:
                with (open(args.cell_barcodes)) as f:
                    cell_barcodes = json.load(f)
                gg_filtered_bcs, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_manual(
                    gg_matrix, cell_barcodes)

            elif method == FilterMethod.TOP_N_BARCODES:
                gg_bc_counts = gg_matrix.get_counts_per_bc()
                gg_filtered_indices, gg_filtered_metrics, msg = cr_stats.filter_cellular_barcodes_fixed_cutoff(
                    gg_bc_counts, gg_force_cells)
                gg_filtered_bcs = gg_matrix.ints_to_bcs(gg_filtered_indices)

            else:
                martian.exit("Unsupported BC filtering method: %s" % method)

            if msg is not None:
                martian.log_info(msg)

            filtered_metrics_groups[(gem_group, genome)] = gg_filtered_metrics
            filtered_bcs_groups[(gem_group, genome)] = gg_filtered_bcs

    # Do additional cell calling
    outs.nonambient_calls = None

    if method == FilterMethod.ORDMAG_NONAMBIENT:
        # We need the full gene expression matrix instead of just a view
        full_gex_matrix = matrix.select_features_by_type(
            lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)

        # Track these for recordkeeping
        eval_bcs_arrays = []
        umis_per_bc_arrays = []
        loglk_arrays = []
        pvalue_arrays = []
        pvalue_adj_arrays = []
        nonambient_arrays = []
        genome_call_arrays = []

        # Do it by gem group, but agnostic to genome
        for gg in unique_gem_groups:
            gg_matrix = full_gex_matrix.select_barcodes_by_gem_group(gg)

            # Take union of initial cell calls across genomes
            gg_bcs = sorted(
                list(
                    reduce(set.union, [
                        set(bcs)
                        for group, bcs in filtered_bcs_groups.iteritems()
                        if group[0] == gg
                    ])))

            result = cr_cell.find_nonambient_barcodes(gg_matrix, gg_bcs)
            if result is None:
                print 'Failed at attempt to call non-ambient barcodes in GEM group %s' % gg
                continue

            # Assign a genome to the cell calls by argmax genome counts
            genome_counts = []
            for genome in genomes:
                genome_counts.append(gg_matrix.view() \
                                     .select_features_by_genome(genome) \
                                     .select_barcodes(result.eval_bcs) \
                                     .get_counts_per_bc())
            genome_counts = np.column_stack(genome_counts)
            genome_calls = np.array(genomes)[np.argmax(genome_counts, axis=1)]

            umis_per_bc = gg_matrix.get_counts_per_bc()

            eval_bcs_arrays.append(np.array(gg_matrix.bcs)[result.eval_bcs])
            umis_per_bc_arrays.append(umis_per_bc[result.eval_bcs])
            loglk_arrays.append(result.log_likelihood)
            pvalue_arrays.append(result.pvalues)
            pvalue_adj_arrays.append(result.pvalues_adj)
            nonambient_arrays.append(result.is_nonambient)
            genome_call_arrays.append(genome_calls)

            # Update the lists of cell-associated barcodes
            for genome in genomes:
                eval_bc_strs = np.array(gg_matrix.bcs)[result.eval_bcs]
                filtered_bcs_groups[(gg, genome)].extend(
                    eval_bc_strs[(genome_calls == genome)
                                 & (result.is_nonambient)])

        if len(eval_bcs_arrays) > 0:
            nonambient_summary = pd.DataFrame(
                OrderedDict([
                    ('barcode', np.concatenate(eval_bcs_arrays)),
                    ('umis', np.concatenate(umis_per_bc_arrays)),
                    ('ambient_loglk', np.concatenate(loglk_arrays)),
                    ('pvalue', np.concatenate(pvalue_arrays)),
                    ('pvalue_adj', np.concatenate(pvalue_adj_arrays)),
                    ('nonambient', np.concatenate(nonambient_arrays)),
                    ('genome', np.concatenate(genome_call_arrays)),
                ]))
            nonambient_summary.to_csv(outs.nonambient_calls)

    # Record all filtered barcodes
    genome_filtered_bcs = defaultdict(set)
    filtered_bcs = set()
    for (gem_group, genome), bcs in filtered_bcs_groups.iteritems():
        genome_filtered_bcs[genome].update(bcs)
        filtered_bcs.update(bcs)

    # Combine initial-cell-calling metrics
    for genome in genomes:
        # Merge metrics over all gem groups for this genome
        txome_metrics = [
            v for k, v in filtered_metrics_groups.iteritems() if k[1] == genome
        ]
        txome_summary = cr_stats.merge_filtered_metrics(txome_metrics)

        # Append method name to metrics
        summary.update({
            ('%s_%s_%s' % (genome,
                           key,
                           get_filter_method_name(method))): txome_summary[key] \
            for (key,_) in txome_summary.iteritems()})

        summary['%s_filtered_bcs' % genome] = len(genome_filtered_bcs[genome])

        # NOTE: This metric only applies to the initial cell calls
        summary['%s_filtered_bcs_cv' %
                genome] = txome_summary['filtered_bcs_cv']

    # Deduplicate and sort filtered barcode sequences
    # Sort by (gem_group, barcode_sequence)
    barcode_sort_key = lambda x: cr_utils.split_barcode_seq(x)[::-1]

    for genome, bcs in genome_filtered_bcs.iteritems():
        genome_filtered_bcs[genome] = sorted(list(set(bcs)),
                                             key=barcode_sort_key)
    filtered_bcs = sorted(list(set(filtered_bcs)), key=barcode_sort_key)

    # Re-compute various metrics on the filtered matrix
    reads_summary = cr_utils.merge_jsons_as_dict(
        [args.raw_fastq_summary, args.attach_bcs_summary])
    matrix_summary = rna_report_mat.report_genomes(
        matrix,
        reads_summary=reads_summary,
        barcode_summary_h5_path=args.barcode_summary,
        recovered_cells=args.recovered_cells,
        cell_bc_seqs=genome_filtered_bcs)

    # Write metrics json
    combined_summary = matrix_summary.copy()
    combined_summary.update(summary)
    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(combined_summary),
                  f,
                  indent=4,
                  sort_keys=True)

    # Write the filtered barcodes file
    write_filtered_barcodes(outs.filtered_barcodes, genome_filtered_bcs)

    # Select cell-associated barcodes
    filtered_matrix = matrix.select_barcodes_by_seq(filtered_bcs)

    return filtered_matrix
Ejemplo n.º 5
0
    def _report_genome_agnostic_metrics(self, summary_json_paths, barcode_summary_h5, recovered_cells,
                                        cell_bc_seqs):
        """ Report metrics that are computed across all barcodes and all genomes """
        d = {}

        # Get total_reads and *_conf_mapped_reads_frac
        merged_jsons = cr_utils.merge_jsons_as_dict(summary_json_paths)
        total_reads = int(merged_jsons['total_reads'])
        conf_mapped_metrics = ['_'.join([ref,
                                         cr_constants.TRANSCRIPTOME_REGION,
                                         cr_constants.CONF_MAPPED_READ_TYPE,
                                         'reads_frac']) for ref in self.matrices.keys()]
        total_conf_mapped_reads = sum(float(merged_jsons.get(metric, 0)) * float(total_reads) for metric in conf_mapped_metrics)

        # Get number of cell bcs across all genomes
        cell_bcs_union = self.union_barcodes(cell_bc_seqs)
        n_cell_bcs_union = len(cell_bcs_union)
        d['filtered_bcs_transcriptome_union'] = n_cell_bcs_union
        d['%s_filtered_bcs' % cr_constants.MULTI_REFS_PREFIX] = n_cell_bcs_union

        # Report reads/cell across all genomes
        d['%s_%s_total_raw_reads_per_filtered_bc' % (cr_constants.MULTI_REFS_PREFIX, cr_constants.TRANSCRIPTOME_REGION)] = tk_stats.robust_divide(total_reads, n_cell_bcs_union)
        d['%s_%s_total_conf_mapped_reads_per_filtered_bc' % (cr_constants.MULTI_REFS_PREFIX, cr_constants.TRANSCRIPTOME_REGION)] = tk_stats.robust_divide(total_conf_mapped_reads, n_cell_bcs_union)

        # Total UMI counts across all matrices and all filtered barcodes
        total_umi_counts = 0
        for mat in self.matrices.values():
            total_umi_counts += mat.select_barcodes_by_seq(cell_bcs_union).m.sum()


        # Deviation from cell load
        if recovered_cells is None:
            d['%s_filtered_bcs_difference_from_recovered_cells' % cr_constants.MULTI_REFS_PREFIX] = 0
            d['%s_filtered_bcs_relative_difference_from_recovered_cells' % cr_constants.MULTI_REFS_PREFIX] = 0
        else:
            d['%s_filtered_bcs_difference_from_recovered_cells' % cr_constants.MULTI_REFS_PREFIX] = int(n_cell_bcs_union) - int(recovered_cells)
            d['%s_filtered_bcs_relative_difference_from_recovered_cells' % cr_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(n_cell_bcs_union - recovered_cells, recovered_cells)

        # Duplicate these metrics across genomes for backwards-compat
        for genome in self.matrices.keys():
            d['%s_total_raw_reads_per_filtered_bc' % genome] = tk_stats.robust_divide(total_reads, n_cell_bcs_union)
            d['%s_total_conf_mapped_reads_per_filtered_bc' % genome] = tk_stats.robust_divide(total_conf_mapped_reads, n_cell_bcs_union)

            for read_type in cr_constants.MATRIX_REPORT_READ_TYPES:
                metric = '%s_total_%s_reads_per_filtered_bc' % (genome, read_type)
                if read_type in cr_constants.MATRIX_USE_MATRIX_FOR_READ_TYPE:
                    n_reads = total_umi_counts
                else:
                    h5_keys = ['%s_%s_%s_reads' % (txome, cr_constants.TRANSCRIPTOME_REGION, read_type) for txome in self.matrices.keys()]
                    h5_keys = [x for x in h5_keys if x in barcode_summary_h5]
                    n_reads = sum(np.array(barcode_summary_h5[h5_key]).sum() for h5_key in h5_keys)
                d[metric] = tk_stats.robust_divide(n_reads, n_cell_bcs_union)

        # Report frac reads in cells across all genomes
        total_conf_mapped_reads_in_cells = 0
        total_conf_mapped_barcoded_reads = 0

        for txome, matrix in self.matrices.iteritems():
            h5_key = '%s_%s_%s_reads' % (txome, cr_constants.TRANSCRIPTOME_REGION,
                                      cr_constants.CONF_MAPPED_BC_READ_TYPE)
            cmb_reads = barcode_summary_h5[h5_key]
            cell_bc_indices = matrix.bcs_to_ints(cell_bcs_union)
            total_conf_mapped_reads_in_cells += cmb_reads[list(cell_bc_indices)].sum() if cell_bc_indices else 0
            total_conf_mapped_barcoded_reads += cmb_reads[()].sum()
        d['multi_filtered_bcs_conf_mapped_barcoded_reads_cum_frac'] = tk_stats.robust_divide(total_conf_mapped_reads_in_cells, total_conf_mapped_barcoded_reads)


        # Compute fraction of reads usable (conf mapped, barcoded, filtered barcode)
        unique_barcodes = set(cell_bcs_union)
        in_unique_barcodes_vectorized = np.vectorize(lambda x: x in unique_barcodes)
        filtered_bc_h5_row = in_unique_barcodes_vectorized(np.array(barcode_summary_h5['bc_sequence']))

        usable_reads = 0

        for txome in self.matrices.keys():
            h5_key = '%s_%s_%s_reads' % (txome,
                                                 cr_constants.TRANSCRIPTOME_REGION,
                                                 cr_constants.CONF_MAPPED_BC_READ_TYPE)

            if h5_key not in barcode_summary_h5:
                continue

            usable_reads += (filtered_bc_h5_row * np.array(barcode_summary_h5[h5_key])).sum()

        d['%s_transcriptome_usable_reads_frac' % cr_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(usable_reads, total_reads)


        # Compute matrix density across all genomes
        total_nonzero_entries, total_entries = 0, 0
        for matrix in self.matrices.values():
            filtered_mat = matrix.select_barcodes_by_seq(cell_bcs_union)
            total_nonzero_entries += filtered_mat.m.getnnz()
            total_entries += filtered_mat.m.shape[0] * filtered_mat.m.shape[1]
        d['%s_filtered_gene_bc_matrix_density' % cr_constants.MULTI_REFS_PREFIX] = tk_stats.robust_divide(total_nonzero_entries, total_entries)

        return d