예제 #1
0
def report_genomes(matrix, reads_summary, barcode_summary_h5_path,
                   recovered_cells, cell_bc_seqs):
    """Report on all genomes in this matrix"""
    barcode_summary_h5 = h5.File(barcode_summary_h5_path, 'r')

    metrics = {}

    genomes = matrix.get_genomes()
    assert len(cell_bc_seqs) == len(genomes)

    # Compute genome-agnostic metrics
    feature_types = sorted(
        list(set(f.feature_type for f in matrix.feature_ref.feature_defs)))
    for ftype in feature_types:
        total_reads = _get_total_reads(reads_summary, ftype)
        if total_reads == 0:
            continue

        genomes = matrix.get_genomes()
        conf_mapped_reads = _get_conf_mapped_reads(reads_summary, genomes,
                                                   ftype)

        submatrix = matrix.view().select_features_by_type(ftype)
        prefix = rna_library.get_library_type_metric_prefix(ftype)

        m = _report_genome_agnostic_metrics(submatrix, barcode_summary_h5,
                                            recovered_cells, cell_bc_seqs,
                                            total_reads, conf_mapped_reads,
                                            prefix)

        if rna_library.has_genomes(ftype):
            for genome in genomes:
                # Compute genome-specific metrics
                genome_matrix = matrix.view().select_features_by_genome(genome)
                genome_summary = _report(genome_matrix, genome,
                                         barcode_summary_h5, recovered_cells,
                                         cell_bc_seqs[genome], prefix)

                for key, value in genome_summary.iteritems():
                    key = '_'.join([genome, key])
                    m[key] = value

        else:
            # This feature has no genomes
            cell_bcs_union = list(
                reduce(lambda a, x: a | set(x), cell_bc_seqs.itervalues(),
                       set()))
            genome_summary = _report(submatrix,
                                     lib_constants.MULTI_REFS_PREFIX,
                                     barcode_summary_h5, recovered_cells,
                                     cell_bcs_union, prefix)
            for key, value in genome_summary.iteritems():
                key = '_'.join([lib_constants.MULTI_REFS_PREFIX, key])
                m[key] = value

        # Prepend feature type to metric keys
        m_prefixed = {(prefix + k): v for k, v in m.iteritems()}
        metrics.update(m_prefixed)

    return metrics
예제 #2
0
def plot_barcode_rank(chart, sample_properties, sample_data):
    """ Generate the RNA counter barcode rank plot """
    if sample_properties.get(
            'genomes'
    ) is None or sample_data.barcode_summary is None or sample_data.cell_barcodes is None:
        return None

    if len(sample_properties['genomes']) == 0:
        return None

    # UMI counts per BC across all genomes present
    if len(sample_properties['genomes']) > 1:
        genome = lib_constants.MULTI_REFS_PREFIX
    else:
        genome = sample_properties['genomes'][0]

    gex_prefix = rna_library.get_library_type_metric_prefix(
        lib_constants.GENE_EXPRESSION_LIBRARY_TYPE)
    key = cr_utils.format_barcode_summary_h5_key(
        gex_prefix, genome, cr_constants.TRANSCRIPTOME_REGION,
        cr_constants.CONF_MAPPED_DEDUPED_READ_TYPE)

    if key in sample_data.barcode_summary:
        counts_per_bc, plot_segments = sample_data.counter_barcode_rank_plot_data(
            key)
        return _plot_counter_barcode_rank(chart, counts_per_bc, plot_segments)
    else:
        # Not guaranteed to exist, depending on pipeline
        pass
예제 #3
0
def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    libraries = rna_library.get_bam_library_info(in_bam)
    distinct_library_types = sorted(
        list(set([x['library_type'] for x in libraries])))
    library_prefixes = map(
        lambda lib: rna_library.get_library_type_metric_prefix(lib[
            'library_type']), libraries)

    chroms = in_bam.references

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_summary = cr_utils.load_barcode_tsv(
        args.barcodes_detected) if not barcode_whitelist else None

    # TODO: this is redundant
    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_summary=barcode_summary,
                                  gem_groups=args.gem_groups,
                                  library_types=distinct_library_types)

    feature_ref = rna_feature_ref.from_transcriptome_and_csv(
        args.reference_path, args.feature_reference)

    if barcode_whitelist:
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist,
                                                    args.gem_groups)
    else:
        barcode_seqs = barcode_summary

    matrix = cr_matrix.CountMatrix.empty(feature_ref,
                                         barcode_seqs,
                                         dtype='int32')

    for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None):
        is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb(
            reads_iter,
            libraries,
            library_prefixes,
            use_umis=cr_chem.has_umis(args.chemistry_def))
        if is_conf_mapped_deduped:
            matrix.add(feature_id, bc)

    in_bam.close()

    reporter.store_reference_metadata(args.reference_path,
                                      cr_constants.REFERENCE_TYPE,
                                      cr_constants.REFERENCE_METRIC_PREFIX)

    matrix.save_h5_file(outs.matrices_h5)
    reporter.save(outs.chunked_reporter)
예제 #4
0
def _get_conf_mapped_reads(summary, genomes, library_type):
    prefix = rna_library.get_library_type_metric_prefix(library_type)
    conf_mapped_metrics = [
        prefix + '_'.join([
            ref, cr_constants.TRANSCRIPTOME_REGION,
            cr_constants.CONF_MAPPED_READ_TYPE, 'reads_frac'
        ]) for ref in genomes
    ]
    total_reads = _get_total_reads(summary, library_type)
    return sum(
        float(summary.get(metric, 0)) * float(total_reads)
        for metric in conf_mapped_metrics)
예제 #5
0
def main(args, outs):

    if not (os.path.isfile(args.molecule_info)
            and os.path.isfile(args.filtered_feature_counts_matrix)):
        set_empty(outs)
        return

    with open(args.counter_metrics_json) as f:
        protospacer_call_metrics = json.load(f)
    report_prefix = rna_library.get_library_type_metric_prefix(
        rna_library.CRISPR_LIBRARY_TYPE)

    filtered_feature_counts_matrix = cr_matrix.CountMatrix.load_h5_file(
        args.filtered_feature_counts_matrix)
    filtered_guide_counts_matrix = filtered_feature_counts_matrix.select_features_by_type(
        rna_library.CRISPR_LIBRARY_TYPE)
    num_gex_cbs = len(filtered_feature_counts_matrix.bcs)

    if feature_utils.check_if_none_or_empty(filtered_guide_counts_matrix):
        set_empty(outs)
        return

    feature_defs = filtered_guide_counts_matrix.feature_ref.feature_defs
    feature_map = {
        feature_def.id: feature_def.tags.get('sequence')
        for feature_def in feature_defs
    }
    """Protospacer calling"""
    (perturbation_calls_table, presence_calls, cells_with_ps, ps_calls_summary,
     umi_thresholds) = protospacer_calling.get_ps_calls_and_summary(
         filtered_guide_counts_matrix,
         feature_map,
     )
    protospacer_call_metrics.update(
        protospacer_calling.get_protospacer_call_metrics(
            ps_calls_summary, num_gex_cbs, report_prefix))

    perturbation_calls_table.to_csv(outs.protospacer_calls_per_cell)
    ps_calls_summary.to_csv(outs.protospacer_calls_summary)
    feature_utils.write_json_from_dict(cells_with_ps,
                                       outs.cells_per_protospacer)
    feature_utils.write_json_from_dict(umi_thresholds,
                                       outs.protospacer_umi_thresholds_json)
    feature_utils.write_csv_from_dict(umi_thresholds,
                                      outs.protospacer_umi_thresholds_csv,
                                      "Protospacer,UMI threshold\n")
    feature_utils.write_json_from_dict(protospacer_call_metrics,
                                       outs.protospacer_call_metrics_json)
예제 #6
0
def split(args):
    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    if paired_end:
        assert len(args.read1s) == len(args.read2s)

    assert len(args.corrected_bcs) == len(args.read1s)

    chunks = []

    # Determine the number of buckets required to achieve
    # the given chunk size.
    chunks_per_gem_group = {}
    with open(args.reads_summary) as f:
        reads_summary = json.load(f)
        for gg in args.gem_groups:
            # Get the libraries w/ this GEM group (should only be one)
            gg_library_ids = [lib['library_id'] for lib in args.library_info if lib['gem_group'] == gg]
            assert len(gg_library_ids) == 1

            lib_type_prefix = rna_library.get_library_type_metric_prefix(
                lib_constants.VDJ_LIBRARY_TYPE)
            readpairs = reads_summary['%s%s_total_read_pairs_per_library' %
                                      (lib_type_prefix,
                                       gg_library_ids[0])]

            chunks_per_gem_group[str(gg)] = max(2,
                                                int(math.ceil(float(readpairs) / \
                                                              args.readpairs_per_chunk)))

    for fastq1, fastq2, bcs in itertools.izip_longest(args.read1s, args.read2s, args.corrected_bcs):
        chunks.append({
            'read1s_chunk': fastq1,
            'read2s_chunk': fastq2 if paired_end else None,
            'bcs': bcs,
            'chunks_per_gem_group': chunks_per_gem_group,
            '__mem_gb': 6,
        })
    return {'chunks': chunks, 'join': {'__mem_gb': 2}}
    def get_metrics_from_summary(summary, libraries, total_recovered_cells=None, total_force_cells=None):
        """ Extract relevant metrics from a summary dict."""
        mol_metrics = {}

        version_metrics = ['cellranger_version', 'reference_mkref_version', 'reference_fasta_hash', 'reference_gtf_hash']
        for m in version_metrics:
            mol_metrics[m] = summary[m]

        chemistry_metrics = [m for m in summary if m.startswith('chemistry')]
        for m in chemistry_metrics:
            mol_metrics[m] = summary[m]

        # Per-library values
        lib_metrics = {}
        for lib_idx, lib in enumerate(libraries):
            lib_type_prefix = rna_library.get_library_type_metric_prefix(lib['library_type'])
            summary_name = '%s%s_total_read_pairs_per_library' % (lib_type_prefix, lib_idx)
            lib_metrics[str(lib_idx)] = {
                TOTAL_READS_METRIC: summary[summary_name],
            }

        # Per-gem-group values
        gg_metrics = {}
        gem_groups = sorted([lib['gem_group'] for lib in libraries])
        for gg in gem_groups:
            # Distribute the toplevel expected and forced cells parameters
            #   evenly among the gem groups.
            recovered_cells = total_recovered_cells / len(gem_groups) if total_recovered_cells is not None else None
            force_cells = total_force_cells / len(gem_groups) if total_force_cells is not None else None
            gg_metrics[str(gg)] = {
                GG_RECOVERED_CELLS_METRIC: recovered_cells,
                GG_FORCE_CELLS_METRIC: force_cells,
            }

        mol_metrics[LIBRARIES_METRIC] = lib_metrics
        mol_metrics[GEM_GROUPS_METRIC] = gg_metrics
        return mol_metrics
예제 #8
0
def remove_bcs_with_high_umi_corrected_reads(correction_data, matrix):
    """ Given a CountMatrix and and csv file containing information about umi corrected reads,
        detect all barcodes with unusually high fraction of corrected reads (proobably aggregates),
        and remove them from the CoutMatrix """

    bcs_to_remove, reads_lost, removed_bcs_df = ab_utils.detect_aggregate_bcs(
        correction_data)
    bcs_to_remove = set(matrix.bc_to_int(bc) for bc in bcs_to_remove)
    # make sure filtered_bcs is in deterministic order or any later bootstrap sampling will not be deterministic
    filtered_bcs = [
        i for i in xrange(matrix.bcs_dim) if i not in bcs_to_remove
    ]
    cleaned_matrix = matrix.select_barcodes(filtered_bcs)

    ### report how many aggregates were found, and the fraction of reads those accounted for
    metrics_to_report = {}
    report_prefix = rna_library.get_library_type_metric_prefix(
        rna_library.ANTIBODY_LIBRARY_TYPE)
    metrics_to_report[report_prefix +
                      'number_highly_corrected_GEMs'] = len(bcs_to_remove)
    metrics_to_report[report_prefix +
                      'reads_lost_to_highly_corrected_GEMs'] = reads_lost

    return cleaned_matrix, metrics_to_report, removed_bcs_df
예제 #9
0
def summarize_read_matrix(matrix, library_info, barcode_info, barcode_seqs):
    """Summarize matrix of read-pair counts"""
    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    view = matrix.view()
    summary = {}

    for lib_type in lib_types:
        if rna_library.has_genomes(lib_type):
            sum_genomes = map(str, barcode_info.genomes)
        else:
            sum_genomes = [lib_constants.MULTI_REFS_PREFIX]

        for genome in sum_genomes:
            m = view.select_features_by_type(lib_type)
            if rna_library.has_genomes(lib_type):
                m = m.select_features_by_genome(genome)
                genome_idx = barcode_info.genomes.index(genome)
            else:
                genome_idx = None

            prefix = '%s%s' % (
                rna_library.get_library_type_metric_prefix(lib_type), genome)
            summary['%s_raw_mapped_reads' % prefix] = m.sum()

            filtered_bcs = MoleculeCounter.get_filtered_barcodes(
                barcode_info,
                library_info,
                barcode_seqs,
                genome_idx=genome_idx,
                library_type=lib_type)
            filtered_m = m.select_barcodes_by_seq(filtered_bcs)
            summary['%s_flt_mapped_reads' % prefix] = filtered_m.sum()

            summary['%s_filtered_bcs' % prefix] = len(filtered_bcs)
    return summary
예제 #10
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.reads, outs.read2s, outs.tags = [], [], []
    outs.gem_groups, outs.library_types, outs.library_ids, outs.read_groups = [], [], [], []

    for chunk_out in chunk_outs:
        outs.reads += [read for read in chunk_out.reads]
        outs.read2s += [read2 for read2 in chunk_out.read2s]
        outs.tags += [tags for tags in chunk_out.tags]
        outs.gem_groups += [gem_group for gem_group in chunk_out.gem_groups]
        outs.library_types += [lt for lt in chunk_out.library_types]
        outs.library_ids += [li for li in chunk_out.library_ids]
        outs.read_groups += [
            read_group for read_group in chunk_out.read_groups
        ]

    # Ensure that we have non-zero reads
    if not outs.reads:
        martian.exit(
            "No reads found. Check the input fastqs and/or the chemistry definition"
        )
    # Ensure consistency of BAM comments
    assert all(chunk_out.bam_comments == chunk_outs[0].bam_comments
               for chunk_out in chunk_outs)
    outs.bam_comments = chunk_outs[0].bam_comments

    # Write barcode counts (merged by library_type)
    bc_counters = BarcodeCounter.merge_by(
        [co.barcode_counts
         for co in chunk_outs], [cd.library_type for cd in chunk_defs],
        args.barcode_whitelist, outs.gem_groups)
    with open(outs.barcode_counts, 'w') as f:
        tk_safe_json.dump_numpy(bc_counters, f)

    # Write feature counts
    feature_counts = None
    for chunk_def, chunk_out in itertools.izip(chunk_defs, chunk_outs):
        with open(chunk_out.feature_counts) as f:
            chunk_counts = np.asarray(json.load(f), dtype=int)
            if feature_counts is None:
                feature_counts = chunk_counts
            else:
                feature_counts += chunk_counts

    with open(outs.feature_counts, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(list(feature_counts)), f)

    outs.align = cr_utils.select_alignment_params(args.align)

    # Group reporters by library type
    outs.chunked_reporter = None
    reporter_groups = defaultdict(list)
    for chunk_def, chunk_out in zip(chunk_defs, chunk_outs):
        if not chunk_out.reads:
            continue
        chunk_lib_types = set(lt for lt in chunk_out.library_types)
        assert len(chunk_lib_types) == 1
        lib_type = list(chunk_lib_types)[0]
        reporter_groups[lib_type].append(chunk_out.chunked_reporter)

    # Merge reporters and prefix JSON keys by library type
    summary = {}
    for lib_type, reporters in reporter_groups.iteritems():
        j = cr_report.merge_reporters(reporters).to_json()

        prefix = rna_library.get_library_type_metric_prefix(lib_type)
        j_prefixed = dict((prefix + k, v) for k, v in j.iteritems())

        summary.update(j_prefixed)

    # Use a temporary reporter to generate the metadata (w/o a prefix)
    tmp_reporter = cr_report.Reporter()
    tmp_reporter.store_chemistry_metadata(args.chemistry_def)
    summary.update(tmp_reporter.to_json())

    # Write summary JSON
    with open(outs.summary, 'w') as f:
        tk_safe_json.dump_numpy(summary, f, pretty=True)
예제 #11
0
def join(args, outs, chunk_defs, chunk_outs):

    # Pass through the matrix chunks and nnz counts
    outs.raw_matrices_h5 = [o.raw_matrix_h5 for o in chunk_outs]
    outs.raw_nnz = sum(o.raw_nnz for o in chunk_outs)
    outs.filtered_matrices_h5 = [o.filtered_matrix_h5 for o in chunk_outs]
    outs.filted_nnz = sum(o.filtered_nnz for o in chunk_outs)

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()

    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    summary = {
        'frac_reads_kept': chunk_defs[0].frac_reads_kept,
        'num_cells_by_library': chunk_defs[0].num_cells,
    }

    # Merge read summary metrics
    read_summary = defaultdict(int)
    for filename in [co.chunk_summary for co in chunk_outs]:
        with open(filename) as f:
            d = json.load(f)
            for k in d['read_summary'].iterkeys():
                read_summary[k] += d['read_summary'][k]
    summary.update(read_summary)

    # Get summary metrics
    with open(chunk_outs[0].chunk_summary) as f:
        mol_metrics = json.load(f)['mol_metrics']
    chem_keys = [
        k for k in mol_metrics.iterkeys() if k.startswith('chemistry')
    ]
    for k in chem_keys:
        summary[k] = mol_metrics[k]
    print json.dumps(mol_metrics, indent=4, sort_keys=True)

    # Report normalization metrics
    all_batches = OrderedDict()

    # These are all per-library-type
    min_frac_reads_kept = np.ones(len(lib_types), dtype='float')
    total_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64')
    total_ds_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64')
    total_cells = np.zeros(len(lib_types), dtype='uint64')

    for lib_type_idx, lib_type in enumerate(lib_types):
        lib_inds = [
            i for i, lib in enumerate(library_info)
            if lib['library_type'] == lib_type
        ]
        for lib_idx in lib_inds:
            aggr_id = library_info[lib_idx]['aggr_id']
            old_gg = library_info[lib_idx]['old_gem_group']
            batch = aggr_id + ('-%d' % old_gg if old_gg > 1 else '')
            all_batches[batch] = None

            n_cells = summary['num_cells_by_library'][lib_idx]
            total_cells[lib_type_idx] += n_cells

            lib_metrics = mol_metrics[cr_mol_counter.LIBRARIES_METRIC][str(
                lib_idx)]
            raw_read_pairs = lib_metrics[cr_mol_counter.TOTAL_READS_METRIC]
            mapped_read_pairs = lib_metrics[cr_mol_counter.USABLE_READS_METRIC]
            ds_read_pairs = lib_metrics[
                cr_mol_counter.DOWNSAMPLED_READS_METRIC]

            total_raw_read_pairs[lib_type_idx] += raw_read_pairs
            total_ds_raw_read_pairs[lib_type_idx] += ds_read_pairs

            frac_reads_kept = summary['frac_reads_kept'][lib_idx]
            min_frac_reads_kept[lib_type_idx] = min(
                min_frac_reads_kept[lib_type_idx], frac_reads_kept)

            pre_norm_raw_rppc = tk_stats.robust_divide(raw_read_pairs, n_cells)
            pre_norm_mapped_rppc = tk_stats.robust_divide(
                mapped_read_pairs, n_cells)

            # Prefix with batch and library type
            if lib_type.lower().startswith(
                    rna_library.CUSTOM_LIBRARY_TYPE_PREFIX.lower()):
                lib_prefix = rna_library.CUSTOM_LIBRARY_TYPE_PREFIX + '_'
            else:
                lib_prefix = rna_library.get_library_type_metric_prefix(
                    lib_type)

            p = (batch, lib_prefix)
            summary.update({
                '%s_%sfrac_reads_kept' % p:
                frac_reads_kept,
                '%s_%spre_normalization_raw_reads_per_filtered_bc' % p:
                pre_norm_raw_rppc,
                '%s_%spre_normalization_cmb_reads_per_filtered_bc' % p:
                pre_norm_mapped_rppc,
            })
    summary['batches'] = all_batches.keys()

    for lib_type_idx, lib_type in enumerate(lib_types):
        mean_rppc = tk_stats.robust_divide(total_raw_read_pairs[lib_type_idx],
                                           total_cells[lib_type_idx])
        ds_mean_rppc = tk_stats.robust_divide(
            total_ds_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx])

        p = rna_library.get_library_type_metric_prefix(lib_type)
        summary.update({
            '%spre_normalization_total_reads' % p:
            total_raw_read_pairs[lib_type_idx],
            '%spost_normalization_total_reads' % p:
            total_ds_raw_read_pairs[lib_type_idx],
            '%sfiltered_bcs_transcriptome_union' % p:
            total_cells[lib_type_idx],
            '%spre_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p:
            mean_rppc,
            '%spost_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p:
            ds_mean_rppc,
            '%slowest_frac_reads_kept' % p:
            min_frac_reads_kept[lib_type_idx],
        })

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary),
                  f,
                  indent=4,
                  sort_keys=True)
예제 #12
0
def join(args, outs, chunk_defs, chunk_outs):

    version = martian.get_pipelines_version()

    with open(args.summary) as f:
        summary = json.load(f)

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        barcode_info = mc.get_barcode_info()
        barcode_seqs = mc.get_barcodes()

    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    # make attrs for user-added columns in aggr csv
    extra_attrs = get_custom_aggr_columns(args.sample_defs)
    # track original library/gem info
    library_map = cr_matrix.make_library_map_aggr(args.gem_group_index)
    extra_attrs.update(library_map)

    # Merge raw matrix
    raw_matrix = cr_matrix.merge_matrices(args.raw_matrices_h5)
    raw_matrix.save_h5_file(outs.raw_matrix_h5, extra_attrs=extra_attrs)

    genomes = raw_matrix.get_genomes()

    # Create barcode summary HDF5 file w/ GEX data for the barcode rank plot
    with h5py.File(outs.barcode_summary_h5, 'w') as f:
        cr_io.create_hdf5_string_dataset(f, cr_constants.H5_BC_SEQUENCE_COL, raw_matrix.bcs)

        gex_bc_counts = raw_matrix.view().select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE).sum(axis=0).astype('uint64')
        genome_key = genomes[0] if len(genomes) == 1 else lib_constants.MULTI_REFS_PREFIX
        f.create_dataset('_%s_transcriptome_conf_mapped_deduped_barcoded_reads' % genome_key,
                         data=gex_bc_counts)

    rna_matrix.save_mex(raw_matrix,outs.raw_matrix_mex, version)
    del raw_matrix

    # Merge filtered matrix
    filt_mat = cr_matrix.merge_matrices(args.filtered_matrices_h5)
    filt_mat.save_h5_file(outs.filtered_matrix_h5, extra_attrs=extra_attrs)

    # Summarize the matrix across library types and genomes
    for lib_type in lib_types:
        libtype_prefix = rna_library.get_library_type_metric_prefix(lib_type)

        if rna_library.has_genomes(lib_type):
            genomes = filt_mat.get_genomes()
        else:
            genomes = [None]

        mat_lib = filt_mat.view().select_features_by_type(lib_type)

        for genome in genomes:
            if genome is None:
                mat = mat_lib
                genome_idx = None
            else:
                mat = mat_lib.select_features_by_genome(genome)
                genome_idx = barcode_info.genomes.index(genome)

            # Select barcodes passing filter for this (lib_type, genome)
            filtered_bcs = MoleculeCounter.get_filtered_barcodes(barcode_info,
                                                                 library_info,
                                                                 barcode_seqs,
                                                                 genome_idx=genome_idx,
                                                                 library_type=lib_type)
            mat = mat.select_barcodes_by_seq(filtered_bcs)

            median_features = np.median(mat.count_ge(axis=0,
                                                     threshold=cr_constants.MIN_COUNTS_PER_GENE))
            median_counts = np.median(mat.sum(axis=0))
            genome_prefix = genome if genome is not None else lib_constants.MULTI_REFS_PREFIX

            prefixes = (libtype_prefix, genome_prefix)
            if genome is not None:
                flt_reads = summary['%s%s_flt_mapped_reads' % prefixes]
                raw_reads = summary['%s%s_raw_mapped_reads' % prefixes]
                frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads)

                summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % prefixes] =  frac_reads_in_cells

            summary.update({
                '%s%s_filtered_bcs_median_counts' % prefixes: median_counts,
                '%s%s_filtered_bcs_median_unique_genes_detected' % prefixes: median_features,
            })

        # Compute frac reads in cells across all genomes
        prefixes = [(libtype_prefix, g) for g in genomes if g is not None]
        if len(prefixes) == 0:
            prefixes = [(libtype_prefix, lib_constants.MULTI_REFS_PREFIX)]
        flt_reads = sum(summary['%s%s_flt_mapped_reads' % p] for p in prefixes)
        raw_reads = sum(summary['%s%s_raw_mapped_reads' % p] for p in prefixes)

        frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads)
        summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % (
            libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] = frac_reads_in_cells


    # Write MEX format (do it last because it converts the matrices to COO)
    rna_matrix.save_mex(filt_mat, outs.filtered_matrix_mex, version)

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
예제 #13
0
def _get_total_reads(summary, library_type):
    prefix = rna_library.get_library_type_metric_prefix(library_type)
    return int(summary.get(prefix + 'total_reads', 0))
예제 #14
0
def main(args, outs):
    reporter = vdj_report.VdjReporter()

    # Set a default value of 0 for number of paired cells so that it will be
    # present in the metric summary csv even when there are no paired cells
    # or in denovo mode
    reporter._get_metric_attr(
        'vdj_assembly_contig_pair_productive_full_len_bc_count',
        MULTI_REFS_PREFIX).set_value(0)

    barcode_contigs = defaultdict(list)
    contig_annotations = {}

    # Get annotations for each contig
    for annotation in iter(json.load(open(args.annotations))):
        contig_annotations[annotation['contig_name']] = annotation

    if args.contig_summary and os.path.isfile(args.contig_summary):
        contig_summary = pd.read_csv(args.contig_summary,
                                     header=0,
                                     index_col=None,
                                     sep='\t',
                                     dtype={
                                         'component': int,
                                         'num_reads': int,
                                         'num_pairs': int,
                                         'num_umis': int,
                                         'umi_list': str,
                                     })
        contig_summary = contig_summary.groupby('barcode')
    else:
        contig_summary = None

    if args.umi_summary and os.path.isfile(args.umi_summary):
        umi_summary = pd.read_csv(args.umi_summary,
                                  header=0,
                                  index_col=None,
                                  sep='\t')
        umi_summary = umi_summary.groupby('barcode')
    else:
        umi_summary = None

    if args.filter_summary:
        filter_summary = vdj_utils.load_contig_summary_table(
            args.filter_summary)
    else:
        filter_summary = None

    # Get contigs for each barcode
    for contig_hdr, contig_seq in cr_utils.get_fasta_iter(open(args.contigs)):
        contig_name = contig_hdr.split(' ')[0]
        if not filter_summary is None and not vdj_utils.is_contig_filtered(
                filter_summary, contig_name):
            continue

        barcode = vdj_utils.get_barcode_from_contig_name(contig_name)
        barcode_contigs[barcode].append((contig_name, contig_seq))

    # Compute metrics for each barcode
    if args.cell_barcodes:
        barcodes = vdj_utils.load_cell_barcodes_json(args.cell_barcodes)
    else:
        # Pass an empty barcode JSON for bulk
        barcodes = {''}

    reference = vdj_ref.VdjReference(args.vdj_reference_path)

    for barcode in barcodes:
        contigs = barcode_contigs[barcode]
        annotations = [contig_annotations[contig[0]] for contig in contigs]

        reporter.vdj_barcode_contig_cb(barcode, contigs, annotations,
                                       reference)

        if not contig_summary is None and barcode in contig_summary.groups:
            bc_contig_summary = contig_summary.get_group(barcode)
        else:
            bc_contig_summary = None

        if not umi_summary is None and barcode in umi_summary.groups:
            bc_umi_summary = umi_summary.get_group(barcode)
        else:
            bc_umi_summary = None

        reporter.vdj_assembly_cb(bc_contig_summary, bc_umi_summary,
                                 annotations, reference)

    ## Compute post-assembly per-cell metrics
    # Load the assembly metrics summary to get the total assemblable reads
    if args.assemble_metrics_summary and args.reads_summary:
        assemblable_read_pairs_by_bc = cr_utils.get_metric_from_json(
            args.assemble_metrics_summary, 'assemblable_read_pairs_by_bc')
        assemblable_read_pairs = sum(
            assemblable_read_pairs_by_bc.get(bc, 0) for bc in barcodes)

        lib_type_prefix = rna_library.get_library_type_metric_prefix(
            LIBRARY_TYPE)
        total_read_pairs = cr_utils.get_metric_from_json(
            args.reads_summary, '%stotal_read_pairs' % lib_type_prefix)

        reporter._get_metric_attr(
            'vdj_assemblable_read_pairs_per_filtered_bc').set_value(
                assemblable_read_pairs, len(barcodes))
        reporter._get_metric_attr('vdj_sequencing_efficiency').set_value(
            assemblable_read_pairs, total_read_pairs)

    ## Try to autodetect the chain type
    # Find all chains w/ a significant presence.
    # If there's exactly one, set the chain type filter to that.
    # Otherwise, show all chain types.

    chain_count = defaultdict(int)
    for anno_dict in contig_annotations.itervalues():
        contig = vdj_annotations.AnnotatedContig.from_dict(
            anno_dict, reference)
        if contig.is_cell and contig.high_confidence and contig.productive:
            for anno in contig.annotations:
                if anno.feature.chain_type in vdj_constants.VDJ_CHAIN_TYPES:
                    chain_count[anno.feature.chain_type] += 1

    outs.chain_type = vdj_constants.ALL_CHAIN_TYPES

    print chain_count

    if len(chain_count) > 0:
        n_contigs = sum(chain_count.itervalues())
        sig_chains = [
            ct
            for ct, count in chain_count.iteritems() if tk_stats.robust_divide(
                count, n_contigs) >= MIN_CHAIN_TYPE_CONTIG_FRAC
        ]
        if len(sig_chains) == 1:
            outs.chain_type = sig_chains[0]

    reporter.report_summary_json(outs.summary)
예제 #15
0
def make_metric_name(name, library_type, genome, ss_type, ss_depth):
    lt_prefix = rna_library.get_library_type_metric_prefix(library_type)
    return '%s%s_%s_%s_%s' % (lt_prefix, genome, ss_type, ss_depth, name)