Ejemplo n.º 1
0
def get_library_mapping(aggr_id, libraries):
    """Get the mapping of gem groups and library indices to their new values.

    Args:
      aggr_id (str): The label given to a set of libraries in the aggr CSV file.
      libraries (list of dict): New library info.
    Returns:
      tuple of (gem_group_map, library_map) (np.array, np.array):
        gem_group_map maps the old gem group integer ro the new one
        library_map maps the old library index integer to the new one
    """
    for i, lib in enumerate(libraries):
        lib['index'] = i

    my_libs = [lib for lib in libraries if lib['aggr_id'] == aggr_id]
    max_old_gg = max(lib['old_gem_group'] for lib in my_libs)
    max_old_lib_idx = max(lib['old_library_index'] for lib in my_libs)

    gem_group_map = np.zeros(
        1 + max_old_gg, dtype=MoleculeCounter.get_column_dtype('gem_group'))
    lib_idx_map = np.zeros(
        1 + max_old_lib_idx,
        dtype=MoleculeCounter.get_column_dtype('library_idx'))
    for lib in my_libs:
        gem_group_map[lib['old_gem_group']] = lib['gem_group']
        lib_idx_map[lib['old_library_index']] = lib['index']

    return gem_group_map, lib_idx_map
Ejemplo n.º 2
0
def join(args, outs, chunk_defs, chunk_outs):
    molecules = [chunk_out.molecule_h5 for chunk_out in chunk_outs]
    metrics = MoleculeCounter.naive_concatenate_metrics(molecules)
    metrics[cr_mol_counter.IS_AGGREGATED_METRIC] = True
    MoleculeCounter.concatenate(outs.merged_molecules,
                                molecules,
                                metrics=metrics)

    # Record, for each gem group, the range of barcode indices it can contain.
    outs.gem_group_barcode_ranges = {}
    for chunk_def, chunk_out in zip(chunk_defs, chunk_outs):
        for gg in chunk_out.new_gem_groups:
            outs.gem_group_barcode_ranges[str(gg)] = [
                chunk_def.barcode_idx_offset, chunk_def.barcode_idx_end
            ]
Ejemplo n.º 3
0
def main(args, outs):
    np.random.seed(0)

    subsample_rate = args.subsample_info.get('subsample_rate')
    if subsample_rate is None:
        return

    mol_counter = MoleculeCounter.open(args.molecule_info,
                                       'r',
                                       start=int(args.chunk_start),
                                       length=int(args.chunk_len))

    # Subsample the matrices
    subsample_result = {}
    subsampled_raw_mats = cr_matrix.GeneBCMatrices.build_from_mol_counter(
        mol_counter,
        subsample_rate=subsample_rate,
        subsample_result=subsample_result)

    # Filter the subsampled matrices
    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    subsampled_filt_mats = subsampled_raw_mats.filter_barcodes(
        filtered_bcs_per_genome)

    # Calculations for subsampled duplication rate
    reporter = cr_report.Reporter(
        genomes=map(str, mol_counter.get_ref_column('genome_ids')),
        subsample_types=cr_constants.ALL_SUBSAMPLE_TYPES,
        subsample_depths=args.subsample_info['all_target_rpc'])

    reporter.subsampled_duplication_frac_cb(
        subsampled_raw_mats,
        mol_counter,
        args.subsample_info['subsample_rate'],
        args.subsample_info['subsample_type'],
        args.subsample_info['target_rpc'],
        subsample_result['mapped_reads'],
    )

    mol_counter.close()

    reporter.save(outs.chunked_reporter)

    outs.subsampled_matrices = {}
    outs.subsampled_matrices['raw_matrices'] = martian.make_path(
        'raw_matrices.h5')
    outs.subsampled_matrices['filtered_matrices'] = martian.make_path(
        'filtered_matrices.h5')

    subsampled_raw_mats.save_h5(outs.subsampled_matrices['raw_matrices'])
    subsampled_filt_mats.save_h5(outs.subsampled_matrices['filtered_matrices'])
Ejemplo n.º 4
0
def join(args, outs, chunk_defs, chunk_outs):
    summary = cr_utils.merge_jsons_as_dict([
        args.extract_reads_summary,
        args.attach_bcs_and_umis_summary,
        args.mark_duplicates_summary,
    ])

    # Hack for getting reference metadata -
    # this used to be computed in prior stages.
    # This is needed for storage in the molecule_info HDF5.
    tmp_reporter = cr_report.Reporter()
    tmp_reporter.store_reference_metadata(args.reference_path,
                                          cr_constants.REFERENCE_TYPE,
                                          cr_constants.REFERENCE_METRIC_PREFIX)
    ref_metadata = tmp_reporter.report(cr_constants.DEFAULT_REPORT_TYPE)
    summary.update(ref_metadata)

    # Load library info from BAM
    in_bam = tk_bam.create_bam_infile(args.inputs[0])
    library_info = rna_library.get_bam_library_info(in_bam)

    metrics = MoleculeCounter.get_metrics_from_summary(summary, library_info,
                                                       args.recovered_cells,
                                                       args.force_cells)

    input_h5_filenames = [chunk_out.output for chunk_out in chunk_outs]
    # update with metrics that were computed in the chunks
    chunk_metric = cr_mol_counter.USABLE_READS_METRIC
    summed_lib_metrics = MoleculeCounter.sum_library_metric(
        input_h5_filenames, chunk_metric)
    for lib_key, value in summed_lib_metrics.iteritems():
        metrics[cr_mol_counter.LIBRARIES_METRIC][lib_key][chunk_metric] = value

    MoleculeCounter.concatenate(outs.output,
                                input_h5_filenames,
                                metrics=metrics)
Ejemplo n.º 5
0
def split(args):
    """ Chunk the data by input library """
    chunks, merged_barcodes = [], []
    barcode_whitelist_to_idx_offset = {}
    barcode_idx_offset = 0
    merged_barcodes_file = martian.make_path('merged_barcodes.pickle')

    for sample_def in args.sample_defs:
        with MoleculeCounter.open(sample_def[cr_constants.AGG_H5_FIELD],
                                  'r') as mol_counter:
            mem_gb = int(1.5 *
                         MoleculeCounter.estimate_mem_gb(mol_counter.nrows()))
            barcode_whitelist = mol_counter.get_barcode_whitelist()

            barcodes = mol_counter.get_barcodes()
            if barcode_whitelist not in barcode_whitelist_to_idx_offset:
                merged_barcodes.extend(barcodes)
                barcode_whitelist_to_idx_offset[
                    barcode_whitelist] = barcode_idx_offset
                barcode_idx_offset += len(barcodes)

            idx_offset = barcode_whitelist_to_idx_offset[barcode_whitelist]

            chunks.append({
                'aggr_id': sample_def[cr_constants.AGG_ID_FIELD],
                'molecule_h5': sample_def[cr_constants.AGG_H5_FIELD],
                '__mem_gb': mem_gb,
                'barcode_idx_offset': idx_offset,
                'barcode_idx_end': idx_offset + len(barcodes),
                'merged_barcodes': merged_barcodes_file,
            })

    with open(merged_barcodes_file, 'wb') as fp:
        cPickle.dump(merged_barcodes, fp, cPickle.HIGHEST_PROTOCOL)

    return {'chunks': chunks, 'join': {'__mem_gb': 6}}
Ejemplo n.º 6
0
def split(args):

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        # For memory request calculation
        num_gem_groups = len(set(lib['gem_group'] for lib in library_info))

        # Number of barcodes in the full matrix
        num_barcodes = mc.get_ref_column_lazy('barcodes').shape[0]

    # Worst case number of nonzero elements in final matrix
    num_nonzero = args.raw_nnz
    join_mem_gb = CountMatrix.get_mem_gb_from_matrix_dim(num_barcodes*num_gem_groups,
                                                                    num_nonzero)

    return {
        'chunks': [],
        'join': {
            '__mem_gb': join_mem_gb,
            '__threads': 2
        }
    }
Ejemplo n.º 7
0
def summarize_read_matrix(matrix, library_info, barcode_info, barcode_seqs):
    """Summarize matrix of read-pair counts"""
    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    view = matrix.view()
    summary = {}

    for lib_type in lib_types:
        if rna_library.has_genomes(lib_type):
            sum_genomes = map(str, barcode_info.genomes)
        else:
            sum_genomes = [lib_constants.MULTI_REFS_PREFIX]

        for genome in sum_genomes:
            m = view.select_features_by_type(lib_type)
            if rna_library.has_genomes(lib_type):
                m = m.select_features_by_genome(genome)
                genome_idx = barcode_info.genomes.index(genome)
            else:
                genome_idx = None

            prefix = '%s%s' % (
                rna_library.get_library_type_metric_prefix(lib_type), genome)
            summary['%s_raw_mapped_reads' % prefix] = m.sum()

            filtered_bcs = MoleculeCounter.get_filtered_barcodes(
                barcode_info,
                library_info,
                barcode_seqs,
                genome_idx=genome_idx,
                library_type=lib_type)
            filtered_m = m.select_barcodes_by_seq(filtered_bcs)
            summary['%s_flt_mapped_reads' % prefix] = filtered_m.sum()

            summary['%s_filtered_bcs' % prefix] = len(filtered_bcs)
    return summary
Ejemplo n.º 8
0
def main(args, outs):
    outs.coerce_strings()

    # Load whitelist
    whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_to_idx = OrderedDict((k, i) for i, k in enumerate(whitelist))

    # Load feature reference
    feature_ref = rna_feature_ref.from_transcriptome_and_csv(
        args.reference_path, args.feature_reference)

    # Load library info from BAM
    in_bam = tk_bam.create_bam_infile(args.chunk_input)
    library_info = rna_library.get_bam_library_info(in_bam)

    # Get cell-associated barcodes by genome
    filtered_bcs_by_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bc_union = cr_utils.get_cell_associated_barcode_set(
        args.filtered_barcodes)

    # Create the barcode info
    barcode_info = MoleculeCounter.build_barcode_info(filtered_bcs_by_genome,
                                                      library_info, whitelist)

    # Create the molecule info file
    mc = MoleculeCounter.open(outs.output,
                              mode='w',
                              feature_ref=feature_ref,
                              barcodes=whitelist,
                              library_info=library_info,
                              barcode_info=barcode_info)

    # Initialize per-library metrics
    lib_metrics = {}
    for lib_idx in xrange(len(library_info)):
        lib_metrics[str(lib_idx)] = {}
        lib_metrics[str(lib_idx)][cr_mol_counter.USABLE_READS_METRIC] = 0

    # Record read-counts per molecule. Note that UMIs are not contiguous
    # in the input because no sorting was done after UMI correction.

    prev_gem_group = None
    prev_barcode_idx = None

    for (gem_group, barcode_seq), reads_iter in \
        itertools.groupby(in_bam, key=cr_utils.barcode_sort_key_no_umi):
        if barcode_seq is None:
            continue

        barcode_idx = barcode_to_idx[barcode_seq]

        # Assert expected sort order of input BAM
        assert gem_group >= prev_gem_group
        if gem_group == prev_gem_group:
            assert barcode_idx >= prev_barcode_idx

        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode_seq, gem_group) in filtered_bc_union

        counts = defaultdict(int)

        for read in reads_iter:
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or \
               read.is_read2 or \
               cr_utils.is_read_low_support_umi(read) or \
               not cr_utils.is_read_conf_mapped_to_feature(read):
                continue

            umi_seq = cr_utils.get_read_umi(read)
            if umi_seq is None:
                continue

            umi_int = MoleculeCounter.compress_umi_seq(
                umi_seq,
                MoleculeCounter.get_column_dtype('umi').itemsize * 8)

            feature_ids = cr_utils.get_read_gene_ids(read)
            assert len(feature_ids) == 1
            feature_int = feature_ref.id_map[feature_ids[0]].index

            library_idx = cr_utils.get_read_library_index(read)

            counts[(umi_int, library_idx, feature_int)] += 1

            if is_cell_barcode:
                lib_metrics[str(library_idx)][
                    cr_mol_counter.USABLE_READS_METRIC] += 1

            prev_gem_group = gem_group
            prev_barcode_idx = barcode_idx

        # Record data for this barcode
        gg_int = MoleculeCounter.get_column_dtype('gem_group').type(gem_group)
        mc.append_column('gem_group', np.repeat(gg_int, len(counts)))
        bc_int = MoleculeCounter.get_column_dtype('barcode_idx').type(
            barcode_idx)
        mc.append_column('barcode_idx', np.repeat(bc_int, len(counts)))

        feature_ints = np.fromiter(
            (k[2] for k in counts.iterkeys()),
            dtype=MoleculeCounter.get_column_dtype('feature_idx'),
            count=len(counts))
        # Sort by feature for fast matrix construction
        order = np.argsort(feature_ints)
        feature_ints = feature_ints[order]
        mc.append_column('feature_idx', feature_ints)
        del feature_ints

        li_ints = np.fromiter(
            (k[1] for k in counts.iterkeys()),
            dtype=MoleculeCounter.get_column_dtype('library_idx'),
            count=len(counts))[order]
        mc.append_column('library_idx', li_ints)
        del li_ints

        umi_ints = np.fromiter((k[0] for k in counts.iterkeys()),
                               dtype=MoleculeCounter.get_column_dtype('umi'),
                               count=len(counts))[order]
        mc.append_column('umi', umi_ints)
        del umi_ints

        count_ints = np.fromiter(
            counts.itervalues(),
            dtype=MoleculeCounter.get_column_dtype('count'),
            count=len(counts))[order]
        mc.append_column('count', count_ints)
        del count_ints

    in_bam.close()

    mc.set_metric(cr_mol_counter.LIBRARIES_METRIC, dict(lib_metrics))

    mc.save()
Ejemplo n.º 9
0
def main(args, outs):
    new_gg = 0
    gg_index = {}
    libraries = []
    chemistry_batch_correction = False

    ### Batch info
    # If a column 'batch' is given in sample_defs (read from input csv), that
    # column will be used as batch identifier and chemistry_batch_correction will 
    # be turned on. otherwise, aggr_id will be used as batch identifier.
    # Each batch will have a distinct batch_id, which is an increasing integer. 
    batch_name_to_id = {}

    sample_defs = [] if args.sample_defs is None else args.sample_defs
    
    for sample_def in sample_defs:
        seen_ggs = set()

        aggr_id = sample_def[cr_constants.AGG_ID_FIELD]

        if cr_constants.AGG_BATCH_FIELD in sample_def:
            chemistry_batch_correction = True 
            batch_name = sample_def[cr_constants.AGG_BATCH_FIELD]
        else:
            batch_name =  aggr_id

        if batch_name not in batch_name_to_id:
            batch_name_to_id[batch_name] = len(batch_name_to_id) 

        with MoleculeCounter.open(sample_def[cr_constants.AGG_H5_FIELD], 'r') as mc:
            old_libraries = mc.get_library_info()

        for lib_idx, old_lib in enumerate(old_libraries):
            # Remap gem groups
            old_gg = int(old_lib['gem_group'])

            # Increment gem group if this is a new one from the same input sample
            if old_gg not in seen_ggs:
                new_gg += 1

            gg_index[new_gg] = (aggr_id, old_gg)

            # Remap libraries
            new_lib = copy.deepcopy(old_lib)
            new_lib['gem_group'] = new_gg

            # Make the new library id unique
            new_lib['library_id'] += ".%d" % (new_gg)
            new_lib['old_library_index'] = lib_idx
            new_lib['old_gem_group'] = old_gg
            new_lib['aggr_id'] = sample_def[cr_constants.AGG_ID_FIELD]
            new_lib['batch_name'] = batch_name
            new_lib['batch_id'] = batch_name_to_id[batch_name]
            libraries.append(new_lib)

            # Track gem groups
            seen_ggs.add(old_gg)

    if chemistry_batch_correction is True and len(batch_name_to_id) <= 1:
        chemistry_batch_correction = False
        martian.log_info('Warning: only one batch sepecified in the input csv, chemistry_batch_correction is disabled.')

    outs.libraries = libraries
    outs.gem_group_index = gg_index
    outs.chemistry_batch_correction = chemistry_batch_correction

    # Write the "gem group index" (a legacy structure) for Loupe
    with open(outs.gem_group_index_json, 'w') as outfile:
        json.dump({"gem_group_index": gg_index}, outfile)
Ejemplo n.º 10
0
def split(args):
    # Get the cell count
    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)
    n_cells = len(filtered_bcs)

    if n_cells == 0:
        return {
            'chunks': [{
                'chunk_start': 0,
                'chunk_len': 0,
                'subsample_info': {}
            }]
        }

    # Get required info from the mol info
    with MoleculeCounter.open(args.molecule_info, 'r') as mol_counter:
        n_molecule_info_entries = mol_counter.nrows()
        barcode_whitelist = mol_counter.get_barcode_whitelist()
        gem_groups = mol_counter.get_gem_groups()

        raw_reads = mol_counter.get_total_raw_reads()
        raw_rpc = tk_stats.robust_divide(raw_reads, n_cells)
        mapped_reads = mol_counter.get_total_conf_mapped_filtered_bc_reads()

    mapped_read_frac = tk_stats.robust_divide(mapped_reads, raw_reads)

    subsamplings = list()  # track subsample info definitions

    # Calculate extra deciles to add in based on raw reads
    if raw_reads > 0:
        subsampling_deciles = [
            round(decile * raw_rpc) for decile in np.arange(0.1, 1.1, 0.1)
        ]
    else:
        subsampling_deciles = []

    # All target depths
    target_rpcs = cr_constants.SUBSAMPLE_READS_PER_CELL + subsampling_deciles

    for subsample_type, rpc_multiplier in [
        (cr_constants.RAW_SUBSAMPLE_TYPE, mapped_read_frac),
        (cr_constants.MAPPED_SUBSAMPLE_TYPE, 1.0)
    ]:
        # Generate subsampling definitions
        for target_rpc in target_rpcs:
            target_mapped_reads = int(
                float(target_rpc) * float(n_cells) * rpc_multiplier)

            subsample_rate = tk_stats.robust_divide(target_mapped_reads,
                                                    mapped_reads)

            if subsample_rate > 1.0:
                continue

            subsamplings.append({
                'subsample_type': subsample_type,
                'target_rpc': target_rpc,
                'subsample_rate': subsample_rate,
                'all_target_rpc': target_rpcs,
            })

    # Each chunk needs to store the entire gene-bc matrix and a piece of the mol info h5
    matrix_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        barcode_whitelist, gem_groups)
    chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK
    chunk_mem_gb = matrix_mem_gb + MoleculeCounter.estimate_mem_gb(chunk_len)
    join_mem_gb = matrix_mem_gb

    # Split the molecule info h5 into equi-RAM chunks
    chunks = []
    for subsample_info in subsamplings:
        for chunk_start in xrange(0, n_molecule_info_entries, chunk_len):
            chunks.append({
                'chunk_start':
                str(chunk_start),
                'chunk_len':
                str(min(n_molecule_info_entries - chunk_start, chunk_len)),
                'subsample_info':
                subsample_info,
                '__mem_gb':
                chunk_mem_gb,
            })
    join = {
        '__mem_gb': join_mem_gb,
    }

    if len(chunks) == 0:
        chunks.append({
            'chunk_start': str(0),
            'chunk_len': str(0),
            'subsample_info': {},
        })

    return {'chunks': chunks, 'join': join}
Ejemplo n.º 11
0
def join(args, outs, chunk_defs, chunk_outs):
    # Merge tallies
    data = None
    for chunk in chunk_outs:
        with open(chunk.metrics) as f:
            chunk_data = cPickle.load(f)
        if data is None:
            data = chunk_data
        else:
            for k, v in data.iteritems():
                data[k] += chunk_data[k]

    # Compute metrics for each subsampling rate
    summary = {}

    with MoleculeCounter.open(args.molecule_info, 'r') as mc:
        genomes = sorted(
            set(
                f.tags.get('genome', '')
                for f in mc.feature_reference.feature_defs))
        lib_types = sorted(set(lib['library_type'] for lib in mc.library_info))
        lib_type_map = dict((lt, idx) for (idx, lt) in enumerate(lib_types))
    cell_bcs_by_genome = get_cell_associated_barcodes(genomes,
                                                      args.filtered_barcodes)

    # Give each cell-associated barcode an integer index
    cell_bcs = sorted(list(cell_bcs_by_genome['']))
    cell_bc_to_int = {bc: i for i, bc in enumerate(cell_bcs)}

    subsample_info = chunk_defs[0].subsample_info if len(
        chunk_defs) > 0 else []

    for i, task in enumerate(subsample_info):
        lib_type = task['library_type']
        lib_type_idx = lib_type_map[lib_type]
        ss_type = task['subsample_type']
        ss_depth = task['target_read_pairs_per_cell']

        if rna_library.has_genomes(lib_type):
            genome_ints = list(range(data['umis_per_bc'].shape[1]))
        else:
            genome_ints = [0]

        # Per-genome metrics
        for g in genome_ints:
            if not data['lib_type_genome_any_reads'][lib_type_idx, g]:
                continue
            genome = genomes[g]

            # Only compute on cell-associated barcodes for this genome.
            # This only matters when there are multiple genomes present.
            cell_inds = np.array(
                sorted(cell_bc_to_int[bc]
                       for bc in cell_bcs_by_genome[genome]))

            median_umis_per_cell = np.median(data['umis_per_bc'][i, g,
                                                                 cell_inds])
            summary[make_metric_name('subsampled_filtered_bcs_median_counts',
                                     lib_type, genome, ss_type,
                                     ss_depth)] = median_umis_per_cell

            median_features_per_cell = np.median(
                data['features_det_per_bc'][i, g, cell_inds])
            summary[make_metric_name(
                'subsampled_filtered_bcs_median_unique_genes_detected',
                lib_type, genome, ss_type,
                ss_depth)] = median_features_per_cell

            dup_frac = compute_dup_frac(data['read_pairs'][i, g],
                                        data['umis'][i, g])
            summary[make_metric_name('subsampled_duplication_frac', lib_type,
                                     genome, ss_type, ss_depth)] = dup_frac

        # Whole-dataset duplication frac
        all_read_pairs = np.sum(data['read_pairs'][i, :])
        all_umis = np.sum(data['umis'][i, :])
        dup_frac = compute_dup_frac(all_read_pairs, all_umis)
        summary[make_metric_name('subsampled_duplication_frac', lib_type,
                                 lib_constants.MULTI_REFS_PREFIX, ss_type,
                                 ss_depth)] = dup_frac

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary),
                  f,
                  indent=4,
                  sort_keys=True)
Ejemplo n.º 12
0
def main(args, outs):
    np.random.seed(0)

    LogPerf.mem()

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        barcode_info = mc.get_barcode_info()

        metrics_in = mc.get_all_metrics()
        metrics_out = copy.deepcopy(metrics_in)

        # Compute subsampling rate and approximate new total readpair count
        frac_reads_kept = np.array(args.frac_reads_kept, dtype=float)
        total_reads_in = mc.get_raw_read_pairs_per_library()
        total_reads_out = total_reads_in * frac_reads_kept

        for lib_idx, _ in enumerate(library_info):
            metrics_out[cr_mol_counter.LIBRARIES_METRIC][str(
                lib_idx)][cr_mol_counter.
                          DOWNSAMPLED_READS_METRIC] = total_reads_out[lib_idx]

        # downsample molecule info
        chunk = slice(args.chunk_start, args.chunk_start + args.chunk_len)
        mol_library_idx = mc.get_column_lazy('library_idx')[chunk]
        mol_read_pairs = mc.get_column_lazy('count')[chunk]

        mol_rate = frac_reads_kept[mol_library_idx]
        del mol_library_idx

        new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate)
        del mol_read_pairs
        del mol_rate

        keep_mol = np.flatnonzero(new_read_pairs)
        new_read_pairs = new_read_pairs[keep_mol]

        mol_gem_group = mc.get_column_lazy('gem_group')[chunk][keep_mol]
        mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk][keep_mol]
        mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk][keep_mol]

        # Assert that gem groups start at 1 and are contiguous
        gem_groups = sorted(set(lib['gem_group'] for lib in library_info))
        assert(min(gem_groups) == 1 and \
               np.all(np.diff(np.array(gem_groups,dtype=int)) == 1))

        feature_ref = mc.get_feature_ref()

        # Compute matrix dimensions
        # Get the range of possible barcode indices for each gem group.
        gg_barcode_idx_start = np.zeros(1 + len(gem_groups), dtype=int)
        gg_barcode_idx_len = np.zeros(1 + len(gem_groups), dtype=int)
        for gg_str, idx_range in sorted(
                args.gem_group_barcode_ranges.iteritems(),
                key=lambda kv: int(kv[0])):
            gg = int(gg_str)
            gg_barcode_idx_start[gg] = idx_range[0]
            gg_barcode_idx_len[gg] = idx_range[1] - idx_range[0]

        num_bcs = gg_barcode_idx_len.sum()
        num_features = feature_ref.get_num_features()

        print 'downsampled'
        LogPerf.mem()

        # Convert molecule barcode indices into matrix barcode indices
        # The molecule info barcode_idx is in this space:
        #  [W_0, W_1, ...] where W_i is distinct original whitelist i.
        # The matrix is in, e.g., this space:
        #  [w_0-1, w_1-2, w_0-3, ...] where w_i-j is a copy of whitelist i for gem group j.

        # Return to the original whitelist index
        mol_barcode_idx -= gg_barcode_idx_start.astype(
            np.uint64)[mol_gem_group]

        # Offset by the cumulative whitelist length up to a barcode's gem group
        gg_barcode_matrix_start = np.cumsum(gg_barcode_idx_len).astype(
            np.uint64)
        mol_barcode_idx += gg_barcode_matrix_start[mol_gem_group - 1]

        ones = np.ones(len(mol_barcode_idx),
                       dtype=cr_matrix.DEFAULT_DATA_DTYPE)
        umi_matrix = sp_sparse.coo_matrix(
            (ones, (mol_feature_idx, mol_barcode_idx)),
            shape=(num_features, num_bcs))
        print 'created umi matrix'
        LogPerf.mem()

        # Create a read-count matrix so we can summarize reads per barcode
        read_matrix = sp_sparse.coo_matrix(
            (new_read_pairs, (mol_feature_idx, mol_barcode_idx)),
            shape=(num_features, num_bcs))
        del ones
        del mol_feature_idx
        del mol_barcode_idx
        del new_read_pairs

        # Get all barcodes strings for the raw matrix
        barcode_seqs = mc.get_barcodes()

        print len(barcode_seqs), len(gem_groups)
        print 'creating barcode strings'
        LogPerf.mem()

        barcodes = []
        for gg in gem_groups:
            idx_start = gg_barcode_idx_start[gg]
            idx_end = idx_start + gg_barcode_idx_len[gg]
            gg_bcs = np.array([
                cr_utils.format_barcode_seq(bc, gg)
                for bc in barcode_seqs[idx_start:idx_end]
            ])
            barcodes.append(gg_bcs)
        barcodes = np.concatenate(barcodes)
        barcodes.flags.writeable = False

        print 'created barcode strings'
        LogPerf.mem()

        # Get mapped reads per barcode per library,genome
        read_summary = {}
        read_matrix = CountMatrix(feature_ref, barcodes, read_matrix)
        read_matrix.m = read_matrix.m.tocsc(copy=True)
        read_summary = summarize_read_matrix(read_matrix, library_info,
                                             barcode_info, barcode_seqs)
        del read_matrix

        print 'created read matrix'
        LogPerf.mem()
        # Construct the raw UMI matrix
        raw_umi_matrix = CountMatrix(feature_ref, barcodes, umi_matrix)
        raw_umi_matrix.save_h5_file(outs.raw_matrix_h5)
        outs.raw_nnz = raw_umi_matrix.m.nnz

        # Construct the filtered UMI matrix
        filtered_bcs = MoleculeCounter.get_filtered_barcodes(
            barcode_info, library_info, barcode_seqs)
        filtered_umi_matrix = raw_umi_matrix.select_barcodes_by_seq(
            filtered_bcs)
        filtered_umi_matrix.save_h5_file(outs.filtered_matrix_h5)
        outs.filtered_nnz = filtered_umi_matrix.m.nnz

        print 'created filtered umi matrix'
        LogPerf.mem()

        summary = {
            'read_summary': read_summary,
            'mol_metrics': metrics_out,
        }

        with open(outs.chunk_summary, 'w') as f:
            json.dump(tk_safe_json.json_sanitize(summary),
                      f,
                      indent=4,
                      sort_keys=True)

    # Don't write MEX from chunks.
    outs.raw_matrices_mex = None
    outs.filtered_matrices_mex = None
Ejemplo n.º 13
0
def join(args, outs, chunk_defs, chunk_outs):
    # compute invariants on input data
    input_genomes = set()
    input_features = set()
    input_bc_counts = {}
    input_feature_counts = {}
    input_num_gem_groups = 0

    for sample_def in args.input_sample_defs:
        library_id = sample_def['library_id']
        with MoleculeCounter.open(sample_def[cr_constants.AGG_H5_FIELD],
                                  'r') as mc:
            input_genomes.update(mol_counter_genomes(mc))
            input_features.update(mol_counter_features_id_type(mc))
            gem_groups = mc.get_gem_groups()
            input_num_gem_groups += len(gem_groups)

            mol_gem_group = mc.get_column('gem_group')

            mol_barcode_idx = mc.get_column('barcode_idx')
            for gg in gem_groups:
                input_bc_counts[(library_id, gg)] = np.zeros(
                    len(mc.get_ref_column('barcodes')))
                bc_idx, counts = np.unique(
                    mol_barcode_idx[mol_gem_group == gg], return_counts=True)
                input_bc_counts[(library_id, gg)][bc_idx] = counts
            del mol_barcode_idx

            mol_feature_idx = mc.get_column('feature_idx')
            for gg in gem_groups:
                input_feature_counts[(library_id, gg)] = np.zeros(
                    len(mc.feature_reference.feature_defs))
                feature_idx, counts = np.unique(
                    mol_feature_idx[mol_gem_group == gg], return_counts=True)
                input_feature_counts[(library_id, gg)][feature_idx] = counts
            del mol_feature_idx

    # compute invariants on output
    output_matrix = cr_matrix.CountMatrix.load_h5_file(
        args.merged_raw_gene_bc_matrices_h5)
    output_genomes = set(output_matrix.get_genomes())
    output_features = set(count_matrix_features_id_type(output_matrix))
    output_bc_counts = {}
    output_feature_counts = {}
    output_gem_index = cr_matrix.get_gem_group_index(
        args.merged_raw_gene_bc_matrices_h5)
    output_num_gem_groups = len(output_gem_index)

    for gg in output_gem_index:
        library_id, old_gg = output_gem_index[gg]
        matrix_gg = output_matrix.select_barcodes_by_gem_group(gg)
        output_bc_counts[(library_id, old_gg)] = matrix_gg.get_counts_per_bc()
        output_feature_counts[(library_id,
                               old_gg)] = matrix_gg.get_counts_per_feature()

    exit_message = (
        'An internal problem in the aggr pipeline has been detected '
        'that might lead to incorrect results. Please report this '
        'problem to [email protected].')

    if input_genomes != output_genomes:
        martian.log_info(
            'Genomes differ between input molecule files and aggregated matrix'
        )
        martian.exit(exit_message)
    if input_features != output_features:
        martian.log_info(
            'Features differ between input molecule files and aggregated matrix'
        )
        martian.exit(exit_message)
    if input_num_gem_groups != output_num_gem_groups:
        martian.log_info(
            'Number of GEM groups differs between input molecule files and aggregated matrix'
        )
        martian.exit(exit_message)
    for lib_gg in input_bc_counts.keys():
        if len(input_bc_counts[lib_gg]) != len(output_bc_counts[lib_gg]):
            martian.log_info(
                'Barcode list for library {}, GEM group {} has different length '
                'in aggregated output compared to input.'.format(
                    lib_gg[0], lib_gg[1]))
            martian.exit(exit_message)
        if np.any(input_bc_counts[lib_gg] < output_bc_counts[lib_gg]):
            martian.log_info(
                'Barcode(s) in library {}, GEM group {} have higher UMI counts '
                'in aggregated output compared to inputs'.format(
                    lib_gg[0], lib_gg[1]))
            martian.exit(exit_message)
        if len(input_feature_counts[lib_gg]) != len(
                output_feature_counts[lib_gg]):
            martian.log_info(
                'Feature list for library {}, GEM group {} has different length '
                'in aggregated output compared to input.'.format(
                    lib_gg[0], lib_gg[1]))
            martian.exit(exit_message)
        if np.any(
                input_feature_counts[lib_gg] < output_feature_counts[lib_gg]):
            martian.log_info(
                'Feature(s) in library {}, GEM group {} have higher UMI counts '
                'in aggregated output compared to inputs'.format(
                    lib_gg[0], lib_gg[1]))
            martian.exit(exit_message)

    summary = {
        'genomes_present': list(input_genomes),
        'num_features_in_ref': len(input_features),
        'num_gem_groups': input_num_gem_groups,
    }

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary),
                  f,
                  indent=4,
                  sort_keys=True)
Ejemplo n.º 14
0
def join(args, outs, chunk_defs, chunk_outs):

    version = martian.get_pipelines_version()

    with open(args.summary) as f:
        summary = json.load(f)

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        barcode_info = mc.get_barcode_info()
        barcode_seqs = mc.get_barcodes()

    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    # make attrs for user-added columns in aggr csv
    extra_attrs = get_custom_aggr_columns(args.sample_defs)
    # track original library/gem info
    library_map = cr_matrix.make_library_map_aggr(args.gem_group_index)
    extra_attrs.update(library_map)

    # Merge raw matrix
    raw_matrix = cr_matrix.merge_matrices(args.raw_matrices_h5)
    raw_matrix.save_h5_file(outs.raw_matrix_h5, extra_attrs=extra_attrs)

    genomes = raw_matrix.get_genomes()

    # Create barcode summary HDF5 file w/ GEX data for the barcode rank plot
    with h5py.File(outs.barcode_summary_h5, 'w') as f:
        cr_io.create_hdf5_string_dataset(f, cr_constants.H5_BC_SEQUENCE_COL, raw_matrix.bcs)

        gex_bc_counts = raw_matrix.view().select_features_by_type(lib_constants.GENE_EXPRESSION_LIBRARY_TYPE).sum(axis=0).astype('uint64')
        genome_key = genomes[0] if len(genomes) == 1 else lib_constants.MULTI_REFS_PREFIX
        f.create_dataset('_%s_transcriptome_conf_mapped_deduped_barcoded_reads' % genome_key,
                         data=gex_bc_counts)

    rna_matrix.save_mex(raw_matrix,outs.raw_matrix_mex, version)
    del raw_matrix

    # Merge filtered matrix
    filt_mat = cr_matrix.merge_matrices(args.filtered_matrices_h5)
    filt_mat.save_h5_file(outs.filtered_matrix_h5, extra_attrs=extra_attrs)

    # Summarize the matrix across library types and genomes
    for lib_type in lib_types:
        libtype_prefix = rna_library.get_library_type_metric_prefix(lib_type)

        if rna_library.has_genomes(lib_type):
            genomes = filt_mat.get_genomes()
        else:
            genomes = [None]

        mat_lib = filt_mat.view().select_features_by_type(lib_type)

        for genome in genomes:
            if genome is None:
                mat = mat_lib
                genome_idx = None
            else:
                mat = mat_lib.select_features_by_genome(genome)
                genome_idx = barcode_info.genomes.index(genome)

            # Select barcodes passing filter for this (lib_type, genome)
            filtered_bcs = MoleculeCounter.get_filtered_barcodes(barcode_info,
                                                                 library_info,
                                                                 barcode_seqs,
                                                                 genome_idx=genome_idx,
                                                                 library_type=lib_type)
            mat = mat.select_barcodes_by_seq(filtered_bcs)

            median_features = np.median(mat.count_ge(axis=0,
                                                     threshold=cr_constants.MIN_COUNTS_PER_GENE))
            median_counts = np.median(mat.sum(axis=0))
            genome_prefix = genome if genome is not None else lib_constants.MULTI_REFS_PREFIX

            prefixes = (libtype_prefix, genome_prefix)
            if genome is not None:
                flt_reads = summary['%s%s_flt_mapped_reads' % prefixes]
                raw_reads = summary['%s%s_raw_mapped_reads' % prefixes]
                frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads)

                summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % prefixes] =  frac_reads_in_cells

            summary.update({
                '%s%s_filtered_bcs_median_counts' % prefixes: median_counts,
                '%s%s_filtered_bcs_median_unique_genes_detected' % prefixes: median_features,
            })

        # Compute frac reads in cells across all genomes
        prefixes = [(libtype_prefix, g) for g in genomes if g is not None]
        if len(prefixes) == 0:
            prefixes = [(libtype_prefix, lib_constants.MULTI_REFS_PREFIX)]
        flt_reads = sum(summary['%s%s_flt_mapped_reads' % p] for p in prefixes)
        raw_reads = sum(summary['%s%s_raw_mapped_reads' % p] for p in prefixes)

        frac_reads_in_cells = tk_stats.robust_divide(flt_reads, raw_reads)
        summary['%s%s_filtered_bcs_conf_mapped_barcoded_reads_cum_frac' % (
            libtype_prefix, lib_constants.MULTI_REFS_PREFIX)] = frac_reads_in_cells


    # Write MEX format (do it last because it converts the matrices to COO)
    rna_matrix.save_mex(filt_mat, outs.filtered_matrix_mex, version)

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary), f, indent=4, sort_keys=True)
Ejemplo n.º 15
0
def split(args):
    # Get required info from the mol info
    mc = MoleculeCounter.open(args.molecule_info, 'r')

    genomes = sorted(
        set(
            f.tags.get('genome', '')
            for f in mc.feature_reference.feature_defs))
    cell_bcs_by_genome = get_cell_associated_barcodes(genomes,
                                                      args.filtered_barcodes)

    # Get cell counts per gem group
    n_cells_per_gg = defaultdict(int)
    for bc in cell_bcs_by_genome['']:
        _, gem_group = cr_utils.split_barcode_seq(bc)
        n_cells_per_gg[gem_group] += 1

    # Assign gem group cell counts to their constituent libraries
    # TODO FIXME: Need to allow for per-library cell counts
    #   because some feature types might only have a subset of the GEX cell-assoc barcodes.
    n_cells_per_lib = np.zeros(len(mc.library_info), dtype=int)
    for lib_idx, lib in enumerate(mc.library_info):
        n_cells_per_lib[lib_idx] = n_cells_per_gg[lib['gem_group']]

    if n_cells_per_lib.sum() == 0:
        return {'chunks': []}

    library_info = mc.library_info

    raw_count_per_lib = np.array(mc.get_raw_read_pairs_per_library())
    raw_rppc_per_lib = raw_count_per_lib.astype(float) / n_cells_per_lib
    usable_count_per_lib = np.array(mc.get_usable_read_pairs_per_library())

    subsamplings = list()  # track subsample info definitions

    library_types = sorted(set(lib['library_type'] for lib in library_info))
    for library_type in library_types:
        # All libraries w/ this type
        lib_indexes = np.array([
            i for i, lib in enumerate(library_info)
            if lib['library_type'] == library_type
        ])

        # For plotting, we want a series of target depths that exist for all
        #   libraries w/ the same library type. When there's a single library
        #   per type (the common case), this is trivial - split it into deciles.
        #   But if there are multiple libraries with different depths, (e.g.,
        #   because gem-group-aggregation was used to increase cell numbers),
        #   we need to find depths that are achievable for all libraries.
        #   For now, let the lowest-depth library for a given type dictate this.
        min_raw_rppc = np.min(raw_rppc_per_lib[lib_indexes])

        # Use deciles of the raw read pairs per cell.
        deciles = np.arange(0.1, 1.1, 0.1)
        plot_targets = map(round, min_raw_rppc * deciles)

        # TODO: separate this work (internal + non)
        raw_targets = cr_constants.SUBSAMPLE_READS_PER_CELL + \
                      plot_targets

        # TODO: separate this work (internal + non)
        usable_targets = cr_constants.SUBSAMPLE_READS_PER_CELL + \
                         plot_targets

        for targets, depth_type in \
            ((raw_targets, cr_constants.RAW_SUBSAMPLE_TYPE), \
             ((usable_targets, cr_constants.MAPPED_SUBSAMPLE_TYPE)),):
            targets = sorted(list(set(map(int, targets))))
            for target_rppc in targets:
                if depth_type == cr_constants.RAW_SUBSAMPLE_TYPE:
                    # Infer the usable depth required to achieve this raw depth
                    usable_read_fracs = usable_count_per_lib.astype(
                        float) / raw_count_per_lib
                    target_usable_counts = target_rppc * n_cells_per_lib * usable_read_fracs
                else:
                    target_usable_counts = target_rppc * n_cells_per_lib

                # Zero out libraries of the other types
                rates = np.zeros(len(library_info), dtype=float)
                rates[lib_indexes] = target_usable_counts[lib_indexes].astype(float) \
                                     / usable_count_per_lib[lib_indexes]

                # Clamp rates that are close to 1 to 1
                rates[np.absolute(rates - 1) < 1e-3] = 1

                # Zero out the libraries for which we have fewer reads than the target
                rates[rates > 1] = 0.0

                enough_data = np.any((rates > 0) & (rates <= 1))
                if not enough_data:
                    rates = np.zeros(len(rates))

                subsamplings.append({
                    'library_type':
                    library_type,
                    'subsample_type':
                    depth_type,
                    'target_read_pairs_per_cell':
                    int(target_rppc),
                    'library_subsample_rates':
                    list(map(float, rates)),
                })

    # Each chunk needs to store a piece of the mol info h5
    tgt_chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK

    # Split the molecule info h5 into equi-RAM chunks
    chunks = []
    for chunk_start, chunk_len in mc.get_chunks(tgt_chunk_len,
                                                preserve_boundaries=True):
        chunks.append({
            'chunk_start':
            chunk_start,
            'chunk_len':
            chunk_len,
            'subsample_info':
            subsamplings,
            # The estimate_mem_gb only count the memory usage for the MoleculeCounter object, which is
            # under-estimated the actual memory usage.
            # Based on memory profiling with test case fuzzer_114, actual memory usageis ~4x more
            # than estimate_mem_gb (without cap), here set scale = 6.
            '__mem_gb':
            MoleculeCounter.estimate_mem_gb(chunk_len, scale=6),
        })

    join = {
        '__mem_gb': 6,
    }

    mc.close()

    # TODO: is this really necessary w/ martian 3
    if len(chunks) == 0:
        chunks.append({
            'chunk_start': str(0),
            'chunk_len': str(0),
            'subsample_info': [],
        })

    return {'chunks': chunks, 'join': join}
Ejemplo n.º 16
0
def join(args, outs, chunk_defs, chunk_outs):

    # Pass through the matrix chunks and nnz counts
    outs.raw_matrices_h5 = [o.raw_matrix_h5 for o in chunk_outs]
    outs.raw_nnz = sum(o.raw_nnz for o in chunk_outs)
    outs.filtered_matrices_h5 = [o.filtered_matrix_h5 for o in chunk_outs]
    outs.filted_nnz = sum(o.filtered_nnz for o in chunk_outs)

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()

    lib_types = sorted(set(lib['library_type'] for lib in library_info))

    summary = {
        'frac_reads_kept': chunk_defs[0].frac_reads_kept,
        'num_cells_by_library': chunk_defs[0].num_cells,
    }

    # Merge read summary metrics
    read_summary = defaultdict(int)
    for filename in [co.chunk_summary for co in chunk_outs]:
        with open(filename) as f:
            d = json.load(f)
            for k in d['read_summary'].iterkeys():
                read_summary[k] += d['read_summary'][k]
    summary.update(read_summary)

    # Get summary metrics
    with open(chunk_outs[0].chunk_summary) as f:
        mol_metrics = json.load(f)['mol_metrics']
    chem_keys = [
        k for k in mol_metrics.iterkeys() if k.startswith('chemistry')
    ]
    for k in chem_keys:
        summary[k] = mol_metrics[k]
    print json.dumps(mol_metrics, indent=4, sort_keys=True)

    # Report normalization metrics
    all_batches = OrderedDict()

    # These are all per-library-type
    min_frac_reads_kept = np.ones(len(lib_types), dtype='float')
    total_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64')
    total_ds_raw_read_pairs = np.zeros(len(lib_types), dtype='uint64')
    total_cells = np.zeros(len(lib_types), dtype='uint64')

    for lib_type_idx, lib_type in enumerate(lib_types):
        lib_inds = [
            i for i, lib in enumerate(library_info)
            if lib['library_type'] == lib_type
        ]
        for lib_idx in lib_inds:
            aggr_id = library_info[lib_idx]['aggr_id']
            old_gg = library_info[lib_idx]['old_gem_group']
            batch = aggr_id + ('-%d' % old_gg if old_gg > 1 else '')
            all_batches[batch] = None

            n_cells = summary['num_cells_by_library'][lib_idx]
            total_cells[lib_type_idx] += n_cells

            lib_metrics = mol_metrics[cr_mol_counter.LIBRARIES_METRIC][str(
                lib_idx)]
            raw_read_pairs = lib_metrics[cr_mol_counter.TOTAL_READS_METRIC]
            mapped_read_pairs = lib_metrics[cr_mol_counter.USABLE_READS_METRIC]
            ds_read_pairs = lib_metrics[
                cr_mol_counter.DOWNSAMPLED_READS_METRIC]

            total_raw_read_pairs[lib_type_idx] += raw_read_pairs
            total_ds_raw_read_pairs[lib_type_idx] += ds_read_pairs

            frac_reads_kept = summary['frac_reads_kept'][lib_idx]
            min_frac_reads_kept[lib_type_idx] = min(
                min_frac_reads_kept[lib_type_idx], frac_reads_kept)

            pre_norm_raw_rppc = tk_stats.robust_divide(raw_read_pairs, n_cells)
            pre_norm_mapped_rppc = tk_stats.robust_divide(
                mapped_read_pairs, n_cells)

            # Prefix with batch and library type
            if lib_type.lower().startswith(
                    rna_library.CUSTOM_LIBRARY_TYPE_PREFIX.lower()):
                lib_prefix = rna_library.CUSTOM_LIBRARY_TYPE_PREFIX + '_'
            else:
                lib_prefix = rna_library.get_library_type_metric_prefix(
                    lib_type)

            p = (batch, lib_prefix)
            summary.update({
                '%s_%sfrac_reads_kept' % p:
                frac_reads_kept,
                '%s_%spre_normalization_raw_reads_per_filtered_bc' % p:
                pre_norm_raw_rppc,
                '%s_%spre_normalization_cmb_reads_per_filtered_bc' % p:
                pre_norm_mapped_rppc,
            })
    summary['batches'] = all_batches.keys()

    for lib_type_idx, lib_type in enumerate(lib_types):
        mean_rppc = tk_stats.robust_divide(total_raw_read_pairs[lib_type_idx],
                                           total_cells[lib_type_idx])
        ds_mean_rppc = tk_stats.robust_divide(
            total_ds_raw_read_pairs[lib_type_idx], total_cells[lib_type_idx])

        p = rna_library.get_library_type_metric_prefix(lib_type)
        summary.update({
            '%spre_normalization_total_reads' % p:
            total_raw_read_pairs[lib_type_idx],
            '%spost_normalization_total_reads' % p:
            total_ds_raw_read_pairs[lib_type_idx],
            '%sfiltered_bcs_transcriptome_union' % p:
            total_cells[lib_type_idx],
            '%spre_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p:
            mean_rppc,
            '%spost_normalization_multi_transcriptome_total_raw_reads_per_filtered_bc' % p:
            ds_mean_rppc,
            '%slowest_frac_reads_kept' % p:
            min_frac_reads_kept[lib_type_idx],
        })

    with open(outs.summary, 'w') as f:
        json.dump(tk_safe_json.json_sanitize(summary),
                  f,
                  indent=4,
                  sort_keys=True)
Ejemplo n.º 17
0
def split(args):
    # default to downsampling by mapped reads
    downsample = True

    if args.normalization_mode == cr_constants.NORM_MODE_NONE:
        downsample = False

    # compute downsample rates for each library
    with MoleculeCounter.open(args.molecules, 'r') as mc:
        library_info = mc.get_library_info()
        usable_reads = mc.get_usable_read_pairs_per_library()
        cells = np.array([
            mc.get_num_filtered_barcodes_for_library(lib_idx)
            for lib_idx in xrange(len(library_info))
        ])

        print "Libraries: %s" % library_info
        print "Usable reads: %s" % usable_reads
        print "Cells: %s" % cells

        usable_rpc = np.zeros(len(library_info), dtype=float)
        for i in xrange(len(library_info)):
            usable_rpc[i] = tk_stats.robust_divide(
                usable_reads[i], cells[i]) if cells[i] > 0 else 0.0

    # Determine lowest depth for each library type
    lt_rpcs = defaultdict(list)
    for lib, rpc in itertools.izip(library_info, usable_rpc):
        lt_rpcs[lib['library_type']].append(rpc)
    min_rpc_by_lt = {lt: min(rpcs) for lt, rpcs in lt_rpcs.iteritems()}

    for lib_idx in xrange(len(library_info)):
        lib_type = library_info[lib_idx]['library_type']
        print "%s Usable read pairs per cell: %s" % (lib_type,
                                                     usable_rpc[lib_idx])
        print "%s Minimum read pairs usable per cell: %d" % (
            lib_type, min_rpc_by_lt[lib_type])

    if not downsample:
        frac_reads_kept = np.ones(len(library_info), dtype=float)
    else:
        frac_reads_kept = np.zeros(len(library_info), dtype=float)
        for i in xrange(len(library_info)):
            lib_type = library_info[i]['library_type']
            min_rpc = min_rpc_by_lt[lib_type]
            if min_rpc == 0:
                frac_reads_kept[i] = 0
            else:
                frac_reads_kept[i] = tk_stats.robust_divide(
                    min_rpc, usable_rpc[i])

    # Split the molecule info h5 into equi-RAM chunks, preserving (barcode, gem_group) boundaries
    # Assumes the molecule_info is sorted by (gem_group, barcode)
    tgt_chunk_len = cr_constants.NUM_MOLECULE_INFO_ENTRIES_PER_CHUNK

    chunks = []

    # For memory request calculation
    num_gem_groups = len(set(lib['gem_group'] for lib in library_info))

    with MoleculeCounter.open(args.molecules, 'r') as mc:
        # Number of barcodes in the full matrix
        num_barcodes = mc.get_ref_column_lazy('barcodes').shape[0]

        for chunk_start, chunk_len in mc.get_chunks(tgt_chunk_len,
                                                    preserve_boundaries=True):
            mol_mem_gb = MoleculeCounter.estimate_mem_gb(chunk_len,
                                                         scale=2.0,
                                                         cap=False)
            print 'molecule_info mem_gb = %d' % mol_mem_gb

            # Worst case number of nonzero elements in chunk matrix
            num_nonzero = chunk_len
            matrix_mem_gb = CountMatrix.get_mem_gb_from_matrix_dim(
                num_barcodes * num_gem_groups, num_nonzero)
            print 'matrix mem_gb = %d' % matrix_mem_gb

            mem_gb = max(h5_constants.MIN_MEM_GB, matrix_mem_gb + mol_mem_gb)

            chunks.append({
                'frac_reads_kept': list(frac_reads_kept),
                'num_cells': list(cells),
                'chunk_start': chunk_start,
                'chunk_len': chunk_len,
                # Request enough for two copies
                '__mem_gb': mem_gb,
            })

    # Join is not loading the merged matrix, so it doesn't need much memory.
    # WRITE_MATRICES will use the precise nnz counts to make an appropriate mem request.
    return {'chunks': chunks, 'join': {'__mem_gb': 3, '__threads': 2}}
Ejemplo n.º 18
0
def main(args, outs):
    with MoleculeCounter.open(args.molecule_h5, 'r') as in_mc:
        # Get the gem group and library mappings
        gg_map, lib_idx_map = get_library_mapping(args.aggr_id, args.libraries)

        # load merged barcode whitelists
        bc_idx_offset = args.barcode_idx_offset
        with open(args.merged_barcodes) as fp:
            merged_barcodes = cPickle.load(fp)

        # FIXME: Handle heterogeneous feature references
        merged_feature_ref = in_mc.get_feature_ref()

        # Remap the barcode info
        old_barcode_info = in_mc.get_barcode_info()
        new_pass_filter = old_barcode_info.pass_filter
        new_pass_filter[:, 0] = new_pass_filter[:, 0] + bc_idx_offset
        new_pass_filter[:, 1] = lib_idx_map[new_pass_filter[:, 1]]

        new_barcode_info = cr_mol_counter.BarcodeInfo(
            pass_filter=new_pass_filter,
            genomes=old_barcode_info.genomes,
        )

        with MoleculeCounter.open(
                outs.molecule_h5,
                'w',
                feature_ref=merged_feature_ref,
                barcodes=merged_barcodes,
                library_info=args.libraries,
                barcode_info=new_barcode_info,
        ) as out_mc:

            # Copy the datasets, rewriting the ones we remap
            for col, ds in in_mc.columns.iteritems():
                if col == 'gem_group':
                    old_gg = ds[:]
                    new_gg = gg_map[old_gg]
                    out_mc.append_column(col, new_gg)

                    outs.new_gem_groups = np.flatnonzero(
                        np.bincount(new_gg)).tolist()

                elif col == 'library_idx':
                    old_idx = ds[:]
                    new_idx = lib_idx_map[old_idx]
                    out_mc.append_column(col, new_idx)

                elif col == 'barcode_idx':
                    new_bc_idx = ds[:] + bc_idx_offset
                    out_mc.append_column(col, new_bc_idx)

                else:
                    out_mc.append_column(col, ds[:])

            # Copy over all standard metrics
            out_metrics = in_mc.get_all_metrics()

            # Remap the per-gem-group and per-library metrics
            old_gg_metrics = in_mc.get_metric(cr_mol_counter.GEM_GROUPS_METRIC)
            gg_metrics = {
                str(gg_map[int(og)]): m
                for og, m in old_gg_metrics.iteritems()
            }
            old_lib_metrics = in_mc.get_metric(cr_mol_counter.LIBRARIES_METRIC)
            lib_metrics = {
                str(lib_idx_map[int(ol)]): m
                for ol, m in old_lib_metrics.iteritems()
            }

            out_metrics[cr_mol_counter.GEM_GROUPS_METRIC] = gg_metrics
            out_metrics[cr_mol_counter.LIBRARIES_METRIC] = lib_metrics

            out_mc.set_all_metrics(out_metrics)
Ejemplo n.º 19
0
def main(args, outs):
    np.random.seed(0)

    mc = MoleculeCounter.open(args.molecule_info, 'r')

    # Get cell-associated barcodes
    genomes = sorted(
        set(
            f.tags.get('genome', '')
            for f in mc.feature_reference.feature_defs))
    cell_bcs_by_genome = get_cell_associated_barcodes(genomes,
                                                      args.filtered_barcodes)

    # Load chunk of relevant data from the mol_info
    chunk = slice(int(args.chunk_start),
                  int(args.chunk_start) + int(args.chunk_len))
    mol_library_idx = mc.get_column_lazy('library_idx')[chunk]
    mol_read_pairs = mc.get_column_lazy('count')[chunk]
    mol_gem_group = mc.get_column_lazy('gem_group')[chunk]
    mol_barcode_idx = mc.get_column_lazy('barcode_idx')[chunk]
    mol_feature_idx = mc.get_column_lazy('feature_idx')[chunk]

    barcodes = mc.get_ref_column('barcodes')

    # Give each cell-associated barcode an integer index
    cell_bcs = sorted(list(cell_bcs_by_genome['']))
    cell_bc_to_int = {bc: i for i, bc in enumerate(cell_bcs)}

    # Give each genome an integer index
    genome_to_int = {g: i for i, g in enumerate(genomes)}
    feature_int_to_genome_int = np.fromiter(
        (genome_to_int[f.tags.get('genome', '')]
         for f in mc.feature_reference.feature_defs),
        dtype=int)
    mol_genome_idx = feature_int_to_genome_int[mol_feature_idx]

    # determine which (library type, genome) pairs have any associated reads
    lib_types = sorted(set(lib['library_type'] for lib in mc.library_info))
    lib_type_to_int = {l: i for i, l in enumerate(lib_types)}
    lib_idx_to_lib_type_idx = np.fromiter(
        (lib_type_to_int[lib['library_type']] for lib in mc.library_info),
        dtype=np.int)

    lib_type_genome_any_reads = np.zeros((len(lib_types), len(genomes)),
                                         dtype=np.bool)
    lib_genome_idx_pairs = set(
        izip(mol_library_idx[mol_read_pairs > 0],
             mol_genome_idx[mol_read_pairs > 0]))
    for (lib_idx, genome_idx) in lib_genome_idx_pairs:
        lib_type_idx = lib_idx_to_lib_type_idx[lib_idx]
        lib_type_genome_any_reads[lib_type_idx, genome_idx] = True

    # Run each subsampling task on this chunk of data
    n_tasks = len(args.subsample_info)
    n_genomes = len(genomes)
    n_cells = len(cell_bcs)

    umis_per_bc = np.zeros((n_tasks, n_genomes, n_cells))
    features_det_per_bc = np.zeros((n_tasks, n_genomes, n_cells))
    read_pairs_per_task = np.zeros((n_tasks, n_genomes))
    umis_per_task = np.zeros((n_tasks, n_genomes))

    for task_idx, task in enumerate(args.subsample_info):
        # Per-library subsampling rates
        rates_per_library = np.array(task['library_subsample_rates'],
                                     dtype=float)

        if np.count_nonzero(rates_per_library) == 0:
            continue

        mol_rate = rates_per_library[mol_library_idx]

        # Subsampled read pairs per molecule
        new_read_pairs = np.random.binomial(mol_read_pairs, mol_rate)

        # Compute tallies for each barcode
        group_keys = (mol_gem_group, mol_barcode_idx)
        group_values = (mol_feature_idx, mol_genome_idx, new_read_pairs)
        for (gg, bc_idx), (feature_idx, genome_idx, read_pairs) in \
            cr_utils.numpy_groupby(group_values, group_keys):

            barcode = cr_utils.format_barcode_seq(barcodes[bc_idx], gg)

            cell_idx = cell_bc_to_int.get(barcode)

            for this_genome_idx in xrange(len(genomes)):
                umis = np.flatnonzero((read_pairs > 0)
                                      & (genome_idx == this_genome_idx))
                this_genome_read_pairs = np.sum(
                    read_pairs[genome_idx == this_genome_idx])

                # Tally UMIs and median features detected
                if barcode in cell_bcs_by_genome[genomes[this_genome_idx]]:
                    # This is a cell-associated barcode for this genome
                    umis_per_bc[task_idx, this_genome_idx,
                                cell_idx] = len(umis)
                    features_det_per_bc[task_idx, this_genome_idx,
                                        cell_idx] = np.count_nonzero(
                                            np.bincount(feature_idx[umis]))

                # Tally numbers for duplicate fraction
                read_pairs_per_task[task_idx, this_genome_idx] += np.sum(
                    this_genome_read_pairs)
                umis_per_task[task_idx, this_genome_idx] += len(umis)

    with open(outs.metrics, 'w') as f:
        data = {
            'umis_per_bc': umis_per_bc,
            'features_det_per_bc': features_det_per_bc,
            'read_pairs': read_pairs_per_task,
            'umis': umis_per_task,
            'lib_type_genome_any_reads': lib_type_genome_any_reads,
        }
        cPickle.dump(data, f, protocol=cPickle.HIGHEST_PROTOCOL)