예제 #1
0
def main(args, outs):
    cr_report.merge_jsons(args.summaries, outs.metrics_summary_json)

    sample_data_paths = cr_webshim_data.SampleDataPaths(
        summary_path=outs.metrics_summary_json,
        barcode_summary_path=args.barcode_summary_h5,
        analysis_path=args.analysis,
        filtered_barcodes_path=args.filtered_barcodes,
    )

    genomes = cr_utils.get_reference_genomes(args.reference_path)
    sample_properties = CountSampleProperties(
        sample_id=args.sample_id,
        sample_desc=args.sample_desc,
        genomes=genomes,
        version=martian.get_pipelines_version())
    sample_properties = dict(sample_properties._asdict())

    sample_data = cr_webshim.load_sample_data(sample_properties,
                                              sample_data_paths)

    cr_webshim.build_web_summary_html(outs.web_summary,
                                      sample_properties,
                                      sample_data,
                                      PIPELINE_COUNT,
                                      alerts_output_filename=outs.alerts)
    cr_webshim.build_metrics_summary_csv(outs.metrics_summary_csv,
                                         sample_properties, sample_data,
                                         PIPELINE_COUNT)
예제 #2
0
def from_transcriptome_and_csv(gene_ref_path, feature_def_filename):
    '''Create a FeatureReference.

    Create a FeatureReference from a transcriptome ref and a feature barcode ref.

    Args:
        gene_ref_path (str): Path to transcriptome reference. Can be None.
        feature_def_filename (str): Path to Feature Definition CSV file. Can be None.
    Returns:
        FeatureReference
    '''

    # Load gene info
    feature_defs = []
    all_tag_keys = ['genome']

    genomes = cr_utils.get_reference_genomes(gene_ref_path)

    if gene_ref_path is not None:
        gene_idx_filename = cr_utils.get_reference_genes_index(gene_ref_path)
        gene_index = cr_reference.GeneIndex.load_pickle(gene_idx_filename)

        # Stuff relevant fields of Gene tuple into FeatureDef
        for gene in gene_index.genes:
            genome = cr_utils.get_genome_from_str(gene.id, genomes)
            fd = FeatureDef(
                index=len(feature_defs),
                id=gene.id,
                name=gene.name,
                feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE,
                tags={
                    'genome': genome,
                })
            feature_defs.append(fd)

    # Load feature definition file
    if feature_def_filename is not None:
        csv_feature_defs, csv_tag_keys = parse_feature_def_file(
            feature_def_filename, index_offset=len(feature_defs))

        # check the CRISPR 'target_gene_id' field, if it exists
        # it needs to match a transcriptome entry
        check_crispr_target_gene(csv_feature_defs, feature_defs)

        feature_defs.extend(csv_feature_defs)
        all_tag_keys.extend(csv_tag_keys)

    return FeatureReference(feature_defs, all_tag_keys)
예제 #3
0
def split(args):
    chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist)

    chunks = []
    for chunk_input in args.inputs:
        chunks.append({
            'chunk_input': chunk_input,
            '__mem_gb': chunk_mem_gb,
        })

    join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist(
        args.barcode_whitelist, args.gem_groups, use_min=False)

    # Account for memory used by reporters (particularly the bc and umi diversity dicts)
    genomes = cr_utils.get_reference_genomes(args.reference_path)

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    if barcode_whitelist is not None:
        num_barcodes = len(barcode_whitelist) * max(args.gem_groups)
    else:
        num_barcodes = cr_utils.get_num_barcodes_from_barcode_summary(
            args.barcode_summary)

    max_bc_diversity_entries = num_barcodes
    max_umi_diversity_entries = 4**cr_chem.get_umi_length(args.chemistry_def)

    # Multiply by 2 to hold the current reporter + accumulating reporter in the merge
    bc_diversity_mem_gb = (2 * max_bc_diversity_entries *
                           cr_constants.BYTES_PER_STR_INT_DICT_ENTRY *
                           (len(genomes) + 1) *
                           len(cr_constants.READ_TYPES)) / 1e9
    umi_diversity_mem_gb = (2 * max_umi_diversity_entries *
                            cr_constants.BYTES_PER_STR_INT_DICT_ENTRY *
                            (len(genomes) + 1) *
                            len(cr_constants.READ_TYPES)) / 1e9
    join_mem_gb = min(
        cr_constants.COUNT_GENES_MAX_MEM_GB,
        max(cr_constants.MIN_MEM_GB,
            int(join_mem_gb + bc_diversity_mem_gb + umi_diversity_mem_gb)))
    join = {
        '__mem_gb': join_mem_gb,
    }
    return {'chunks': chunks, 'join': join}
예제 #4
0
def main(args, outs):
    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    chroms = in_bam.references

    barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist)
    barcode_summary = cr_utils.load_barcode_summary(
        args.barcode_summary) if not barcode_whitelist else None

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    reporter = cr_report.Reporter(reference_path=args.reference_path,
                                  high_conf_mapq=cr_utils.get_high_conf_mapq(
                                      args.align),
                                  gene_index=gene_index,
                                  chroms=chroms,
                                  barcode_whitelist=barcode_whitelist,
                                  barcode_summary=barcode_summary,
                                  gem_groups=args.gem_groups)

    if barcode_whitelist:
        barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist,
                                                    args.gem_groups)
    else:
        barcode_seqs = barcode_summary

    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes)
    matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs)

    for read in in_bam:
        is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb(
            read, use_umis=cr_chem.has_umis(args.chemistry_def))
        if is_conf_mapped_deduped:
            matrices.add(genome, gene_id, bc)

    in_bam.close()

    matrices.save_h5(outs.matrices_h5)
    reporter.save(outs.chunked_reporter)
예제 #5
0
def main(args, outs):
    outs.coerce_strings()

    in_bam = tk_bam.create_bam_infile(args.chunk_input)

    counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w')

    mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns()
    mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)}

    gene_index = cr_reference.GeneIndex.load_pickle(
        cr_utils.get_reference_genes_index(args.reference_path))
    genomes = cr_utils.get_reference_genomes(args.reference_path)
    genome_index = cr_reference.get_genome_index(genomes)
    none_gene_id = len(gene_index.get_genes())

    # store reference index columns
    # NOTE - these must be cast to str first, as unicode is not supported
    counter.set_ref_column('genome_ids', [str(genome) for genome in genomes])
    counter.set_ref_column('gene_ids',
                           [str(gene.id) for gene in gene_index.genes])
    counter.set_ref_column('gene_names',
                           [str(gene.name) for gene in gene_index.genes])

    filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes)
    filtered_bcs = set()
    for _, bcs in filtered_bcs_per_genome.iteritems():
        filtered_bcs |= set(bcs)

    gg_metrics = collections.defaultdict(
        lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0})

    for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby(
            in_bam, key=cr_utils.barcode_sort_key):
        if barcode is None or gem_group is None:
            continue
        is_cell_barcode = cr_utils.format_barcode_seq(
            barcode, gem_group) in filtered_bcs
        molecules = collections.defaultdict(
            lambda: np.zeros(len(mol_data_columns), dtype=np.uint64))

        compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq(
            barcode)
        gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group(
            gem_group)

        read_positions = collections.defaultdict(set)
        for read in reads_iter:
            umi = cr_utils.get_read_umi(read)
            # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent.
            if read.is_secondary or umi is None or read.is_read2:
                continue

            raw_umi = cr_utils.get_read_raw_umi(read)
            raw_bc, raw_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_raw_barcode(read))
            proc_bc, proc_gg = cr_utils.split_barcode_seq(
                cr_utils.get_read_barcode(read))

            if cr_utils.is_read_conf_mapped_to_transcriptome(
                    read, cr_utils.get_high_conf_mapq(args.align)):
                assert len(gene_ids) == 1

                mol_key, map_type = (umi, gene_index.gene_id_to_int(
                    gene_ids[0])), 'reads'

                read_pos = (read.tid, read.pos)
                uniq_read_pos = read_pos not in read_positions[mol_key]
                read_positions[mol_key].add(read_pos)

                if is_cell_barcode:
                    gg_metrics[int(gem_group)][
                        cr_mol_counter.
                        GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1

            elif read.is_unmapped:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'unmapped_reads', False
            else:
                mol_key, map_type, uniq_read_pos = (
                    umi, none_gene_id), 'nonconf_mapped_reads', False
            molecules[mol_key][mol_data_columns[map_type]] += 1
            molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int(
                not raw_umi == umi)
            molecules[mol_key][mol_data_columns[
                'barcode_corrected_reads']] += int(not raw_bc == proc_bc)
            molecules[mol_key][mol_data_columns[
                'conf_mapped_uniq_read_pos']] += int(uniq_read_pos)

        for mol_key, molecule in sorted(molecules.items()):
            umi, gene_id = mol_key
            genome = cr_utils.get_genome_from_str(
                gene_index.int_to_gene_id(gene_id), genomes)
            genome_id = cr_reference.get_genome_id(genome, genome_index)
            counter.add(
                barcode=compressed_barcode,
                gem_group=gem_group,
                umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi),
                gene=gene_id,
                genome=genome_id,
                **{
                    key: molecule[col_idx]
                    for key, col_idx in mol_data_columns.iteritems()
                })

    in_bam.close()

    counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics))

    counter.save()