def main(args, outs): cr_report.merge_jsons(args.summaries, outs.metrics_summary_json) sample_data_paths = cr_webshim_data.SampleDataPaths( summary_path=outs.metrics_summary_json, barcode_summary_path=args.barcode_summary_h5, analysis_path=args.analysis, filtered_barcodes_path=args.filtered_barcodes, ) genomes = cr_utils.get_reference_genomes(args.reference_path) sample_properties = CountSampleProperties( sample_id=args.sample_id, sample_desc=args.sample_desc, genomes=genomes, version=martian.get_pipelines_version()) sample_properties = dict(sample_properties._asdict()) sample_data = cr_webshim.load_sample_data(sample_properties, sample_data_paths) cr_webshim.build_web_summary_html(outs.web_summary, sample_properties, sample_data, PIPELINE_COUNT, alerts_output_filename=outs.alerts) cr_webshim.build_metrics_summary_csv(outs.metrics_summary_csv, sample_properties, sample_data, PIPELINE_COUNT)
def from_transcriptome_and_csv(gene_ref_path, feature_def_filename): '''Create a FeatureReference. Create a FeatureReference from a transcriptome ref and a feature barcode ref. Args: gene_ref_path (str): Path to transcriptome reference. Can be None. feature_def_filename (str): Path to Feature Definition CSV file. Can be None. Returns: FeatureReference ''' # Load gene info feature_defs = [] all_tag_keys = ['genome'] genomes = cr_utils.get_reference_genomes(gene_ref_path) if gene_ref_path is not None: gene_idx_filename = cr_utils.get_reference_genes_index(gene_ref_path) gene_index = cr_reference.GeneIndex.load_pickle(gene_idx_filename) # Stuff relevant fields of Gene tuple into FeatureDef for gene in gene_index.genes: genome = cr_utils.get_genome_from_str(gene.id, genomes) fd = FeatureDef( index=len(feature_defs), id=gene.id, name=gene.name, feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE, tags={ 'genome': genome, }) feature_defs.append(fd) # Load feature definition file if feature_def_filename is not None: csv_feature_defs, csv_tag_keys = parse_feature_def_file( feature_def_filename, index_offset=len(feature_defs)) # check the CRISPR 'target_gene_id' field, if it exists # it needs to match a transcriptome entry check_crispr_target_gene(csv_feature_defs, feature_defs) feature_defs.extend(csv_feature_defs) all_tag_keys.extend(csv_tag_keys) return FeatureReference(feature_defs, all_tag_keys)
def split(args): chunk_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist) chunks = [] for chunk_input in args.inputs: chunks.append({ 'chunk_input': chunk_input, '__mem_gb': chunk_mem_gb, }) join_mem_gb = cr_utils.get_mem_gb_request_from_barcode_whitelist( args.barcode_whitelist, args.gem_groups, use_min=False) # Account for memory used by reporters (particularly the bc and umi diversity dicts) genomes = cr_utils.get_reference_genomes(args.reference_path) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) if barcode_whitelist is not None: num_barcodes = len(barcode_whitelist) * max(args.gem_groups) else: num_barcodes = cr_utils.get_num_barcodes_from_barcode_summary( args.barcode_summary) max_bc_diversity_entries = num_barcodes max_umi_diversity_entries = 4**cr_chem.get_umi_length(args.chemistry_def) # Multiply by 2 to hold the current reporter + accumulating reporter in the merge bc_diversity_mem_gb = (2 * max_bc_diversity_entries * cr_constants.BYTES_PER_STR_INT_DICT_ENTRY * (len(genomes) + 1) * len(cr_constants.READ_TYPES)) / 1e9 umi_diversity_mem_gb = (2 * max_umi_diversity_entries * cr_constants.BYTES_PER_STR_INT_DICT_ENTRY * (len(genomes) + 1) * len(cr_constants.READ_TYPES)) / 1e9 join_mem_gb = min( cr_constants.COUNT_GENES_MAX_MEM_GB, max(cr_constants.MIN_MEM_GB, int(join_mem_gb + bc_diversity_mem_gb + umi_diversity_mem_gb))) join = { '__mem_gb': join_mem_gb, } return {'chunks': chunks, 'join': join}
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_summary( args.barcode_summary) if not barcode_whitelist else None gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary genomes = cr_utils.get_reference_genomes(args.reference_path) genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes) matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs) for read in in_bam: is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb( read, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrices.add(genome, gene_id, bc) in_bam.close() matrices.save_h5(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.chunk_input) counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w') mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns() mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)} gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) genomes = cr_utils.get_reference_genomes(args.reference_path) genome_index = cr_reference.get_genome_index(genomes) none_gene_id = len(gene_index.get_genes()) # store reference index columns # NOTE - these must be cast to str first, as unicode is not supported counter.set_ref_column('genome_ids', [str(genome) for genome in genomes]) counter.set_ref_column('gene_ids', [str(gene.id) for gene in gene_index.genes]) counter.set_ref_column('gene_names', [str(gene.name) for gene in gene_index.genes]) filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) gg_metrics = collections.defaultdict( lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0}) for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby( in_bam, key=cr_utils.barcode_sort_key): if barcode is None or gem_group is None: continue is_cell_barcode = cr_utils.format_barcode_seq( barcode, gem_group) in filtered_bcs molecules = collections.defaultdict( lambda: np.zeros(len(mol_data_columns), dtype=np.uint64)) compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq( barcode) gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group( gem_group) read_positions = collections.defaultdict(set) for read in reads_iter: umi = cr_utils.get_read_umi(read) # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or umi is None or read.is_read2: continue raw_umi = cr_utils.get_read_raw_umi(read) raw_bc, raw_gg = cr_utils.split_barcode_seq( cr_utils.get_read_raw_barcode(read)) proc_bc, proc_gg = cr_utils.split_barcode_seq( cr_utils.get_read_barcode(read)) if cr_utils.is_read_conf_mapped_to_transcriptome( read, cr_utils.get_high_conf_mapq(args.align)): assert len(gene_ids) == 1 mol_key, map_type = (umi, gene_index.gene_id_to_int( gene_ids[0])), 'reads' read_pos = (read.tid, read.pos) uniq_read_pos = read_pos not in read_positions[mol_key] read_positions[mol_key].add(read_pos) if is_cell_barcode: gg_metrics[int(gem_group)][ cr_mol_counter. GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1 elif read.is_unmapped: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'unmapped_reads', False else: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'nonconf_mapped_reads', False molecules[mol_key][mol_data_columns[map_type]] += 1 molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int( not raw_umi == umi) molecules[mol_key][mol_data_columns[ 'barcode_corrected_reads']] += int(not raw_bc == proc_bc) molecules[mol_key][mol_data_columns[ 'conf_mapped_uniq_read_pos']] += int(uniq_read_pos) for mol_key, molecule in sorted(molecules.items()): umi, gene_id = mol_key genome = cr_utils.get_genome_from_str( gene_index.int_to_gene_id(gene_id), genomes) genome_id = cr_reference.get_genome_id(genome, genome_index) counter.add( barcode=compressed_barcode, gem_group=gem_group, umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi), gene=gene_id, genome=genome_id, **{ key: molecule[col_idx] for key, col_idx in mol_data_columns.iteritems() }) in_bam.close() counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics)) counter.save()