def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) libraries = rna_library.get_bam_library_info(in_bam) distinct_library_types = sorted( list(set([x['library_type'] for x in libraries]))) library_prefixes = map( lambda lib: rna_library.get_library_type_metric_prefix(lib[ 'library_type']), libraries) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_tsv( args.barcodes_detected) if not barcode_whitelist else None # TODO: this is redundant gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups, library_types=distinct_library_types) feature_ref = rna_feature_ref.from_transcriptome_and_csv( args.reference_path, args.feature_reference) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary matrix = cr_matrix.CountMatrix.empty(feature_ref, barcode_seqs, dtype='int32') for qname, reads_iter, _ in cr_utils.iter_by_qname(in_bam, None): is_conf_mapped_deduped, genome, feature_id, bc = reporter.count_genes_bam_cb( reads_iter, libraries, library_prefixes, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrix.add(feature_id, bc) in_bam.close() reporter.store_reference_metadata(args.reference_path, cr_constants.REFERENCE_TYPE, cr_constants.REFERENCE_METRIC_PREFIX) matrix.save_h5_file(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def from_transcriptome_and_csv(gene_ref_path, feature_def_filename): '''Create a FeatureReference. Create a FeatureReference from a transcriptome ref and a feature barcode ref. Args: gene_ref_path (str): Path to transcriptome reference. Can be None. feature_def_filename (str): Path to Feature Definition CSV file. Can be None. Returns: FeatureReference ''' # Load gene info feature_defs = [] all_tag_keys = ['genome'] genomes = cr_utils.get_reference_genomes(gene_ref_path) if gene_ref_path is not None: gene_idx_filename = cr_utils.get_reference_genes_index(gene_ref_path) gene_index = cr_reference.GeneIndex.load_pickle(gene_idx_filename) # Stuff relevant fields of Gene tuple into FeatureDef for gene in gene_index.genes: genome = cr_utils.get_genome_from_str(gene.id, genomes) fd = FeatureDef( index=len(feature_defs), id=gene.id, name=gene.name, feature_type=lib_constants.GENE_EXPRESSION_LIBRARY_TYPE, tags={ 'genome': genome, }) feature_defs.append(fd) # Load feature definition file if feature_def_filename is not None: csv_feature_defs, csv_tag_keys = parse_feature_def_file( feature_def_filename, index_offset=len(feature_defs)) # check the CRISPR 'target_gene_id' field, if it exists # it needs to match a transcriptome entry check_crispr_target_gene(csv_feature_defs, feature_defs) feature_defs.extend(csv_feature_defs) all_tag_keys.extend(csv_tag_keys) return FeatureReference(feature_defs, all_tag_keys)
def main(args, outs): convert_pickle_to_rust_index( cr_utils.get_reference_genes_index(args.reference_path), outs.gene_index_tab) if args.barcode_whitelist is None: barcode_whitelist = 'null' elif not os.path.exists(args.barcode_whitelist): barcode_whitelist = cr_utils.get_barcode_whitelist_path( args.barcode_whitelist) else: barcode_whitelist = args.barcode_whitelist cmd = [ 'annotate_reads', 'main', args.chunk_genome_input, args.chunk_tags, outs.output, outs.chunked_reporter, args.reference_path, outs.gene_index_tab, args.barcode_counts, barcode_whitelist, str(args.gem_group), outs.chunk_metadata, cr_chem.get_strandedness(args.chemistry_def), args.feature_counts, args.library_type or lib_constants.DEFAULT_LIBRARY_TYPE, args.library_id, args.library_info_json, '--bam-comments', args.bam_comments_json, ] if cr_chem.get_endedness(args.chemistry_def) == cr_constants.FIVE_PRIME: cmd.append('--fiveprime') if args.skip_translate: cmd.append('--skip-translate') if args.feature_reference is not None: cmd.extend(['--feature-ref', args.feature_reference]) print >> sys.stderr, 'Running', ' '.join(map(lambda x: "'%s'" % x, cmd)) tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(outs.chunk_metadata) as f: metadata = json.load(f) outs.num_alignments = metadata['num_alignments']
def main(args, outs): reference_star_path = cr_utils.get_reference_star_path(args.reference_path) star_index = cr_transcriptome.build_star_index(reference_star_path) chroms = star_index[0][0] gene_index = cr_reference.GeneIndex.load_pickle(cr_utils.get_reference_genes_index(args.reference_path)) barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_dist = cr_utils.load_barcode_dist(args.barcode_counts, barcode_whitelist, args.gem_group) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_constants.STAR_DEFAULT_HIGH_CONF_MAPQ, gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_dist=barcode_dist, gem_groups=args.gem_groups, umi_length=cr_chem.get_umi_length(args.chemistry_def), umi_min_qual_threshold=args.umi_min_qual_threshold) reporter.attach_bcs_init() outs.num_alignments = process_alignments(args.chunk_genome_input, args.chunk_trimmed_input, outs.output, args.bam_comments, reporter, gene_index, star_index, args) reporter.attach_bcs_finalize() reporter.save(outs.chunked_reporter)
def main(args, outs): in_bam = tk_bam.create_bam_infile(args.chunk_input) chroms = in_bam.references barcode_whitelist = cr_utils.load_barcode_whitelist(args.barcode_whitelist) barcode_summary = cr_utils.load_barcode_summary( args.barcode_summary) if not barcode_whitelist else None gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) reporter = cr_report.Reporter(reference_path=args.reference_path, high_conf_mapq=cr_utils.get_high_conf_mapq( args.align), gene_index=gene_index, chroms=chroms, barcode_whitelist=barcode_whitelist, barcode_summary=barcode_summary, gem_groups=args.gem_groups) if barcode_whitelist: barcode_seqs = cr_utils.format_barcode_seqs(barcode_whitelist, args.gem_groups) else: barcode_seqs = barcode_summary genomes = cr_utils.get_reference_genomes(args.reference_path) genes = cr_utils.split_genes_by_genomes(gene_index.get_genes(), genomes) matrices = cr_matrix.GeneBCMatrices(genomes, genes, barcode_seqs) for read in in_bam: is_conf_mapped_deduped, genome, gene_id, bc = reporter.count_genes_bam_cb( read, use_umis=cr_chem.has_umis(args.chemistry_def)) if is_conf_mapped_deduped: matrices.add(genome, gene_id, bc) in_bam.close() matrices.save_h5(outs.matrices_h5) reporter.save(outs.chunked_reporter)
def main(args, outs): convert_pickle_to_rust_index( cr_utils.get_reference_genes_index(args.reference_path), outs.gene_index_tab) if args.barcode_whitelist is None: barcode_whitelist = 'null' elif not os.path.exists(args.barcode_whitelist): barcode_whitelist = cr_utils.get_barcode_whitelist_path( args.barcode_whitelist) else: barcode_whitelist = args.barcode_whitelist cmd = [ 'annotate_reads', 'main', args.chunk_genome_input, outs.output, outs.chunked_reporter, args.reference_path, outs.gene_index_tab, args.barcode_counts, barcode_whitelist, str(args.gem_group), outs.chunk_metadata, cr_chem.get_strandedness(args.chemistry_def), '--bam-comments', args.bam_comments_json, ] if cr_chem.get_endedness(args.chemistry_def) == cr_constants.FIVE_PRIME: cmd.append('--fiveprime') print >> sys.stderr, 'Running', ' '.join(cmd) tk_subproc.check_call(cmd, cwd=os.getcwd()) with open(outs.chunk_metadata) as f: metadata = json.load(f) outs.num_alignments = metadata['num_alignments']
def main(args, outs): outs.coerce_strings() in_bam = tk_bam.create_bam_infile(args.chunk_input) counter = cr_mol_counter.MoleculeCounter.open(outs.output, mode='w') mol_data_keys = cr_mol_counter.MoleculeCounter.get_data_columns() mol_data_columns = {key: idx for idx, key in enumerate(mol_data_keys)} gene_index = cr_reference.GeneIndex.load_pickle( cr_utils.get_reference_genes_index(args.reference_path)) genomes = cr_utils.get_reference_genomes(args.reference_path) genome_index = cr_reference.get_genome_index(genomes) none_gene_id = len(gene_index.get_genes()) # store reference index columns # NOTE - these must be cast to str first, as unicode is not supported counter.set_ref_column('genome_ids', [str(genome) for genome in genomes]) counter.set_ref_column('gene_ids', [str(gene.id) for gene in gene_index.genes]) counter.set_ref_column('gene_names', [str(gene.name) for gene in gene_index.genes]) filtered_bcs_per_genome = cr_utils.load_barcode_csv(args.filtered_barcodes) filtered_bcs = set() for _, bcs in filtered_bcs_per_genome.iteritems(): filtered_bcs |= set(bcs) gg_metrics = collections.defaultdict( lambda: {cr_mol_counter.GG_CONF_MAPPED_FILTERED_BC_READS_METRIC: 0}) for (gem_group, barcode, gene_ids), reads_iter in itertools.groupby( in_bam, key=cr_utils.barcode_sort_key): if barcode is None or gem_group is None: continue is_cell_barcode = cr_utils.format_barcode_seq( barcode, gem_group) in filtered_bcs molecules = collections.defaultdict( lambda: np.zeros(len(mol_data_columns), dtype=np.uint64)) compressed_barcode = cr_mol_counter.MoleculeCounter.compress_barcode_seq( barcode) gem_group = cr_mol_counter.MoleculeCounter.compress_gem_group( gem_group) read_positions = collections.defaultdict(set) for read in reads_iter: umi = cr_utils.get_read_umi(read) # ignore read2 to avoid double-counting. the mapping + annotation should be equivalent. if read.is_secondary or umi is None or read.is_read2: continue raw_umi = cr_utils.get_read_raw_umi(read) raw_bc, raw_gg = cr_utils.split_barcode_seq( cr_utils.get_read_raw_barcode(read)) proc_bc, proc_gg = cr_utils.split_barcode_seq( cr_utils.get_read_barcode(read)) if cr_utils.is_read_conf_mapped_to_transcriptome( read, cr_utils.get_high_conf_mapq(args.align)): assert len(gene_ids) == 1 mol_key, map_type = (umi, gene_index.gene_id_to_int( gene_ids[0])), 'reads' read_pos = (read.tid, read.pos) uniq_read_pos = read_pos not in read_positions[mol_key] read_positions[mol_key].add(read_pos) if is_cell_barcode: gg_metrics[int(gem_group)][ cr_mol_counter. GG_CONF_MAPPED_FILTERED_BC_READS_METRIC] += 1 elif read.is_unmapped: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'unmapped_reads', False else: mol_key, map_type, uniq_read_pos = ( umi, none_gene_id), 'nonconf_mapped_reads', False molecules[mol_key][mol_data_columns[map_type]] += 1 molecules[mol_key][mol_data_columns['umi_corrected_reads']] += int( not raw_umi == umi) molecules[mol_key][mol_data_columns[ 'barcode_corrected_reads']] += int(not raw_bc == proc_bc) molecules[mol_key][mol_data_columns[ 'conf_mapped_uniq_read_pos']] += int(uniq_read_pos) for mol_key, molecule in sorted(molecules.items()): umi, gene_id = mol_key genome = cr_utils.get_genome_from_str( gene_index.int_to_gene_id(gene_id), genomes) genome_id = cr_reference.get_genome_id(genome, genome_index) counter.add( barcode=compressed_barcode, gem_group=gem_group, umi=cr_mol_counter.MoleculeCounter.compress_umi_seq(umi), gene=gene_id, genome=genome_id, **{ key: molecule[col_idx] for key, col_idx in mol_data_columns.iteritems() }) in_bam.close() counter.set_metric(cr_mol_counter.GEM_GROUPS_METRIC, dict(gg_metrics)) counter.save()