def count_mapped_bp(args, tempdir, genes): """ Count number of bp mapped to each gene across pangenomes. Return number covered genes and average gene depth per species. Result contains only covered species, but being a defaultdict, would yield 0 for any uncovered species, which is appropriate. """ bam_path = f"{tempdir}/pangenomes.bam" bamfile = AlignmentFile(bam_path, "rb") covered_genes = {} # loop over alignments, sum values per gene for aln in bamfile.fetch(until_eof=True): gene_id = bamfile.getrname(aln.reference_id) gene = genes[gene_id] gene["aligned_reads"] += 1 if keep_read(aln, args.aln_mapid, args.aln_readq, args.aln_mapq, args.aln_cov): gene["mapped_reads"] += 1 gene["depth"] += len(aln.query_alignment_sequence) / float(gene["length"]) covered_genes[gene_id] = gene tsprint("Pangenome count_mapped_bp: total aligned reads: %s" % sum(g["aligned_reads"] for g in genes.values())) tsprint("Pangenome count_mapped_bp: total mapped reads: %s" % sum(g["mapped_reads"] for g in genes.values())) # Filter to genes with non-zero depth, then group by species nonzero_gene_depths = defaultdict(list) for g in covered_genes.values(): gene_depth = g["depth"] if gene_depth > 0: # This should always pass, because ags.aln_cov is always >0. species_id = g["species_id"] nonzero_gene_depths[species_id].append(gene_depth) # Compute number of covered genes per species, and average gene depth. num_covered_genes = defaultdict(int) mean_coverage = defaultdict(float) for species_id, non_zero_depths in nonzero_gene_depths.items(): num_covered_genes[species_id] = len(non_zero_depths) mean_coverage[species_id] = np.mean(non_zero_depths) return num_covered_genes, mean_coverage, covered_genes
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True) total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format( cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info( 'Creating reservoir of subsampled reads ({} per cell)'.format( subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(int) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) targets = [x["SN"] for x in sam_file.header["SQ"]] track = sam_file.fetch(until_eof=True) count = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' count_this_read = True for i, aln in enumerate(track): if count and not count % 100000: logger.info("Processed %d alignments, kept %d." % (count, kept)) logger.info("%d were filtered for being unmapped." % unmapped) if filter_cb: logger.info( "%d were filtered for not matching known barcodes." % nomatchcb) count += 1 if aln.is_unmapped: unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue MB = match.group('MB') txid = sam_file.getrname(aln.reference_id) if gene_map: target_name = gene_map[txid] else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits if no_scale_evidence: evidence[e_tuple] += 1.0 else: evidence[e_tuple] += weigh_evidence(aln.tags) kept += 1 tally_time = time.time() - start_tally logger.info('Tally done - {:.3}s, {:,} alns/min'.format( tally_time, int(60. * count / tally_time))) logger.info('Collapsing evidence') logger.info('Writing evidence') with tempfile.NamedTemporaryFile('w+t') as out_handle: for key in evidence: line = '{},{}\n'.format(key, evidence[key]) out_handle.write(line) out_handle.flush() out_handle.seek(0) evidence_table = pd.read_csv(out_handle, header=None) del evidence evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns = ['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby( ['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns = ['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby( ['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] else: # make data frame have a complete accounting of transcripts targets = pd.Series(index=set(targets)) targets = targets.sort_index() expanded = expanded.reindex(targets.index.values, fill_value=0) genes = expanded genes.replace(pd.np.nan, 0, inplace=True) logger.info('Output results') if subsample: cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) if sparse: pd.Series(genes.index).to_csv(out + ".rownames", index=False) pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False) with open(out, "w+b") as out_handle: scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes)) else: genes.to_csv(out)
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from cStringIO import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: gene_map = dict(p.strip().split() for p in fh) if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') logger.info('Tallying evidence') start_tally = time.time() evidence = collections.defaultdict(int) sam_file = AlignmentFile(sam, mode='r') track = sam_file.fetch(until_eof=True) for i, aln in enumerate(track): if aln.is_unmapped: continue match = parser_re.match(aln.qname) CB = match.group('CB') MB = match.group('MB') txid = sam_file.getrname(aln.reference_id) if gene_map: target_name = gene_map[txid] else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits evidence[e_tuple] += weigh_evidence(aln.tags) tally_time = time.time() - start_tally logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * i / tally_time))) logger.info('Collapsing evidence') buf = StringIO() for key in evidence: line = '{},{}\n'.format(key, evidence[key]) buf.write(line) buf.seek(0) evidence_table = pd.read_csv(buf) evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns=['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] else: genes = expanded genes.replace(pd.np.nan, 0, inplace=True) logger.info('Output results') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) genes.to_csv(out)
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, cb_histogram, cb_cutoff, no_scale_evidence): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from cStringIO import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: gene_map = dict(p.strip().split() for p in fh) if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' cb_set = set() if cb_histogram: with open(cb_histogram) as fh: cb_map = dict(p.strip().split() for p in fh) cb_set = set([k for k, v in cb_map.items() if int(v) > cb_cutoff]) logger.info('Keeping %d out of %d cellular barcodes.' % (len(cb_map), len(cb_set))) parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') logger.info('Tallying evidence') start_tally = time.time() evidence = collections.defaultdict(int) sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) track = sam_file.fetch(until_eof=True) count = 0 kept = 0 for i, aln in enumerate(track): count += 1 if not count % 100000: logger.info("Processed %d alignments, kept %d." % (count, kept)) if aln.is_unmapped: continue match = parser_re.match(aln.qname) CB = match.group('CB') if cb_set and CB not in cb_set: continue MB = match.group('MB') txid = sam_file.getrname(aln.reference_id) if gene_map: target_name = gene_map[txid] else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits if no_scale_evidence: evidence[e_tuple] += 1.0 else: evidence[e_tuple] += weigh_evidence(aln.tags) kept += 1 tally_time = time.time() - start_tally logger.info('Tally done - {:.3}s, {:,} alns/min'.format( tally_time, int(60. * i / tally_time))) logger.info('Collapsing evidence') buf = StringIO() for key in evidence: line = '{},{}\n'.format(key, evidence[key]) buf.write(line) buf.seek(0) evidence_table = pd.read_csv(buf) evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns = ['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby( ['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns = ['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby( ['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] else: genes = expanded genes.replace(pd.np.nan, 0, inplace=True) logger.info('Output results') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) genes.to_csv(out)
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix): ''' Count up evidence for tagged molecules, this implementation assumes the alignment file is coordinate sorted ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence if sam.endswith(".sam"): logger.error( "To use the fasttagcount subcommand, the alignment file must be a " "coordinate sorted, indexed BAM file.") sys.exit(1) logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True) total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format( cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info( 'Creating reservoir of subsampled reads ({} per cell)'.format( subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(lambda: collections.defaultdict(float)) bare_evidence = collections.defaultdict(float) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) transcript_map = collections.defaultdict(set) sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]] if gene_map: for transcript, gene in gene_map.items(): if transcript in sam_transcripts: transcript_map[gene].add(transcript) else: for transcript in sam_transcripts: transcript_map[transcript].add(transcript) missing_transcripts = set() alignments_processed = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' current_transcript = None count_this_read = True transcripts_processed = 0 genes_processed = 0 cells = list(cb_hist.index) targets_seen = set() if umi_matrix: bare_evidence_handle = open(umi_matrix, "w") bare_evidence_handle.write(",".join(["gene"] + cells) + "\n") with open(out, "w") as out_handle: out_handle.write(",".join(["gene"] + cells) + "\n") for gene, transcripts in transcript_map.items(): for transcript in transcripts: for aln in sam_file.fetch(transcript): alignments_processed += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) continue else: target_name = txid targets_seen.add(target_name) # Scale evidence by number of hits evidence[CB][MB] += weigh_evidence(aln.tags) bare_evidence[CB] += weigh_evidence(aln.tags) kept += 1 transcripts_processed += 1 if not transcripts_processed % 1000: logger.info("%d genes processed." % genes_processed) logger.info("%d transcripts processed." % transcripts_processed) logger.info("%d alignments processed." % alignments_processed) earray = [] for cell in cells: umis = [ 1 for _, v in evidence[cell].items() if v >= minevidence ] earray.append(str(sum(umis))) out_handle.write(",".join([gene] + earray) + "\n") earray = [] if umi_matrix: for cell in cells: earray.append(str(int(bare_evidence[cell]))) bare_evidence_handle.write(",".join([gene] + earray) + "\n") evidence = collections.defaultdict( lambda: collections.defaultdict(int)) bare_evidence = collections.defaultdict(int) genes_processed += 1 if umi_matrix: bare_evidence_handle.close() # fill dataframe with missing values, sort and output df = pd.read_csv(out, index_col=0, header=0) targets = pd.Series(index=set(transcript_map.keys())) targets = targets.sort_index() df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(out) if umi_matrix: df = pd.read_csv(umi_matrix, index_col=0, header=0) df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(umi_matrix)
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, cb_histogram, cb_cutoff, no_scale_evidence, subsample): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_table(cb_histogram, index_col=0, header=-1, squeeze=True) total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) track = sam_file.fetch(until_eof=True) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(int) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) track = sam_file.fetch(until_eof=True) count = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' count_this_read = True for i, aln in enumerate(track): count += 1 if not count % 100000: logger.info("Processed %d alignments, kept %d." % (count, kept)) logger.info("%d were filtered for being unmapped." % unmapped) if filter_cb: logger.info("%d were filtered for not matching known barcodes." % nomatchcb) if aln.is_unmapped: unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue MB = match.group('MB') txid = sam_file.getrname(aln.reference_id) if gene_map: target_name = gene_map[txid] else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits if no_scale_evidence: evidence[e_tuple] += 1.0 else: evidence[e_tuple] += weigh_evidence(aln.tags) kept += 1 tally_time = time.time() - start_tally logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time))) logger.info('Collapsing evidence') buf = StringIO() for key in evidence: line = '{},{}\n'.format(key, evidence[key]) buf.write(unicode(line), "utf-8") buf.seek(0) evidence_table = pd.read_csv(buf) evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns=['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] else: genes = expanded genes.replace(pd.np.nan, 0, inplace=True) logger.info('Output results') if subsample: cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) genes.to_csv(out)
def fasttagcount(sam, out, genemap, positional, minevidence, cb_histogram, cb_cutoff, subsample, parse_tags, gene_tags, umi_matrix): ''' Count up evidence for tagged molecules, this implementation assumes the alignment file is coordinate sorted ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence if sam.endswith(".sam"): logger.error("To use the fasttagcount subcommand, the alignment file must be a " "coordinate sorted, indexed BAM file.") sys.exit(1) logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t") total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(lambda: collections.defaultdict(float)) bare_evidence = collections.defaultdict(float) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) transcript_map = collections.defaultdict(set) sam_transcripts = [x["SN"] for x in sam_file.header["SQ"]] if gene_map: for transcript, gene in gene_map.items(): if transcript in sam_transcripts: transcript_map[gene].add(transcript) else: for transcript in sam_transcripts: transcript_map[transcript].add(transcript) missing_transcripts = set() alignments_processed = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' current_transcript = None count_this_read = True transcripts_processed = 0 genes_processed = 0 cells = list(cb_hist.index) targets_seen = set() if umi_matrix: bare_evidence_handle = open(umi_matrix, "w") bare_evidence_handle.write(",".join(["gene"] + cells) + "\n") with open(out, "w") as out_handle: out_handle.write(",".join(["gene"] + cells) + "\n") for gene, transcripts in transcript_map.items(): for transcript in transcripts: for aln in sam_file.fetch(transcript): alignments_processed += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) continue else: target_name = txid targets_seen.add(target_name) # Scale evidence by number of hits evidence[CB][MB] += weigh_evidence(aln.tags) bare_evidence[CB] += weigh_evidence(aln.tags) kept += 1 transcripts_processed += 1 if not transcripts_processed % 1000: logger.info("%d genes processed." % genes_processed) logger.info("%d transcripts processed." % transcripts_processed) logger.info("%d alignments processed." % alignments_processed) earray = [] for cell in cells: umis = [1 for _, v in evidence[cell].items() if v >= minevidence] earray.append(str(sum(umis))) out_handle.write(",".join([gene] + earray) + "\n") earray = [] if umi_matrix: for cell in cells: earray.append(str(int(bare_evidence[cell]))) bare_evidence_handle.write(",".join([gene] + earray) + "\n") evidence = collections.defaultdict(lambda: collections.defaultdict(int)) bare_evidence = collections.defaultdict(int) genes_processed += 1 if umi_matrix: bare_evidence_handle.close() # fill dataframe with missing values, sort and output df = pd.read_csv(out, index_col=0, header=0) targets = pd.Series(index=set(transcript_map.keys())) targets = targets.sort_index() df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(out) if umi_matrix: df = pd.read_csv(umi_matrix, index_col=0, header=0) df = df.reindex(targets.index.values, fill_value=0) df = df.sort_index() df.to_csv(umi_matrix)
def tagcount(sam, out, genemap, output_evidence_table, positional, minevidence, cb_histogram, cb_cutoff, no_scale_evidence, subsample, sparse, parse_tags, gene_tags): ''' Count up evidence for tagged molecules ''' from pysam import AlignmentFile from io import StringIO import pandas as pd from utils import weigh_evidence logger.info('Reading optional files') gene_map = None if genemap: with open(genemap) as fh: try: gene_map = dict(p.strip().split() for p in fh) except ValueError: logger.error('Incorrectly formatted gene_map, need to be tsv.') sys.exit() if positional: tuple_template = '{0},{1},{2},{3}' else: tuple_template = '{0},{1},{3}' if not cb_cutoff: cb_cutoff = 0 if cb_histogram and cb_cutoff == "auto": cb_cutoff = guess_depth_cutoff(cb_histogram) cb_cutoff = int(cb_cutoff) cb_hist = None filter_cb = False if cb_histogram: cb_hist = pd.read_csv(cb_histogram, index_col=0, header=-1, squeeze=True, sep="\t") total_num_cbs = cb_hist.shape[0] cb_hist = cb_hist[cb_hist > cb_cutoff] logger.info('Keeping {} out of {} cellular barcodes.'.format(cb_hist.shape[0], total_num_cbs)) filter_cb = True parser_re = re.compile('.*:CELL_(?P<CB>.*):UMI_(?P<MB>.*)') if subsample: logger.info('Creating reservoir of subsampled reads ({} per cell)'.format(subsample)) start_sampling = time.time() reservoir = collections.defaultdict(list) cb_hist_sampled = 0 * cb_hist cb_obs = 0 * cb_hist track = stream_bamfile(sam) current_read = 'none_observed_yet' for i, aln in enumerate(track): if aln.qname == current_read: continue current_read = aln.qname if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if CB not in cb_hist.index: continue cb_obs[CB] += 1 if len(reservoir[CB]) < subsample: reservoir[CB].append(i) cb_hist_sampled[CB] += 1 else: s = pd.np.random.randint(0, cb_obs[CB]) if s < subsample: reservoir[CB][s] = i index_filter = set(itertools.chain.from_iterable(reservoir.values())) sam_file.close() sampling_time = time.time() - start_sampling logger.info('Sampling done - {:.3}s'.format(sampling_time)) evidence = collections.defaultdict(int) logger.info('Tallying evidence') start_tally = time.time() sam_mode = 'r' if sam.endswith(".sam") else 'rb' sam_file = AlignmentFile(sam, mode=sam_mode) targets = [x["SN"] for x in sam_file.header["SQ"]] track = sam_file.fetch(until_eof=True) count = 0 unmapped = 0 kept = 0 nomatchcb = 0 current_read = 'none_observed_yet' count_this_read = True missing_transcripts = set() for i, aln in enumerate(track): if count and not count % 1000000: logger.info("Processed %d alignments, kept %d." % (count, kept)) logger.info("%d were filtered for being unmapped." % unmapped) if filter_cb: logger.info("%d were filtered for not matching known barcodes." % nomatchcb) count += 1 if aln.is_unmapped: unmapped += 1 continue if gene_tags and not aln.has_tag('GX'): unmapped += 1 continue if aln.qname != current_read: current_read = aln.qname if subsample and i not in index_filter: count_this_read = False continue else: count_this_read = True else: if not count_this_read: continue if parse_tags: CB = aln.get_tag('CR') else: match = parser_re.match(aln.qname) CB = match.group('CB') if filter_cb: if CB not in cb_hist.index: nomatchcb += 1 continue if parse_tags: MB = aln.get_tag('UM') else: MB = match.group('MB') if gene_tags: target_name = aln.get_tag('GX').split(',')[0] else: txid = sam_file.getrname(aln.reference_id) if gene_map: if txid in gene_map: target_name = gene_map[txid] else: missing_transcripts.add(txid) target_name = txid else: target_name = txid e_tuple = tuple_template.format(CB, target_name, aln.pos, MB) # Scale evidence by number of hits if no_scale_evidence: evidence[e_tuple] += 1.0 else: evidence[e_tuple] += weigh_evidence(aln.tags) kept += 1 tally_time = time.time() - start_tally if missing_transcripts: logger.warn('The following transcripts were missing gene_ids, so we added them as the transcript ids: %s' % str(missing_transcripts)) logger.info('Tally done - {:.3}s, {:,} alns/min'.format(tally_time, int(60. * count / tally_time))) logger.info('Collapsing evidence') logger.info('Writing evidence') with tempfile.NamedTemporaryFile('w+t') as out_handle: for key in evidence: line = '{},{}\n'.format(key, evidence[key]) out_handle.write(line) out_handle.flush() out_handle.seek(0) evidence_table = pd.read_csv(out_handle, header=None) del evidence evidence_query = 'evidence >= %f' % minevidence if positional: evidence_table.columns=['cell', 'gene', 'umi', 'pos', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi', 'pos'].size() else: evidence_table.columns=['cell', 'gene', 'umi', 'evidence'] collapsed = evidence_table.query(evidence_query).groupby(['cell', 'gene'])['umi'].size() expanded = collapsed.unstack().T if gene_map: # This Series is just for sorting the index genes = pd.Series(index=set(gene_map.values())) genes = genes.sort_index() # Now genes is assigned to a DataFrame genes = expanded.ix[genes.index] elif gene_tags: expanded.sort_index() genes = expanded else: # make data frame have a complete accounting of transcripts targets = pd.Series(index=set(targets)) targets = targets.sort_index() expanded = expanded.reindex(targets.index.values, fill_value=0) genes = expanded genes.fillna(0, inplace=True) genes = genes.astype(int) genes.index.name = "gene" logger.info('Output results') if subsample: cb_hist_sampled.to_csv('ss_{}_'.format(subsample) + os.path.basename(cb_histogram), sep='\t') if output_evidence_table: import shutil buf.seek(0) with open(output_evidence_table, 'w') as etab_fh: shutil.copyfileobj(buf, etab_fh) if sparse: pd.Series(genes.index).to_csv(out + ".rownames", index=False, header=False) pd.Series(genes.columns.values).to_csv(out + ".colnames", index=False, header=False) with open(out, "w+b") as out_handle: scipy.io.mmwrite(out_handle, scipy.sparse.csr_matrix(genes)) else: genes.to_csv(out)
def pesam2bed(input_file, chrlen_file, output_file, logfile, errfile, filter_function=check_good_read): ''' Converts aligned BAM file into BED file of fragments filtering reads that do not pass the filtering criteria Input: input_file - aligned (unsorted) paired end BAM chrlen_file - TSV file of format chromosome<TAB>length output_file - output file (BED file with start and end corresponding to both ends of the PE fragment) logfile, errfile - logging files filter_function - function that accepts an aligned read (pysam.AlignedSegment) and returns true, false or weight of the fragment Default: utils.check_good_reads Note: the function can return weight, if trying to accommodate reads from repetitive sequences Output: None (written into output file) See also: check_good_read ''' samfile = AlignmentFile(input_file) take_read = [x for x in itertools.imap(filter_function, samfile)] # check good_read also checks if this is R1... samfile.reset() logfile.write("%d out of %d reads passed filtering\n" % (sum(take_read), len(take_read) / 2)) logfile.flush() chrlens = genomic_utils.get_chr_lens(chrlen_file) outfile = open(output_file, 'w') count = 0 for i, r in enumerate(samfile): count += 1 if count % 1000000 == 0: logfile.write("pesam2bed: %s, Read # %d\n" % (input_file, count)) logfile.flush() read_1 = r if take_read[i] == 0: continue if not read_1.is_reverse: rstart = read_1.reference_start start = max(0, rstart) rend = rstart + read_1.template_length end = min(rend, chrlens[samfile.getrname(read_1.reference_id)]) if start > end: continue try: outfile.write(samfile.getrname(read_1.reference_id)) outfile.write('\t') outfile.write('%d\t%d\t%s\t%d\t+\n' % (start, end, read_1.qname, take_read[i] * 100)) except ValueError: errfile.write( "***** pesam2bed failed with the following value\n") errfile.write(samfile.getrname(read_1.reference_id)) errfile.write(' %d\t%d\t%s\t%d\t-\n' % (start, end, read_1.qname, take_read[i] * 100)) errfile.flush() sys.exit(-1) else: rstart = read_1.reference_end - abs(read_1.template_length) start = max(0, rstart) rend = read_1.reference_end end = min(rend, chrlens[samfile.getrname(read_1.reference_id)]) if start > end: continue try: outfile.write(samfile.getrname(read_1.reference_id)) outfile.write('\t') outfile.write('%d\t%d\t%s\t%d\t-\n' % (start, end, read_1.qname, take_read[i] * 100)) except ValueError: errfile.write( "***** pesam2bed failed with the following value\n") errfile.write(samfile.getrname(read_1.reference_id)) errfile.write(' %d\t%d\t%s\t%d\t-\n' % (start, end, read_1.qname, take_read[i] * 100)) errfile.flush() sys.exit(-1) outfile.close()