def testDoubleCalling(self): # The following would fail if there is an # issue with stdout being improperly caught. retvals = pysam.idxstats( os.path.join(DATADIR, "ex1.bam")) retvals = pysam.idxstats( os.path.join(DATADIR, "ex1.bam"))
def test_merge_and_switch(): test_bam1 = bam.BamFile(os.path.join(dir, "chr19_window.bam"), "samtools", no_initial_index=True) test_bam2 = bam.BamFile(os.path.join(dir, "chrX_window1.bam"), "samtools", no_initial_index=True) test_bam3 = bam.BamFile(os.path.join(dir, "chrX_window2.bam"), "samtools", no_initial_index=True) merged = bam.samtools_merge("samtools", [test_bam1.filepath, test_bam2.filepath], os.path.join(dir, "merged1"), 1) merged = bam.BamFile(os.path.join(dir, "merged1.merged.bam"), "samtools", no_initial_index=True) a = pysam.idxstats(test_bam1.filepath) test1_reads = sum([ int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")] if len(k) > 3 ]) a = pysam.idxstats(test_bam2.filepath) test2_reads = sum([ int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")] if len(k) > 3 ]) a = pysam.idxstats(test_bam3.filepath) test3_reads = sum([ int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")] if len(k) > 3 ]) a = pysam.idxstats(merged.filepath) merged1_reads = sum([ int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")] if len(k) > 3 ]) assert merged1_reads == test1_reads + test2_reads swapped = bam.switch_sex_chromosomes_sambamba("samtools", "sambamba", merged.filepath, test_bam3.filepath, "chrX", dir, "swapped", 1, { "CL": ["foo"], "ID": "xyalign" }) swapped = bam.BamFile(os.path.join(dir, "swapped.merged.bam"), "samtools", no_initial_index=True) a = pysam.idxstats(swapped.filepath) swapped_reads = sum([ int(k[2]) + int(k[3]) for k in [x.split("\t") for x in a.split("\n")] if len(k) > 3 ]) assert swapped_reads == test1_reads + test3_reads header = read_bed(os.path.join(dir, "swapped.header.sam")) assert ["@PG", "ID:xyalign", "CL:foo"] in header
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None): blacklisted = 0 if blackListFileName is None: return blacklisted import pysam import deeptools.mapReduce as mapReduce # Get the chromosome lengths chromLens = {} lines = pysam.idxstats(bam_handle.filename) if type(lines) is str: lines = lines.strip().split('\n') for line in lines: chrom, _len, nmapped, _nunmapped = line.split('\t') chromLens[chrom] = int(_len) bl = mapReduce.BED_to_interval_tree(open(blackListFileName, "r")) for chrom in bl.keys(): if not chroms_to_ignore or chrom not in chroms_to_ignore: for reg in bl[chrom].find(0, chromLens[chrom]): blacklisted += bam_handle.count(reference=chrom, start=reg.start, end=reg.end) return blacklisted
def _setup(self, config, temp): with open(os.path.join(temp, "contigs.table"), "w") as handle: handle.write("ID\tSize\tNs\tHits\n") # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(self._input_file)).split('\n'): line = line.strip() if not line: continue name, size, hits, _ = line.split('\t') name = contig_name_to_plink_name(name) if name is None or not (name.isdigit() or name == 'X'): continue elif name not in self._contigs: # Excluding contigs is allowed continue if int(size) != self._contigs[name]['Size']: raise NodeError( "Size mismatch between database and BAM; " "expected size %i, found %i for contig %r" % (int(size), self._contigs[name]['Size'], name)) row = { 'ID': name, 'Size': self._contigs[name]['Size'], 'Ns': self._contigs[name]['Ns'], 'Hits': hits, } handle.write('{ID}\t{Size}\t{Ns}\t{Hits}\n'.format(**row)) CommandNode._setup(self, config, temp)
def getNumReads(bamfile): '''count number of reads in bam file. This methods works through pysam.idxstats. Arguments --------- bamfile : string Filename of :term:`bam` formatted file. The file needs to be indexed. Returns ------- nreads : int Number of reads ''' lines = pysam.idxstats(bamfile) try: nreads = sum( map(int, [x.split("\t")[2] for x in lines if not x.startswith("#")])) except IndexError, msg: raise IndexError( "can't get number of reads from bamfile, msg=%s, data=%s" % (msg, lines))
def test_idxstats_parse_split_lines(): bam_filename = "./pysam_data/ex2.bam" lines = pysam.idxstats( bam_filename, split_lines=True ) # Test pysam 0.8.X style output, which returns a list of lines for line in lines: _seqname, _seqlen, nmapped, _nunmapped = line.split()
def print_sex(bam): """ Print sex based on chr x ratio Args: bam (str): Path to bam file """ idxstats = pysam.idxstats(bam) chr_ratio = [] # Calculate read / chromosome length ratio per chromosome for chr in idxstats[0:24]: chr = chr.strip('\n').split('\t') chr_length = float(chr[1]) chr_mapped = float(chr[2]) ratio = chr_mapped / chr_length chr_ratio.append(ratio) chr_ratio_std = numpy.std(chr_ratio) chr_ratio_mean = numpy.mean(chr_ratio) chr_x = idxstats[22].strip('\n').split('\t') chr_x_ratio = float(chr_x[2]) / float(chr_x[1]) if ((chr_x_ratio > chr_ratio_mean - (2 * chr_ratio_std)) and (chr_x_ratio < chr_ratio_mean + (2 * chr_ratio_std))): print 'female' elif (chr_x_ratio < chr_ratio_mean - (2 * chr_ratio_std)): print 'male' else: print "unkown"
def bam_total_reads(bam_handle, chroms_to_ignore): """Count the total number of mapped reads in a BAM file, filtering the chromosome given in chroms_to_ignore list """ if chroms_to_ignore: import pysam lines = pysam.idxstats(bam_handle.filename) lines = toString(lines) if type(lines) is str: lines = lines.strip().split('\n') if len(lines) == 0: # check if this is a test running under nose # in which case it will fail. if len([val for val in sys.modules.keys() if val.find("nose") >= 0]): sys.stderr.write("To run this code inside a test use disable " "output buffering `nosetest -s`\n".format(bam_handle.filename)) else: sys.stderr.write("Error running idxstats on {}\n".format(bam_handle.filename)) tot_mapped_reads = 0 for line in lines: chrom, _len, nmapped, _nunmapped = line.split('\t') if chrom not in chroms_to_ignore: tot_mapped_reads += int(nmapped) else: tot_mapped_reads = bam_handle.mapped return tot_mapped_reads
def get_contigs_with_reads(bam_path: str, with_length: bool = False) -> Generator: """ Get all contigs with reads mapped to them Args: bam_path(str): path to bam file with_length(bool): also yield the length of the contig Yields: contig(str) """ for line in pysam.idxstats(bam_path).split('\n'): try: contig, contig_len, mapped_reads, unmapped_reads = line.strip( ).split() mapped_reads, unmapped_reads = int(mapped_reads), int( unmapped_reads) if mapped_reads > 0 or unmapped_reads > 0: if with_length: yield contig, int(contig_len) else: yield contig except ValueError: pass
def bam_blacklisted_reads(bam_handle, chroms_to_ignore, blackListFileName=None, numberOfProcessors=1): blacklisted = 0 if blackListFileName is None: return blacklisted # Get the chromosome lengths chromLens = {} lines = pysam.idxstats(bam_handle.filename) lines = toString(lines) if type(lines) is str: lines = lines.strip().split('\n') for line in lines: chrom, _len, nmapped, _nunmapped = line.split('\t') chromLens[chrom] = int(_len) bl = GTF(blackListFileName) regions = [] for chrom in bl.chroms: if (not chroms_to_ignore or chrom not in chroms_to_ignore) and chrom in chromLens: for reg in bl.findOverlaps(chrom, 0, chromLens[chrom]): regions.append([bam_handle.filename, chrom, reg[0], reg[1]]) if len(regions) > 0: import multiprocessing if len(regions) > 1 and numberOfProcessors > 1: pool = multiprocessing.Pool(numberOfProcessors) res = pool.map_async(bam_blacklisted_worker, regions).get(9999999) else: res = [bam_blacklisted_worker(x) for x in regions] for val in res: blacklisted += val return blacklisted
def _setup(self, config, temp): with open(os.path.join(temp, "contigs.table"), "w") as handle: handle.write("ID\tSize\tNs\tHits\n") # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(self._input_file)).split("\n"): line = line.strip() if not line: continue name, size, hits, _ = line.split("\t") name = self._mapping.get(name, name) if name not in self._contigs: # Excluding contigs is allowed continue row = { "ID": name, "Size": self._contigs[name]["Size"], "Ns": self._contigs[name]["Ns"], "Hits": hits, } handle.write("{ID}\t{Size}\t{Ns}\t{Hits}\n".format(**row)) CommandNode._setup(self, config, temp)
def getNumReads(bamfile): '''count number of reads in bam file. This methods works through pysam.idxstats. Arguments --------- bamfile : string Filename of :term:`bam` formatted file. The file needs to be indexed. Returns ------- nreads : int Number of reads ''' lines = pysam.idxstats(bamfile).splitlines() try: nreads = sum( map(int, [x.split("\t")[2] for x in lines if not x.startswith("#")])) except IndexError, msg: raise IndexError( "can't get number of reads from bamfile, msg=%s, data=%s" % (msg, lines))
def test_idxstats_parse(): bam_filename = "./pysam_data/ex2.bam" idxstats_string = pysam.idxstats(bam_filename, split_lines=False) # Test pysam 0.9.X style output, which returns a string that needs to be split by \n lines = idxstats_string.splitlines() for line in lines: splt = line.split("\t") _seqname, _seqlen, nmapped, _nunmapped = splt
def _coverage(self,bam_obj): stats = pysam.idxstats(bam_obj.filename).rstrip().split('\n') tot_reads = sum([int(x.split('\t')[2]) for x in stats]) tot_bases = sum(bam_obj.lengths) return tot_reads/tot_bases*self._avg_read_len(bam_obj)
def main(args): # Fail-fast check for file existence for b in args.bams: if not os.path.isfile(b): print("Could not find file: {}!".format(str(b))) sys.exit() data = {} with smartOut(args.output) as out: for b in args.bams: data[b] = bamStats() for line in pysam.idxstats(b).split('\n'): segs = line.split('\t') if len(segs) < 4: continue data[b].addChr(segs[0], int(segs[1]), int(segs[2]), int(segs[3])) out.write( "BamName\tTotalReads\tMappedReads\tUnmappedReads\tMapProportion\tRawXcov\tMapXcov\tavgRawChrCov\tavgMapChrCov\n" ) for b in data: v = data[b] v.calculateStats() out.write("{}\t{}\t{}\t{}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}\n"\ .format(b, v.totalReads, v.mappedReads, v.unmappedReads, v.mapProportion, v.rawXcov, v.mapXcov, v.avgRawChrCov, v.avgMapChrCov))
def print_sex(bam): """ Print sex based on chr x ratio Args: bam (str): Path to bam file """ idxstats = pysam.idxstats(bam) chr_ratio = [] # Calculate read / chromosome length ratio per chromosome for chr in idxstats[0:24]: chr = chr.strip("\n").split("\t") chr_length = float(chr[1]) chr_mapped = float(chr[2]) ratio = chr_mapped / chr_length chr_ratio.append(ratio) chr_ratio_std = numpy.std(chr_ratio) chr_ratio_mean = numpy.mean(chr_ratio) chr_x = idxstats[22].strip("\n").split("\t") chr_x_ratio = float(chr_x[2]) / float(chr_x[1]) if (chr_x_ratio > chr_ratio_mean - (2 * chr_ratio_std)) and (chr_x_ratio < chr_ratio_mean + (2 * chr_ratio_std)): print "female" elif chr_x_ratio < chr_ratio_mean - (2 * chr_ratio_std): print "male" else: print "unkown"
def _setup(self, config, temp): with open(os.path.join(temp, "contigs.table"), "w") as handle: handle.write("ID\tSize\tNs\tHits\n") # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(self._input_file)).split('\n'): line = line.strip() if not line: continue name, size, hits, _ = line.split('\t') name = contig_name_to_plink_name(name) if name is None or not (name.isdigit() or name == 'X'): continue if int(size) != self._contigs[name]['Size']: raise NodeError("TODO: size mismatch") row = { 'ID': name, 'Size': self._contigs[name]['Size'], 'Ns': self._contigs[name]['Ns'], 'Hits': hits, } handle.write('{ID}\t{Size}\t{Ns}\t{Hits}\n'.format(**row)) CommandNode._setup(self, config, temp)
def main(argv): fullname = os.path.abspath(argv[1]) bamfile = pysam.Samfile(fullname) header = bamfile.header fullname_s = fullname.split("/") # Populate info dict info = dict() info['idsequencing'] = fullname_s[-2].split("seq")[1] info['filename'] = os.path.basename(fullname) info['aligner_index'] = fullname_s[-4] bam_datetime = b = datetime.datetime.fromtimestamp(os.stat(fullname).st_mtime) info['align_datetime'] = bam_datetime.strftime("%Y-%m-%d %H:%M:%S") info['aligner'] = header['PG'][0]['PN'] info['command'] = header['PG'][0]['cl'] # Compute total number of aligned reads stats = pysam.idxstats(fullname) stats = [el.split("\t") for el in stats] total_reads = 0 for el in stats: total_reads += int(el[2]) info['total_reads'] = total_reads # Format dict entries for MySQL for i in info.iterkeys(): info[i] = "'" + str(info[i]) + "'" ## Connect to db try: conn = mdb.connect('localhost', 'brad', 'Eu23ler1', 'sample_db') cur = conn.cursor() except mdb.Error, e: print "MySQLdb error %d: %s " % (e.args[0] + e.args[1])
def bamStats(bamfile): """ Extract average depths + idxstats data from BAM file, return data frame """ istats = pysam.idxstats(bamfile) result = [] samfile = pysam.Samfile(bamfile, "rb") for x in istats: xs = x.replace("\n", "").split("\t") rec = { "CHROM": xs[0], "NT": int(xs[1]), "MAPPED": int(xs[2]), "UNMAPPED": int(xs[3]), "READLEN": 0, "COVERAGE": 0.0, } count = 0 rls = 0.0 try: for read in samfile.fetch(xs[0]): rls += float(read.rlen) count += 1 if count > 10000: break rls /= count rec["READLEN"] = rls rec["COVERAGE"] = float(rec["MAPPED"] * rec["READLEN"])/float(rec["NT"]) except: pass result.append(rec) if result: return pandas.DataFrame(result, columns=["CHROM", "NT", "MAPPED", "UNMAPPED", "READLEN", "COVERAGE"]) else: return pandas.DataFrame(columns=["CHROM", "NT", "MAPPED", "UNMAPPED", "READLEN", "COVERAGE"])
def find_coverage(df): print "finding coverage of BAM files" df['cov'] = None for index, row in df.iterrows(): bam = row['BAM_path'] temp_cov_output = str(bam) + ".cov" stats = pysam.idxstats(bam) nreads_mapped = 0 nreads_unmapped = 0 total_bp = 0 for row in stats.split("\n"): row.rstrip("\r") fields = row.split("\t") if len(fields) > 3 and fields[0] != '*': total_bp += int(fields[1]) nreads_mapped += int(fields[2]) nreads_unmapped += int(fields[3]) cov = (nreads_unmapped + nreads_mapped) * 150. / total_bp df.loc[index, 'cov'] = cov return df
def get_num_reads(filename): num_reads = 0 try: num_reads = reduce(lambda x, y: x + y, [eval('+'.join(l.rstrip('\n').split('\t')[2:])) for l in pysam.idxstats(filename)]) except: sys.stderr.write("Unable to count reads in file: %s" % filename) return num_reads
def count_all(bamfile): #all reads: mapped + unmapped #return reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bamfile) ]) #count mapped reads return reduce( lambda x, y: x + y, [int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(bamfile)])
def testReturnValueString(self): retval = pysam.idxstats(os.path.join(BAM_DATADIR, "ex1.bam")) if IS_PYTHON3: self.assertFalse(isinstance(retval, bytes)) self.assertTrue(isinstance(retval, str)) else: self.assertTrue(isinstance(retval, bytes)) self.assertTrue(isinstance(retval, basestring))
def test_idxstats_parse(): bam_filename = os.path.join(BAM_DATADIR, "ex2.bam") # Test pysam 0.9.X style output, which returns a string that needs to be split by \n idxstats_string = pysam.idxstats(bam_filename, split_lines=False) lines = idxstats_string.splitlines() for line in lines: splt = line.split("\t") _seqname, _seqlen, nmapped, _nunmapped = splt
def _get_bam_stats(bam_filepath): stats_str = StringIO(pysam.idxstats(bam_filepath)) col_names = ["ref chrom", "ref len", "mapped", "unmapped"] bam_stats_df = pd.read_csv(stats_str, delimiter="\t", names=col_names, index_col="ref chrom") return bam_stats_df
def getChromsFromBAM(filename): chroms = [] stats = pysam.idxstats(filename) for row in stats.split("\n"): fields = row.split("\t") if fields[0] != '*' and fields[0] != '': chroms.append(fields[0]) return chroms
def getBamReads(bam): ''' get total reads from a bam file ''' return (reduce( lambda x, y: x + y, [int(l.split('\t')[2]) for l in pysam.idxstats(bam).split('\n')[0:-1]]))
def Main(): args = ParseArg() if len(args.data) != len(args.name): print >> sys.stderr, "ERROR: Number of data is not the same as number of names!" sys.exit(0) # store data information data = {} total_reads = {} for i in range(len(args.data)): temp_name = args.name[i] print >> sys.stderr, "\n Reading data file:" + temp_name + "..." total_reads[temp_name] = 0 if args.format[i] == "bam": total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i]) ]) else: Format = "bed" for b in TableIO.parse(args.data[i], Format): total_reads[temp_name] += 1 if total_reads[temp_name] % 50000 == 0: print >> sys.stderr, " reading %d reads..\r" % ( total_reads[temp_name]), data[temp_name] = DBI.init(args.data[i], args.format[i]) output = open(args.output, 'w') Input = open(args.input, 'r') lines = Input.read().split("\n") # header header = ["chr", "start", "end", "type", "name", "subtype", "count" ] + data.keys() print >> output, "\t".join(g + "_%d" % (f) for f in [1, 2] for g in header) + "\tinteraction\tp-value" num = 0 print >> sys.stderr, "Start process interactions:" for l in lines: if l.strip() == '': continue l = l.strip().split('\t') num = num + 1 if l[0] == "chrM" or l[7] == "chrM": continue C1 = Bed([l[0], int(l[1]), int(l[2])]) C2 = Bed([l[7], int(l[8]), int(l[9])]) rpkm1 = "\t".join( str(f) for f in [RPKM(C1, data[n], total_reads[n], n) for n in data.keys()]) rpkm2 = "\t".join( str(f) for f in [RPKM(C2, data[n], total_reads[n], n) for n in data.keys()]) print >> output, "\t".join( str(f) for f in l[:7] + [rpkm1] + l[7:14] + [rpkm2, l[14], l[15]]) if num % 1000 == 0: print >> sys.stderr, " Output interaction: %d\r" % (num),
def calculate_samples(bamfile, count, ref_prefix=None): refcounts = dict() for s in pysam.idxstats(bamfile).split("\n"): tok = s.rstrip().split("\t") if ref_prefix is not None and tok[0].startswith(ref_prefix) == False: continue refcounts[tok[0]] = int(tok[2]) coef = (count * 1.0) / sum(refcounts.values()) return dict((k, (v, int(np.round(v * coef)))) for k, v in refcounts.items())
def countReadsInBAM(filename): stats = pysam.idxstats(filename) nreads = 0 for row in stats.split("\n"): row.rstrip("\r") fields = row.split("\t") if len(fields) > 2 and fields[0] != '*': nreads += int(fields[2]) return nreads
def get_total_reads(bam_filename): idxstats = pysam.idxstats(bam_filename).split('\n') tot = 0 for l in idxstats: if not l: continue ele = l.split('\t') tot += int(ele[-2]) return tot
def test_idxstats_parse(): bam_filename = "./pysam_data/ex2.bam" idxstats_string = pysam.idxstats( bam_filename, split_lines=False ) # Test pysam 0.9.X style output, which returns a string that needs to be split by \n lines = idxstats_string.splitlines() for line in lines: splt = line.split("\t") _seqname, _seqlen, nmapped, _nunmapped = splt
def calculate_samples(bamfile, count, ref_prefix=None): refcounts = dict() for s in pysam.idxstats(bamfile).split("\n"): tok = s.rstrip().split("\t") if ref_prefix is not None and tok[0].startswith(ref_prefix) == False: continue refcounts[tok[0]] = int(tok[2]) coef = (count * 1.0) / sum(refcounts.values()) return dict( (k, (v, int(np.round(v * coef)))) for k, v in refcounts.items())
def _method_pysam(self, *args, **kwargs): import pysam # index the bam file pysam.index(self.infile) # create count table with open(self.outfile, 'wt') as out: out.write("Reference sequence name\tSequence length\t" "Mapped reads\tUnmapped reads{}".format(os.linesep)) for line in pysam.idxstats(self.infile): out.write(line)
def bam_total_reads(bam_fname): """Count the total number of mapped reads in a BAM file. Uses the BAM index to do this quickly. """ lines = pysam.idxstats(bam_fname) tot_mapped_reads = 0 for line in lines: _seqname, _seqlen, nmapped, _nunmapped = line.split() tot_mapped_reads += int(nmapped) return tot_mapped_reads
def getTotalReads(bam): totalReads = 0 perChromCount = {} stats = pysam.idxstats(bam) for line in stats.split('\n'): tokenized = line.split() if len(tokenized) == 0 or tokenized[0] == "*": continue c = int(tokenized[2]) + int(tokenized[3]) # mapped + unmapped reads perChromCount[tokenized[0]] = c totalReads += c return totalReads, perChromCount
def creatChromeSize(bamFileName): preffixName, suffixName = os.path.splitext(bamFileName) tmpChromeSizeFilename = preffixName + ".chromesize" ftmp = open(tmpChromeSizeFilename, "w") for line in pysam.idxstats(bamFileName).strip().split('\n'): line = line.strip().split() if line[0] != "*": ftmp.write(line[0] + "\t" + line[1] + "\n") ftmp.close() return tmpChromeSizeFilename
def normalization(options): """ find total number of mapped reads for each chromosome to be used as a scaling factor when comparing between samples """ stats = pysam.idxstats(options.file) norms = {} for i in stats: norms[i.rsplit()[0]] = int(i.rsplit()[2]) return norms
def idxstats(bam_fname, drop_unmapped=False): """Get chromosome names, lengths, and number of mapped/unmapped reads. Use the BAM index (.bai) to get the number of reads and size of each chromosome. Contigs with no mapped reads are skipped. """ handle = StringIO(pysam.idxstats(bam_fname, split_lines=False)) table = pd.read_table(handle, header=None, names=['chromosome', 'length', 'mapped', 'unmapped']) if drop_unmapped: table = table[table.mapped != 0].drop('unmapped', axis=1) return table
def CountRandom(BamFile): samIdxStats = pysam.idxstats(BamFile) samfile = pysam.Samfile(BamFile,"rb") TotalMapped = samfile.mapped samfile.close() countAlign = 0 List = GetChromoList() for stat in samIdxStats: if stat.split()[0] in List: MappedforChromosome = stat.split()[2] countAlign = countAlign+long(MappedforChromosome) RandAlign = TotalMapped-countAlign return [BamFile,{"Random":RandAlign}]
def _init_read_number(self, bamFile): """Compute number of reads and number of mapped reads for CoverageSet""" # XXX ToDo add number of mapped reads in all cases # try: from distutils.version import LooseVersion if LooseVersion("0.9.0") <= LooseVersion(pysam.__version__): a = pysam.idxstats(bamFile) mapped_reads = sum([int(el.split('\t')[2]) for el in a.split('\n')[:len(a.split('\n'))-1]]) unmapped_read = sum([int(el.split('\t')[3]) for el in a.split('\n')[:len(a.split('\n'))-1]]) self.reads = mapped_reads + unmapped_read self.mapped_reads = mapped_reads else: self.reads = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bamFile)]) self.mapped_reads = None
def _validate_mito_bam(data, handle, info): if data.mitochondria is None: # No mitochondrial data .. skip phylogeny return True references = handle.references min_length = min((len(record.sequence)) for record in data.mitochondria.itervalues()) for bam_contig, bam_length in zip(references, handle.lengths): if bam_contig not in data.mitochondria: continue db_sequence = data.mitochondria[bam_contig].sequence db_length = len(db_sequence) - db_sequence.count("-") if bam_length != db_length: print_err("ERROR: Length of mitochondrial contig %r (%i bp) " "does not match the length of the corresponding " "sequence in the database (%i bp)" % (bam_contig, bam_length, db_length)) return False if not os.path.exists(handle.filename + '.bai') \ and not os.path.exists(swap_ext(handle.filename, '.bai')): print_info(' - Attempting to index BAM file %r!' % (handle.filename,)) pysam.index(handle.filename) # Workaround for pysam < 0.9 returning list, >= 0.9 returning str for line in "".join(pysam.idxstats(handle.filename)).split('\n'): line = line.strip() if not line: continue name, _, hits, _ = line.split('\t') if (name == bam_contig) and not int(hits): print_err("WARNING: Mitochondrial BAM (%r) does not contain " "any reads aligned to contig %r; inferring an " "phylogeny is not possible." % (handle.filename, name)) return True info.mt_contig = bam_contig info.mt_length = bam_length info.mt_padding = len(db_sequence) - min_length return True return True
def verify_chrom_in_paths(genome_path, bamfile1, bamfile2, chrom_sizes): """Check whether the chromsome info overlap in bamfiles, genome path and chrom size path""" chrom_bams = set() chrom_genome = set() chrom_chrom_sizes = set() #check bam files try: if pysam.__version__ == '0.9.0': chrom_bams_1 = set([el.split('\t')[0] for el in pysam.idxstats(bamfile1).split('\n')[:len(pysam.idxstats(bamfile1).split('\n'))-1]]) chrom_bams_2 = set([el.split('\t')[0] for el in pysam.idxstats(bamfile2).split('\n')[:len(pysam.idxstats(bamfile2).split('\n'))-1]]) else: chrom_bams_1 = set(map(lambda x: x.split('\t')[0], pysam.idxstats(bamfile1))) chrom_bams_2 = set(map(lambda x: x.split('\t')[0], pysam.idxstats(bamfile2))) except: return True chrom_bams = chrom_bams_1 & chrom_bams_2 #check chrom_sizes with open(chrom_sizes) as f: for line in f: line = line.split('\t') if line[0] not in chrom_chrom_sizes: chrom_chrom_sizes.add(line[0]) tmp = chrom_bams & chrom_chrom_sizes if len(tmp) == 0: return False #check genome for s in FastaReader(genome_path): if s.name not in chrom_genome: chrom_genome.add(s.name) if s.name in tmp: #one overlap is sufficient return True return len(chrom_bams & chrom_genome & chrom_chrom_sizes) >= 1
def get_chrom_lengths(path_to_bam): ''' Uses pysam to retrieve chromosome sizes form bam. Useful helper to use with some pybedtools functions (e.g. coverage), when a bam was mapped with custom genome not available in UCSC. Input: path to bam file (should be indexed) Output: dictionary. Example output: {'chr4': (0, 1351857), 'chr3L': (0, 24543557), 'chr2L': (0, 23011544), '*': (0, 0), 'chrX': (0, 22422827), 'chr2R': (0, 21146708), 'chr3R': (0, 27905053)} ''' idx = pysam.idxstats(path_to_bam).splitlines() chromsizes = {} for element in idx: stats = element.split("\t") chromsizes[stats[0]] = (0, int(stats[1])) return chromsizes
def _init_read_number(self, bamFile): """Compute number of reads and number of mapped reads for CoverageSet""" # XXX ToDo add number of mapped reads in all cases try: if pysam.__version__ == '0.9.0': a = pysam.idxstats(bamFile) mapped_reads = sum([int(el.split('\t')[2]) for el in a.split('\n')[:len(a.split('\n'))-1]]) unmapped_read = sum([int(el.split('\t')[3]) for el in a.split('\n')[:len(a.split('\n'))-1]]) self.reads = mapped_reads + unmapped_read self.mapped_reads = mapped_reads else: self.reads = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bamFile)]) self.mapped_reads = None except: self.reads = None self.mapped_reads = None
def bam_total_reads(bam_handle, chroms_to_ignore): """Count the total number of mapped reads in a BAM file, filtering the chromosome given in chroms_to_ignore list """ if chroms_to_ignore: import pysam lines = pysam.idxstats(bam_handle.filename) tot_mapped_reads = 0 for line in lines: chrom, _len, nmapped, _nunmapped = line.split("\t") if chrom not in chroms_to_ignore: tot_mapped_reads += int(nmapped) else: tot_mapped_reads = bam_handle.mapped return tot_mapped_reads
def write_table(filename, outfile): """ Function that create a count table using pysam. First index the BAM file, then count reads using the function idxstats from pysam, and output a count table. Args : filename [STR] : BAM file to count outfile [STR] : count table name No Returns """ # index the bam file pysam.index(filename) # create count table table = pysam.idxstats(filename) # write the count table with open(outfile, 'wt') as out: for line in table: out.write(line)
def get_chromosomes_info(bam_path): # Check if there is an index file, create one if there isn't if not os.path.isfile(bam_path + ".bai"): pysam.index(bam_path) logging.info('No BAM index file was found, new index was generated : `{}`'.format(bam_path + ".bai")) # Take chromosome data from BAM index: # (ref.seq. name, ref.seq. length, number of mapped reads and number of unmapped reads) chromosomes_info = [] logging.info('Collecting information about sample from .bai file: ' '[ref.seq. name, ref.seq. length, number of mapped and unmapped reads]') logging.info("\nGenome ID {} \nEstimated mappability {}".format('?', '?')) try: for chr in pysam.idxstats(bam_path): chromosomes_info.append(chr.split("\t")[:-1]) # Last line is unmapped reads, we don't need them chromosomes_info.pop() except: logging.error("\nPROBLEM WITH BAM FILE OR pysam.idxstats() COMMAND\nYour BAM file {} probably is not sorted." "\n\nTo sort it with samtools use comand: \n'samtools sort {} {}'" .format(bam_path, bam_path, bam_path[:-3] + 'sorted')) sys.exit(1) # print(chromosomes_info) return chromosomes_info
def Main(): args=ParseArg() if len(args.data)!=len(args.name): print >> sys.stderr, "ERROR: Number of data is not the same as number of names!" sys.exit(0) # store data information data={} total_reads={} for i in range(len(args.data)): temp_name=args.name[i] print >> sys.stderr, "\n Reading data file:"+temp_name+"..." total_reads[temp_name]=0 if args.format[i]=="bam": total_reads[temp_name] = reduce(lambda x, y: x + y, [ int(l.rstrip('\n').split('\t')[2]) for l in pysam.idxstats(args.data[i])]) else: Format="bed" for b in TableIO.parse(args.data[i],Format): total_reads[temp_name]+=1 if total_reads[temp_name]%50000==0: print >> sys.stderr, " reading %d reads..\r"%(total_reads[temp_name]), data[temp_name]=DBI.init(args.data[i],args.format[i]) output=open(args.output,'w') Input=open(args.input,'r') lines=Input.read().split("\n") # header header=["chr","start","end","type","name","subtype","count"]+data.keys() print >> output, "\t".join(g+"_%d"%(f) for f in [1,2] for g in header)+"\tinteraction\tp-value" num=0 print >> sys.stderr, "Start process interactions:" for l in lines: if l.strip()=='': continue l=l.strip().split('\t') num=num+1 if l[0]=="chrM" or l[7]=="chrM": continue C1=Bed([l[0],int(l[1]),int(l[2])]) C2=Bed([l[7],int(l[8]),int(l[9])]) rpkm1="\t".join (str(f) for f in [RPKM(C1,data[n],total_reads[n],n) for n in data.keys()]) rpkm2="\t".join (str(f) for f in [RPKM(C2,data[n],total_reads[n],n) for n in data.keys()]) print >> output, "\t".join(str(f) for f in l[:7]+[rpkm1]+l[7:14]+[rpkm2,l[14],l[15]]) if num%1000==0: print >> sys.stderr, " Output interaction: %d\r"%(num),
def count_telomeric_reads(bamfile, q): # generate Telomere reads file name telofile = bamfile.replace(options.bamdir,options.outdir).replace(".bam","_TelomericReads.sam") # check if the file was already generated if not os.path.exists(telofile): # print("---- Processing BAM file: "+bamfile) # extract telomeric reads and write to file cmd = options.sambamba+" view "+bamfile+" -t "+ str(options.nr_cpus) +" | LC_ALL=C grep -E \"" + "TTAGGG"*options.repsize +"|"+ "CCCTAA"*options.repsize + "\"" + " > " + telofile print("++++ Generating SAM file: "+telofile) os.system(cmd) # count total number of reads total_rc = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bamfile) ]) sleep(1) telomere_rc = 0 if os.path.exists(telofile): # count number of telomeric reads by line count telomere_rc = sum(1 for line in open(telofile,'r')) else: print("Something went wrong with BAM file: "+bamfile) # return results result = [str(bamfile.split("/")[-1].split("_")[0]), str(total_rc), str(telomere_rc), str((telomere_rc/(total_rc*1.0))*100000.0)] q.put(result) return(result)
def bam_statistics(bam_filename): stats = pysam.idxstats(bam_filename) del stats[-1] # * 0 0 0 0 ...? mapped_reads = sum([int(el.split("\t")[2]) for el in stats]) notmapped_reads = sum([int(el.split("\t")[3]) for el in stats]) return {'mapped':mapped_reads, 'notmapped':notmapped_reads, 'all':mapped_reads+notmapped_reads}
def test_idxstats_parse_split_lines(): bam_filename = "./pysam_data/ex2.bam" lines = pysam.idxstats(bam_filename, split_lines=True) # Test pysam 0.8.X style output, which returns a list of lines for line in lines: _seqname, _seqlen, nmapped, _nunmapped = line.split()
def coverage_from_bam(self, bam_file, read_size = 200, binsize = 100, stepsize = 50, rmdup = True, mask_file = None): """Return list of arrays describing the coverage of each genomicRegions from <bam_file>. Consider reads in <bam_file> with a extension size of <read_size>. Remove duplicates (read with same position) with rmdup=True (default). Divide the genomic regions in bins with a width of <binsize> and use <stepsize> to smooth the signal.""" self.binsize = binsize self.stepsize = stepsize bam = pysam.Samfile(bam_file, "rb" ) for read in bam.fetch(): read_size += read.rlen break self.mapped_reads = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:3]) ) for l in pysam.idxstats(bam_file) ]) self.reads = reduce(lambda x, y: x + y, [ eval('+'.join(l.rstrip('\n').split('\t')[2:]) ) for l in pysam.idxstats(bam_file) ]) #print("Loading reads of %s..." %self.name, file=sys.stderr) #check whether one should mask next_it = True if mask_file is not None and os.path.exists(mask_file): mask = True f = open(mask_file, 'r') c_help, s_help, e_help = self.genomicRegions.sequences[0].chrom, -1, -1 else: mask = False chrom_regions = [r.chrom for r in self.genomicRegions.sequences] #chroms by regions for region in self.genomicRegions: cov = [0] * (len(region) / stepsize) positions = [] j = 0 read_length = -1 try: for read in bam.fetch(region.chrom, max(0, region.initial-read_size), region.final+read_size): j += 1 read_length = read.rlen if not read.is_unmapped: pos = read.pos - read_size if read.is_reverse else read.pos pos_help = read.pos - read.qlen if read.is_reverse else read.pos #if position in mask region, then ignore if mask: while next_it and c_help not in chrom_regions: #do not consider this deadzone c_help, s_help, e_help, next_it = self._get_bedinfo(f.readline()) if c_help != -1 and chrom_regions.index(region.chrom) >= chrom_regions.index(c_help): #deadzones behind, go further while next_it and c_help != region.chrom: #get right chromosome c_help, s_help, e_help, next_it = self._get_bedinfo(f.readline()) while next_it and e_help <= pos_help and c_help == region.chrom: #check right position c_help, s_help, e_help, next_it = self._get_bedinfo(f.readline()) if next_it and s_help <= pos_help and c_help == region.chrom: continue #pos in mask region positions.append(pos) except ValueError: pass if rmdup: positions = list(set(positions)) positions.sort() positions.reverse() i = 0 while positions: win_s = max(0, i * stepsize - binsize*0.5) + region.initial win_e = i * stepsize + binsize*0.5 + region.initial c = 0 taken = [] while True: s = positions.pop() taken.append(s) if s < win_e: #read within window c += 1 if s >= win_e or not positions: taken.reverse() for s in taken: if s + read_size + read_length >= win_s: #consider read in next iteration positions.append(s) else: break #as taken decreases monotonously taken = [] break if i < len(cov): cov[i] = c i += 1 self.coverage.append(np.array(cov)) self.coverageorig = self.coverage[:]