def genotype_bam(argd): """Call genotypes in bam file at the given SNPs. argd is a dictionary of arguments """ # Extract reads in panel SNPs aln_f= pysam.AlignmentFile(argd['bam']) panel_bam= tempfile.NamedTemporaryFile(prefix= re.sub('\.bam$', '', os.path.basename(argd['bam'])) + '.', suffix= '.bam', delete= False, mode= 'w', dir= argd['tmpdir']) bname= os.path.splitext(panel_bam.name)[0] aln_out_us= pysam.AlignmentFile(panel_bam, 'wb', header= aln_f.header) panel= pysam.VariantFile(argd['snp_panel']) prev_reads= [] for snp in panel: reads= [] for read in aln_f.fetch(snp.chrom, snp.start, snp.stop): reads.append(read) # Remove reads already written outreads= [x for x in reads if x not in prev_reads] for read in outreads: aln_out_us.write(read) prev_reads= reads aln_f.close() aln_out_us.close() panel.close() # Pileup pysam.mpileup('-f', args.ref, '-g', '--min-MQ', str(args.mapq), '-o', bname + '.all.bcf', bname + '.bam', catch_stdout= False) bcftools.index(bname + '.all.bcf', catch_stdout= False) # Call genotypes # We use subprocess instead of pysam because of https://github.com/pysam-developers/pysam/issues/693 cmd= ['bcftools', 'call', '-T', snp_panel, '-m', '--skip-variants', 'indels', '-O', 'z', '-o', bname + '.calls.vcf.gz', bname + '.all.bcf'] sp= subprocess.Popen(cmd, stderr= subprocess.PIPE) stdout, stderr= sp.communicate() stderr= stderr.decode().split('\n') stderr= '\n'.join([x for x in stderr if 'assuming all sites are diploid' not in x]) # We ignore this warning if stderr != '': sys.stderr.write(stderr) if sp.returncode != 0: raise Exception('\n%s exit code from \n\n%s\n' % (sp.returncode, ' '.join(cmd))) # Make tabular format with open(bname + '.txt', 'w') as txt: txt.write('\t'.join(['chrom', 'pos', 'alt', 'gt', 'qual']) + '\n') calls= pysam.VariantFile(bname + '.calls.vcf.gz') for x in calls: if x.qual < argd['min_gq'] or sum(x.info['DP4']) < argd['min_dp4']: continue if x.alts is None: alt= '.' else: alt= x.alts[0] gt= x.samples[0]['GT'] if gt[0] is None or gt[1] is None: continue line= [x.chrom, str(x.pos), alt, str(gt[0]) + "/" + str(gt[1]), str(round(x.qual, 1))] txt.write('\t'.join(line) + '\n') calls.close() return {'table': bname + '.txt', 'bam': argd['bam']}
def linearsplicedreadscount(self, circ_coor, bamfile, ref, header=True): # Count linear spliced reads # process the circ_coordinates to left circ position and right circ position if header: coor = open(circ_coor, 'r').readlines()[1:] else: coor = open(circ_coor, 'r').readlines() start_coor = open(self.tmp_dir + 'tmp_start_coor', 'w') start_coor_1 = open(self.tmp_dir + 'tmp_start_coor_1', 'w') end_coor = open(self.tmp_dir + 'tmp_end_coor', 'w') end_coor_1 = open(self.tmp_dir + 'tmp_end_coor_1', 'w') for line in coor: tmp = line.split('\t') start_coor.write(tmp[0] + '\t' + tmp[1] + '\n') start_coor_1.write(tmp[0] + '\t' + str(int(tmp[1]) - 1) + '\n') end_coor.write(tmp[0] + '\t' + tmp[2] + '\n') end_coor_1.write(tmp[0] + '\t' + str(int(tmp[2]) + 1) + '\n') # close position files: start_coor.close() start_coor_1.close() end_coor.close() end_coor_1.close() print ('Started linear spliced read counting for %s' % bamfile) # mpileup get the number of spliced reads at circle start position and (start-1) position. print ("\t=> running mpileup 1 for start positions [%s]" % bamfile) mpileup_start = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_start_coor_1') print ("\t=> running mpileup 2 for start positions [%s]" % bamfile) mpileup_start_1 = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_start_coor_2') # mpileup get the number of spliced reads at circle end position and (end+1) position. print ("\t=> running mpileup 1 for end positions [%s]" % bamfile) mpileup_end = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_end_coor_1') print ("\t=> running mpileup 2 for end positions [%s]" % bamfile) mpileup_end_1 = pysam.mpileup(bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_end_coor_2') # get count print "\t=> gathering read counts for start positions [%s]" % bamfile startcount = self.submpileup(self.getreadscount(mpileup_start_1), self.getreadscount(mpileup_start)) print "\t=> gathering read counts for end positions [%s]" % bamfile endcount = self.submpileup(self.getreadscount(mpileup_end), self.getreadscount(mpileup_end_1), left=False) # remove tmp files # os.remove(self.tmp_dir + 'tmp_start_coor') # os.remove(self.tmp_dir + 'tmp_start_coor_1') # os.remove(self.tmp_dir + 'tmp_end_coor') # os.remove(self.tmp_dir + 'tmp_end_coor_1') print 'Finished linear spliced read counting for %s' % bamfile return startcount, endcount
def genecount(self, circ_coordinates, bamfile, ref, tid): """ @circ_coordinates: quoted string, content with format "chr1\tstart\tend" @bamfile: quoted string @ref: quoted string """ # process the circ_coordinates to left circ position and right circ position coordinates = open(circ_coordinates, 'r').readlines()[1:] start_coordinates = open(self.tmp_dir + 'tmp_start_coordinates_' + tid, 'w') end_coordinates = open(self.tmp_dir + 'tmp_end_coordinates_' + tid, 'w') for line in coordinates: tmp = line.split('\t') start_coordinates.write(tmp[0] + '\t' + tmp[1] + '\n') end_coordinates.write(tmp[0] + '\t' + tmp[2] + '\n') # close position files: start_coordinates.close() end_coordinates.close() print(('Started linear gene expression counting for %s' % bamfile)) start = time.time() # mpileup get the read counts of the start and end positions print(("\t=> running mpileup for start positions [%s]" % bamfile)) mpileup_start = pysam.mpileup( bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_start_coordinates_' + tid) end = time.time() - start print(("\t=> mpileup for start positions for %s took %d seconds" % (bamfile, end))) start = time.time() # mpileup get the read counts of the start and end positions print(("\t=> running mpileup for end positions [%s]" % bamfile)) mpileup_end = pysam.mpileup( bamfile, '-f', ref, '-l', self.tmp_dir + 'tmp_end_coordinates_' + tid) end = time.time() - start print(("\t=> mpileup for end positions for %s took %d seconds" % (bamfile, end))) print("\t=> gathering read counts for start positions [%s]" % bamfile) startcount = self.getreadscount(mpileup_start, countmapped=True) print("\t=> gathering read counts for end positions [%s]" % bamfile) endcount = self.getreadscount(mpileup_end, countmapped=True) # remove tmp files # os.remove(self.tmp_dir + 'tmp_start_coordinates_' + tid) # os.remove(self.tmp_dir + 'tmp_end_coordinates_' + tid) print('Finished linear gene expression counting for %s' % bamfile) return startcount, endcount
def linearsplicedreadscount(self, circ_coor, bamfile, ref, header=True): # Count linear spliced reads # process the circ_coordinates to left circ position and right circ position if header: coor = open(circ_coor, 'r').readlines()[1:] else: coor = open(circ_coor, 'r').readlines() start_coor = open('start_coor', 'w') start_coor_1 = open('start_coor_1', 'w') end_coor = open('end_coor', 'w') end_coor_1 = open('end_coor_1', 'w') for line in coor: tmp = line.split('\t') start_coor.write(tmp[0] + '\t' + tmp[1] + '\n') start_coor_1.write(tmp[0] + '\t' + str(int(tmp[1]) - 1) + '\n') end_coor.write(tmp[0] + '\t' + tmp[2] + '\n') end_coor_1.write(tmp[0] + '\t' + str(int(tmp[2]) + 1) + '\n') # close postion files: start_coor.close() start_coor_1.close() end_coor.close() end_coor_1.close() print 'Started linear spliced reads count for %s' % bamfile # mpileup get the number of spliced reads at circle start position and (start-1) postion. mpileup_start = pysam.mpileup(bamfile, '-f', ref, '-l', 'start_coor') # A list object mpileup_start_1 = pysam.mpileup(bamfile, '-f', ref, '-l', 'start_coor_1') # mpileup get the number of spliced reads at circle end position and (end+1) postion. mpileup_end = pysam.mpileup(bamfile, '-f', ref, '-l', 'end_coor') mpileup_end_1 = pysam.mpileup(bamfile, '-f', ref, '-l', 'end_coor_1') # get count startcount = self.submpileup(self.getreadscount(mpileup_start_1), self.getreadscount(mpileup_start)) endcount = self.submpileup(self.getreadscount(mpileup_end), self.getreadscount(mpileup_end_1), left=False) # remove tmp files os.remove('start_coor') os.remove('start_coor_1') os.remove('end_coor') os.remove('end_coor_1') return startcount, endcount
def findSiteCoverages(self, bamIn, minDepth=40): """ Runs mpileup on each SUN position and finds allele fractions """ bases = set(["A", "T", "G", "C", "a", "t", "g", "c"]) resultDict = defaultdict(list) nVals = [] for pos, (para, ref, alt, hg38_pos) in self.wl.iteritems(): posStr = "chr1:{0}-{0}".format(pos) pileUp = pysam.mpileup("-q", "20", "-r", posStr, bamIn) if len(pileUp) == 0: continue pileUpStr = pileUp[0].split() if len(pileUpStr) != 6: continue pileUpResult = Counter(x.upper() for x in pileUpStr[4] if x in bases) if ref not in pileUpResult or alt not in pileUpResult: continue if sum(pileUpResult.itervalues()) < minDepth: continue frac = formatRatio(pileUpResult[alt], sum(pileUpResult.values())) # invert fraction for Notch2 paralogs if para == "N": frac = 1 - frac nVals.append(frac) else: resultDict[para].append([pos, frac]) norm = self.findNormalizingFactor(nVals) for para, result in resultDict.iteritems(): resultDict[para] = [[x, 2.0 * y / norm] for x, y in result] return resultDict
def _mpileup_grab(self): """ Get information on this locus from samtools mpileup. :return: """ return pysam.mpileup(self.inbam, "-f", self.ref_gen, "-r", self.ival).rstrip('\n').split('\t')
def get_consensus_report(name, sam_path, ref_path, is_circular, coverage_threshold=0, report_out_dir=None, tmp_files_dir=None): basename = os.path.basename(sam_path) file_name, ext = os.path.splitext(basename) out_dir = tmp_files_dir keep_tmp_files = tmp_files_dir is not None if not keep_tmp_files: out_dir = tempfile.mkdtemp() os.makedirs(out_dir, exist_ok=True) tmp_sam_path = os.path.join(out_dir, file_name + '_tmp.sam') tmp_bam_path = os.path.join(out_dir, file_name + '_tmp.bam') bam_path = os.path.join(out_dir, file_name + '.bam') mpileup_path = bam_path + '.bam.mpilup' logging.info("Split long aligments") split_aligments_in_sam(sam_path, tmp_sam_path) logging.info("Converting sam to bam") pysam.view('-S', tmp_sam_path, '-b', '-o', tmp_bam_path, catch_stdout=False) logging.info("Sorting bam file") pysam.sort(tmp_bam_path, '-o', bam_path, catch_stdout=False) logging.info("Creating bam index") pysam.index(bam_path, '-b') logging.info("Creating mpileup") mpileup_flags = ['-A', '-B', '-Q', '0'] if is_circular: # use secondary aligments as well mpileup_flags.extend(['--ff', '0']) pysam.mpileup(*mpileup_flags, '-f', ref_path, bam_path, '-o', mpileup_path, catch_stdout=False) logging.info("Generating consensus and report") report = process_mpileup(name, sam_path, ref_path, mpileup_path, coverage_threshold, report_out_dir) if not keep_tmp_files: logging.info("Cleaning tmp files") shutil.rmtree(out_dir) return report
def linearsplicedreadscount(self,circ_coor,bamfile,ref,header=True): # Count linear spliced reads # process the circ_coordinates to left circ position and right circ position if header: coor = open(circ_coor,'r').readlines()[1:] else: coor = open(circ_coor,'r').readlines() start_coor = open('start_coor','w') start_coor_1 = open('start_coor_1','w') end_coor = open('end_coor','w') end_coor_1 = open('end_coor_1','w') for line in coor: tmp = line.split('\t') start_coor.write(tmp[0]+'\t'+tmp[1]+'\n') start_coor_1.write(tmp[0]+'\t'+str(int(tmp[1])-1)+'\n') end_coor.write(tmp[0]+'\t'+tmp[2]+'\n') end_coor_1.write(tmp[0]+'\t'+str(int(tmp[2])+1)+'\n') # close postion files: start_coor.close() start_coor_1.close() end_coor.close() end_coor_1.close() print 'Started linear spliced reads count for %s' % bamfile # mpileup get the number of spliced reads at circle start position and (start-1) postion. mpileup_start = pysam.mpileup(bamfile,'-f',ref,'-l','start_coor') # A list object mpileup_start_1 = pysam.mpileup(bamfile,'-f',ref,'-l','start_coor_1') # mpileup get the number of spliced reads at circle end position and (end+1) postion. mpileup_end = pysam.mpileup(bamfile,'-f',ref,'-l','end_coor') mpileup_end_1 = pysam.mpileup(bamfile,'-f',ref,'-l','end_coor_1') # get count startcount = self.submpileup(self.getreadscount(mpileup_start_1),self.getreadscount(mpileup_start)) endcount = self.submpileup(self.getreadscount(mpileup_end),self.getreadscount(mpileup_end_1),left=False) # remove tmp files os.remove('start_coor') os.remove('start_coor_1') os.remove('end_coor') os.remove('end_coor_1') return startcount, endcount
def pileup(self, chr_name, pos, ref, alt, tmp_file=".cnvpytor"): if not (chr_name in self.len): _logger.warning("Can not find chromosome '%s' in file '%s'." % (chr_name, self.filename)) return _logger.debug("Pileup chromosome %s from filename %s" % (chr_name, self.filename)) tmp_file += "_" + str(random.randint(0, 1e10)) + "_" + chr_name f = open(tmp_file, "w") for i in pos: print(chr_name, i, file=f) f.close() if self.reference_filename: mpile = pysam.mpileup("-r", chr_name, "-l", tmp_file, "--reference", self.reference_filename, self.filename) else: mpile = pysam.mpileup("-r", chr_name, "-l", tmp_file, self.filename) os.remove(tmp_file) pos_seq = dict([(int(x.split("\t")[1]), x.split("\t")[4].upper()) for x in mpile.split("\n") if x != ""]) nref = [0] * len(pos) nalt = [0] * len(pos) for ix in range(len(pos)): if pos[ix] in pos_seq: nref[ix] = pos_seq[pos[ix]].count(ref[ix]) + pos_seq[pos[ix]].count(".") + pos_seq[pos[ix]].count(",") nalt[ix] = pos_seq[pos[ix]].count(alt[ix]) return nref, nalt
def GetCoverages(line, normal, tumor): [chrom, start, end] = line[1:4] counts = {} for p in pysam.mpileup("-r %s:%s-%s" % (chrom, str(int(start)+1), end), normal): #, chrom, start, end) seq = p.split("\t")[4].upper() #print seq bases = RemoveIndels(seq) for base in bases: if base not in counts: counts[base] = 0 counts[base] += 1 print counts return (0,0)
def genecount(self, circ_coor, bamfile, ref): """ @circ_coor: quoted string, content with format "chr1\tstart\tend" @bamfile: quoted string @ref: quoted string """ # process the circ_coordinates to left circ position and right circ position coor = open(circ_coor, 'r').readlines()[1:] start_coor = open('start_coor', 'w') end_coor = open('end_coor', 'w') for line in coor: tmp = line.split('\t') start_coor.write(tmp[0] + '\t' + tmp[1] + '\n') end_coor.write(tmp[0] + '\t' + tmp[2] + '\n') # close postion files: start_coor.close() end_coor.close() print 'Started linear gene expression counting for %s' % bamfile # mpileup get the read counts of the start and end positions mpileup_start = pysam.mpileup(bamfile, '-f', ref, '-l', 'start_coor') # A list object mpileup_end = pysam.mpileup(bamfile, '-f', ref, '-l', 'end_coor') # get count startcount = self.getreadscount(mpileup_start, countmapped=True) endcount = self.getreadscount(mpileup_end, countmapped=True) # remove tmp files os.remove('start_coor') os.remove('end_coor') return startcount, endcount
def genecount(self, circ_coor, bamfile, ref): """ @circ_coor: quoted string, content with format "chr1\tstart\tend" @bamfile: quoted string @ref: quoted string """ # process the circ_coordinates to left circ position and right circ position coor = open(circ_coor,'r').readlines()[1:] start_coor = open('start_coor','w') end_coor = open('end_coor','w') for line in coor: tmp = line.split('\t') start_coor.write(tmp[0]+'\t'+tmp[1]+'\n') end_coor.write(tmp[0]+'\t'+tmp[2]+'\n') # close postion files: start_coor.close() end_coor.close() print 'Started linear gene expression counting %s' % bamfile # mpileup get the read counts of the start and end positions mpileup_start = pysam.mpileup(bamfile,'-f',ref,'-l','start_coor') # A list object mpileup_end = pysam.mpileup(bamfile,'-f',ref,'-l','end_coor') # get count startcount = self.getreadscount(mpileup_start,countmapped=True) endcount = self.getreadscount(mpileup_end,countmapped=True) # remove tmp files os.remove('start_coor') os.remove('end_coor') return startcount, endcount
def find_variant_depths(variants, bamfile, logfh, args): depths = {} if not variants: return depths logfh.write("Creating pileup from bamfile: {}\n".format(bamfile)) pysam_pileup = pysam.mpileup("-B", "-d99999", '-Q0', "-f", RESOURCE['ref_fa'], bamfile) varpos = sorted(variants.keys()) for pos in varpos: variant = variants[pos] for pileline in pysam_pileup: pileinfo = PileLine(pileline) if pileinfo.pos == pos: depths[pos] = get_depths_from_pileline(variant, pileinfo, pos, logfh, args.debug) break return depths
def _make_vcf_and_read_depths_files(self): if not os.path.exists(self.ref_fa + '.fai'): pysam.faidx(self.ref_fa) tmp_vcf = self.vcf_file + '.tmp' with open(tmp_vcf, 'w') as f: print(pysam.mpileup( '-t', 'INFO/AD,INFO/ADF,INFO/ADR', '-L', '99999999', '-A', '-f', self.ref_fa, '-u', '-v', self.bam, ), end='', file=f) got = vcfcall_ariba.vcfcall_ariba(tmp_vcf, self.outprefix, self.min_var_read_depth, self.min_second_var_read_depth, self.max_allele_freq) if got != 0: raise Error('Error parsing vcf file. Cannot contine') pysam.tabix_compress(self.outprefix + '.read_depths', self.read_depths_file) pysam.tabix_index(self.read_depths_file, seq_col=0, start_col=1, end_col=1) os.unlink(self.outprefix + '.read_depths') os.unlink(tmp_vcf)
def find_site_coverages(self, bam_in, min_depth=20): """ Runs mpileup on each SUN position and finds allele fractions """ bases = {"A", "T", "G", "C", "a", "t", "g", "c"} sun_results = defaultdict(list) n_vals = [] for pos, (para, ref, alt, hg38_pos) in self.whitelist.iteritems(): pos_str = "chr1:{0}-{0}".format(pos) pile_up = pysam.mpileup("-q", "20", "-Q", "20", "-r", pos_str, bam_in) if len(pile_up) == 0: continue pile_up_str = pile_up[0].split() if len(pile_up_str) != 6: continue pile_up_result = Counter(x.upper() for x in pile_up_str[4] if x in bases) if ref not in pile_up_result or alt not in pile_up_result: continue if sum(pile_up_result.itervalues()) < min_depth: continue frac = format_ratio(pile_up_result[alt], sum(pile_up_result.values())) # invert fraction for Notch2 paralogs if para == "N": frac = 1 - frac n_vals.append(frac) else: sun_results[para].append([hg38_pos, frac]) # calculate normalizing factor from mean value of Notch2 SUNs, then normalize other SUNs normalizing_factor = 1.0 * sum(n_vals) / len(n_vals) for para, result in sun_results.iteritems(): sun_results[para] = [[pos, 2.0 * val / normalizing_factor] for pos, val in result] return sun_results
def calculate_mRNA_coverage(convert, transcript_lengths, coordinates_dict, sequence_dict, goi_id, dataset, base_name): bamfile = pysam.AlignmentFile( "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam" % (args.dataset, base_name), "rb") if convert[goi_id] in special_cases[dataset]: print("Special") print(goi_id) print(convert[goi_id]) if (convert[goi_id] == "ADAMTS13WT") or (convert[goi_id] == "F9WT"): print(base_name, convert[goi_id]) if base_name[0] in datasetinfo[dataset].wt: print("yep it's here") string = goi_id + ":" + str(0) + "-" + str( transcript_lengths[convert[goi_id]]) mpileup = pysam.mpileup( "-a", "-r", string, "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam" % (dataset, base_name)).split("\n") mrna_coverage = {} if len(mpileup) == 1 and mpileup[0] == "": for i in range(0, transcript_lengths[convert[goi_id]]): mrna_coverage[i] = 0 print( "Finished filling in coverage with zeroes because no reads aligned to this gene for %s" % base_name) for i in range(0, len(mpileup)): if mpileup[i] == "": continue s = mpileup[i].split("\t") try: mrna_position = int(s[1]) - 1 except: print(s) print(base_name) coverage = int(s[3]) mrna_coverage[i] = int(s[3]) print("Finished calculating mRNA coverage for %s" % base_name, "at", time.ctime()) return mrna_coverage else: mrna_coverage = {} for i in range(0, transcript_lengths[convert[goi_id]]): mrna_coverage[i] = 0 print("Finished fudging mRNA coverage for %s" % base_name, "at", time.ctime()) return mrna_coverage elif (convert[goi_id] == "ADAMTS13P118P") or (convert[goi_id] == "F9opt1") or ( convert[goi_id] == "F91A") or (convert[goi_id] == "F9V107V"): print(base_name, convert[goi_id]) if base_name[0] in datasetinfo[dataset].opt1: print("yep it's here") string = goi_id + ":" + str(0) + "-" + str( transcript_lengths[convert[goi_id]]) mpileup = pysam.mpileup( "-a", "-r", string, "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam" % (dataset, base_name)).split("\n") mrna_coverage = {} if len(mpileup) == 1 and mpileup[0] == "": for i in range(0, transcript_lengths[convert[goi_id]]): mrna_coverage[i] = 0 print( "Finished filling in coverage with zeroes because no reads aligned to this gene for %s" % base_name) for i in range(0, len(mpileup)): if mpileup[i] == "": continue s = mpileup[i].split("\t") try: mrna_position = int(s[1]) - 1 except: print(s) print(base_name) coverage = int(s[3]) mrna_coverage[i] = int(s[3]) print("Finished calculating mRNA coverage for %s" % base_name, "at", time.ctime()) return mrna_coverage else: mrna_coverage = {} for i in range(0, transcript_lengths[convert[goi_id]]): mrna_coverage[i] = 0 print("Finished fudging mRNA coverage for %s" % base_name, "at", time.ctime()) return mrna_coverage elif (convert[goi_id] == "F92A"): print("hello") print(base_name, convert[goi_id]) print(datasetinfo[dataset].opt2) if base_name[0] in datasetinfo[dataset].opt2: string = goi_id + ":" + str(0) + "-" + str( transcript_lengths[convert[goi_id]]) mpileup = pysam.mpileup( "-a", "-r", string, "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam" % (dataset, base_name)).split("\n") mrna_coverage = {} if len(mpileup) == 1 and mpileup[0] == "": for i in range(0, transcript_lengths[convert[goi_id]]): mrna_coverage[i] = 0 print( "Finished filling in coverage with zeroes because no reads aligned to this gene for %s" % base_name) for i in range(0, len(mpileup)): if mpileup[i] == "": continue s = mpileup[i].split("\t") try: mrna_position = int(s[1]) - 1 except: print(s) print(base_name) coverage = int(s[3]) mrna_coverage[i] = int(s[3]) print("Finished calculating mRNA coverage for %s" % base_name, "at", time.ctime()) return mrna_coverage else: mrna_coverage = {} for i in range(0, transcript_lengths[convert[goi_id]]): mrna_coverage[i] = 0 print("Finished fudging mRNA coverage for %s" % base_name, "at", time.ctime()) return mrna_coverage else: string = goi_id + ":" + str(0) + "-" + str( transcript_lengths[convert[goi_id]]) mpileup = pysam.mpileup( "-a", "-r", string, "./Annotated_size_filtered_reads/%s/%s_annotated_sized.bam" % (dataset, base_name)).split("\n") mrna_coverage = {} print(base_name) if len(mpileup) == 1 and mpileup[0] == "": for i in range(0, transcript_lengths[convert[goi_id]]): mrna_coverage[i] = 0 print( "Finished filling in coverage with zeroes because no reads aligned to this gene for %s" % base_name) else: for i in range(0, len(mpileup)): if mpileup[i] == "": continue s = mpileup[i].split("\t") try: mrna_position = int(s[1]) - 1 except: print(s) print(base_name) coverage = int(s[3]) mrna_coverage[i] = int(s[3]) print("Finished calculating mRNA coverage for %s" % base_name, "at", time.ctime()) return mrna_coverage
def filter(self, in_mutation_file, in_bam, output): samfile = pysam.Samfile(in_bam, "rb") srcfile = open(in_mutation_file,'r') hResult = open(output,'w') if self.header_flag: header = srcfile.readline().rstrip('\n') newheader = "tmismatch_count\tmismatch_rate" print >> hResult, (header +"\t"+ newheader) for line in srcfile: line = line.rstrip() itemlist = line.split('\t') # input file is annovar format (not zero-based number) chr = itemlist[0] start = (int(itemlist[1]) - 1) end = int(itemlist[2]) ref = itemlist[3] alt = itemlist[4] max_mismatch_count = 0 max_mismatch_rate = 0 # if (ref == '-' or alt == '-'): # self.write_result_file(line, hResult, '---', '---') # continue region = chr +":"+str(max(0, (start - self.search_length + 1))) +"-"+ str(end + self.search_length) #### # print region for mpileup in pysam.mpileup( '-BQ', '0', '-d', '10000000', "-q", self.base_qual_thres, "-r", region, in_bam ): # print mpileup.rstrip() # # Prepare mpileup data # mp_list = str( mpileup.translate( None, '\n' ) ).split( '\t' ) mp_list_len = len( mp_list ) coordinate = mp_list[ 0:3 ] # # skip if depth is 0 # if mp_list[ 3 ] == '0' or ( mp_list_len > 6 and mp_list[ 6 ] == '0' ): continue # # skip if depth < min_depth # if int(mp_list[ 3 ]) < self.min_depth: continue # depth = mp_list[ 3 ] read_bases = mp_list[ 4 ] qual_list = mp_list[ 5 ] # # position id, # mpileup output 4th row(number of read covering the site), # 5th row(read bases), # 6th row(base quality) # indel = auto_vivification() # # Look for deletion/insertion and save info in 'indel' dictionary # # ([\+\-])[0-9]+[ACGTNacgtn]+ # # m.group(1): + or - (deletion/insertion) # m.group(2): number of deletion/insertion # m.group(3): nucleotides # deleted = 0 iter = self.target.finditer( read_bases ) for m in iter: site = m.start() type = m.group( 1 ) num = m.group( 2 ) bases = m.group( 3 )[ 0:int( num ) ] if bases.islower(): strand = ( '-', '+' ) else: strand = ( '+', '-' ) key = '\t'.join( coordinate + [ bases.upper() ] ) if type in indel and key in indel[ type ]: indel[ type ][ key ][ strand[ 0 ] ] += 1 else: indel[ type ][ key ][ strand[ 0 ] ] = 1 indel[ type ][ key ][ strand[ 1 ] ] = 0 read_bases = read_bases[ 0:site - deleted ] + read_bases[ site + int(num) + len( num ) + 1 - deleted: ] deleted += 1 + len( num ) + int( num ) # # Remove '^.' and '$' # read_bases = self.remove_chr.sub( '', read_bases ) read_bases = read_bases.translate( None, '$' ) # # Error check # if len( read_bases ) != len( qual_list ): logging.error( "mpileup data is not good: {0}, {1}".format( mpileup, read_bases ) ) exit(1) # for type in ( '+', '-' ): if type in indel: for key in indel[ type ].keys(): start_pos = mp_list[ 1 ] mismatch_count = ( indel[ type ][ key ][ '-' ] + indel[ type ][ key ][ '+' ]) mismatch_rate = (float(mismatch_count) / float(depth)) if mismatch_rate >= max_mismatch_rate: start_pos = int(start_pos) end_pos = int(start_pos) if (type == '-'): start_pos = int(start_pos) + 1 end_pos = int(start_pos) + len((key.split('\t'))[3]) - 1 # print "m: " + str(start_pos) +"-"+ str(end_pos) # print "o: " + str(start) +"-"+ str(end) # print mismatch_count # print mismatch_rate if ((start_pos - self.neighbor <= int(start) + 1 and int(start) + 1 <= self.neighbor + end_pos) or(start_pos - self.neighbor <= int(end) and int(end) <= self.neighbor + end_pos)): max_mismatch_count = mismatch_count max_mismatch_rate = mismatch_rate #### # print "mmc: " + str(max_mismatch_count) # print "mm: " + str(self.min_mismatch) if(max_mismatch_count <= self.min_mismatch or max_mismatch_rate <= self.af_thres): self.write_result_file(line, hResult, max_mismatch_count, max_mismatch_rate) #### hResult.close() srcfile.close()
class Counter: mCounts = 0 def __call__(self, alignment): self.mCounts += 1 c = Counter() samfile.fetch(region="chr1:10-200", callback=c) print "counts=", c.mCounts print "########### Calling a samtools command line function ############" for p in pysam.mpileup("-c", "ex1.bam"): print str(p) print pysam.mpileup.getMessages() print "########### Investigating headers #######################" # playing arount with headers samfile = pysam.Samfile("ex3.sam", "r") print samfile.references print samfile.lengths print samfile.text print samfile.header header = samfile.header samfile.close()
def get_coverage(bams, reference, chromosome, merged, length, depth, proportion): """ Uses pysam tools to create mpileup, and determines the total coverage for either pairs of merged files, or individual merged files. """ removed_bams = [] if merged: # Must be using pairs of matched files for b1, b2 in bams: total_bases = 0 reads_one = pysam.mpileup('-D', b1) reads_two = pysam.mpileup('-D', b2) line_one = reads_one[0] line_two = reads_two[0] line_one_i = 1 line_two_i = 1 while True: if chromosome not in line_one: try: line_one = reads_one[line_one_i] except IndexError: break line_one_i += 1 continue elif chromosome not in line_two: try: line_two = reads_two[line_two_i] except IndexError: break line_two_i += 1 continue lone_split = line_one.split('\t') p1 = int(lone_split[1]) ltwo_split = line_two.split('\t') p2 = int(ltwo_split[1]) if p1 > p2: cov = int(ltwo_split[3]) if cov >= depth: total_bases += 1 try: line_two = reads_two[line_two_i] except IndexError: break line_two_i += 1 elif p2 < p1: cov = int(lone_split[3]) if cov >= depth: total_bases += 1 try: line_one = reads_one[line_one_i] except IndexError: break line_one_i += 1 elif p1 == p2: cov = int(lone_split[3]) + int(ltwo_split[3]) if cov >= depth: total_bases += 1 try: line_one = reads_one[line_one_i] except IndexError: break line_one_i += 1 try: line_two = reads_two[line_two_i] except IndexError: break line_two_i += 1 while True: if chromosome not in line_one: cov = int(lone_split[3]) if cov >= depth: total_bases += 1 try: line_one = reads_one[line_one_i] except IndexError: break line_one_i += 1 continue cov = int(lone_split[3]) try: line_one = reads_one[line_one_i] except IndexError: break if cov >= depth: total_bases += 1 line_one_i += 1 while line_two: if chromosome not in line_two: try: line_two = reads_two[line_two_i] except IndexError: break line_two_i += 1 continue try: line_two = reads_two[line_two_i] except IndexError: break cov = int(ltwo_split[3]) line_two = reads_two.readline() if cov >= depth: total_bases += 1 line_two_i += 1 if float(total_bases) / length < proportion: print b1, b2 removed_bams.extend([b1, b2]) # Might need to implement simple one-pass strategy here else: for b in bams: total_bases = 0 reads = pysam.mpileup('-D', b) for r in reads: if chromosome in r: r_cov = int(r.strip().split('\t')[3]) if r_cov >= depth: total_bases += 1 if float(total_bases) / length < proportion: print b removed_bams.append(b) remove_files(removed_bams)
def createAncientReads(individuals, snpFile, bamFilePath): sFile = open(snpFile, 'r') outFile = open("AncientReads.output", 'w') # will OVERWRITE contents of existing file header = 'Chrom\tPos' for i in range(len( individuals)): # need to create header for all individuals in list header = f"{header}\t{individuals[i]}_der\t{individuals[i]}_anc\t{individuals[i]}_other" outFile.write(f"{header}\n") for line in sFile: lineList = line.split( ) # should be in "rsID | chrom | pos | physPos | refAllele | newAllele" format writtenLine = f"{lineList[1]}\t{lineList[3]}" # resets/starts the line we'll write to AncientReads.output file for i in range(len(individuals)): # run mpileup on each individual # in format "samtools mpileup -r chromosome:pos-pos bamFile" # Even if you only want one positon, do pos-pos. Otherwise # You'll get reads from that position to the end of the # chromosome. if (lineList[1] == '23'): # if X chromosome reads = pysam.mpileup( '-r', f"X:{lineList[3]}-{lineList[3]}", f"{bamFilePath}{individuals[i]}.sorted.bam") reads = reads.split() elif (lineList[1] == '24'): # if Y chromosome reads = pysam.mpileup( '-r', f"Y:{lineList[3]}-{lineList[3]}", f"{bamFilePath}{individuals[i]}.sorted.bam") reads = reads.split() else: reads = pysam.mpileup( '-r', f"{lineList[1]}:{lineList[3]}-{lineList[3]}", f"{bamFilePath}{individuals[i]}.sorted.bam") reads = reads.split() # list of strings ["chrom", "pos", "N", "total reads", "reads", "read quality"] if not reads: # no read was found ancReads = 0 derReads = 0 otherReads = 0 else: totalReads = int(reads[3]) ancReads = reads[4].count(lineList[4]) derReads = reads[4].count(lineList[5]) otherReads = totalReads - ancReads - derReads writtenLine = f"{writtenLine}\t{derReads}\t{ancReads}\t{otherReads}" # append reads to line outFile.write(f"{writtenLine}\n") sFile.close() # for good practice outFile.close()
sitedepths=[] totalevents=[] ## Iterate through every indel for indel in indels: ## Set properties of indel CHROM=indel[1].split(":")[0] POS=int(indel[1].split(":")[1].split("-")[0])-1 # Pysam coordinates are ZERO-based, so we MUST subtract 1 logging.debug(str(CHROM)+":"+str(POS)) # Set the window size and reference genome window=10 genome="/home/chris_w/resources/b37/human_g1k_v37.fasta" # Produce a pileup of bam over the specified window pile=pysam.mpileup("-f","/home/chris_w/resources/b37/human_g1k_v37.fasta","-r",CHROM+":"+str(POS-window)+"-"+str(POS+window),args.n1) # Count how many SNVs and indels are present # Indel count is unreliable, e.g. a 3bp insertion could be "+3GAC" which is 4 characters # However, this inaccuracy is acceptable as indels are arguably worse than SNVs when considering # the reliability of a region # We can ignore "^" and "$" characters, as they denote the start/end of reads # Note that "^" is ALWAYS followed by another character, so we remove that, too siteevents=0 sitedepth=0 sites=0 for site in pile: bases=site.split("\t") events=len(bases[4])-bases[4].count(".")-bases[4].count(",")-2*bases[4].count("^")-bases[4].count("$") siteevents=siteevents+events sitedepth=sitedepth+int(bases[3])
print "########### Using a callback object ###### " class Counter: mCounts = 0 def __call__(self, alignment): self.mCounts += 1 c = Counter() samfile.fetch( region = "chr1:10-200", callback = c ) print "counts=", c.mCounts print "########### Calling a samtools command line function ############" for p in pysam.mpileup( "-c", "ex1.bam" ): print str(p) print pysam.mpileup.getMessages() print "########### Investigating headers #######################" # playing arount with headers samfile = pysam.Samfile( "ex3.sam", "r" ) print samfile.references print samfile.lengths print samfile.text print samfile.header header = samfile.header samfile.close()
def main(): args = parse_command_line_arguments() # defaults and naming if not args.max_length: args.max_length = 1000 if not args.max_edit_dist and not args.max_derived: args.max_edit_dist = 1000 if args.max_edit_dist: outbam = '%slen%dNM%d.bam' % (args.bam_file[:-3], args.max_length, args.max_edit_dist) elif args.max_derived: outbam = '%slen%dder%d.bam' % (args.bam_file[:-3], args.max_length, args.max_derived) with pysam.AlignmentFile(args.bam_file, "rb") as samfile: print '%d reads before filtering' % samfile.count() i = 0 with pysam.AlignmentFile(outbam, "wb", template=samfile) as tmpfile: for read in samfile.fetch(): edit_dist = read.get_tag('NM') read_length = read.query_length if read_length <= args.max_length: if args.max_edit_dist: if edit_dist <= args.max_edit_dist: tmpfile.write(read) i += 1 elif args.max_derived: if 100*edit_dist/read_length < args.max_derived: tmpfile.write(read) i += 1 print '%d reads after filtering' % i # pileup generation if not args.filter_only: pileup = pysam.mpileup('-f',args.ref_fasta,outbam) os.remove(outbam) # pileup parsing subst = {'AA':0,'TT':0,'CC':0,'GG':0, 'AC':0,'AT':0,'AG':0, 'CA':0,'CT':0,'CG':0, 'TA':0,'TC':0,'TG':0, 'GA':0,'GC':0,'GT':0, 'ins':0,'del':0,'un':0,'un_ref':0} for line in pileup: toks = line.strip('\n').split('\t') ref = toks[2].upper() alt = parseString(ref, toks[4]).__repr__() for alt_type,count in alt.iteritems(): if ref in list('ACTG'): try: subst[alt_type] += count except: print alt, toks[1] sys.exit() else: subst['un_ref'] += count # output keyorder = "AA\tTT\tCC\tGG\tAC\tAT\tAG\tCA\tCT\tCG\tTA\tTC\tTG\tGA\tGC\tGT\tins\tdel\tun\tno_ref" print keyorder subst = OrderedDict(sorted(subst.items(), key=lambda i: keyorder.index(i[0]))) print '\t'.join([str(x) for x in subst.values()])
def get_coverage(bams, reference, chromosome, merged, length, depth, proportion): """ Uses pysam tools to create mpileup, and determines the total coverage for either pairs of merged files, or individual merged files. """ removed_bams = [] if merged: # Must be using pairs of matched files for b1, b2 in bams: total_bases = 0 reads_one = pysam.mpileup('-D', b1) reads_two = pysam.mpileup('-D', b2) line_one = reads_one[0] line_two = reads_two[0] line_one_i = 1 line_two_i = 1 while True: if chromosome not in line_one: try: line_one = reads_one[line_one_i] except IndexError: break line_one_i += 1 continue elif chromosome not in line_two: try: line_two = reads_two[line_two_i] except IndexError: break line_two_i += 1 continue lone_split = line_one.split('\t') p1 = int(lone_split[1]) ltwo_split = line_two.split('\t') p2 = int(ltwo_split[1]) if p1 > p2: cov = int(ltwo_split[3]) if cov >= depth: total_bases += 1 try: line_two = reads_two[line_two_i] except IndexError: break line_two_i += 1 elif p2 < p1: cov = int(lone_split[3]) if cov >= depth: total_bases += 1 try: line_one = reads_one[line_one_i] except IndexError: break line_one_i += 1 elif p1 == p2: cov = int(lone_split[3]) + int(ltwo_split[3]) if cov >= depth: total_bases += 1 try: line_one = reads_one[line_one_i] except IndexError: break line_one_i += 1 try: line_two = reads_two[line_two_i] except IndexError: break line_two_i += 1 while True: if chromosome not in line_one: cov = int(lone_split[3]) if cov >= depth: total_bases += 1 try: line_one = reads_one[line_one_i] except IndexError: break line_one_i += 1 continue cov = int(lone_split[3]) try: line_one = reads_one[line_one_i] except IndexError: break if cov >= depth: total_bases += 1 line_one_i += 1 while line_two: if chromosome not in line_two: try: line_two = reads_two[line_two_i] except IndexError: break line_two_i += 1 continue try: line_two = reads_two[line_two_i] except IndexError: break cov = int(ltwo_split[3]) line_two = reads_two.readline() if cov >= depth: total_bases += 1 line_two_i += 1 if float(total_bases)/length < proportion: print b1, b2 removed_bams.extend([b1, b2]) # Might need to implement simple one-pass strategy here else: for b in bams: total_bases = 0 reads = pysam.mpileup('-D', b) for r in reads: if chromosome in r: r_cov = int(r.strip().split('\t')[3]) if r_cov >= depth: total_bases += 1 if float(total_bases)/length < proportion: print b removed_bams.append(b) remove_files(removed_bams)
def calculateCoverage(contig_file, contig, bam_file, bamnam, bedtool, bednam, outdir, typ='samtools'): out_tab = "%s/%s_coverage_%s_%s.tsv" % (outdir, contig, bamnam, typ) # index the bam file (if needed) if not os.path.exists("%s.bai" % bam_file): pysam.index(bam_file) if typ == 'bedtools': with warnings.catch_warnings(): # this raises a warning in Python 3.8 that is not relevant # ignore it # https://github.com/benoitc/gunicorn/issues/2091 warnings.filterwarnings("ignore") bam = pybedtools.BedTool(bam_file) # d=True - calculate depth of coverage for every position # in the bam file cov = bedtool.coverage(bam, d=True) cov_df = cov.to_dataframe( names=['chromosome', 'start', 'end', 'pos', 'coverage']) cov_df['bam'] = bamnam elif typ == 'samtools': # -A - keep anomalous read pairs # -B - don't calculate per base alignment quality # -C 0 - don't adjust based on mapping quality # -d 0 - don't limit coverage depth # -Q 0 - minimum quality score of 0 # -aa - output every position even if coverage is 0 cov = pysam.mpileup(bam_file, "-f", contig_file, "-A", "-B", "-C", "0", "-d", "0", "-Q", "0", "-aa") # convert the coverage table into a dataframe cov = [x.split("\t") for x in cov.split("\n")] cov_df = pd.DataFrame(cov, columns=[ 'chromosome', 'pos', 'reference_base', 'coverage', 'base_quality', 'alignment_quality' ]) cov_df['bam'] = bamnam # the final position is always NA for some reason cov_df['coverage'] = cov_df['coverage'].fillna(0) cov_df = cov_df[cov_df['pos'].notnull()] # convert to int cov_df['coverage'] = cov_df['coverage'].astype(int) cov_df['pos'] = cov_df['pos'].astype(int) cov_df['reference_base'] = cov_df['reference_base'].str.upper() # save the coverage table cov_df = cov_df[cov_df['chromosome'] == contig] cov_df.to_csv(out_tab, sep="\t", index=None) return (cov_df)