def makemut(args, hc, avoid, alignopts): mutid_list = [] for site in hc: mutid_list.append(site['chrom'] + '_' + str(site['start']) + '_' + str(site['end']) + '_' + str(site['vaf']) + '_' + str(site['altbase'])) try: if args.seed is not None: random.seed(int(args.seed) + int(hc[0]['start'])) bamfile = pysam.Samfile(args.bamFileName, 'rb') bammate = pysam.Samfile( args.bamFileName, 'rb') # use for mates to avoid iterator problems reffile = pysam.Fastafile(args.refFasta) tmpbams = [] #snvfrac = float(args.snvfrac) chrom = None vaf = None mutpos_list = [] altbase_list = [] for site in hc: if chrom is None: chrom = site['chrom'] else: assert chrom == site[ 'chrom'], "haplotype clusters cannot span multiple chromosomes!" if vaf is None: vaf = site['vaf'] elif vaf != site['vaf']: logger.warning( "multiple VAFs for single haplotype, using first encountered VAF: %f" % vaf) mutpos = int(random.uniform(site['start'], site['end'] + 1)) # position of mutation in genome mutpos_list.append(mutpos) # FIXME altbase_list.append(site['altbase']) mutbase_list = [] refbase_list = [] mutstr_list = [] for n, mutpos in enumerate(mutpos_list): refbase = reffile.fetch(chrom, mutpos - 1, mutpos) altbase = altbase_list[n] refbase_list.append(refbase) if altbase == refbase.upper() and not args.ignoreref: logger.warning( "%s specified ALT base matches reference, skipping mutation" % mutid_list[n]) return None try: mutbase = mut(refbase, altbase) mutbase_list.append(mutbase) except ValueError as e: logger.warning(mutid_list[n] + " " + ' '.join( ("skipped site:", chrom, str(hc[n]['start']), str(hc[n]['end']), "due to N base:", str(e), "\n"))) return None mutstr_list.append(refbase + "-->" + str(mutbase)) # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') hapstr = "_".join( ('haplo', chrom, str(min(mutpos_list)), str(max(mutpos_list)))) log = open( 'addsnv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + hapstr + ".log", 'w') tmpoutbamname = args.tmpdir + "/" + hapstr + ".tmpbam." + str( uuid4()) + ".bam" logger.info("%s creating tmp bam: %s" % (hapstr, tmpoutbamname)) outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile) mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate( args, log, bamfile, bammate, chrom, min(mutpos_list), max(mutpos_list) + 1, mutpos_list, avoid=avoid, mutid_list=mutid_list, is_snv=True, mutbase_list=mutbase_list, reffile=reffile) if mutfail: outbam_muts.close() os.remove(tmpoutbamname) return None # pick reads to change readlist = [] for extqname, read in outreads.iteritems(): if read.seq != mutreads[extqname]: readlist.append(extqname) logger.info("%s len(readlist): %s" % (hapstr, str(len(readlist)))) readlist.sort() random.shuffle(readlist) if len(readlist) < int(args.mindepth): logger.warning("%s too few reads in region (%s) skipping..." % (hapstr, str(len(readlist)))) outbam_muts.close() os.remove(tmpoutbamname) return None if vaf is None: vaf = float( args.mutfrac ) # default minor allele freq if not otherwise specified if cnv: # cnv file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom, min(mutpos_list), max(mutpos_list) + 1): cn = float(cnregion.strip().split() [3]) # expect chrom,start,end,CN logger.info(hapstr + "\t" + ' '.join(("copy number in snp region:", chrom, str(min(mutpos_list)), str(max(mutpos_list)), "=", str(cn)))) if float(cn) > 0.0: vaf = 1.0 / float(cn) else: vaf = 0.0 logger.info("%s adjusted VAF: %f" % (hapstr, vaf)) else: logger.info("%s selected VAF: %f" % (hapstr, vaf)) lastread = int(len(readlist) * vaf) # pick at least args.minmutreads if possible if lastread < int(args.minmutreads): if len(readlist) > int(args.minmutreads): lastread = int(args.minmutreads) logger.warning("%s forced %d reads." % (hapstr, lastread)) else: logger.warning( "%s dropped site with fewer reads than --minmutreads" % hapstr) os.remove(tmpoutbamname) return None readtrack = dd(list) for readname in readlist: orig_name, readpos, pairend = readname.split(',') readtrack[orig_name].append('%s,%s' % (readpos, pairend)) usedreads = 0 newreadlist = [] for orig_name in readtrack: for read_instance in readtrack[orig_name]: newreadlist.append(orig_name + ',' + read_instance) usedreads += 1 if usedreads >= lastread: break readlist = newreadlist logger.info("%s picked: %d" % (hapstr, len(readlist))) wrote = 0 nmut = 0 mut_out = {} # change reads from .bam to mutated sequences for extqname, read in outreads.iteritems(): if read.seq != mutreads[extqname]: if not args.nomut and extqname in readlist: qual = read.qual # changing seq resets qual (see pysam API docs) read.seq = mutreads[extqname] # make mutation read.qual = qual nmut += 1 if not hasSNP or args.force: wrote += 1 mut_out[extqname] = read muts_written = {} for extqname in mut_out: if extqname not in muts_written: outbam_muts.write(mut_out[extqname]) muts_written[extqname] = True if mutmates[extqname] is not None: # is mate also in mutated list? mate_read = mutmates[extqname] pairname = 'F' # read is first in pair if mate_read.is_read2: pairname = 'S' # read is second in pair if not mate_read.is_paired: pairname = 'U' # read is unpaired mateqname = ','.join( (mate_read.qname, str(mate_read.pos), pairname)) if mateqname in mut_out: # yes: output mutated mate outbam_muts.write(mut_out[mateqname]) muts_written[mateqname] = True else: # no: output original mate outbam_muts.write(mate_read) logger.info("%s wrote: %d, mutated: %d" % (hapstr, wrote, nmut)) if not hasSNP or args.force: outbam_muts.close() aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=hapstr, paired=(not args.single), picardjar=args.picardjar, insane=args.insane) outbam_muts = pysam.Samfile(tmpoutbamname, 'rb') coverwindow = 1 incover = countReadCoverage(bamfile, chrom, min(mutpos_list) - coverwindow, max(mutpos_list) + coverwindow) outcover = countReadCoverage(outbam_muts, chrom, min(mutpos_list) - coverwindow, max(mutpos_list) + coverwindow) avgincover = float(sum(incover)) / float(len(incover)) avgoutcover = float(sum(outcover)) / float(len(outcover)) logger.info("%s avgincover: %f, avgoutcover: %f" % (hapstr, avgincover, avgoutcover)) spikein_snvfrac = 0.0 if wrote > 0: spikein_snvfrac = float(nmut) / float(wrote) # qc cutoff for final snv depth if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover >= float(args.coverdiff)) or args.force: tmpbams.append(tmpoutbamname) for n, site in enumerate(hc): snvstr = chrom + ":" + str(site['start']) + "-" + str( site['end']) + " (VAF=" + str(vaf) + ")" log.write("\t".join(("snv", snvstr, str(mutpos_list[n]), mutstr_list[n], str(avgoutcover), str(avgoutcover), str(spikein_snvfrac), str(maxfrac))) + "\n") else: outbam_muts.close() os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') logger.warning("%s dropped for outcover/incover < %s" % (hapstr, str(args.coverdiff))) return None outbam_muts.close() bamfile.close() bammate.close() log.close() return tmpbams except Exception, e: sys.stderr.write("*" * 60 + "\nERROR\t" + now() + "\tencountered error in mutation spikein: " + str(mutid_list) + "\n") traceback.print_exc(file=sys.stdout) sys.stderr.write("*" * 60 + "\n") if os.path.exists(tmpoutbamname): os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') return None
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts): ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end''' if args.seed is not None: random.seed(int(args.seed) + int(start)) mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf) if ins is None: mutid += ':DEL' else: mutid += ':INS:' + ins try: bamfile = pysam.Samfile(args.bamFileName, 'rb') bammate = pysam.Samfile(args.bamFileName, 'rb') # use for mates to avoid iterator problems reffile = pysam.Fastafile(args.refFasta) tmpbams = [] is_insertion = ins is not None is_deletion = ins is None snvfrac = float(args.snvfrac) mutstr = get_mutstr(chrom, start, end, ins, reffile) del_ln = 0 if is_deletion: del_ln = end-start mutpos = start mutpos_list = [start] # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') log = open('addindel_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + "_".join((chrom,str(start),str(end))) + ".log",'w') tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str(uuid4()) + ".bam" print "INFO\t" + now() + "\t" + mutid + "\tcreating tmp bam: ",tmpoutbamname #DEBUG outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile) mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(args, log, bamfile, bammate, chrom, mutpos, mutpos+del_ln+1, mutpos_list, avoid=avoid, mutid_list=[mutid], is_insertion=is_insertion, is_deletion=is_deletion, ins_seq=ins, reffile=reffile, indel_start=start, indel_end=end) if mutfail: outbam_muts.close() os.remove(tmpoutbamname) return None # pick reads to change readlist = [] for extqname,read in outreads.iteritems(): if read.seq != mutreads[extqname]: readlist.append(extqname) print "len(readlist):",str(len(readlist)) readlist.sort() random.shuffle(readlist) if len(readlist) < int(args.mindepth): sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tskipped, too few reads in region: " + str(len(readlist)) + "\n") outbam_muts.close() os.remove(tmpoutbamname) return None if vaf is None: vaf = float(args.mutfrac) # default minor allele freq if not otherwise specified if cnv: # cnv file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom,start,end): cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" + ' '.join(("copy number in snp region:",chrom,str(start),str(end),"=",str(cn))) + "\n") if float(cn) > 0.0: vaf = 1.0/float(cn) else: vaf = 0.0 sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted VAF: " + str(vaf) + "\n") else: sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tselected VAF: " + str(vaf) + "\n") lastread = int(len(readlist)*vaf) # pick at least args.minmutreads if possible if lastread < int(args.minmutreads): if len(readlist) > int(args.minmutreads): lastread = int(args.minmutreads) sys.stdout.write("WARN\t" + now() + "\t" + mutid + "\tforced " + str(lastread) + " reads.\n") else: print "WARN\t" + now() + "\t" + mutid + "\tdropped site with fewer reads than --minmutreads" os.remove(tmpoutbamname) return None readtrack = dd(list) for readname in readlist: orig_name, readpos, pairend = readname.split(',') readtrack[orig_name].append('%s,%s' % (readpos, pairend)) usedreads = 0 newreadlist = [] for orig_name in readtrack: for read_instance in readtrack[orig_name]: newreadlist.append(orig_name + ',' + read_instance) usedreads += 1 if usedreads >= lastread: break readlist = newreadlist print "INFO\t" + now() + "\t" + mutid + "\tpicked: " + str(len(readlist)) + " reads" wrote = 0 nmut = 0 mut_out = {} # change reads from .bam to mutated sequences for extqname,read in outreads.iteritems(): if read.seq != mutreads[extqname]: if not args.nomut and extqname in readlist: qual = read.qual # changing seq resets qual (see pysam API docs) read.seq = mutreads[extqname] # make mutation read.qual = qual nmut += 1 if not hasSNP or args.force: wrote += 1 mut_out[extqname] = read muts_written = {} for extqname in mut_out: if extqname not in muts_written: outbam_muts.write(mut_out[extqname]) muts_written[extqname] = True if mutmates[extqname] is not None: # is mate also in mutated list? mate_read = mutmates[extqname] pairname = 'F' # read is first in pair if mate_read.is_read2: pairname = 'S' # read is second in pair if not mate_read.is_paired: pairname = 'U' # read is unpaired mateqname = ','.join((mate_read.qname,str(mate_read.pos),pairname)) if mateqname in mut_out: # yes: output mutated mate outbam_muts.write(mut_out[mateqname]) muts_written[mateqname] = True else: # no: output original mate outbam_muts.write(mate_read) print "INFO\t" + now() + "\t" + mutid + "\twrote: " + str(wrote) + " reads, mutated: " + str(nmut) + " reads" if not hasSNP or args.force: outbam_muts.close() aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=mutid, paired=(not args.single), picardjar=args.picardjar) outbam_muts = pysam.Samfile(tmpoutbamname,'rb') coverwindow = 1 incover = countReadCoverage(bamfile,chrom,mutpos-coverwindow,mutpos+del_ln+coverwindow) outcover = countReadCoverage(outbam_muts,chrom,mutpos-coverwindow,mutpos+del_ln+coverwindow) avgincover = float(sum(incover))/float(len(incover)) avgoutcover = float(sum(outcover))/float(len(outcover)) spikein_frac = 0.0 if wrote > 0: spikein_frac = float(nmut)/float(wrote) # qc cutoff for final snv depth if (avgoutcover > 0 and avgincover > 0 and avgoutcover/avgincover >= float(args.coverdiff)) or args.force: tmpbams.append(tmpoutbamname) indelstr = '' if is_insertion: indelstr = ':'.join(('INS', chrom, str(start), ins)) else: indelstr = ':'.join(('DEL', chrom, str(start), str(end))) snvstr = chrom + ":" + str(start) + "-" + str(end) + " (VAF=" + str(vaf) + ")" log.write("\t".join(("indel",indelstr,str(mutpos),mutstr,str(avgincover),str(avgoutcover),str(spikein_frac),str(maxfrac)))+"\n") else: outbam_muts.close() os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') print "WARN\t" + now() + "\t" + mutid + "\tdropped for outcover/incover < " + str(args.coverdiff) return None outbam_muts.close() bamfile.close() bammate.close() log.close() return sorted(tmpbams) except Exception, e: sys.stderr.write("*"*60 + "\nencountered error in mutation spikein: " + mutid + "\n") traceback.print_exc(file=sys.stdout) sys.stderr.write("*"*60 + "\n") if os.path.exists(tmpoutbamname): os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') return None
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts): ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end''' if args.seed is not None: random.seed(int(args.seed) + int(start)) mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf) if ins is None: mutid += ':DEL' else: mutid += ':INS:' + ins try: bamfile = pysam.Samfile(args.bamFileName, 'rb') bammate = pysam.Samfile( args.bamFileName, 'rb') # use for mates to avoid iterator problems reffile = pysam.Fastafile(args.refFasta) tmpbams = [] is_insertion = ins is not None is_deletion = ins is None snvfrac = float(args.snvfrac) mutstr = get_mutstr(chrom, start, end, ins, reffile) del_ln = 0 if is_deletion: del_ln = end - start mutpos = start mutpos_list = [start] # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') log = open( 'addindel_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + "_".join( (chrom, str(start), str(end))) + ".log", 'w') tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str( uuid4()) + ".bam" print "INFO\t" + now( ) + "\t" + mutid + "\tcreating tmp bam: ", tmpoutbamname #DEBUG outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile) mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate( args, log, bamfile, bammate, chrom, mutpos, mutpos + del_ln + 1, mutpos_list, avoid=avoid, mutid_list=[mutid], is_insertion=is_insertion, is_deletion=is_deletion, ins_seq=ins, reffile=reffile, indel_start=start, indel_end=end) if mutfail: outbam_muts.close() os.remove(tmpoutbamname) return None # pick reads to change readlist = [] for extqname, read in outreads.iteritems(): if read.seq != mutreads[extqname]: readlist.append(extqname) print "len(readlist):", str(len(readlist)) readlist.sort() random.shuffle(readlist) if len(readlist) < int(args.mindepth): sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tskipped, too few reads in region: " + str(len(readlist)) + "\n") outbam_muts.close() os.remove(tmpoutbamname) return None if vaf is None: vaf = float( args.mutfrac ) # default minor allele freq if not otherwise specified if cnv: # cnv file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom, start, end): cn = float(cnregion.strip().split() [3]) # expect chrom,start,end,CN sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" + ' '.join(("copy number in snp region:", chrom, str(start), str(end), "=", str(cn))) + "\n") if float(cn) > 0.0: vaf = 1.0 / float(cn) else: vaf = 0.0 sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted VAF: " + str(vaf) + "\n") else: sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tselected VAF: " + str(vaf) + "\n") lastread = int(len(readlist) * vaf) # pick at least args.minmutreads if possible if lastread < int(args.minmutreads): if len(readlist) > int(args.minmutreads): lastread = int(args.minmutreads) sys.stdout.write("WARN\t" + now() + "\t" + mutid + "\tforced " + str(lastread) + " reads.\n") else: print "WARN\t" + now( ) + "\t" + mutid + "\tdropped site with fewer reads than --minmutreads" os.remove(tmpoutbamname) return None readtrack = dd(list) for readname in readlist: orig_name, readpos, pairend = readname.split(',') readtrack[orig_name].append('%s,%s' % (readpos, pairend)) usedreads = 0 newreadlist = [] for orig_name in readtrack: for read_instance in readtrack[orig_name]: newreadlist.append(orig_name + ',' + read_instance) usedreads += 1 if usedreads >= lastread: break readlist = newreadlist print "INFO\t" + now() + "\t" + mutid + "\tpicked: " + str( len(readlist)) + " reads" wrote = 0 nmut = 0 mut_out = {} # change reads from .bam to mutated sequences for extqname, read in outreads.iteritems(): if read.seq != mutreads[extqname]: if not args.nomut and extqname in readlist: qual = read.qual # changing seq resets qual (see pysam API docs) read.seq = mutreads[extqname] # make mutation read.qual = qual nmut += 1 if not hasSNP or args.force: wrote += 1 mut_out[extqname] = read muts_written = {} for extqname in mut_out: if extqname not in muts_written: outbam_muts.write(mut_out[extqname]) muts_written[extqname] = True if mutmates[extqname] is not None: # is mate also in mutated list? mate_read = mutmates[extqname] pairname = 'F' # read is first in pair if mate_read.is_read2: pairname = 'S' # read is second in pair if not mate_read.is_paired: pairname = 'U' # read is unpaired mateqname = ','.join( (mate_read.qname, str(mate_read.pos), pairname)) if mateqname in mut_out: # yes: output mutated mate outbam_muts.write(mut_out[mateqname]) muts_written[mateqname] = True else: # no: output original mate outbam_muts.write(mate_read) print "INFO\t" + now() + "\t" + mutid + "\twrote: " + str( wrote) + " reads, mutated: " + str(nmut) + " reads" if not hasSNP or args.force: outbam_muts.close() aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=mutid, paired=(not args.single), picardjar=args.picardjar, insane=args.insane) outbam_muts = pysam.Samfile(tmpoutbamname, 'rb') coverwindow = 1 incover = countReadCoverage(bamfile, chrom, mutpos - coverwindow, mutpos + del_ln + coverwindow) outcover = countReadCoverage(outbam_muts, chrom, mutpos - coverwindow, mutpos + del_ln + coverwindow) avgincover = float(sum(incover)) / float(len(incover)) avgoutcover = float(sum(outcover)) / float(len(outcover)) spikein_frac = 0.0 if wrote > 0: spikein_frac = float(nmut) / float(wrote) # qc cutoff for final snv depth if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover >= float(args.coverdiff)) or args.force: tmpbams.append(tmpoutbamname) indelstr = '' if is_insertion: indelstr = ':'.join(('INS', chrom, str(start), ins)) else: indelstr = ':'.join(('DEL', chrom, str(start), str(end))) snvstr = chrom + ":" + str(start) + "-" + str( end) + " (VAF=" + str(vaf) + ")" log.write("\t".join(("indel", indelstr, str(mutpos), mutstr, str(avgincover), str(avgoutcover), str(spikein_frac), str(maxfrac))) + "\n") else: outbam_muts.close() os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') print "WARN\t" + now( ) + "\t" + mutid + "\tdropped for outcover/incover < " + str( args.coverdiff) return None outbam_muts.close() bamfile.close() bammate.close() log.close() return sorted(tmpbams) except Exception, e: sys.stderr.write("*" * 60 + "\nencountered error in mutation spikein: " + mutid + "\n") traceback.print_exc(file=sys.stdout) sys.stderr.write("*" * 60 + "\n") if os.path.exists(tmpoutbamname): os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') return None
def makemut(args, hc, avoid, alignopts): mutid_list = [] for site in hc: mutid_list.append(site['chrom'] + '_' + str(site['start']) + '_' + str(site['end']) + '_' + str(site['vaf']) + '_' + str(site['altbase'])) try: if args.seed is not None: random.seed(int(args.seed) + int(hc[0]['start'])) bamfile = pysam.Samfile(args.bamFileName, 'rb') bammate = pysam.Samfile(args.bamFileName, 'rb') # use for mates to avoid iterator problems reffile = pysam.Fastafile(args.refFasta) tmpbams = [] #snvfrac = float(args.snvfrac) chrom = None vaf = None mutpos_list = [] altbase_list = [] for site in hc: if chrom is None: chrom = site['chrom'] else: assert chrom == site['chrom'], "haplotype clusters cannot span multiple chromosomes!" if vaf is None: vaf = site['vaf'] elif vaf != site['vaf']: sys.stderr.write("WARN\t" + now() + "\tmultiple VAFs for single haplotype, using first encountered VAF: " + str(vaf) + "\n") mutpos = int(random.uniform(site['start'],site['end']+1)) # position of mutation in genome mutpos_list.append(mutpos) # FIXME altbase_list.append(site['altbase']) mutbase_list = [] refbase_list = [] mutstr_list = [] for n, mutpos in enumerate(mutpos_list): refbase = reffile.fetch(chrom,mutpos-1,mutpos) altbase = altbase_list[n] refbase_list.append(refbase) if altbase == refbase.upper() and not args.ignoreref: sys.stderr.write("WARN\t" + now() + "\t" + mutid_list[n] + "\tspecified ALT base matches reference, skipping mutation\n") return None try: mutbase = mut(refbase, altbase) mutbase_list.append(mutbase) except ValueError as e: sys.stderr.write("WARN\t" + now() + "\t" + mutid_list[n] + "\t" + ' '.join(("skipped site:",chrom,str(hc[n]['start']),str(hc[n]['end']),"due to N base:",str(e),"\n"))) return None mutstr_list.append(refbase + "-->" + str(mutbase)) # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') hapstr = "_".join(('haplo',chrom,str(min(mutpos_list)),str(max(mutpos_list)))) log = open('addsnv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + hapstr + ".log",'w') tmpoutbamname = args.tmpdir + "/" + hapstr + ".tmpbam." + str(uuid4()) + ".bam" print "INFO\t" + now() + "\t" + hapstr + "\tcreating tmp bam: ",tmpoutbamname outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile) mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(args, log, bamfile, bammate, chrom, min(mutpos_list), max(mutpos_list)+1, mutpos_list, avoid=avoid, mutid_list=mutid_list, is_snv=True, mutbase_list=mutbase_list, reffile=reffile) if mutfail: outbam_muts.close() os.remove(tmpoutbamname) return None # pick reads to change readlist = [] for extqname,read in outreads.iteritems(): if read.seq != mutreads[extqname]: readlist.append(extqname) print "INFO\t" + now() + "\t" + hapstr + "\tlen(readlist): " + str(len(readlist)) readlist.sort() random.shuffle(readlist) if len(readlist) < int(args.mindepth): print "WARN\t" + now() + "\t" + hapstr + "\ttoo few reads in region (" + str(len(readlist)) + ") skipping..." outbam_muts.close() os.remove(tmpoutbamname) return None if vaf is None: vaf = float(args.mutfrac) # default minor allele freq if not otherwise specified if cnv: # cnv file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom,min(mutpos_list),max(mutpos_list)+1): cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN print "INFO\t" + now() + "\t" + hapstr + "\t" + ' '.join(("copy number in snp region:",chrom,str(min(mutpos_list)),str(max(mutpos_list)),"=",str(cn))) + "\n" if float(cn) > 0.0: vaf = 1.0/float(cn) else: vaf = 0.0 print "adjusted VAF: " + str(vaf) + "\n" else: print "INFO\t" + now() + "\t" + hapstr + "\tselected VAF: " + str(vaf) + "\n" lastread = int(len(readlist)*vaf) # pick at least args.minmutreads if possible if lastread < int(args.minmutreads): if len(readlist) > int(args.minmutreads): lastread = int(args.minmutreads) sys.stdout.write("WARN\t" + now() + "\t" + hapstr + "\tforced " + str(lastread) + " reads.\n") else: print "WARN\t" + now() + "\t" + hapstr + "\tdropped site with fewer reads than --minmutreads" os.remove(tmpoutbamname) return None readtrack = dd(list) for readname in readlist: orig_name, readpos, pairend = readname.split(',') readtrack[orig_name].append('%s,%s' % (readpos, pairend)) usedreads = 0 newreadlist = [] for orig_name in readtrack: for read_instance in readtrack[orig_name]: newreadlist.append(orig_name + ',' + read_instance) usedreads += 1 if usedreads >= lastread: break readlist = newreadlist print "INFO\t" + now() + "\t" + hapstr + "\tpicked:",str(len(readlist)) wrote = 0 nmut = 0 mut_out = {} # change reads from .bam to mutated sequences for extqname,read in outreads.iteritems(): if read.seq != mutreads[extqname]: if not args.nomut and extqname in readlist: qual = read.qual # changing seq resets qual (see pysam API docs) read.seq = mutreads[extqname] # make mutation read.qual = qual nmut += 1 if not hasSNP or args.force: wrote += 1 mut_out[extqname] = read muts_written = {} for extqname in mut_out: if extqname not in muts_written: outbam_muts.write(mut_out[extqname]) muts_written[extqname] = True if mutmates[extqname] is not None: # is mate also in mutated list? mate_read = mutmates[extqname] pairname = 'F' # read is first in pair if mate_read.is_read2: pairname = 'S' # read is second in pair if not mate_read.is_paired: pairname = 'U' # read is unpaired mateqname = ','.join((mate_read.qname,str(mate_read.pos),pairname)) if mateqname in mut_out: # yes: output mutated mate outbam_muts.write(mut_out[mateqname]) muts_written[mateqname] = True else: # no: output original mate outbam_muts.write(mate_read) print "INFO\t" + now() + "\t" + hapstr + "\twrote: ",wrote,"mutated:",nmut if not hasSNP or args.force: outbam_muts.close() aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=hapstr, paired=(not args.single), picardjar=args.picardjar) outbam_muts = pysam.Samfile(tmpoutbamname,'rb') coverwindow = 1 incover = countReadCoverage(bamfile,chrom,min(mutpos_list)-coverwindow,max(mutpos_list)+coverwindow) outcover = countReadCoverage(outbam_muts,chrom,min(mutpos_list)-coverwindow,max(mutpos_list)+coverwindow) avgincover = float(sum(incover))/float(len(incover)) avgoutcover = float(sum(outcover))/float(len(outcover)) print "INFO\t" + now() + "\t" + hapstr + "\tavgincover: " + str(avgincover) + " avgoutcover: " + str(avgoutcover) spikein_snvfrac = 0.0 if wrote > 0: spikein_snvfrac = float(nmut)/float(wrote) # qc cutoff for final snv depth if (avgoutcover > 0 and avgincover > 0 and avgoutcover/avgincover >= float(args.coverdiff)) or args.force: tmpbams.append(tmpoutbamname) for n,site in enumerate(hc): snvstr = chrom + ":" + str(site['start']) + "-" + str(site['end']) + " (VAF=" + str(vaf) + ")" log.write("\t".join(("snv",snvstr,str(mutpos_list[n]),mutstr_list[n],str(avgoutcover),str(avgoutcover),str(spikein_snvfrac),str(maxfrac)))+"\n") else: outbam_muts.close() os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') print "WARN\t" + now() + "\t" + hapstr + "\tdropped for outcover/incover < " + str(args.coverdiff) return None outbam_muts.close() bamfile.close() bammate.close() log.close() return tmpbams except Exception, e: sys.stderr.write("*"*60 + "\nERROR\t" + now() + "\tencountered error in mutation spikein: " + str(mutid_list) + "\n") traceback.print_exc(file=sys.stdout) sys.stderr.write("*"*60 + "\n") if os.path.exists(tmpoutbamname): os.remove(tmpoutbamname) if os.path.exists(tmpoutbamname + '.bai'): os.remove(tmpoutbamname + '.bai') return None