def makemut(args, bedline, alignopts): if args.seed is not None: random.seed(int(args.seed)) mutid = '_'.join(map(str, bedline.strip().split())) try: bamfile = pysam.Samfile(args.bamFileName, 'rb') reffile = pysam.Fastafile(args.refFasta) logfn = '_'.join(map(os.path.basename, bedline.strip().split())) + ".log" logfile = open( 'addsv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + '_' + logfn, 'w') exclfile = args.tmpdir + '/' + '.'.join( (mutid, 'exclude', str(uuid4()), 'txt')) exclude = open(exclfile, 'w') # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') # temporary file to hold mutated reads outbam_mutsfile = args.tmpdir + '/' + '.'.join( (mutid, str(uuid4()), "muts.bam")) c = bedline.strip().split() chrom = c[0] start = int(c[1]) end = int(c[2]) araw = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP # translocation specific trn_chrom = None trn_start = None trn_end = None is_transloc = c[3] == 'TRN' if is_transloc: start -= 3000 end += 3000 if start < 0: start = 0 trn_chrom = c[4] trn_start = int(c[5]) - 3000 trn_end = int(c[5]) + 3000 if trn_start < 0: trn_start = 0 actions = map(lambda x: x.strip(), ' '.join(araw).split(',')) svfrac = float(args.svfrac) # default, can be overridden by cnv file if cnv: # CNV file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom, start, end): cn = float(cnregion.strip().split() [3]) # expect chrom,start,end,CN sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" + ' '.join(("copy number in sv region:", chrom, str(start), str(end), "=", str(cn))) + "\n") svfrac = 1.0 / float(cn) assert svfrac <= 1.0 sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted MAF: " + str(svfrac) + "\n") print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c print "INFO\t" + now() + "\t" + mutid + "\tlength:", end - start # modify start and end if interval is too long maxctglen = int(args.maxctglen) assert maxctglen > 3 * int(args.maxlibsize) # maxctglen is too short if end - start > maxctglen: adj = (end - start) - maxctglen rndpt = random.randint(0, adj) start = start + rndpt end = end - (adj - rndpt) print "INFO\t" + now( ) + "\t" + mutid + "\tnote: interval size too long, adjusted:", chrom, start, end dfrac = discordant_fraction(args.bamFileName, chrom, start, end) print "INFO\t" + now() + "\t" + mutid + "\tdiscordant fraction:", dfrac maxdfrac = 0.1 # FIXME make a parameter if dfrac > .1: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tdiscordant fraction > " + str(maxdfrac) + " aborting mutation!\n") return None, None contigs = ar.asm(chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.tmpdir, args.noref, args.recycle, mutid=mutid, debug=args.debug) trn_contigs = None if is_transloc: trn_contigs = ar.asm(trn_chrom, trn_start, trn_end, args.bamFileName, reffile, int(args.kmersize), args.tmpdir, args.noref, args.recycle, mutid=mutid, debug=args.debug) maxcontig = sorted(contigs)[-1] trn_maxcontig = None if is_transloc: trn_maxcontig = sorted(trn_contigs)[-1] # be strict about contig quality if re.search('N', maxcontig.seq): sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\tcontig dropped due to ambiguous base (N), aborting mutation.\n" ) return None, None if is_transloc and re.search('N', trn_maxcontig.seq): sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\tcontig dropped due to ambiguous base (N), aborting mutation.\n" ) return None, None if maxcontig is None: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tmaxcontig has length 0, aborting mutation!\n") return None, None if is_transloc and trn_maxcontig is None: sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\ttransloc maxcontig has length 0, aborting mutation!\n") return None, None print "INFO\t" + now( ) + "\t" + mutid + "\tbest contig length:", sorted(contigs)[-1].len if is_transloc: print "INFO\t" + now( ) + "\t" + mutid + "\tbest transloc contig length:", sorted( trn_contigs)[-1].len # trim contig to get best ungapped aligned region to ref. maxcontig, refseq, alignstats, refstart, refend, qrystart, qryend, tgtstart, tgtend = trim_contig( mutid, chrom, start, end, maxcontig, reffile) print "INFO\t" + now( ) + "\t" + mutid + "\tstart, end, tgtstart, tgtend, refstart, refend:", start, end, tgtstart, tgtend, refstart, refend if is_transloc: trn_maxcontig, trn_refseq, trn_alignstats, trn_refstart, trn_refend, trn_qrystart, trn_qryend, trn_tgtstart, trn_tgtend = trim_contig( mutid, trn_chrom, trn_start, trn_end, trn_maxcontig, reffile) print "INFO\t" + now( ) + "\t" + mutid + "\ttrn_start, trn_end, trn_tgtstart, trn_tgtend, trn_refstart, trn_refend:", trn_start, trn_end, trn_tgtstart, trn_tgtend, trn_refstart, trn_refend # is there anough room to make mutations? if maxcontig.len < 3 * int(args.maxlibsize): sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tbest contig too short to make mutation!\n") return None, None if is_transloc and trn_maxcontig.len < 3 * int(args.maxlibsize): sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\tbest transloc contig too short to make mutation!\n") return None, None # make mutation in the largest contig mutseq = ms.MutableSeq(maxcontig.seq) if is_transloc: trn_mutseq = ms.MutableSeq(trn_maxcontig.seq) # support for multiple mutations for actionstr in actions: a = actionstr.split() action = a[0] print "INFO\t" + now( ) + "\t" + mutid + "\taction: ", actionstr, action insseqfile = None insseq = '' tsdlen = 0 # target site duplication length ndups = 0 # number of tandem dups dsize = 0.0 # deletion size fraction dlen = 0 if action == 'INS': assert len( a) > 1 # insertion syntax: INS <file.fa> [optional TSDlen] insseqfile = a[1] if not ( os.path.exists(insseqfile) or insseqfile == 'RND' ): # not a file... is it a sequence? (support indel ins.) assert re.search('^[ATGCatgc]*$', insseqfile) # make sure it's a sequence insseq = insseqfile.upper() insseqfile = None if len(a) > 2: tsdlen = int(a[2]) if action == 'DUP': if len(a) > 1: ndups = int(a[1]) else: ndups = 1 if action == 'DEL': if len(a) > 1: dsize = float(a[1]) if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion dlen = int(dsize) dsize = 1.0 else: dsize = 1.0 if action == 'TRN': pass logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n") if action == 'INS': if insseqfile: # seq in file if insseqfile == 'RND': assert args.inslib is not None # insertion library needs to exist insseqfile = random.choice(args.inslib.keys()) print "INFO\t" + now( ) + "\t" + mutid + "\tchose sequence from insertion library: " + insseqfile mutseq.insertion(mutseq.length() / 2, args.inslib[insseqfile], tsdlen) else: mutseq.insertion(mutseq.length() / 2, singleseqfa(insseqfile, mutid=mutid), tsdlen) else: # seq is input mutseq.insertion(mutseq.length() / 2, insseq, tsdlen) logfile.write("\t".join( ('ins', chrom, str(refstart), str(refend), action, str(mutseq.length()), str(mutseq.length() / 2), str(insseqfile), str(tsdlen))) + "\n") elif action == 'INV': invstart = int(args.maxlibsize) invend = mutseq.length() - invstart mutseq.inversion(invstart, invend) logfile.write("\t".join( ('inv', chrom, str(refstart), str(refend), action, str(mutseq.length()), str(invstart), str(invend))) + "\n") elif action == 'DEL': delstart = int(args.maxlibsize) delend = mutseq.length() - delstart if dlen == 0: # bp size not specified, delete fraction of contig dlen = int((float(delend - delstart) * dsize) + 0.5) dadj = delend - delstart - dlen if dadj < 0: dadj = 0 sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\twarning: deletion of length 0\n") delstart += dadj / 2 delend -= dadj / 2 mutseq.deletion(delstart, delend) logfile.write("\t".join(('del', chrom, str(refstart), str(refend), action, str(mutseq.length()), str(delstart), str(delend), str(dlen))) + "\n") elif action == 'DUP': dupstart = int(args.maxlibsize) dupend = mutseq.length() - dupstart mutseq.duplication(dupstart, dupend, ndups) logfile.write("\t".join(('dup', chrom, str(refstart), str(refend), action, str(mutseq.length()), str(dupstart), str(dupend), str(ndups))) + "\n") elif action == 'TRN': mutseq.fusion(mutseq.length() / 2, trn_mutseq, trn_mutseq.length() / 2) logfile.write("\t".join( ('trn', chrom, str(refstart), str(refend), action, str(mutseq.length()), trn_chrom, str(trn_refstart), str(trn_refend), str(trn_mutseq.length()))) + "\n") else: raise ValueError( "ERROR\t" + now() + "\t" + mutid + "\t: mutation not one of: INS,INV,DEL,DUP,TRN\n") logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " AFTER\n" + str(mutseq) + "\n") pemean, pesd = float(args.ismean), float(args.issd) print "INFO\t" + now( ) + "\t" + mutid + "\tset paired end mean distance: " + str( args.ismean) print "INFO\t" + now( ) + "\t" + mutid + "\tset paired end distance stddev: " + str( args.issd) # simulate reads (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, actions, exclude, pemean, pesd, args.tmpdir, mutid=mutid, seed=args.seed, trn_contig=trn_maxcontig) outreads = aligners.remap_fastq(args.aligner, fq1, fq2, args.refFasta, outbam_mutsfile, alignopts, mutid=mutid, threads=1) if outreads == 0: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\toutbam " + outbam_mutsfile + " has no mapped reads!\n") return None, None print "INFO\t" + now( ) + "\t" + mutid + "\ttemporary bam: " + outbam_mutsfile exclude.close() bamfile.close() return outbam_mutsfile, exclfile except Exception, e: sys.stderr.write("*" * 60 + "\nencountered error in mutation spikein: " + bedline + "\n") traceback.print_exc(file=sys.stderr) sys.stderr.write("*" * 60 + "\n") return None, None
def makemut(args, bedline, alignopts): if args.seed is not None: random.seed(int(args.seed)) mutid = "_".join(map(str, bedline.strip().split())) try: bamfile = pysam.Samfile(args.bamFileName, "rb") reffile = pysam.Fastafile(args.refFasta) logfn = "_".join(map(os.path.basename, bedline.strip().split())) + ".log" logfile = open( "addsv_logs_" + os.path.basename(args.outBamFile) + "/" + os.path.basename(args.outBamFile) + "_" + logfn, "w", ) exclfile = args.tmpdir + "/" + ".".join((mutid, "exclude", str(uuid4()), "txt")) exclude = open(exclfile, "w") # optional CNV file cnv = None if args.cnvfile: cnv = pysam.Tabixfile(args.cnvfile, "r") # temporary file to hold mutated reads outbam_mutsfile = args.tmpdir + "/" + ".".join((mutid, str(uuid4()), "muts.bam")) c = bedline.strip().split() chrom = c[0] start = int(c[1]) end = int(c[2]) araw = c[3 : len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP # translocation specific trn_chrom = None trn_start = None trn_end = None is_transloc = c[3] == "TRN" if is_transloc: start -= 3000 end += 3000 if start < 0: start = 0 trn_chrom = c[4] trn_start = int(c[5]) - 3000 trn_end = int(c[5]) + 3000 if trn_start < 0: trn_start = 0 actions = map(lambda x: x.strip(), " ".join(araw).split(",")) svfrac = float(args.svfrac) # default, can be overridden by cnv file if cnv: # CNV file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom, start, end): cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN sys.stdout.write( "INFO\t" + now() + "\t" + mutid + "\t" + " ".join(("copy number in sv region:", chrom, str(start), str(end), "=", str(cn))) + "\n" ) svfrac = 1.0 / float(cn) assert svfrac <= 1.0 sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted MAF: " + str(svfrac) + "\n") print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c print "INFO\t" + now() + "\t" + mutid + "\tlength:", end - start # modify start and end if interval is too long maxctglen = int(args.maxctglen) assert maxctglen > 3 * int(args.maxlibsize) # maxctglen is too short if end - start > maxctglen: adj = (end - start) - maxctglen rndpt = random.randint(0, adj) start = start + rndpt end = end - (adj - rndpt) print "INFO\t" + now() + "\t" + mutid + "\tnote: interval size too long, adjusted:", chrom, start, end dfrac = discordant_fraction(args.bamFileName, chrom, start, end) print "INFO\t" + now() + "\t" + mutid + "\tdiscordant fraction:", dfrac maxdfrac = 0.1 # FIXME make a parameter if dfrac > 0.1: sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\tdiscordant fraction > " + str(maxdfrac) + " aborting mutation!\n" ) return None, None contigs = ar.asm( chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.tmpdir, args.noref, args.recycle, mutid=mutid, debug=args.debug, ) trn_contigs = None if is_transloc: trn_contigs = ar.asm( trn_chrom, trn_start, trn_end, args.bamFileName, reffile, int(args.kmersize), args.tmpdir, args.noref, args.recycle, mutid=mutid, debug=args.debug, ) maxcontig = sorted(contigs)[-1] trn_maxcontig = None if is_transloc: trn_maxcontig = sorted(trn_contigs)[-1] # be strict about contig quality if re.search("N", maxcontig.seq): sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\tcontig dropped due to ambiguous base (N), aborting mutation.\n" ) return None, None if is_transloc and re.search("N", trn_maxcontig.seq): sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\tcontig dropped due to ambiguous base (N), aborting mutation.\n" ) return None, None if maxcontig is None: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tmaxcontig has length 0, aborting mutation!\n") return None, None if is_transloc and trn_maxcontig is None: sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\ttransloc maxcontig has length 0, aborting mutation!\n" ) return None, None print "INFO\t" + now() + "\t" + mutid + "\tbest contig length:", sorted(contigs)[-1].len if is_transloc: print "INFO\t" + now() + "\t" + mutid + "\tbest transloc contig length:", sorted(trn_contigs)[-1].len # trim contig to get best ungapped aligned region to ref. maxcontig, refseq, alignstats, refstart, refend, qrystart, qryend, tgtstart, tgtend = trim_contig( mutid, chrom, start, end, maxcontig, reffile ) print "INFO\t" + now() + "\t" + mutid + "\tstart, end, tgtstart, tgtend, refstart, refend:", start, end, tgtstart, tgtend, refstart, refend if is_transloc: trn_maxcontig, trn_refseq, trn_alignstats, trn_refstart, trn_refend, trn_qrystart, trn_qryend, trn_tgtstart, trn_tgtend = trim_contig( mutid, trn_chrom, trn_start, trn_end, trn_maxcontig, reffile ) print "INFO\t" + now() + "\t" + mutid + "\ttrn_start, trn_end, trn_tgtstart, trn_tgtend, trn_refstart, trn_refend:", trn_start, trn_end, trn_tgtstart, trn_tgtend, trn_refstart, trn_refend # is there anough room to make mutations? if maxcontig.len < 3 * int(args.maxlibsize): sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tbest contig too short to make mutation!\n") return None, None if is_transloc and trn_maxcontig.len < 3 * int(args.maxlibsize): sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tbest transloc contig too short to make mutation!\n") return None, None # make mutation in the largest contig mutseq = ms.MutableSeq(maxcontig.seq) if is_transloc: trn_mutseq = ms.MutableSeq(trn_maxcontig.seq) # support for multiple mutations for actionstr in actions: a = actionstr.split() action = a[0] print "INFO\t" + now() + "\t" + mutid + "\taction: ", actionstr, action insseqfile = None insseq = "" tsdlen = 0 # target site duplication length ndups = 0 # number of tandem dups dsize = 0.0 # deletion size fraction dlen = 0 if action == "INS": assert len(a) > 1 # insertion syntax: INS <file.fa> [optional TSDlen] insseqfile = a[1] if not ( os.path.exists(insseqfile) or insseqfile == "RND" ): # not a file... is it a sequence? (support indel ins.) assert re.search("^[ATGCatgc]*$", insseqfile) # make sure it's a sequence insseq = insseqfile.upper() insseqfile = None if len(a) > 2: tsdlen = int(a[2]) if action == "DUP": if len(a) > 1: ndups = int(a[1]) else: ndups = 1 if action == "DEL": if len(a) > 1: dsize = float(a[1]) if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion dlen = int(dsize) dsize = 1.0 else: dsize = 1.0 if action == "TRN": pass logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n") if action == "INS": if insseqfile: # seq in file if insseqfile == "RND": assert args.inslib is not None # insertion library needs to exist insseqfile = random.choice(args.inslib.keys()) print "INFO\t" + now() + "\t" + mutid + "\tchose sequence from insertion library: " + insseqfile mutseq.insertion(mutseq.length() / 2, args.inslib[insseqfile], tsdlen) else: mutseq.insertion(mutseq.length() / 2, singleseqfa(insseqfile, mutid=mutid), tsdlen) else: # seq is input mutseq.insertion(mutseq.length() / 2, insseq, tsdlen) logfile.write( "\t".join( ( "ins", chrom, str(refstart), str(refend), action, str(mutseq.length()), str(mutseq.length() / 2), str(insseqfile), str(tsdlen), ) ) + "\n" ) elif action == "INV": invstart = int(args.maxlibsize) invend = mutseq.length() - invstart mutseq.inversion(invstart, invend) logfile.write( "\t".join( ( "inv", chrom, str(refstart), str(refend), action, str(mutseq.length()), str(invstart), str(invend), ) ) + "\n" ) elif action == "DEL": delstart = int(args.maxlibsize) delend = mutseq.length() - delstart if dlen == 0: # bp size not specified, delete fraction of contig dlen = int((float(delend - delstart) * dsize) + 0.5) dadj = delend - delstart - dlen if dadj < 0: dadj = 0 sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\twarning: deletion of length 0\n") delstart += dadj / 2 delend -= dadj / 2 mutseq.deletion(delstart, delend) logfile.write( "\t".join( ( "del", chrom, str(refstart), str(refend), action, str(mutseq.length()), str(delstart), str(delend), str(dlen), ) ) + "\n" ) elif action == "DUP": dupstart = int(args.maxlibsize) dupend = mutseq.length() - dupstart mutseq.duplication(dupstart, dupend, ndups) logfile.write( "\t".join( ( "dup", chrom, str(refstart), str(refend), action, str(mutseq.length()), str(dupstart), str(dupend), str(ndups), ) ) + "\n" ) elif action == "TRN": mutseq.fusion(mutseq.length() / 2, trn_mutseq, trn_mutseq.length() / 2) logfile.write( "\t".join( ( "trn", chrom, str(refstart), str(refend), action, str(mutseq.length()), trn_chrom, str(trn_refstart), str(trn_refend), str(trn_mutseq.length()), ) ) + "\n" ) else: raise ValueError("ERROR\t" + now() + "\t" + mutid + "\t: mutation not one of: INS,INV,DEL,DUP,TRN\n") logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " AFTER\n" + str(mutseq) + "\n") pemean, pesd = float(args.ismean), float(args.issd) print "INFO\t" + now() + "\t" + mutid + "\tset paired end mean distance: " + str(args.ismean) print "INFO\t" + now() + "\t" + mutid + "\tset paired end distance stddev: " + str(args.issd) # simulate reads (fq1, fq2) = runwgsim( maxcontig, mutseq.seq, svfrac, actions, exclude, pemean, pesd, args.tmpdir, mutid=mutid, seed=args.seed, trn_contig=trn_maxcontig, ) outreads = aligners.remap_fastq( args.aligner, fq1, fq2, args.refFasta, outbam_mutsfile, alignopts, mutid=mutid, threads=1 ) if outreads == 0: sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\toutbam " + outbam_mutsfile + " has no mapped reads!\n" ) return None, None print "INFO\t" + now() + "\t" + mutid + "\ttemporary bam: " + outbam_mutsfile exclude.close() bamfile.close() return outbam_mutsfile, exclfile except Exception, e: sys.stderr.write("*" * 60 + "\nencountered error in mutation spikein: " + bedline + "\n") traceback.print_exc(file=sys.stderr) sys.stderr.write("*" * 60 + "\n") return None, None
def main(args): """ needs refactoring """ varfile = open(args.varFileName, 'r') bamfile = pysam.Samfile(args.bamFileName, 'rb') reffile = pysam.Fastafile(args.refFasta) logfile = open(args.outBamFile + ".log", 'w') exclude = open(args.exclfile, 'w') # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') # temporary file to hold mutated reads outbam_mutsfile = "tmp." + str(random.random()) + ".muts.bam" nmuts = 0 for bedline in varfile: if re.search('^#',bedline): continue if args.maxmuts and nmuts >= int(args.maxmuts): break c = bedline.strip().split() chrom = c[0] start = int(c[1]) end = int(c[2]) araw = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP actions = map(lambda x: x.strip(),' '.join(araw).split(',')) svfrac = float(args.svfrac) # default, can be overridden by cnv file if cnv: # CNV file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom,start,end): cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN sys.stderr.write(' '.join(("copy number in snp region:",chrom,str(start),str(end),"=",str(cn))) + "\n") svfrac = 1.0/float(cn) sys.stderr.write("adjusted MAF: " + str(svfrac) + "\n") print "interval:",c # modify start and end if interval is too long maxctglen = int(args.maxctglen) assert maxctglen > 3*int(args.maxlibsize) # maxctglen is too short if end-start > maxctglen: adj = (end-start) - maxctglen rndpt = random.randint(0,adj) start = start + rndpt end = end - (adj-rndpt) print "note: interval size too long, adjusted:",chrom,start,end contigs = ar.asm(chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.noref, args.recycle) # find the largest contig maxlen = 0 maxcontig = None for contig in contigs: if contig.len > maxlen: maxlen = contig.len maxcontig = contig # is there anough room to make mutations? if maxlen > 3*int(args.maxlibsize): # make mutation in the largest contig mutseq = ms.MutableSeq(maxcontig.seq) # if we're this far along, we're making a mutation nmuts += 1 # support for multiple mutations for actionstr in actions: a = actionstr.split() action = a[0] print actionstr,action insseqfile = None insseq = '' tsdlen = 0 # target site duplication length ndups = 0 # number of tandem dups dsize = 0.0 # deletion size fraction dlen = 0 if action == 'INS': assert len(a) > 1 # insertion syntax: INS <file.fa> [optional TSDlen] insseqfile = a[1] if not os.path.exists(insseqfile): # not a file... is it a sequence? (support indel ins.) assert re.search('^[ATGCatgc]*$',insseqfile) # make sure it's a sequence insseq = insseqfile.upper() insseqfile = None if len(a) > 2: tsdlen = int(a[2]) if action == 'DUP': if len(a) > 1: ndups = int(a[1]) else: ndups = 1 if action == 'DEL': if len(a) > 1: dsize = float(a[1]) if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion dlen = int(dsize) dsize = 1.0 else: dsize = 1.0 print "BEFORE:",mutseq if action == 'INS': if insseqfile: # seq in file mutseq.insertion(mutseq.length()/2,singleseqfa(insseqfile),tsdlen) else: # seq is input mutseq.insertion(mutseq.length()/2,insseq,tsdlen) logfile.write("\t".join(('ins',chrom,str(start),str(end),action,str(mutseq.length()),str(mutseq.length()/2),str(insseqfile),str(tsdlen))) + "\n") elif action == 'INV': invstart = int(args.maxlibsize) invend = mutseq.length() - invstart mutseq.inversion(invstart,invend) logfile.write("\t".join(('inv',chrom,str(start),str(end),action,str(mutseq.length()),str(invstart),str(invend))) + "\n") elif action == 'DEL': delstart = int(args.maxlibsize) delend = mutseq.length() - delstart if dlen == 0: # bp size not specified, delete fraction of contig dlen = int((float(delend-delstart) * dsize)+0.5) dadj = delend-delstart-dlen if dadj < 0: dadj = 0 print "warning: deletion of length 0" delstart += dadj/2 delend -= dadj/2 mutseq.deletion(delstart,delend) logfile.write("\t".join(('del',chrom,str(start),str(end),action,str(mutseq.length()),str(delstart),str(delend),str(dlen))) + "\n") elif action == 'DUP': dupstart = int(args.maxlibsize) dupend = mutseq.length() - dupstart mutseq.duplication(dupstart,dupend,ndups) logfile.write("\t".join(('dup',chrom,str(start),str(end),action,str(mutseq.length()),str(dupstart),str(dupend),str(ndups))) + "\n") else: raise ValueError(bedline.strip() + ": mutation not one of: INS,INV,DEL,DUP") print "AFTER:",mutseq # simulate reads (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, exclude) # remap reads remap(fq1, fq2, 4, args.refFasta, outbam_mutsfile) else: print "best contig too short to make mutation: ",bedline.strip() print "addsv.py finished, made", nmuts, "mutations." exclude.close() varfile.close() bamfile.close() logfile.close() print "merging mutations into", args.bamFileName, "-->", args.outBamFile replace(args.bamFileName, outbam_mutsfile, args.outBamFile, args.exclfile) # cleanup os.remove(outbam_mutsfile)
def makemut(args, bedline, alignopts): mutid = '_'.join(map(str, bedline.strip().split())) try: bamfile = pysam.Samfile(args.bamFileName, 'rb') reffile = pysam.Fastafile(args.refFasta) logfn = '_'.join(map(os.path.basename, bedline.strip().split())) + ".log" logfile = open('addsv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + '_' + logfn, 'w') exclfile = args.tmpdir + '/' + '.'.join((mutid, 'exclude', str(uuid4()), 'txt')) exclude = open(exclfile, 'w') # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') # temporary file to hold mutated reads outbam_mutsfile = args.tmpdir + '/' + '.'.join((mutid, str(uuid4()), "muts.bam")) c = bedline.strip().split() chrom = c[0] start = int(c[1]) end = int(c[2]) araw = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP actions = map(lambda x: x.strip(),' '.join(araw).split(',')) svfrac = float(args.svfrac) # default, can be overridden by cnv file if cnv: # CNV file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom,start,end): cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" + ' '.join(("copy number in sv region:",chrom,str(start),str(end),"=",str(cn))) + "\n") svfrac = 1.0/float(cn) assert svfrac <= 1.0 sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted MAF: " + str(svfrac) + "\n") print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c print "INFO\t" + now() + "\t" + mutid + "\tlength:", end-start # modify start and end if interval is too long maxctglen = int(args.maxctglen) assert maxctglen > 3*int(args.maxlibsize) # maxctglen is too short if end-start > maxctglen: adj = (end-start) - maxctglen rndpt = random.randint(0,adj) start = start + rndpt end = end - (adj-rndpt) print "INFO\t" + now() + "\t" + mutid + "\tnote: interval size too long, adjusted:",chrom,start,end dfrac = discordant_fraction(args.bamFileName, chrom, start, end) print "INFO\t" + now() + "\t" + mutid + "\tdiscordant fraction:", dfrac maxdfrac = 0.1 # FIXME make a parameter if dfrac > .1: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tdiscordant fraction > " + str(maxdfrac) + " aborting mutation!\n") return None, None contigs = ar.asm(chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.tmpdir, args.noref, args.recycle, mutid=mutid, debug=args.debug) # find the largest contig maxlen = 0 maxcontig = None for contig in contigs: if contig.len > maxlen: maxlen = contig.len maxcontig = contig # be strict about contig quality if re.search('N', maxcontig.seq): sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tcontig dropped due to ambiguous base (N), aborting mutation.\n") return None, None if maxcontig is None: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tmaxcontig has length 0, aborting mutation!\n") return None, None # trim contig to get best ungapped aligned region to ref. refseq = reffile.fetch(chrom,start,end) alignstats = align(maxcontig.seq, refseq) if len(alignstats) < 6: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\talignstats:" + str(alignstats) + "\n") sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tNo good alignment between mutated contig and original, aborting mutation!\n") return None, None qrystart, qryend = map(int, alignstats[2:4]) tgtstart, tgtend = map(int, alignstats[4:6]) refseq = refseq[tgtstart:tgtend] print "INFO\t" + now() + "\t" + mutid + "\tbest contig length:", maxlen print "INFO\t" + now() + "\t" + mutid + "\talignment result:", alignstats maxcontig.trimseq(qrystart, qryend) print "INFO\t" + now() + "\t" + mutid + "\ttrimmed contig length:", maxcontig.len refstart = start + tgtstart refend = start + tgtend if refstart > refend: refstart, refend = refend, refstart print "INFO\t" + now() + "\t" + mutid + "\tstart, end, tgtstart, tgtend, refstart, refend:", start, end, tgtstart, tgtend, refstart, refend # is there anough room to make mutations? if maxcontig.len > 3*int(args.maxlibsize): # make mutation in the largest contig mutseq = ms.MutableSeq(maxcontig.seq) # support for multiple mutations for actionstr in actions: a = actionstr.split() action = a[0] print "INFO\t" + now() + "\t" + mutid + "\taction: ", actionstr, action insseqfile = None insseq = '' tsdlen = 0 # target site duplication length ndups = 0 # number of tandem dups dsize = 0.0 # deletion size fraction dlen = 0 if action == 'INS': assert len(a) > 1 # insertion syntax: INS <file.fa> [optional TSDlen] insseqfile = a[1] if not (os.path.exists(insseqfile) or insseqfile == 'RND'): # not a file... is it a sequence? (support indel ins.) assert re.search('^[ATGCatgc]*$',insseqfile) # make sure it's a sequence insseq = insseqfile.upper() insseqfile = None if len(a) > 2: tsdlen = int(a[2]) if action == 'DUP': if len(a) > 1: ndups = int(a[1]) else: ndups = 1 if action == 'DEL': if len(a) > 1: dsize = float(a[1]) if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion dlen = int(dsize) dsize = 1.0 else: dsize = 1.0 logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n") if action == 'INS': if insseqfile: # seq in file if insseqfile == 'RND': assert args.inslib is not None # insertion library needs to exist mutseq.insertion(mutseq.length()/2,pickseq(args.inslib, mutid=mutid),tsdlen) else: mutseq.insertion(mutseq.length()/2,singleseqfa(insseqfile, mutid=mutid),tsdlen) else: # seq is input mutseq.insertion(mutseq.length()/2,insseq,tsdlen) logfile.write("\t".join(('ins',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(mutseq.length()/2),str(insseqfile),str(tsdlen))) + "\n") elif action == 'INV': invstart = int(args.maxlibsize) invend = mutseq.length() - invstart mutseq.inversion(invstart,invend) logfile.write("\t".join(('inv',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(invstart),str(invend))) + "\n") elif action == 'DEL': delstart = int(args.maxlibsize) delend = mutseq.length() - delstart if dlen == 0: # bp size not specified, delete fraction of contig dlen = int((float(delend-delstart) * dsize)+0.5) dadj = delend-delstart-dlen if dadj < 0: dadj = 0 sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\twarning: deletion of length 0\n") delstart += dadj/2 delend -= dadj/2 mutseq.deletion(delstart,delend) logfile.write("\t".join(('del',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(delstart),str(delend),str(dlen))) + "\n") elif action == 'DUP': dupstart = int(args.maxlibsize) dupend = mutseq.length() - dupstart mutseq.duplication(dupstart,dupend,ndups) logfile.write("\t".join(('dup',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(dupstart),str(dupend),str(ndups))) + "\n") else: raise ValueError("ERROR\t" + now() + "\t" + mutid + "\t: mutation not one of: INS,INV,DEL,DUP\n") logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) +" AFTER\n" + str(mutseq) + "\n") pemean, pesd = float(args.ismean), float(args.issd) print "INFO\t" + now() + "\t" + mutid + "\tset paired end mean distance: " + str(args.ismean) print "INFO\t" + now() + "\t" + mutid + "\tset paired end distance stddev: " + str(args.issd) # simulate reads (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, actions, exclude, pemean, pesd, args.tmpdir, mutid=mutid) outreads = aligners.remap_fastq(args.aligner, fq1, fq2, args.refFasta, outbam_mutsfile, alignopts, mutid=mutid, threads=1) if outreads == 0: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\toutbam " + outbam_mutsfile + " has no mapped reads!\n") return None, None else: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tbest contig too short to make mutation!\n") return None, None print "INFO\t" + now() + "\t" + mutid + "\ttemporary bam: " + outbam_mutsfile exclude.close() bamfile.close() return outbam_mutsfile, exclfile except Exception, e: sys.stderr.write("*"*60 + "\nencountered error in mutation spikein: " + bedline + "\n") traceback.print_exc(file=sys.stderr) sys.stderr.write("*"*60 + "\n") return None, None
def makemut(args, bedline): mutid = ':'.join(map(str, bedline.strip().split())) try: bamfile = pysam.Samfile(args.bamFileName, 'rb') reffile = pysam.Fastafile(args.refFasta) logfn = '_'.join(map(os.path.basename, bedline.strip().split())) + ".log" logfile = open( 'addsv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + '_' + logfn, 'w') exclfile = 'exclude.' + str(random.random()) + '.txt' exclude = open(exclfile, 'w') # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') # temporary file to hold mutated reads outbam_mutsfile = "tmp." + str(random.random()) + ".muts.bam" c = bedline.strip().split() chrom = c[0] start = int(c[1]) end = int(c[2]) araw = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP actions = map(lambda x: x.strip(), ' '.join(araw).split(',')) svfrac = float(args.svfrac) # default, can be overridden by cnv file if cnv: # CNV file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom, start, end): cn = float(cnregion.strip().split() [3]) # expect chrom,start,end,CN sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" + ' '.join(("copy number in snp region:", chrom, str(start), str(end), "=", str(cn))) + "\n") svfrac = 1.0 / float(cn) assert svfrac < 1.0 sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted MAF: " + str(svfrac) + "\n") print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c print "INFO\t" + now() + "\t" + mutid + "\tlength:", end - start # modify start and end if interval is too long maxctglen = int(args.maxctglen) assert maxctglen > 3 * int(args.maxlibsize) # maxctglen is too short if end - start > maxctglen: adj = (end - start) - maxctglen rndpt = random.randint(0, adj) start = start + rndpt end = end - (adj - rndpt) print "INFO\t" + now( ) + "\t" + mutid + "\tnote: interval size too long, adjusted:", chrom, start, end dfrac = discordant_fraction(args.bamFileName, chrom, start, end) print "INFO\t" + now() + "\t" + mutid + "\tdiscordant fraction:", dfrac maxdfrac = 0.1 # FIXME make a parameter if dfrac > .1: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tdiscordant fraction > " + str(maxdfrac) + " aborting mutation!\n") return None, None contigs = ar.asm(chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.noref, args.recycle, mutid=mutid) # find the largest contig maxlen = 0 maxcontig = None for contig in contigs: if contig.len > maxlen: maxlen = contig.len maxcontig = contig if maxcontig is None: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tmaxcontig has length 0, aborting mutation!\n") return None, None # trim contig to get best ungapped aligned region to ref. refseq = reffile.fetch(chrom, start, end) alignstats = align(maxcontig.seq, refseq) if len(alignstats) < 6: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\talignstats:" + str(alignstats) + "\n") sys.stderr.write( "WARN\t" + now() + "\t" + mutid + "\tNo good alignment between mutated contig and original, aborting mutation!\n" ) return None, None qrystart, qryend = map(int, alignstats[2:4]) tgtstart, tgtend = map(int, alignstats[4:6]) refseq = refseq[tgtstart:tgtend] print "INFO\t" + now() + "\t" + mutid + "\tbest contig length:", maxlen print "INFO\t" + now( ) + "\t" + mutid + "\talignment result:", alignstats maxcontig.trimseq(qrystart, qryend) print "INFO\t" + now( ) + "\t" + mutid + "\ttrimmed contig length:", maxcontig.len refstart = start + tgtstart refend = start + tgtend if refstart > refend: refstart, refend = refend, refstart print "INFO\t" + now( ) + "\t" + mutid + "\tstart, end, tgtstart, tgtend, refstart, refend:", start, end, tgtstart, tgtend, refstart, refend # is there anough room to make mutations? if maxcontig.len > 3 * int(args.maxlibsize): # make mutation in the largest contig mutseq = ms.MutableSeq(maxcontig.seq) # support for multiple mutations for actionstr in actions: a = actionstr.split() action = a[0] print "INFO\t" + now( ) + "\t" + mutid + "\taction: ", actionstr, action insseqfile = None insseq = '' tsdlen = 0 # target site duplication length ndups = 0 # number of tandem dups dsize = 0.0 # deletion size fraction dlen = 0 if action == 'INS': assert len( a ) > 1 # insertion syntax: INS <file.fa> [optional TSDlen] insseqfile = a[1] if not ( os.path.exists(insseqfile) or insseqfile == 'RND' ): # not a file... is it a sequence? (support indel ins.) assert re.search( '^[ATGCatgc]*$', insseqfile) # make sure it's a sequence insseq = insseqfile.upper() insseqfile = None if len(a) > 2: tsdlen = int(a[2]) if action == 'DUP': if len(a) > 1: ndups = int(a[1]) else: ndups = 1 if action == 'DEL': if len(a) > 1: dsize = float(a[1]) if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion dlen = int(dsize) dsize = 1.0 else: dsize = 1.0 logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n") if action == 'INS': if insseqfile: # seq in file if insseqfile == 'RND': assert args.inslib is not None # insertion library needs to exist mutseq.insertion(mutseq.length() / 2, pickseq(args.inslib, mutid=mutid), tsdlen) else: mutseq.insertion( mutseq.length() / 2, singleseqfa(insseqfile, mutid=mutid), tsdlen) else: # seq is input mutseq.insertion(mutseq.length() / 2, insseq, tsdlen) logfile.write("\t".join( ('ins', chrom, str(refstart), str(refend), action, str(mutseq.length()), str(mutseq.length() / 2), str(insseqfile), str(tsdlen))) + "\n") elif action == 'INV': invstart = int(args.maxlibsize) invend = mutseq.length() - invstart mutseq.inversion(invstart, invend) logfile.write("\t".join( ('inv', chrom, str(refstart), str(refend), action, str(mutseq.length()), str(invstart), str(invend))) + "\n") elif action == 'DEL': delstart = int(args.maxlibsize) delend = mutseq.length() - delstart if dlen == 0: # bp size not specified, delete fraction of contig dlen = int((float(delend - delstart) * dsize) + 0.5) dadj = delend - delstart - dlen if dadj < 0: dadj = 0 sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\twarning: deletion of length 0\n") delstart += dadj / 2 delend -= dadj / 2 mutseq.deletion(delstart, delend) logfile.write("\t".join( ('del', chrom, str(refstart), str(refend), action, str(mutseq.length()), str(delstart), str(delend), str(dlen))) + "\n") elif action == 'DUP': dupstart = int(args.maxlibsize) dupend = mutseq.length() - dupstart mutseq.duplication(dupstart, dupend, ndups) logfile.write("\t".join( ('dup', chrom, str(refstart), str(refend), action, str(mutseq.length()), str(dupstart), str(dupend), str(ndups))) + "\n") else: raise ValueError( "ERROR\t" + now() + "\t" + mutid + "\t: mutation not one of: INS,INV,DEL,DUP\n") logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " AFTER\n" + str(mutseq) + "\n") pemean, pesd = float(args.ismean), float(args.issd) print "INFO\t" + now( ) + "\t" + mutid + "\tset paired end mean distance: " + str( args.ismean) print "INFO\t" + now( ) + "\t" + mutid + "\tset paired end distance stddev: " + str( args.issd) # simulate reads (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, exclude, pemean, pesd, mutid=mutid) # remap reads if args.bwamem: outreads = remap_bwamem(fq1, fq2, 4, args.refFasta, outbam_mutsfile, mutid=mutid) else: outreads = remap(fq1, fq2, 4, args.refFasta, outbam_mutsfile, mutid=mutid) if outreads == 0: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\toutbam " + outbam_mutsfile + " has no mapped reads!\n") return None, None else: sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tbest contig too short to make mutation!\n") return None, None print "INFO\t" + now( ) + "\t" + mutid + "\ttemporary bam: " + outbam_mutsfile exclude.close() bamfile.close() return outbam_mutsfile, exclfile except Exception, e: sys.stderr.write("*" * 60 + "\nencountered error in mutation spikein: " + bedline + "\n") traceback.print_exc(file=sys.stderr) sys.stderr.write("*" * 60 + "\n") return None, None
def makemut(args, bedline): try: bamfile = pysam.Samfile(args.bamFileName, 'rb') reffile = pysam.Fastafile(args.refFasta) logfn = '_'.join(map(os.path.basename, bedline.strip().split())) + ".log" logfile = open('addsv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + '_' + logfn, 'w') exclfile = 'exclude.' + str(random.random()) + '.txt' exclude = open(exclfile, 'w') # optional CNV file cnv = None if (args.cnvfile): cnv = pysam.Tabixfile(args.cnvfile, 'r') # temporary file to hold mutated reads outbam_mutsfile = "tmp." + str(random.random()) + ".muts.bam" c = bedline.strip().split() chrom = c[0] start = int(c[1]) end = int(c[2]) araw = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP actions = map(lambda x: x.strip(),' '.join(araw).split(',')) svfrac = float(args.svfrac) # default, can be overridden by cnv file if cnv: # CNV file is present if chrom in cnv.contigs: for cnregion in cnv.fetch(chrom,start,end): cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN sys.stderr.write(' '.join(("copy number in snp region:",chrom,str(start),str(end),"=",str(cn))) + "\n") svfrac = 1.0/float(cn) sys.stderr.write("adjusted MAF: " + str(svfrac) + "\n") print "interval:", c print "length:", end-start # modify start and end if interval is too long maxctglen = int(args.maxctglen) assert maxctglen > 3*int(args.maxlibsize) # maxctglen is too short if end-start > maxctglen: adj = (end-start) - maxctglen rndpt = random.randint(0,adj) start = start + rndpt end = end - (adj-rndpt) print "note: interval size too long, adjusted:",chrom,start,end dfrac = discordant_fraction(args.bamFileName, chrom, start, end) print "discordant fraction:",dfrac maxdfrac = 0.1 # FIXME make a parameter if dfrac > .1: print "discordant fraction >", maxdfrac, "aborting mutation!" return None, None contigs = ar.asm(chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.noref, args.recycle) # find the largest contig maxlen = 0 maxcontig = None for contig in contigs: if contig.len > maxlen: maxlen = contig.len maxcontig = contig if maxcontig is None: print "maxcontig has length 0, aborting mutation!" return None, None # trim contig to get best ungapped aligned region to ref. refseq = reffile.fetch(chrom,start,end) alignstats = align(maxcontig.seq, refseq) qrystart, qryend = map(int, alignstats[2:4]) tgtstart, tgtend = map(int, alignstats[4:6]) refseq = refseq[tgtstart:tgtend] print "best contig length:", maxlen print "alignment result:", alignstats maxcontig.trimseq(qrystart, qryend) print "trimmed contig length:", maxcontig.len refstart = start + tgtstart refend = start + tgtend if refstart > refend: refstart, refend = refend, refstart print 'start, end, tgtstart, tgtend, refstart, refend:', start, end, tgtstart, tgtend, refstart, refend #fixedseq = check_asmvariants(args.bamFileName, maxcontig.seq, reffile, chrom, refstart, refend) fixedseq = maxcontig.seq # FIXME # is there anough room to make mutations? if maxcontig.len > 3*int(args.maxlibsize): # make mutation in the largest contig mutseq = ms.MutableSeq(fixedseq) # support for multiple mutations for actionstr in actions: a = actionstr.split() action = a[0] print actionstr,action insseqfile = None insseq = '' tsdlen = 0 # target site duplication length ndups = 0 # number of tandem dups dsize = 0.0 # deletion size fraction dlen = 0 if action == 'INS': assert len(a) > 1 # insertion syntax: INS <file.fa> [optional TSDlen] insseqfile = a[1] if not os.path.exists(insseqfile): # not a file... is it a sequence? (support indel ins.) assert re.search('^[ATGCatgc]*$',insseqfile) # make sure it's a sequence insseq = insseqfile.upper() insseqfile = None if len(a) > 2: tsdlen = int(a[2]) if action == 'DUP': if len(a) > 1: ndups = int(a[1]) else: ndups = 1 if action == 'DEL': if len(a) > 1: dsize = float(a[1]) if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion dlen = int(dsize) dsize = 1.0 else: dsize = 1.0 logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n") if action == 'INS': if insseqfile: # seq in file mutseq.insertion(mutseq.length()/2,singleseqfa(insseqfile),tsdlen) else: # seq is input mutseq.insertion(mutseq.length()/2,insseq,tsdlen) logfile.write("\t".join(('ins',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(mutseq.length()/2),str(insseqfile),str(tsdlen))) + "\n") elif action == 'INV': invstart = int(args.maxlibsize) invend = mutseq.length() - invstart mutseq.inversion(invstart,invend) logfile.write("\t".join(('inv',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(invstart),str(invend))) + "\n") elif action == 'DEL': delstart = int(args.maxlibsize) delend = mutseq.length() - delstart if dlen == 0: # bp size not specified, delete fraction of contig dlen = int((float(delend-delstart) * dsize)+0.5) dadj = delend-delstart-dlen if dadj < 0: dadj = 0 print "warning: deletion of length 0" delstart += dadj/2 delend -= dadj/2 mutseq.deletion(delstart,delend) logfile.write("\t".join(('del',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(delstart),str(delend),str(dlen))) + "\n") elif action == 'DUP': dupstart = int(args.maxlibsize) dupend = mutseq.length() - dupstart mutseq.duplication(dupstart,dupend,ndups) logfile.write("\t".join(('dup',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(dupstart),str(dupend),str(ndups))) + "\n") else: raise ValueError(bedline.strip() + ": mutation not one of: INS,INV,DEL,DUP") logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) +" AFTER\n" + str(mutseq) + "\n") # estimate paired-end distribution print "estimating paired-end insert size mean, stdev..." pemean, pesd = estimate_pedist(bamfile, chrom, start, end, window=10000, setmean=args.ismean, setsd=args.issd) # simulate reads (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, exclude, pemean, pesd) # remap reads outreads = remap(fq1, fq2, 4, args.refFasta, outbam_mutsfile) if outreads == 0: print "outbam", outbam_mutsfile, "has no mapped reads!" return None, None else: print "best contig too short to make mutation: ",bedline.strip() return None, None sys.stderr.write("temporary bam: " + outbam_mutsfile + "\n") exclude.close() bamfile.close() return outbam_mutsfile, exclfile except Exception, e: sys.stderr.write("*"*60 + "\nencountered error in mutation spikein: " + bedline + "\n") traceback.print_exc(file=sys.stdout) sys.stderr.write("*"*60 + "\n") return None, None