예제 #1
0
def makemut(args, bedline, alignopts):

    if args.seed is not None: random.seed(int(args.seed))

    mutid = '_'.join(map(str, bedline.strip().split()))
    try:
        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        reffile = pysam.Fastafile(args.refFasta)
        logfn = '_'.join(map(os.path.basename,
                             bedline.strip().split())) + ".log"
        logfile = open(
            'addsv_logs_' + os.path.basename(args.outBamFile) + '/' +
            os.path.basename(args.outBamFile) + '_' + logfn, 'w')
        exclfile = args.tmpdir + '/' + '.'.join(
            (mutid, 'exclude', str(uuid4()), 'txt'))
        exclude = open(exclfile, 'w')

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        # temporary file to hold mutated reads
        outbam_mutsfile = args.tmpdir + '/' + '.'.join(
            (mutid, str(uuid4()), "muts.bam"))

        c = bedline.strip().split()
        chrom = c[0]
        start = int(c[1])
        end = int(c[2])
        araw = c[3:len(c)]  # INV, DEL, INS seqfile.fa TSDlength, DUP

        # translocation specific
        trn_chrom = None
        trn_start = None
        trn_end = None

        is_transloc = c[3] == 'TRN'

        if is_transloc:
            start -= 3000
            end += 3000
            if start < 0: start = 0

            trn_chrom = c[4]
            trn_start = int(c[5]) - 3000
            trn_end = int(c[5]) + 3000
            if trn_start < 0: trn_start = 0

        actions = map(lambda x: x.strip(), ' '.join(araw).split(','))

        svfrac = float(args.svfrac)  # default, can be overridden by cnv file

        if cnv:  # CNV file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom, start, end):
                    cn = float(cnregion.strip().split()
                               [3])  # expect chrom,start,end,CN
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" +
                                     ' '.join(("copy number in sv region:",
                                               chrom, str(start), str(end),
                                               "=", str(cn))) + "\n")
                    svfrac = 1.0 / float(cn)
                    assert svfrac <= 1.0
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid +
                                     "\tadjusted MAF: " + str(svfrac) + "\n")

        print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c
        print "INFO\t" + now() + "\t" + mutid + "\tlength:", end - start

        # modify start and end if interval is too long
        maxctglen = int(args.maxctglen)
        assert maxctglen > 3 * int(args.maxlibsize)  # maxctglen is too short
        if end - start > maxctglen:
            adj = (end - start) - maxctglen
            rndpt = random.randint(0, adj)
            start = start + rndpt
            end = end - (adj - rndpt)
            print "INFO\t" + now(
            ) + "\t" + mutid + "\tnote: interval size too long, adjusted:", chrom, start, end

        dfrac = discordant_fraction(args.bamFileName, chrom, start, end)
        print "INFO\t" + now() + "\t" + mutid + "\tdiscordant fraction:", dfrac

        maxdfrac = 0.1  # FIXME make a parameter
        if dfrac > .1:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                             "\tdiscordant fraction > " + str(maxdfrac) +
                             " aborting mutation!\n")
            return None, None

        contigs = ar.asm(chrom,
                         start,
                         end,
                         args.bamFileName,
                         reffile,
                         int(args.kmersize),
                         args.tmpdir,
                         args.noref,
                         args.recycle,
                         mutid=mutid,
                         debug=args.debug)

        trn_contigs = None
        if is_transloc:
            trn_contigs = ar.asm(trn_chrom,
                                 trn_start,
                                 trn_end,
                                 args.bamFileName,
                                 reffile,
                                 int(args.kmersize),
                                 args.tmpdir,
                                 args.noref,
                                 args.recycle,
                                 mutid=mutid,
                                 debug=args.debug)

        maxcontig = sorted(contigs)[-1]

        trn_maxcontig = None
        if is_transloc: trn_maxcontig = sorted(trn_contigs)[-1]

        # be strict about contig quality
        if re.search('N', maxcontig.seq):
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid +
                "\tcontig dropped due to ambiguous base (N), aborting mutation.\n"
            )
            return None, None

        if is_transloc and re.search('N', trn_maxcontig.seq):
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid +
                "\tcontig dropped due to ambiguous base (N), aborting mutation.\n"
            )
            return None, None

        if maxcontig is None:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                             "\tmaxcontig has length 0, aborting mutation!\n")
            return None, None

        if is_transloc and trn_maxcontig is None:
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid +
                "\ttransloc maxcontig has length 0, aborting mutation!\n")
            return None, None

        print "INFO\t" + now(
        ) + "\t" + mutid + "\tbest contig length:", sorted(contigs)[-1].len

        if is_transloc:
            print "INFO\t" + now(
            ) + "\t" + mutid + "\tbest transloc contig length:", sorted(
                trn_contigs)[-1].len

        # trim contig to get best ungapped aligned region to ref.
        maxcontig, refseq, alignstats, refstart, refend, qrystart, qryend, tgtstart, tgtend = trim_contig(
            mutid, chrom, start, end, maxcontig, reffile)

        print "INFO\t" + now(
        ) + "\t" + mutid + "\tstart, end, tgtstart, tgtend, refstart, refend:", start, end, tgtstart, tgtend, refstart, refend

        if is_transloc:
            trn_maxcontig, trn_refseq, trn_alignstats, trn_refstart, trn_refend, trn_qrystart, trn_qryend, trn_tgtstart, trn_tgtend = trim_contig(
                mutid, trn_chrom, trn_start, trn_end, trn_maxcontig, reffile)
            print "INFO\t" + now(
            ) + "\t" + mutid + "\ttrn_start, trn_end, trn_tgtstart, trn_tgtend, trn_refstart, trn_refend:", trn_start, trn_end, trn_tgtstart, trn_tgtend, trn_refstart, trn_refend

        # is there anough room to make mutations?
        if maxcontig.len < 3 * int(args.maxlibsize):
            sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                             "\tbest contig too short to make mutation!\n")
            return None, None

        if is_transloc and trn_maxcontig.len < 3 * int(args.maxlibsize):
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid +
                "\tbest transloc contig too short to make mutation!\n")
            return None, None

        # make mutation in the largest contig
        mutseq = ms.MutableSeq(maxcontig.seq)

        if is_transloc: trn_mutseq = ms.MutableSeq(trn_maxcontig.seq)

        # support for multiple mutations
        for actionstr in actions:
            a = actionstr.split()
            action = a[0]

            print "INFO\t" + now(
            ) + "\t" + mutid + "\taction: ", actionstr, action

            insseqfile = None
            insseq = ''
            tsdlen = 0  # target site duplication length
            ndups = 0  # number of tandem dups
            dsize = 0.0  # deletion size fraction
            dlen = 0

            if action == 'INS':
                assert len(
                    a) > 1  # insertion syntax: INS <file.fa> [optional TSDlen]
                insseqfile = a[1]
                if not (
                        os.path.exists(insseqfile) or insseqfile == 'RND'
                ):  # not a file... is it a sequence? (support indel ins.)
                    assert re.search('^[ATGCatgc]*$',
                                     insseqfile)  # make sure it's a sequence
                    insseq = insseqfile.upper()
                    insseqfile = None
                if len(a) > 2:
                    tsdlen = int(a[2])

            if action == 'DUP':
                if len(a) > 1:
                    ndups = int(a[1])
                else:
                    ndups = 1

            if action == 'DEL':
                if len(a) > 1:
                    dsize = float(a[1])
                    if dsize >= 1.0:  # if DEL size is not a fraction, interpret as bp
                        # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion
                        dlen = int(dsize)
                        dsize = 1.0
                else:
                    dsize = 1.0

            if action == 'TRN':
                pass

            logfile.write(">" + chrom + ":" + str(refstart) + "-" +
                          str(refend) + " BEFORE\n" + str(mutseq) + "\n")

            if action == 'INS':
                if insseqfile:  # seq in file
                    if insseqfile == 'RND':
                        assert args.inslib is not None  # insertion library needs to exist
                        insseqfile = random.choice(args.inslib.keys())
                        print "INFO\t" + now(
                        ) + "\t" + mutid + "\tchose sequence from insertion library: " + insseqfile
                        mutseq.insertion(mutseq.length() / 2,
                                         args.inslib[insseqfile], tsdlen)
                    else:
                        mutseq.insertion(mutseq.length() / 2,
                                         singleseqfa(insseqfile, mutid=mutid),
                                         tsdlen)
                else:  # seq is input
                    mutseq.insertion(mutseq.length() / 2, insseq, tsdlen)
                logfile.write("\t".join(
                    ('ins', chrom, str(refstart), str(refend), action,
                     str(mutseq.length()), str(mutseq.length() / 2),
                     str(insseqfile), str(tsdlen))) + "\n")

            elif action == 'INV':
                invstart = int(args.maxlibsize)
                invend = mutseq.length() - invstart
                mutseq.inversion(invstart, invend)
                logfile.write("\t".join(
                    ('inv', chrom, str(refstart), str(refend), action,
                     str(mutseq.length()), str(invstart), str(invend))) + "\n")

            elif action == 'DEL':
                delstart = int(args.maxlibsize)
                delend = mutseq.length() - delstart
                if dlen == 0:  # bp size not specified, delete fraction of contig
                    dlen = int((float(delend - delstart) * dsize) + 0.5)

                dadj = delend - delstart - dlen
                if dadj < 0:
                    dadj = 0
                    sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                                     "\twarning: deletion of length 0\n")

                delstart += dadj / 2
                delend -= dadj / 2

                mutseq.deletion(delstart, delend)
                logfile.write("\t".join(('del', chrom, str(refstart),
                                         str(refend), action,
                                         str(mutseq.length()), str(delstart),
                                         str(delend), str(dlen))) + "\n")

            elif action == 'DUP':
                dupstart = int(args.maxlibsize)
                dupend = mutseq.length() - dupstart
                mutseq.duplication(dupstart, dupend, ndups)
                logfile.write("\t".join(('dup', chrom, str(refstart),
                                         str(refend), action,
                                         str(mutseq.length()), str(dupstart),
                                         str(dupend), str(ndups))) + "\n")

            elif action == 'TRN':
                mutseq.fusion(mutseq.length() / 2, trn_mutseq,
                              trn_mutseq.length() / 2)
                logfile.write("\t".join(
                    ('trn', chrom, str(refstart), str(refend), action,
                     str(mutseq.length()), trn_chrom, str(trn_refstart),
                     str(trn_refend), str(trn_mutseq.length()))) + "\n")

            else:
                raise ValueError(
                    "ERROR\t" + now() + "\t" + mutid +
                    "\t: mutation not one of: INS,INV,DEL,DUP,TRN\n")

            logfile.write(">" + chrom + ":" + str(refstart) + "-" +
                          str(refend) + " AFTER\n" + str(mutseq) + "\n")

        pemean, pesd = float(args.ismean), float(args.issd)
        print "INFO\t" + now(
        ) + "\t" + mutid + "\tset paired end mean distance: " + str(
            args.ismean)
        print "INFO\t" + now(
        ) + "\t" + mutid + "\tset paired end distance stddev: " + str(
            args.issd)

        # simulate reads
        (fq1, fq2) = runwgsim(maxcontig,
                              mutseq.seq,
                              svfrac,
                              actions,
                              exclude,
                              pemean,
                              pesd,
                              args.tmpdir,
                              mutid=mutid,
                              seed=args.seed,
                              trn_contig=trn_maxcontig)

        outreads = aligners.remap_fastq(args.aligner,
                                        fq1,
                                        fq2,
                                        args.refFasta,
                                        outbam_mutsfile,
                                        alignopts,
                                        mutid=mutid,
                                        threads=1)

        if outreads == 0:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\toutbam " +
                             outbam_mutsfile + " has no mapped reads!\n")
            return None, None

        print "INFO\t" + now(
        ) + "\t" + mutid + "\ttemporary bam: " + outbam_mutsfile

        exclude.close()
        bamfile.close()

        return outbam_mutsfile, exclfile

    except Exception, e:
        sys.stderr.write("*" * 60 +
                         "\nencountered error in mutation spikein: " +
                         bedline + "\n")
        traceback.print_exc(file=sys.stderr)
        sys.stderr.write("*" * 60 + "\n")
        return None, None
예제 #2
0
def makemut(args, bedline, alignopts):

    if args.seed is not None:
        random.seed(int(args.seed))

    mutid = "_".join(map(str, bedline.strip().split()))
    try:
        bamfile = pysam.Samfile(args.bamFileName, "rb")
        reffile = pysam.Fastafile(args.refFasta)
        logfn = "_".join(map(os.path.basename, bedline.strip().split())) + ".log"
        logfile = open(
            "addsv_logs_" + os.path.basename(args.outBamFile) + "/" + os.path.basename(args.outBamFile) + "_" + logfn,
            "w",
        )
        exclfile = args.tmpdir + "/" + ".".join((mutid, "exclude", str(uuid4()), "txt"))
        exclude = open(exclfile, "w")

        # optional CNV file
        cnv = None
        if args.cnvfile:
            cnv = pysam.Tabixfile(args.cnvfile, "r")

        # temporary file to hold mutated reads
        outbam_mutsfile = args.tmpdir + "/" + ".".join((mutid, str(uuid4()), "muts.bam"))

        c = bedline.strip().split()
        chrom = c[0]
        start = int(c[1])
        end = int(c[2])
        araw = c[3 : len(c)]  # INV, DEL, INS seqfile.fa TSDlength, DUP

        # translocation specific
        trn_chrom = None
        trn_start = None
        trn_end = None

        is_transloc = c[3] == "TRN"

        if is_transloc:
            start -= 3000
            end += 3000
            if start < 0:
                start = 0

            trn_chrom = c[4]
            trn_start = int(c[5]) - 3000
            trn_end = int(c[5]) + 3000
            if trn_start < 0:
                trn_start = 0

        actions = map(lambda x: x.strip(), " ".join(araw).split(","))

        svfrac = float(args.svfrac)  # default, can be overridden by cnv file

        if cnv:  # CNV file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom, start, end):
                    cn = float(cnregion.strip().split()[3])  # expect chrom,start,end,CN
                    sys.stdout.write(
                        "INFO\t"
                        + now()
                        + "\t"
                        + mutid
                        + "\t"
                        + " ".join(("copy number in sv region:", chrom, str(start), str(end), "=", str(cn)))
                        + "\n"
                    )
                    svfrac = 1.0 / float(cn)
                    assert svfrac <= 1.0
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted MAF: " + str(svfrac) + "\n")

        print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c
        print "INFO\t" + now() + "\t" + mutid + "\tlength:", end - start

        # modify start and end if interval is too long
        maxctglen = int(args.maxctglen)
        assert maxctglen > 3 * int(args.maxlibsize)  # maxctglen is too short
        if end - start > maxctglen:
            adj = (end - start) - maxctglen
            rndpt = random.randint(0, adj)
            start = start + rndpt
            end = end - (adj - rndpt)
            print "INFO\t" + now() + "\t" + mutid + "\tnote: interval size too long, adjusted:", chrom, start, end

        dfrac = discordant_fraction(args.bamFileName, chrom, start, end)
        print "INFO\t" + now() + "\t" + mutid + "\tdiscordant fraction:", dfrac

        maxdfrac = 0.1  # FIXME make a parameter
        if dfrac > 0.1:
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid + "\tdiscordant fraction > " + str(maxdfrac) + " aborting mutation!\n"
            )
            return None, None

        contigs = ar.asm(
            chrom,
            start,
            end,
            args.bamFileName,
            reffile,
            int(args.kmersize),
            args.tmpdir,
            args.noref,
            args.recycle,
            mutid=mutid,
            debug=args.debug,
        )

        trn_contigs = None
        if is_transloc:
            trn_contigs = ar.asm(
                trn_chrom,
                trn_start,
                trn_end,
                args.bamFileName,
                reffile,
                int(args.kmersize),
                args.tmpdir,
                args.noref,
                args.recycle,
                mutid=mutid,
                debug=args.debug,
            )

        maxcontig = sorted(contigs)[-1]

        trn_maxcontig = None
        if is_transloc:
            trn_maxcontig = sorted(trn_contigs)[-1]

        # be strict about contig quality
        if re.search("N", maxcontig.seq):
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid + "\tcontig dropped due to ambiguous base (N), aborting mutation.\n"
            )
            return None, None

        if is_transloc and re.search("N", trn_maxcontig.seq):
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid + "\tcontig dropped due to ambiguous base (N), aborting mutation.\n"
            )
            return None, None

        if maxcontig is None:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tmaxcontig has length 0, aborting mutation!\n")
            return None, None

        if is_transloc and trn_maxcontig is None:
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid + "\ttransloc maxcontig has length 0, aborting mutation!\n"
            )
            return None, None

        print "INFO\t" + now() + "\t" + mutid + "\tbest contig length:", sorted(contigs)[-1].len

        if is_transloc:
            print "INFO\t" + now() + "\t" + mutid + "\tbest transloc contig length:", sorted(trn_contigs)[-1].len

        # trim contig to get best ungapped aligned region to ref.
        maxcontig, refseq, alignstats, refstart, refend, qrystart, qryend, tgtstart, tgtend = trim_contig(
            mutid, chrom, start, end, maxcontig, reffile
        )

        print "INFO\t" + now() + "\t" + mutid + "\tstart, end, tgtstart, tgtend, refstart, refend:", start, end, tgtstart, tgtend, refstart, refend

        if is_transloc:
            trn_maxcontig, trn_refseq, trn_alignstats, trn_refstart, trn_refend, trn_qrystart, trn_qryend, trn_tgtstart, trn_tgtend = trim_contig(
                mutid, trn_chrom, trn_start, trn_end, trn_maxcontig, reffile
            )
            print "INFO\t" + now() + "\t" + mutid + "\ttrn_start, trn_end, trn_tgtstart, trn_tgtend, trn_refstart, trn_refend:", trn_start, trn_end, trn_tgtstart, trn_tgtend, trn_refstart, trn_refend

        # is there anough room to make mutations?
        if maxcontig.len < 3 * int(args.maxlibsize):
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tbest contig too short to make mutation!\n")
            return None, None

        if is_transloc and trn_maxcontig.len < 3 * int(args.maxlibsize):
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tbest transloc contig too short to make mutation!\n")
            return None, None

        # make mutation in the largest contig
        mutseq = ms.MutableSeq(maxcontig.seq)

        if is_transloc:
            trn_mutseq = ms.MutableSeq(trn_maxcontig.seq)

        # support for multiple mutations
        for actionstr in actions:
            a = actionstr.split()
            action = a[0]

            print "INFO\t" + now() + "\t" + mutid + "\taction: ", actionstr, action

            insseqfile = None
            insseq = ""
            tsdlen = 0  # target site duplication length
            ndups = 0  # number of tandem dups
            dsize = 0.0  # deletion size fraction
            dlen = 0

            if action == "INS":
                assert len(a) > 1  # insertion syntax: INS <file.fa> [optional TSDlen]
                insseqfile = a[1]
                if not (
                    os.path.exists(insseqfile) or insseqfile == "RND"
                ):  # not a file... is it a sequence? (support indel ins.)
                    assert re.search("^[ATGCatgc]*$", insseqfile)  # make sure it's a sequence
                    insseq = insseqfile.upper()
                    insseqfile = None
                if len(a) > 2:
                    tsdlen = int(a[2])

            if action == "DUP":
                if len(a) > 1:
                    ndups = int(a[1])
                else:
                    ndups = 1

            if action == "DEL":
                if len(a) > 1:
                    dsize = float(a[1])
                    if dsize >= 1.0:  # if DEL size is not a fraction, interpret as bp
                        # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion
                        dlen = int(dsize)
                        dsize = 1.0
                else:
                    dsize = 1.0

            if action == "TRN":
                pass

            logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n")

            if action == "INS":
                if insseqfile:  # seq in file
                    if insseqfile == "RND":
                        assert args.inslib is not None  # insertion library needs to exist
                        insseqfile = random.choice(args.inslib.keys())
                        print "INFO\t" + now() + "\t" + mutid + "\tchose sequence from insertion library: " + insseqfile
                        mutseq.insertion(mutseq.length() / 2, args.inslib[insseqfile], tsdlen)
                    else:
                        mutseq.insertion(mutseq.length() / 2, singleseqfa(insseqfile, mutid=mutid), tsdlen)
                else:  # seq is input
                    mutseq.insertion(mutseq.length() / 2, insseq, tsdlen)
                logfile.write(
                    "\t".join(
                        (
                            "ins",
                            chrom,
                            str(refstart),
                            str(refend),
                            action,
                            str(mutseq.length()),
                            str(mutseq.length() / 2),
                            str(insseqfile),
                            str(tsdlen),
                        )
                    )
                    + "\n"
                )

            elif action == "INV":
                invstart = int(args.maxlibsize)
                invend = mutseq.length() - invstart
                mutseq.inversion(invstart, invend)
                logfile.write(
                    "\t".join(
                        (
                            "inv",
                            chrom,
                            str(refstart),
                            str(refend),
                            action,
                            str(mutseq.length()),
                            str(invstart),
                            str(invend),
                        )
                    )
                    + "\n"
                )

            elif action == "DEL":
                delstart = int(args.maxlibsize)
                delend = mutseq.length() - delstart
                if dlen == 0:  # bp size not specified, delete fraction of contig
                    dlen = int((float(delend - delstart) * dsize) + 0.5)

                dadj = delend - delstart - dlen
                if dadj < 0:
                    dadj = 0
                    sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\twarning: deletion of length 0\n")

                delstart += dadj / 2
                delend -= dadj / 2

                mutseq.deletion(delstart, delend)
                logfile.write(
                    "\t".join(
                        (
                            "del",
                            chrom,
                            str(refstart),
                            str(refend),
                            action,
                            str(mutseq.length()),
                            str(delstart),
                            str(delend),
                            str(dlen),
                        )
                    )
                    + "\n"
                )

            elif action == "DUP":
                dupstart = int(args.maxlibsize)
                dupend = mutseq.length() - dupstart
                mutseq.duplication(dupstart, dupend, ndups)
                logfile.write(
                    "\t".join(
                        (
                            "dup",
                            chrom,
                            str(refstart),
                            str(refend),
                            action,
                            str(mutseq.length()),
                            str(dupstart),
                            str(dupend),
                            str(ndups),
                        )
                    )
                    + "\n"
                )

            elif action == "TRN":
                mutseq.fusion(mutseq.length() / 2, trn_mutseq, trn_mutseq.length() / 2)
                logfile.write(
                    "\t".join(
                        (
                            "trn",
                            chrom,
                            str(refstart),
                            str(refend),
                            action,
                            str(mutseq.length()),
                            trn_chrom,
                            str(trn_refstart),
                            str(trn_refend),
                            str(trn_mutseq.length()),
                        )
                    )
                    + "\n"
                )

            else:
                raise ValueError("ERROR\t" + now() + "\t" + mutid + "\t: mutation not one of: INS,INV,DEL,DUP,TRN\n")

            logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " AFTER\n" + str(mutseq) + "\n")

        pemean, pesd = float(args.ismean), float(args.issd)
        print "INFO\t" + now() + "\t" + mutid + "\tset paired end mean distance: " + str(args.ismean)
        print "INFO\t" + now() + "\t" + mutid + "\tset paired end distance stddev: " + str(args.issd)

        # simulate reads
        (fq1, fq2) = runwgsim(
            maxcontig,
            mutseq.seq,
            svfrac,
            actions,
            exclude,
            pemean,
            pesd,
            args.tmpdir,
            mutid=mutid,
            seed=args.seed,
            trn_contig=trn_maxcontig,
        )

        outreads = aligners.remap_fastq(
            args.aligner, fq1, fq2, args.refFasta, outbam_mutsfile, alignopts, mutid=mutid, threads=1
        )

        if outreads == 0:
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid + "\toutbam " + outbam_mutsfile + " has no mapped reads!\n"
            )
            return None, None

        print "INFO\t" + now() + "\t" + mutid + "\ttemporary bam: " + outbam_mutsfile

        exclude.close()
        bamfile.close()

        return outbam_mutsfile, exclfile

    except Exception, e:
        sys.stderr.write("*" * 60 + "\nencountered error in mutation spikein: " + bedline + "\n")
        traceback.print_exc(file=sys.stderr)
        sys.stderr.write("*" * 60 + "\n")
        return None, None
예제 #3
0
def main(args):
    """ needs refactoring
    """
    varfile = open(args.varFileName, 'r')
    bamfile = pysam.Samfile(args.bamFileName, 'rb')
    reffile = pysam.Fastafile(args.refFasta)
    logfile = open(args.outBamFile + ".log", 'w')
    exclude = open(args.exclfile, 'w')

    # optional CNV file
    cnv = None
    if (args.cnvfile):
        cnv = pysam.Tabixfile(args.cnvfile, 'r')

    # temporary file to hold mutated reads
    outbam_mutsfile = "tmp." + str(random.random()) + ".muts.bam"

    nmuts = 0

    for bedline in varfile:
        if re.search('^#',bedline):
            continue
   
        if args.maxmuts and nmuts >= int(args.maxmuts):
            break
 
        c = bedline.strip().split()
        chrom    = c[0]
        start  = int(c[1])
        end    = int(c[2])
        araw   = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP
        actions = map(lambda x: x.strip(),' '.join(araw).split(','))

        svfrac = float(args.svfrac) # default, can be overridden by cnv file

        if cnv: # CNV file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom,start,end):
                    cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN
                    sys.stderr.write(' '.join(("copy number in snp region:",chrom,str(start),str(end),"=",str(cn))) + "\n")
                    svfrac = 1.0/float(cn)
                    sys.stderr.write("adjusted MAF: " + str(svfrac) + "\n")

        print "interval:",c
        # modify start and end if interval is too long
        maxctglen = int(args.maxctglen)
        assert maxctglen > 3*int(args.maxlibsize) # maxctglen is too short
        if end-start > maxctglen:
            adj   = (end-start) - maxctglen
            rndpt = random.randint(0,adj)
            start = start + rndpt
            end   = end - (adj-rndpt)
            print "note: interval size too long, adjusted:",chrom,start,end

        contigs = ar.asm(chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.noref, args.recycle)

        # find the largest contig        
        maxlen = 0
        maxcontig = None
        for contig in contigs:
            if contig.len > maxlen:
                maxlen = contig.len
                maxcontig = contig

        # is there anough room to make mutations?
        if maxlen > 3*int(args.maxlibsize):
            # make mutation in the largest contig
            mutseq = ms.MutableSeq(maxcontig.seq)

            # if we're this far along, we're making a mutation
            nmuts += 1 

            # support for multiple mutations
            for actionstr in actions:
                a = actionstr.split()
                action = a[0]

                print actionstr,action

                insseqfile = None
                insseq = ''
                tsdlen = 0  # target site duplication length
                ndups = 0   # number of tandem dups
                dsize = 0.0 # deletion size fraction
                dlen = 0
                if action == 'INS':
                    assert len(a) > 1 # insertion syntax: INS <file.fa> [optional TSDlen]
                    insseqfile = a[1]
                    if not os.path.exists(insseqfile): # not a file... is it a sequence? (support indel ins.)
                        assert re.search('^[ATGCatgc]*$',insseqfile) # make sure it's a sequence
                        insseq = insseqfile.upper()
                        insseqfile = None
                    if len(a) > 2:
                        tsdlen = int(a[2])

                if action == 'DUP':
                    if len(a) > 1:
                        ndups = int(a[1])
                    else:
                        ndups = 1

                if action == 'DEL':
                    if len(a) > 1:
                        dsize = float(a[1])
                        if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp
                            # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion
                            dlen = int(dsize)
                            dsize = 1.0
                    else:
                        dsize = 1.0

                print "BEFORE:",mutseq

                if action == 'INS':
                    if insseqfile: # seq in file
                        mutseq.insertion(mutseq.length()/2,singleseqfa(insseqfile),tsdlen)
                    else: # seq is input
                        mutseq.insertion(mutseq.length()/2,insseq,tsdlen)
                    logfile.write("\t".join(('ins',chrom,str(start),str(end),action,str(mutseq.length()),str(mutseq.length()/2),str(insseqfile),str(tsdlen))) + "\n")

                elif action == 'INV':
                    invstart = int(args.maxlibsize)
                    invend = mutseq.length() - invstart
                    mutseq.inversion(invstart,invend)
                    logfile.write("\t".join(('inv',chrom,str(start),str(end),action,str(mutseq.length()),str(invstart),str(invend))) + "\n")

                elif action == 'DEL':
                    delstart = int(args.maxlibsize)
                    delend = mutseq.length() - delstart
                    if dlen == 0: # bp size not specified, delete fraction of contig
                        dlen = int((float(delend-delstart) * dsize)+0.5) 

                    dadj = delend-delstart-dlen
                    if dadj < 0:
                        dadj = 0
                        print "warning: deletion of length 0"

                    delstart += dadj/2
                    delend   -= dadj/2

                    mutseq.deletion(delstart,delend)
                    logfile.write("\t".join(('del',chrom,str(start),str(end),action,str(mutseq.length()),str(delstart),str(delend),str(dlen))) + "\n")

                elif action == 'DUP':
                    dupstart = int(args.maxlibsize)
                    dupend = mutseq.length() - dupstart
                    mutseq.duplication(dupstart,dupend,ndups)
                    logfile.write("\t".join(('dup',chrom,str(start),str(end),action,str(mutseq.length()),str(dupstart),str(dupend),str(ndups))) + "\n")

                else:
                    raise ValueError(bedline.strip() + ": mutation not one of: INS,INV,DEL,DUP")

                print "AFTER:",mutseq

            # simulate reads
            (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, exclude)

            # remap reads
            remap(fq1, fq2, 4, args.refFasta, outbam_mutsfile)

        else:
            print "best contig too short to make mutation: ",bedline.strip()

    print "addsv.py finished, made", nmuts, "mutations."

    exclude.close()
    varfile.close()
    bamfile.close()
    logfile.close()

    print "merging mutations into", args.bamFileName, "-->", args.outBamFile
    replace(args.bamFileName, outbam_mutsfile, args.outBamFile, args.exclfile)

    # cleanup
    os.remove(outbam_mutsfile)
예제 #4
0
파일: addsv.py 프로젝트: CSB5/bamsurgeon
def makemut(args, bedline, alignopts):
    mutid = '_'.join(map(str, bedline.strip().split()))
    try:
        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        reffile = pysam.Fastafile(args.refFasta)
        logfn = '_'.join(map(os.path.basename, bedline.strip().split())) + ".log"
        logfile = open('addsv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + '_' + logfn, 'w')
        exclfile = args.tmpdir + '/' + '.'.join((mutid, 'exclude', str(uuid4()), 'txt'))
        exclude = open(exclfile, 'w')

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        # temporary file to hold mutated reads
        outbam_mutsfile = args.tmpdir + '/' + '.'.join((mutid, str(uuid4()), "muts.bam"))

        c = bedline.strip().split()
        chrom  = c[0]
        start  = int(c[1])
        end    = int(c[2])
        araw   = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP
 
        actions = map(lambda x: x.strip(),' '.join(araw).split(','))

        svfrac = float(args.svfrac) # default, can be overridden by cnv file

        if cnv: # CNV file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom,start,end):
                    cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" + ' '.join(("copy number in sv region:",chrom,str(start),str(end),"=",str(cn))) + "\n")
                    svfrac = 1.0/float(cn)
                    assert svfrac <= 1.0
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted MAF: " + str(svfrac) + "\n")

        print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c
        print "INFO\t" + now() + "\t" + mutid + "\tlength:", end-start

        # modify start and end if interval is too long
        maxctglen = int(args.maxctglen)
        assert maxctglen > 3*int(args.maxlibsize) # maxctglen is too short
        if end-start > maxctglen:
            adj   = (end-start) - maxctglen
            rndpt = random.randint(0,adj)
            start = start + rndpt
            end   = end - (adj-rndpt)
            print "INFO\t" + now() + "\t" + mutid + "\tnote: interval size too long, adjusted:",chrom,start,end

        dfrac = discordant_fraction(args.bamFileName, chrom, start, end)
        print "INFO\t" + now() + "\t" + mutid + "\tdiscordant fraction:", dfrac

        maxdfrac = 0.1 # FIXME make a parameter
        if dfrac > .1: 
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tdiscordant fraction > " + str(maxdfrac) + " aborting mutation!\n")
            return None, None

        contigs = ar.asm(chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.tmpdir, args.noref, args.recycle, mutid=mutid, debug=args.debug)

        # find the largest contig        
        maxlen = 0
        maxcontig = None
        for contig in contigs:
            if contig.len > maxlen:
                maxlen = contig.len
                maxcontig = contig

        # be strict about contig quality
        if re.search('N', maxcontig.seq):
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tcontig dropped due to ambiguous base (N), aborting mutation.\n")
            return None, None

        if maxcontig is None:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tmaxcontig has length 0, aborting mutation!\n")
            return None, None

        # trim contig to get best ungapped aligned region to ref.

        refseq = reffile.fetch(chrom,start,end)
        alignstats = align(maxcontig.seq, refseq)
        
        if len(alignstats) < 6:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\talignstats:" + str(alignstats) + "\n")
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tNo good alignment between mutated contig and original, aborting mutation!\n")
            return None, None
        
        qrystart, qryend = map(int, alignstats[2:4])
        tgtstart, tgtend = map(int, alignstats[4:6])

        refseq = refseq[tgtstart:tgtend]

        print "INFO\t" + now() + "\t" + mutid + "\tbest contig length:", maxlen
        print "INFO\t" + now() + "\t" + mutid + "\talignment result:", alignstats

        maxcontig.trimseq(qrystart, qryend)
        print "INFO\t" + now() + "\t" + mutid + "\ttrimmed contig length:", maxcontig.len

        refstart = start + tgtstart
        refend = start + tgtend

        if refstart > refend:
            refstart, refend = refend, refstart
    
        print "INFO\t" + now() + "\t" + mutid + "\tstart, end, tgtstart, tgtend, refstart, refend:", start, end, tgtstart, tgtend, refstart, refend

        # is there anough room to make mutations?
        if maxcontig.len > 3*int(args.maxlibsize):
            # make mutation in the largest contig
            mutseq = ms.MutableSeq(maxcontig.seq)

            # support for multiple mutations
            for actionstr in actions:
                a = actionstr.split()
                action = a[0]

                print "INFO\t" + now() + "\t" + mutid + "\taction: ", actionstr, action

                insseqfile = None
                insseq = ''
                tsdlen = 0  # target site duplication length
                ndups = 0   # number of tandem dups
                dsize = 0.0 # deletion size fraction
                dlen = 0
                if action == 'INS':
                    assert len(a) > 1 # insertion syntax: INS <file.fa> [optional TSDlen]
                    insseqfile = a[1]
                    if not (os.path.exists(insseqfile) or insseqfile == 'RND'): # not a file... is it a sequence? (support indel ins.)
                        assert re.search('^[ATGCatgc]*$',insseqfile) # make sure it's a sequence
                        insseq = insseqfile.upper()
                        insseqfile = None
                    if len(a) > 2:
                        tsdlen = int(a[2])

                if action == 'DUP':
                    if len(a) > 1:
                        ndups = int(a[1])
                    else:
                        ndups = 1

                if action == 'DEL':
                    if len(a) > 1:
                        dsize = float(a[1])
                        if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp
                            # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion
                            dlen = int(dsize)
                            dsize = 1.0
                    else:
                        dsize = 1.0

                logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n")

                if action == 'INS':
                    if insseqfile: # seq in file
                        if insseqfile == 'RND':
                            assert args.inslib is not None # insertion library needs to exist
                            mutseq.insertion(mutseq.length()/2,pickseq(args.inslib, mutid=mutid),tsdlen)
                        else:
                            mutseq.insertion(mutseq.length()/2,singleseqfa(insseqfile, mutid=mutid),tsdlen)
                    else: # seq is input
                        mutseq.insertion(mutseq.length()/2,insseq,tsdlen)
                    logfile.write("\t".join(('ins',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(mutseq.length()/2),str(insseqfile),str(tsdlen))) + "\n")

                elif action == 'INV':
                    invstart = int(args.maxlibsize)
                    invend = mutseq.length() - invstart
                    mutseq.inversion(invstart,invend)
                    logfile.write("\t".join(('inv',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(invstart),str(invend))) + "\n")

                elif action == 'DEL':
                    delstart = int(args.maxlibsize)
                    delend = mutseq.length() - delstart
                    if dlen == 0: # bp size not specified, delete fraction of contig
                        dlen = int((float(delend-delstart) * dsize)+0.5) 

                    dadj = delend-delstart-dlen
                    if dadj < 0:
                        dadj = 0
                        sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\twarning: deletion of length 0\n")
    
                    delstart += dadj/2
                    delend   -= dadj/2

                    mutseq.deletion(delstart,delend)
                    logfile.write("\t".join(('del',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(delstart),str(delend),str(dlen))) + "\n")

                elif action == 'DUP':
                    dupstart = int(args.maxlibsize)
                    dupend = mutseq.length() - dupstart
                    mutseq.duplication(dupstart,dupend,ndups)
                    logfile.write("\t".join(('dup',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(dupstart),str(dupend),str(ndups))) + "\n")

                else:
                    raise ValueError("ERROR\t" + now() + "\t" + mutid + "\t: mutation not one of: INS,INV,DEL,DUP\n")

                logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) +" AFTER\n" + str(mutseq) + "\n")

            pemean, pesd = float(args.ismean), float(args.issd) 
            print "INFO\t" + now() + "\t" + mutid + "\tset paired end mean distance: " + str(args.ismean)
            print "INFO\t" + now() + "\t" + mutid + "\tset paired end distance stddev: " + str(args.issd)

            # simulate reads
            (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, actions, exclude, pemean, pesd, args.tmpdir, mutid=mutid)

            outreads = aligners.remap_fastq(args.aligner, fq1, fq2, args.refFasta, outbam_mutsfile, alignopts, mutid=mutid, threads=1)

            if outreads == 0:
                sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\toutbam " + outbam_mutsfile + " has no mapped reads!\n")
                return None, None

        else:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tbest contig too short to make mutation!\n")
            return None, None

        print "INFO\t" + now() + "\t" + mutid + "\ttemporary bam: " + outbam_mutsfile

        exclude.close()
        bamfile.close()

        return outbam_mutsfile, exclfile

    except Exception, e:
        sys.stderr.write("*"*60 + "\nencountered error in mutation spikein: " + bedline + "\n")
        traceback.print_exc(file=sys.stderr)
        sys.stderr.write("*"*60 + "\n")
        return None, None
예제 #5
0
def makemut(args, bedline):
    mutid = ':'.join(map(str, bedline.strip().split()))
    try:
        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        reffile = pysam.Fastafile(args.refFasta)
        logfn = '_'.join(map(os.path.basename,
                             bedline.strip().split())) + ".log"
        logfile = open(
            'addsv_logs_' + os.path.basename(args.outBamFile) + '/' +
            os.path.basename(args.outBamFile) + '_' + logfn, 'w')
        exclfile = 'exclude.' + str(random.random()) + '.txt'
        exclude = open(exclfile, 'w')

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        # temporary file to hold mutated reads
        outbam_mutsfile = "tmp." + str(random.random()) + ".muts.bam"

        c = bedline.strip().split()
        chrom = c[0]
        start = int(c[1])
        end = int(c[2])
        araw = c[3:len(c)]  # INV, DEL, INS seqfile.fa TSDlength, DUP

        actions = map(lambda x: x.strip(), ' '.join(araw).split(','))

        svfrac = float(args.svfrac)  # default, can be overridden by cnv file

        if cnv:  # CNV file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom, start, end):
                    cn = float(cnregion.strip().split()
                               [3])  # expect chrom,start,end,CN
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" +
                                     ' '.join(("copy number in snp region:",
                                               chrom, str(start), str(end),
                                               "=", str(cn))) + "\n")
                    svfrac = 1.0 / float(cn)
                    assert svfrac < 1.0
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid +
                                     "\tadjusted MAF: " + str(svfrac) + "\n")

        print "INFO\t" + now() + "\t" + mutid + "\tinterval:", c
        print "INFO\t" + now() + "\t" + mutid + "\tlength:", end - start
        # modify start and end if interval is too long
        maxctglen = int(args.maxctglen)
        assert maxctglen > 3 * int(args.maxlibsize)  # maxctglen is too short
        if end - start > maxctglen:
            adj = (end - start) - maxctglen
            rndpt = random.randint(0, adj)
            start = start + rndpt
            end = end - (adj - rndpt)
            print "INFO\t" + now(
            ) + "\t" + mutid + "\tnote: interval size too long, adjusted:", chrom, start, end

        dfrac = discordant_fraction(args.bamFileName, chrom, start, end)
        print "INFO\t" + now() + "\t" + mutid + "\tdiscordant fraction:", dfrac

        maxdfrac = 0.1  # FIXME make a parameter
        if dfrac > .1:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                             "\tdiscordant fraction > " + str(maxdfrac) +
                             " aborting mutation!\n")
            return None, None

        contigs = ar.asm(chrom,
                         start,
                         end,
                         args.bamFileName,
                         reffile,
                         int(args.kmersize),
                         args.noref,
                         args.recycle,
                         mutid=mutid)

        # find the largest contig
        maxlen = 0
        maxcontig = None
        for contig in contigs:
            if contig.len > maxlen:
                maxlen = contig.len
                maxcontig = contig

        if maxcontig is None:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                             "\tmaxcontig has length 0, aborting mutation!\n")
            return None, None

        # trim contig to get best ungapped aligned region to ref.
        refseq = reffile.fetch(chrom, start, end)
        alignstats = align(maxcontig.seq, refseq)

        if len(alignstats) < 6:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                             "\talignstats:" + str(alignstats) + "\n")
            sys.stderr.write(
                "WARN\t" + now() + "\t" + mutid +
                "\tNo good alignment between mutated contig and original, aborting mutation!\n"
            )
            return None, None

        qrystart, qryend = map(int, alignstats[2:4])
        tgtstart, tgtend = map(int, alignstats[4:6])

        refseq = refseq[tgtstart:tgtend]

        print "INFO\t" + now() + "\t" + mutid + "\tbest contig length:", maxlen
        print "INFO\t" + now(
        ) + "\t" + mutid + "\talignment result:", alignstats

        maxcontig.trimseq(qrystart, qryend)
        print "INFO\t" + now(
        ) + "\t" + mutid + "\ttrimmed contig length:", maxcontig.len

        refstart = start + tgtstart
        refend = start + tgtend

        if refstart > refend:
            refstart, refend = refend, refstart

        print "INFO\t" + now(
        ) + "\t" + mutid + "\tstart, end, tgtstart, tgtend, refstart, refend:", start, end, tgtstart, tgtend, refstart, refend

        # is there anough room to make mutations?
        if maxcontig.len > 3 * int(args.maxlibsize):
            # make mutation in the largest contig
            mutseq = ms.MutableSeq(maxcontig.seq)

            # support for multiple mutations
            for actionstr in actions:
                a = actionstr.split()
                action = a[0]

                print "INFO\t" + now(
                ) + "\t" + mutid + "\taction: ", actionstr, action

                insseqfile = None
                insseq = ''
                tsdlen = 0  # target site duplication length
                ndups = 0  # number of tandem dups
                dsize = 0.0  # deletion size fraction
                dlen = 0
                if action == 'INS':
                    assert len(
                        a
                    ) > 1  # insertion syntax: INS <file.fa> [optional TSDlen]
                    insseqfile = a[1]
                    if not (
                            os.path.exists(insseqfile) or insseqfile == 'RND'
                    ):  # not a file... is it a sequence? (support indel ins.)
                        assert re.search(
                            '^[ATGCatgc]*$',
                            insseqfile)  # make sure it's a sequence
                        insseq = insseqfile.upper()
                        insseqfile = None
                    if len(a) > 2:
                        tsdlen = int(a[2])

                if action == 'DUP':
                    if len(a) > 1:
                        ndups = int(a[1])
                    else:
                        ndups = 1

                if action == 'DEL':
                    if len(a) > 1:
                        dsize = float(a[1])
                        if dsize >= 1.0:  # if DEL size is not a fraction, interpret as bp
                            # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion
                            dlen = int(dsize)
                            dsize = 1.0
                    else:
                        dsize = 1.0

                logfile.write(">" + chrom + ":" + str(refstart) + "-" +
                              str(refend) + " BEFORE\n" + str(mutseq) + "\n")

                if action == 'INS':
                    if insseqfile:  # seq in file
                        if insseqfile == 'RND':
                            assert args.inslib is not None  # insertion library needs to exist
                            mutseq.insertion(mutseq.length() / 2,
                                             pickseq(args.inslib, mutid=mutid),
                                             tsdlen)
                        else:
                            mutseq.insertion(
                                mutseq.length() / 2,
                                singleseqfa(insseqfile, mutid=mutid), tsdlen)
                    else:  # seq is input
                        mutseq.insertion(mutseq.length() / 2, insseq, tsdlen)
                    logfile.write("\t".join(
                        ('ins', chrom, str(refstart), str(refend), action,
                         str(mutseq.length()), str(mutseq.length() / 2),
                         str(insseqfile), str(tsdlen))) + "\n")

                elif action == 'INV':
                    invstart = int(args.maxlibsize)
                    invend = mutseq.length() - invstart
                    mutseq.inversion(invstart, invend)
                    logfile.write("\t".join(
                        ('inv', chrom, str(refstart), str(refend), action,
                         str(mutseq.length()), str(invstart), str(invend))) +
                                  "\n")

                elif action == 'DEL':
                    delstart = int(args.maxlibsize)
                    delend = mutseq.length() - delstart
                    if dlen == 0:  # bp size not specified, delete fraction of contig
                        dlen = int((float(delend - delstart) * dsize) + 0.5)

                    dadj = delend - delstart - dlen
                    if dadj < 0:
                        dadj = 0
                        sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                                         "\twarning: deletion of length 0\n")

                    delstart += dadj / 2
                    delend -= dadj / 2

                    mutseq.deletion(delstart, delend)
                    logfile.write("\t".join(
                        ('del', chrom, str(refstart), str(refend), action,
                         str(mutseq.length()), str(delstart), str(delend),
                         str(dlen))) + "\n")

                elif action == 'DUP':
                    dupstart = int(args.maxlibsize)
                    dupend = mutseq.length() - dupstart
                    mutseq.duplication(dupstart, dupend, ndups)
                    logfile.write("\t".join(
                        ('dup', chrom, str(refstart), str(refend), action,
                         str(mutseq.length()), str(dupstart), str(dupend),
                         str(ndups))) + "\n")

                else:
                    raise ValueError(
                        "ERROR\t" + now() + "\t" + mutid +
                        "\t: mutation not one of: INS,INV,DEL,DUP\n")

                logfile.write(">" + chrom + ":" + str(refstart) + "-" +
                              str(refend) + " AFTER\n" + str(mutseq) + "\n")

            pemean, pesd = float(args.ismean), float(args.issd)
            print "INFO\t" + now(
            ) + "\t" + mutid + "\tset paired end mean distance: " + str(
                args.ismean)
            print "INFO\t" + now(
            ) + "\t" + mutid + "\tset paired end distance stddev: " + str(
                args.issd)

            # simulate reads
            (fq1, fq2) = runwgsim(maxcontig,
                                  mutseq.seq,
                                  svfrac,
                                  exclude,
                                  pemean,
                                  pesd,
                                  mutid=mutid)

            # remap reads
            if args.bwamem:
                outreads = remap_bwamem(fq1,
                                        fq2,
                                        4,
                                        args.refFasta,
                                        outbam_mutsfile,
                                        mutid=mutid)
            else:
                outreads = remap(fq1,
                                 fq2,
                                 4,
                                 args.refFasta,
                                 outbam_mutsfile,
                                 mutid=mutid)

            if outreads == 0:
                sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                                 "\toutbam " + outbam_mutsfile +
                                 " has no mapped reads!\n")
                return None, None

        else:
            sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                             "\tbest contig too short to make mutation!\n")
            return None, None

        print "INFO\t" + now(
        ) + "\t" + mutid + "\ttemporary bam: " + outbam_mutsfile

        exclude.close()
        bamfile.close()

        return outbam_mutsfile, exclfile

    except Exception, e:
        sys.stderr.write("*" * 60 +
                         "\nencountered error in mutation spikein: " +
                         bedline + "\n")
        traceback.print_exc(file=sys.stderr)
        sys.stderr.write("*" * 60 + "\n")
        return None, None
예제 #6
0
def makemut(args, bedline):
    try:
        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        reffile = pysam.Fastafile(args.refFasta)
        logfn   = '_'.join(map(os.path.basename, bedline.strip().split())) + ".log"
        logfile = open('addsv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + '_' + logfn, 'w')
        exclfile = 'exclude.' + str(random.random()) + '.txt'
        exclude = open(exclfile, 'w')

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        # temporary file to hold mutated reads
        outbam_mutsfile = "tmp." + str(random.random()) + ".muts.bam"

        c = bedline.strip().split()
        chrom    = c[0]
        start  = int(c[1])
        end    = int(c[2])
        araw   = c[3:len(c)] # INV, DEL, INS seqfile.fa TSDlength, DUP
 
        actions = map(lambda x: x.strip(),' '.join(araw).split(','))

        svfrac = float(args.svfrac) # default, can be overridden by cnv file

        if cnv: # CNV file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom,start,end):
                    cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN
                    sys.stderr.write(' '.join(("copy number in snp region:",chrom,str(start),str(end),"=",str(cn))) + "\n")
                    svfrac = 1.0/float(cn)
                    sys.stderr.write("adjusted MAF: " + str(svfrac) + "\n")

        print "interval:", c
        print "length:", end-start
        # modify start and end if interval is too long
        maxctglen = int(args.maxctglen)
        assert maxctglen > 3*int(args.maxlibsize) # maxctglen is too short
        if end-start > maxctglen:
            adj   = (end-start) - maxctglen
            rndpt = random.randint(0,adj)
            start = start + rndpt
            end   = end - (adj-rndpt)
            print "note: interval size too long, adjusted:",chrom,start,end

        dfrac = discordant_fraction(args.bamFileName, chrom, start, end)
        print "discordant fraction:",dfrac

        maxdfrac = 0.1 # FIXME make a parameter
        if dfrac > .1: 
            print "discordant fraction >", maxdfrac, "aborting mutation!"
            return None, None

        contigs = ar.asm(chrom, start, end, args.bamFileName, reffile, int(args.kmersize), args.noref, args.recycle)

        # find the largest contig        
        maxlen = 0
        maxcontig = None
        for contig in contigs:
            if contig.len > maxlen:
                maxlen = contig.len
                maxcontig = contig

        if maxcontig is None:
            print "maxcontig has length 0, aborting mutation!"
            return None, None

        # trim contig to get best ungapped aligned region to ref.
        refseq = reffile.fetch(chrom,start,end)
        alignstats = align(maxcontig.seq, refseq)
        qrystart, qryend = map(int, alignstats[2:4])
        tgtstart, tgtend = map(int, alignstats[4:6])

        refseq = refseq[tgtstart:tgtend]

        print "best contig length:", maxlen
        print "alignment result:", alignstats

        maxcontig.trimseq(qrystart, qryend)
        print "trimmed contig length:", maxcontig.len

        refstart = start + tgtstart
        refend = start + tgtend

        if refstart > refend:
            refstart, refend = refend, refstart
    
        print 'start, end, tgtstart, tgtend, refstart, refend:', start, end, tgtstart, tgtend, refstart, refend

        #fixedseq = check_asmvariants(args.bamFileName, maxcontig.seq, reffile, chrom, refstart, refend)
        fixedseq = maxcontig.seq  # FIXME

        # is there anough room to make mutations?
        if maxcontig.len > 3*int(args.maxlibsize):
            # make mutation in the largest contig
            mutseq = ms.MutableSeq(fixedseq)

            # support for multiple mutations
            for actionstr in actions:
                a = actionstr.split()
                action = a[0]

                print actionstr,action

                insseqfile = None
                insseq = ''
                tsdlen = 0  # target site duplication length
                ndups = 0   # number of tandem dups
                dsize = 0.0 # deletion size fraction
                dlen = 0
                if action == 'INS':
                    assert len(a) > 1 # insertion syntax: INS <file.fa> [optional TSDlen]
                    insseqfile = a[1]
                    if not os.path.exists(insseqfile): # not a file... is it a sequence? (support indel ins.)
                        assert re.search('^[ATGCatgc]*$',insseqfile) # make sure it's a sequence
                        insseq = insseqfile.upper()
                        insseqfile = None
                    if len(a) > 2:
                        tsdlen = int(a[2])

                if action == 'DUP':
                    if len(a) > 1:
                        ndups = int(a[1])
                    else:
                        ndups = 1

                if action == 'DEL':
                    if len(a) > 1:
                        dsize = float(a[1])
                        if dsize >= 1.0: # if DEL size is not a fraction, interpret as bp
                            # since DEL 1 is default, if DEL 1 is specified, interpret as 1 bp deletion
                            dlen = int(dsize)
                            dsize = 1.0
                    else:
                        dsize = 1.0

                logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) + " BEFORE\n" + str(mutseq) + "\n")

                if action == 'INS':
                    if insseqfile: # seq in file
                        mutseq.insertion(mutseq.length()/2,singleseqfa(insseqfile),tsdlen)
                    else: # seq is input
                        mutseq.insertion(mutseq.length()/2,insseq,tsdlen)
                    logfile.write("\t".join(('ins',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(mutseq.length()/2),str(insseqfile),str(tsdlen))) + "\n")

                elif action == 'INV':
                    invstart = int(args.maxlibsize)
                    invend = mutseq.length() - invstart
                    mutseq.inversion(invstart,invend)
                    logfile.write("\t".join(('inv',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(invstart),str(invend))) + "\n")

                elif action == 'DEL':
                    delstart = int(args.maxlibsize)
                    delend = mutseq.length() - delstart
                    if dlen == 0: # bp size not specified, delete fraction of contig
                        dlen = int((float(delend-delstart) * dsize)+0.5) 

                    dadj = delend-delstart-dlen
                    if dadj < 0:
                        dadj = 0
                        print "warning: deletion of length 0"
    
                    delstart += dadj/2
                    delend   -= dadj/2

                    mutseq.deletion(delstart,delend)
                    logfile.write("\t".join(('del',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(delstart),str(delend),str(dlen))) + "\n")

                elif action == 'DUP':
                    dupstart = int(args.maxlibsize)
                    dupend = mutseq.length() - dupstart
                    mutseq.duplication(dupstart,dupend,ndups)
                    logfile.write("\t".join(('dup',chrom,str(refstart),str(refend),action,str(mutseq.length()),str(dupstart),str(dupend),str(ndups))) + "\n")

                else:
                    raise ValueError(bedline.strip() + ": mutation not one of: INS,INV,DEL,DUP")

                logfile.write(">" + chrom + ":" + str(refstart) + "-" + str(refend) +" AFTER\n" + str(mutseq) + "\n")

            # estimate paired-end distribution
            print "estimating paired-end insert size mean, stdev..."
            pemean, pesd = estimate_pedist(bamfile, chrom, start, end, window=10000, setmean=args.ismean, setsd=args.issd)
            # simulate reads
            (fq1, fq2) = runwgsim(maxcontig, mutseq.seq, svfrac, exclude, pemean, pesd)

            # remap reads
            outreads = remap(fq1, fq2, 4, args.refFasta, outbam_mutsfile)

            if outreads == 0:
                print "outbam", outbam_mutsfile, "has no mapped reads!"
                return None, None

        else:
            print "best contig too short to make mutation: ",bedline.strip()
            return None, None

        sys.stderr.write("temporary bam: " + outbam_mutsfile + "\n")

        exclude.close()
        bamfile.close()

        return outbam_mutsfile, exclfile

    except Exception, e:
        sys.stderr.write("*"*60 + "\nencountered error in mutation spikein: " + bedline + "\n")
        traceback.print_exc(file=sys.stdout)
        sys.stderr.write("*"*60 + "\n")
        return None, None