예제 #1
0
def makemut(args, hc, avoid, alignopts):

    mutid_list = []
    for site in hc:
        mutid_list.append(site['chrom'] + '_' + str(site['start']) + '_' +
                          str(site['end']) + '_' + str(site['vaf']) + '_' +
                          str(site['altbase']))

    try:
        if args.seed is not None:
            random.seed(int(args.seed) + int(hc[0]['start']))

        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        bammate = pysam.Samfile(
            args.bamFileName, 'rb')  # use for mates to avoid iterator problems
        reffile = pysam.Fastafile(args.refFasta)
        tmpbams = []

        #snvfrac = float(args.snvfrac)

        chrom = None
        vaf = None

        mutpos_list = []
        altbase_list = []

        for site in hc:
            if chrom is None:
                chrom = site['chrom']
            else:
                assert chrom == site[
                    'chrom'], "haplotype clusters cannot span multiple chromosomes!"

            if vaf is None:
                vaf = site['vaf']

            elif vaf != site['vaf']:
                logger.warning(
                    "multiple VAFs for single haplotype, using first encountered VAF: %f"
                    % vaf)

            mutpos = int(random.uniform(site['start'], site['end'] +
                                        1))  # position of mutation in genome
            mutpos_list.append(mutpos)  # FIXME
            altbase_list.append(site['altbase'])

        mutbase_list = []
        refbase_list = []
        mutstr_list = []

        for n, mutpos in enumerate(mutpos_list):
            refbase = reffile.fetch(chrom, mutpos - 1, mutpos)
            altbase = altbase_list[n]
            refbase_list.append(refbase)

            if altbase == refbase.upper() and not args.ignoreref:
                logger.warning(
                    "%s specified ALT base matches reference, skipping mutation"
                    % mutid_list[n])
                return None

            try:
                mutbase = mut(refbase, altbase)
                mutbase_list.append(mutbase)

            except ValueError as e:
                logger.warning(mutid_list[n] + " " + ' '.join(
                    ("skipped site:", chrom, str(hc[n]['start']),
                     str(hc[n]['end']), "due to N base:", str(e), "\n")))
                return None

            mutstr_list.append(refbase + "-->" + str(mutbase))

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        hapstr = "_".join(
            ('haplo', chrom, str(min(mutpos_list)), str(max(mutpos_list))))
        log = open(
            'addsnv_logs_' + os.path.basename(args.outBamFile) + '/' +
            os.path.basename(args.outBamFile) + "." + hapstr + ".log", 'w')

        tmpoutbamname = args.tmpdir + "/" + hapstr + ".tmpbam." + str(
            uuid4()) + ".bam"
        logger.info("%s creating tmp bam: %s" % (hapstr, tmpoutbamname))
        outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile)

        mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(
            args,
            log,
            bamfile,
            bammate,
            chrom,
            min(mutpos_list),
            max(mutpos_list) + 1,
            mutpos_list,
            avoid=avoid,
            mutid_list=mutid_list,
            is_snv=True,
            mutbase_list=mutbase_list,
            reffile=reffile)

        if mutfail:
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        # pick reads to change
        readlist = []
        for extqname, read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                readlist.append(extqname)

        logger.info("%s len(readlist): %s" % (hapstr, str(len(readlist))))
        readlist.sort()
        random.shuffle(readlist)

        if len(readlist) < int(args.mindepth):
            logger.warning("%s too few reads in region (%s) skipping..." %
                           (hapstr, str(len(readlist))))
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        if vaf is None:
            vaf = float(
                args.mutfrac
            )  # default minor allele freq if not otherwise specified
        if cnv:  # cnv file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom, min(mutpos_list),
                                          max(mutpos_list) + 1):
                    cn = float(cnregion.strip().split()
                               [3])  # expect chrom,start,end,CN
                    logger.info(hapstr + "\t" +
                                ' '.join(("copy number in snp region:", chrom,
                                          str(min(mutpos_list)),
                                          str(max(mutpos_list)), "=",
                                          str(cn))))
                    if float(cn) > 0.0:
                        vaf = 1.0 / float(cn)
                    else:
                        vaf = 0.0
                    logger.info("%s adjusted VAF: %f" % (hapstr, vaf))
        else:
            logger.info("%s selected VAF: %f" % (hapstr, vaf))

        lastread = int(len(readlist) * vaf)

        # pick at least args.minmutreads if possible
        if lastread < int(args.minmutreads):
            if len(readlist) > int(args.minmutreads):
                lastread = int(args.minmutreads)
                logger.warning("%s forced %d reads." % (hapstr, lastread))
            else:
                logger.warning(
                    "%s dropped site with fewer reads than --minmutreads" %
                    hapstr)
                os.remove(tmpoutbamname)
                return None

        readtrack = dd(list)

        for readname in readlist:
            orig_name, readpos, pairend = readname.split(',')
            readtrack[orig_name].append('%s,%s' % (readpos, pairend))

        usedreads = 0
        newreadlist = []

        for orig_name in readtrack:
            for read_instance in readtrack[orig_name]:
                newreadlist.append(orig_name + ',' + read_instance)
                usedreads += 1

            if usedreads >= lastread:
                break

        readlist = newreadlist

        logger.info("%s picked: %d" % (hapstr, len(readlist)))

        wrote = 0
        nmut = 0
        mut_out = {}
        # change reads from .bam to mutated sequences
        for extqname, read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                if not args.nomut and extqname in readlist:
                    qual = read.qual  # changing seq resets qual (see pysam API docs)
                    read.seq = mutreads[extqname]  # make mutation
                    read.qual = qual
                    nmut += 1
            if not hasSNP or args.force:
                wrote += 1
                mut_out[extqname] = read

        muts_written = {}

        for extqname in mut_out:
            if extqname not in muts_written:
                outbam_muts.write(mut_out[extqname])
                muts_written[extqname] = True

                if mutmates[extqname] is not None:
                    # is mate also in mutated list?
                    mate_read = mutmates[extqname]

                    pairname = 'F'  # read is first in pair
                    if mate_read.is_read2:
                        pairname = 'S'  # read is second in pair
                    if not mate_read.is_paired:
                        pairname = 'U'  # read is unpaired

                    mateqname = ','.join(
                        (mate_read.qname, str(mate_read.pos), pairname))

                    if mateqname in mut_out:
                        # yes: output mutated mate
                        outbam_muts.write(mut_out[mateqname])
                        muts_written[mateqname] = True

                    else:
                        # no: output original mate
                        outbam_muts.write(mate_read)

        logger.info("%s wrote: %d, mutated: %d" % (hapstr, wrote, nmut))

        if not hasSNP or args.force:
            outbam_muts.close()

            aligners.remap_bam(args.aligner,
                               tmpoutbamname,
                               args.refFasta,
                               alignopts,
                               mutid=hapstr,
                               paired=(not args.single),
                               picardjar=args.picardjar,
                               insane=args.insane)

            outbam_muts = pysam.Samfile(tmpoutbamname, 'rb')
            coverwindow = 1
            incover = countReadCoverage(bamfile, chrom,
                                        min(mutpos_list) - coverwindow,
                                        max(mutpos_list) + coverwindow)
            outcover = countReadCoverage(outbam_muts, chrom,
                                         min(mutpos_list) - coverwindow,
                                         max(mutpos_list) + coverwindow)

            avgincover = float(sum(incover)) / float(len(incover))
            avgoutcover = float(sum(outcover)) / float(len(outcover))

            logger.info("%s avgincover: %f, avgoutcover: %f" %
                        (hapstr, avgincover, avgoutcover))

            spikein_snvfrac = 0.0
            if wrote > 0:
                spikein_snvfrac = float(nmut) / float(wrote)

            # qc cutoff for final snv depth
            if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover
                    >= float(args.coverdiff)) or args.force:
                tmpbams.append(tmpoutbamname)
                for n, site in enumerate(hc):
                    snvstr = chrom + ":" + str(site['start']) + "-" + str(
                        site['end']) + " (VAF=" + str(vaf) + ")"
                    log.write("\t".join(("snv", snvstr,
                                         str(mutpos_list[n]), mutstr_list[n],
                                         str(avgoutcover), str(avgoutcover),
                                         str(spikein_snvfrac), str(maxfrac))) +
                              "\n")
            else:

                outbam_muts.close()
                os.remove(tmpoutbamname)
                if os.path.exists(tmpoutbamname + '.bai'):
                    os.remove(tmpoutbamname + '.bai')
                logger.warning("%s dropped for outcover/incover < %s" %
                               (hapstr, str(args.coverdiff)))
                return None

        outbam_muts.close()
        bamfile.close()
        bammate.close()
        log.close()

        return tmpbams

    except Exception, e:
        sys.stderr.write("*" * 60 + "\nERROR\t" + now() +
                         "\tencountered error in mutation spikein: " +
                         str(mutid_list) + "\n")
        traceback.print_exc(file=sys.stdout)
        sys.stderr.write("*" * 60 + "\n")
        if os.path.exists(tmpoutbamname):
            os.remove(tmpoutbamname)
        if os.path.exists(tmpoutbamname + '.bai'):
            os.remove(tmpoutbamname + '.bai')
        return None
예제 #2
0
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts):
    ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end'''

    if args.seed is not None: random.seed(int(args.seed) + int(start))

    mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf)
    if ins is None:
        mutid += ':DEL'
    else:
        mutid += ':INS:' + ins

    try:
        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        bammate = pysam.Samfile(args.bamFileName, 'rb') # use for mates to avoid iterator problems
        reffile = pysam.Fastafile(args.refFasta)
        tmpbams = []

        is_insertion = ins is not None
        is_deletion  = ins is None

        snvfrac = float(args.snvfrac)

        mutstr = get_mutstr(chrom, start, end, ins, reffile)

        del_ln = 0
        if is_deletion:
            del_ln = end-start

        mutpos = start
        mutpos_list = [start]

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        log = open('addindel_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + "_".join((chrom,str(start),str(end))) + ".log",'w')

        tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str(uuid4()) + ".bam"
        print "INFO\t" + now() + "\t" + mutid + "\tcreating tmp bam: ",tmpoutbamname #DEBUG
        outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile)

        mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(args, log, bamfile, bammate, chrom, mutpos, mutpos+del_ln+1, mutpos_list, avoid=avoid, mutid_list=[mutid], is_insertion=is_insertion, is_deletion=is_deletion, ins_seq=ins, reffile=reffile, indel_start=start, indel_end=end)

        if mutfail:
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        # pick reads to change
        readlist = []
        for extqname,read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                readlist.append(extqname)

        print "len(readlist):",str(len(readlist))
        readlist.sort()
        random.shuffle(readlist)

        if len(readlist) < int(args.mindepth):
            sys.stderr.write("WARN\t" + now() + "\t" + mutid + "\tskipped, too few reads in region: " + str(len(readlist)) + "\n")
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        if vaf is None:
            vaf = float(args.mutfrac) # default minor allele freq if not otherwise specified
        if cnv: # cnv file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom,start,end):
                    cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" + ' '.join(("copy number in snp region:",chrom,str(start),str(end),"=",str(cn))) + "\n")
                    if float(cn) > 0.0:
                        vaf = 1.0/float(cn)
                    else:
                        vaf = 0.0
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tadjusted VAF: " + str(vaf) + "\n")
        else:
            sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\tselected VAF: " + str(vaf) + "\n")

        lastread = int(len(readlist)*vaf)

        # pick at least args.minmutreads if possible
        if lastread < int(args.minmutreads):
            if len(readlist) > int(args.minmutreads):
                lastread = int(args.minmutreads)
                sys.stdout.write("WARN\t" + now() + "\t" + mutid + "\tforced " + str(lastread) + " reads.\n")
            else:
                print "WARN\t" + now() + "\t" + mutid + "\tdropped site with fewer reads than --minmutreads"
                os.remove(tmpoutbamname)
                return None

        readtrack = dd(list)

        for readname in readlist:
            orig_name, readpos, pairend = readname.split(',')
            readtrack[orig_name].append('%s,%s' % (readpos, pairend))

        usedreads = 0
        newreadlist = []

        for orig_name in readtrack:
            for read_instance in readtrack[orig_name]:
                newreadlist.append(orig_name + ',' + read_instance)
                usedreads += 1

            if usedreads >= lastread:
                break

        readlist = newreadlist

        print "INFO\t" + now() + "\t" + mutid + "\tpicked: " + str(len(readlist)) + " reads"

        wrote = 0
        nmut = 0
        mut_out = {}
        # change reads from .bam to mutated sequences
        for extqname,read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                if not args.nomut and extqname in readlist:
                    qual = read.qual # changing seq resets qual (see pysam API docs)
                    read.seq = mutreads[extqname] # make mutation
                    read.qual = qual
                    nmut += 1
            if not hasSNP or args.force:
                wrote += 1
                mut_out[extqname] = read

        muts_written = {}

        for extqname in mut_out:
            if extqname not in muts_written:
                outbam_muts.write(mut_out[extqname])
                muts_written[extqname] = True

                if mutmates[extqname] is not None:
                    # is mate also in mutated list?
                    mate_read = mutmates[extqname]

                    pairname = 'F' # read is first in pair
                    if mate_read.is_read2:
                        pairname = 'S' # read is second in pair
                    if not mate_read.is_paired:
                        pairname = 'U' # read is unpaired

                    mateqname = ','.join((mate_read.qname,str(mate_read.pos),pairname))

                    if mateqname in mut_out:
                        # yes: output mutated mate
                        outbam_muts.write(mut_out[mateqname])
                        muts_written[mateqname] = True

                    else:
                        # no: output original mate
                        outbam_muts.write(mate_read)

        print "INFO\t" + now() + "\t" + mutid + "\twrote: " + str(wrote) + " reads, mutated: " + str(nmut) + " reads"

        if not hasSNP or args.force:
            outbam_muts.close()
            aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=mutid, paired=(not args.single), picardjar=args.picardjar)

            outbam_muts = pysam.Samfile(tmpoutbamname,'rb')
            coverwindow = 1
            incover  = countReadCoverage(bamfile,chrom,mutpos-coverwindow,mutpos+del_ln+coverwindow)
            outcover = countReadCoverage(outbam_muts,chrom,mutpos-coverwindow,mutpos+del_ln+coverwindow)

            avgincover  = float(sum(incover))/float(len(incover)) 
            avgoutcover = float(sum(outcover))/float(len(outcover))
            spikein_frac = 0.0
            if wrote > 0:
                spikein_frac = float(nmut)/float(wrote)

            # qc cutoff for final snv depth 
            if (avgoutcover > 0 and avgincover > 0 and avgoutcover/avgincover >= float(args.coverdiff)) or args.force:
                tmpbams.append(tmpoutbamname)
                indelstr = ''
                if is_insertion:
                    indelstr = ':'.join(('INS', chrom, str(start), ins))
                else:
                    indelstr = ':'.join(('DEL', chrom, str(start), str(end)))

                snvstr = chrom + ":" + str(start) + "-" + str(end) + " (VAF=" + str(vaf) + ")"
                log.write("\t".join(("indel",indelstr,str(mutpos),mutstr,str(avgincover),str(avgoutcover),str(spikein_frac),str(maxfrac)))+"\n")
            else:
                outbam_muts.close()
                os.remove(tmpoutbamname)
                if os.path.exists(tmpoutbamname + '.bai'):
                    os.remove(tmpoutbamname + '.bai')
                    
                print "WARN\t" + now() + "\t" + mutid + "\tdropped for outcover/incover < " + str(args.coverdiff)
                return None

        outbam_muts.close()
        bamfile.close()
        bammate.close()
        log.close() 

        return sorted(tmpbams)
        
    except Exception, e:
        sys.stderr.write("*"*60 + "\nencountered error in mutation spikein: " + mutid + "\n")
        traceback.print_exc(file=sys.stdout)
        sys.stderr.write("*"*60 + "\n")
        if os.path.exists(tmpoutbamname):
            os.remove(tmpoutbamname)
        if os.path.exists(tmpoutbamname + '.bai'):
            os.remove(tmpoutbamname + '.bai')
        return None
예제 #3
0
def makemut(args, chrom, start, end, vaf, ins, avoid, alignopts):
    ''' is ins is a sequence, it will is inserted at start, otherwise delete from start to end'''

    if args.seed is not None: random.seed(int(args.seed) + int(start))

    mutid = chrom + '_' + str(start) + '_' + str(end) + '_' + str(vaf)
    if ins is None:
        mutid += ':DEL'
    else:
        mutid += ':INS:' + ins

    try:
        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        bammate = pysam.Samfile(
            args.bamFileName, 'rb')  # use for mates to avoid iterator problems
        reffile = pysam.Fastafile(args.refFasta)
        tmpbams = []

        is_insertion = ins is not None
        is_deletion = ins is None

        snvfrac = float(args.snvfrac)

        mutstr = get_mutstr(chrom, start, end, ins, reffile)

        del_ln = 0
        if is_deletion:
            del_ln = end - start

        mutpos = start
        mutpos_list = [start]

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        log = open(
            'addindel_logs_' + os.path.basename(args.outBamFile) + '/' +
            os.path.basename(args.outBamFile) + "." + "_".join(
                (chrom, str(start), str(end))) + ".log", 'w')

        tmpoutbamname = args.tmpdir + "/" + mutid + ".tmpbam." + str(
            uuid4()) + ".bam"
        print "INFO\t" + now(
        ) + "\t" + mutid + "\tcreating tmp bam: ", tmpoutbamname  #DEBUG
        outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile)

        mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(
            args,
            log,
            bamfile,
            bammate,
            chrom,
            mutpos,
            mutpos + del_ln + 1,
            mutpos_list,
            avoid=avoid,
            mutid_list=[mutid],
            is_insertion=is_insertion,
            is_deletion=is_deletion,
            ins_seq=ins,
            reffile=reffile,
            indel_start=start,
            indel_end=end)

        if mutfail:
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        # pick reads to change
        readlist = []
        for extqname, read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                readlist.append(extqname)

        print "len(readlist):", str(len(readlist))
        readlist.sort()
        random.shuffle(readlist)

        if len(readlist) < int(args.mindepth):
            sys.stderr.write("WARN\t" + now() + "\t" + mutid +
                             "\tskipped, too few reads in region: " +
                             str(len(readlist)) + "\n")
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        if vaf is None:
            vaf = float(
                args.mutfrac
            )  # default minor allele freq if not otherwise specified
        if cnv:  # cnv file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom, start, end):
                    cn = float(cnregion.strip().split()
                               [3])  # expect chrom,start,end,CN
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid + "\t" +
                                     ' '.join(("copy number in snp region:",
                                               chrom, str(start), str(end),
                                               "=", str(cn))) + "\n")
                    if float(cn) > 0.0:
                        vaf = 1.0 / float(cn)
                    else:
                        vaf = 0.0
                    sys.stdout.write("INFO\t" + now() + "\t" + mutid +
                                     "\tadjusted VAF: " + str(vaf) + "\n")
        else:
            sys.stdout.write("INFO\t" + now() + "\t" + mutid +
                             "\tselected VAF: " + str(vaf) + "\n")

        lastread = int(len(readlist) * vaf)

        # pick at least args.minmutreads if possible
        if lastread < int(args.minmutreads):
            if len(readlist) > int(args.minmutreads):
                lastread = int(args.minmutreads)
                sys.stdout.write("WARN\t" + now() + "\t" + mutid +
                                 "\tforced " + str(lastread) + " reads.\n")
            else:
                print "WARN\t" + now(
                ) + "\t" + mutid + "\tdropped site with fewer reads than --minmutreads"
                os.remove(tmpoutbamname)
                return None

        readtrack = dd(list)

        for readname in readlist:
            orig_name, readpos, pairend = readname.split(',')
            readtrack[orig_name].append('%s,%s' % (readpos, pairend))

        usedreads = 0
        newreadlist = []

        for orig_name in readtrack:
            for read_instance in readtrack[orig_name]:
                newreadlist.append(orig_name + ',' + read_instance)
                usedreads += 1

            if usedreads >= lastread:
                break

        readlist = newreadlist

        print "INFO\t" + now() + "\t" + mutid + "\tpicked: " + str(
            len(readlist)) + " reads"

        wrote = 0
        nmut = 0
        mut_out = {}
        # change reads from .bam to mutated sequences
        for extqname, read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                if not args.nomut and extqname in readlist:
                    qual = read.qual  # changing seq resets qual (see pysam API docs)
                    read.seq = mutreads[extqname]  # make mutation
                    read.qual = qual
                    nmut += 1
            if not hasSNP or args.force:
                wrote += 1
                mut_out[extqname] = read

        muts_written = {}

        for extqname in mut_out:
            if extqname not in muts_written:
                outbam_muts.write(mut_out[extqname])
                muts_written[extqname] = True

                if mutmates[extqname] is not None:
                    # is mate also in mutated list?
                    mate_read = mutmates[extqname]

                    pairname = 'F'  # read is first in pair
                    if mate_read.is_read2:
                        pairname = 'S'  # read is second in pair
                    if not mate_read.is_paired:
                        pairname = 'U'  # read is unpaired

                    mateqname = ','.join(
                        (mate_read.qname, str(mate_read.pos), pairname))

                    if mateqname in mut_out:
                        # yes: output mutated mate
                        outbam_muts.write(mut_out[mateqname])
                        muts_written[mateqname] = True

                    else:
                        # no: output original mate
                        outbam_muts.write(mate_read)

        print "INFO\t" + now() + "\t" + mutid + "\twrote: " + str(
            wrote) + " reads, mutated: " + str(nmut) + " reads"

        if not hasSNP or args.force:
            outbam_muts.close()
            aligners.remap_bam(args.aligner,
                               tmpoutbamname,
                               args.refFasta,
                               alignopts,
                               mutid=mutid,
                               paired=(not args.single),
                               picardjar=args.picardjar,
                               insane=args.insane)

            outbam_muts = pysam.Samfile(tmpoutbamname, 'rb')
            coverwindow = 1
            incover = countReadCoverage(bamfile, chrom, mutpos - coverwindow,
                                        mutpos + del_ln + coverwindow)
            outcover = countReadCoverage(outbam_muts, chrom,
                                         mutpos - coverwindow,
                                         mutpos + del_ln + coverwindow)

            avgincover = float(sum(incover)) / float(len(incover))
            avgoutcover = float(sum(outcover)) / float(len(outcover))
            spikein_frac = 0.0
            if wrote > 0:
                spikein_frac = float(nmut) / float(wrote)

            # qc cutoff for final snv depth
            if (avgoutcover > 0 and avgincover > 0 and avgoutcover / avgincover
                    >= float(args.coverdiff)) or args.force:
                tmpbams.append(tmpoutbamname)
                indelstr = ''
                if is_insertion:
                    indelstr = ':'.join(('INS', chrom, str(start), ins))
                else:
                    indelstr = ':'.join(('DEL', chrom, str(start), str(end)))

                snvstr = chrom + ":" + str(start) + "-" + str(
                    end) + " (VAF=" + str(vaf) + ")"
                log.write("\t".join(("indel", indelstr, str(mutpos), mutstr,
                                     str(avgincover), str(avgoutcover),
                                     str(spikein_frac), str(maxfrac))) + "\n")
            else:
                outbam_muts.close()
                os.remove(tmpoutbamname)
                if os.path.exists(tmpoutbamname + '.bai'):
                    os.remove(tmpoutbamname + '.bai')

                print "WARN\t" + now(
                ) + "\t" + mutid + "\tdropped for outcover/incover < " + str(
                    args.coverdiff)
                return None

        outbam_muts.close()
        bamfile.close()
        bammate.close()
        log.close()

        return sorted(tmpbams)

    except Exception, e:
        sys.stderr.write("*" * 60 +
                         "\nencountered error in mutation spikein: " + mutid +
                         "\n")
        traceback.print_exc(file=sys.stdout)
        sys.stderr.write("*" * 60 + "\n")
        if os.path.exists(tmpoutbamname):
            os.remove(tmpoutbamname)
        if os.path.exists(tmpoutbamname + '.bai'):
            os.remove(tmpoutbamname + '.bai')
        return None
예제 #4
0
def makemut(args, hc, avoid, alignopts):


    mutid_list = []
    for site in hc:
        mutid_list.append(site['chrom'] + '_' + str(site['start']) + '_' + str(site['end']) + '_' + str(site['vaf']) + '_' + str(site['altbase']))

    try:
        if args.seed is not None: random.seed(int(args.seed) + int(hc[0]['start']))

        bamfile = pysam.Samfile(args.bamFileName, 'rb')
        bammate = pysam.Samfile(args.bamFileName, 'rb') # use for mates to avoid iterator problems
        reffile = pysam.Fastafile(args.refFasta)
        tmpbams = []

        #snvfrac = float(args.snvfrac)

        chrom = None
        vaf   = None

        mutpos_list = []
        altbase_list = []
        
        for site in hc:
            if chrom is None:
                chrom = site['chrom']
            else:
                assert chrom == site['chrom'], "haplotype clusters cannot span multiple chromosomes!"

            if vaf is None:
                vaf = site['vaf']
                
            elif vaf != site['vaf']:
                sys.stderr.write("WARN\t" + now() + "\tmultiple VAFs for single haplotype, using first encountered VAF: " + str(vaf) + "\n")

            mutpos = int(random.uniform(site['start'],site['end']+1)) # position of mutation in genome
            mutpos_list.append(mutpos) # FIXME
            altbase_list.append(site['altbase'])

        mutbase_list = []
        refbase_list = []
        mutstr_list  = []

        for n, mutpos in enumerate(mutpos_list):
            refbase = reffile.fetch(chrom,mutpos-1,mutpos)
            altbase = altbase_list[n]
            refbase_list.append(refbase)

            if altbase == refbase.upper() and not args.ignoreref:
                sys.stderr.write("WARN\t" + now() + "\t" + mutid_list[n] + "\tspecified ALT base matches reference, skipping mutation\n")
                return None

            try:
                mutbase = mut(refbase, altbase)
                mutbase_list.append(mutbase)

            except ValueError as e:
                sys.stderr.write("WARN\t" + now() + "\t" + mutid_list[n] + "\t" + ' '.join(("skipped site:",chrom,str(hc[n]['start']),str(hc[n]['end']),"due to N base:",str(e),"\n")))
                return None

            mutstr_list.append(refbase + "-->" + str(mutbase))

        # optional CNV file
        cnv = None
        if (args.cnvfile):
            cnv = pysam.Tabixfile(args.cnvfile, 'r')

        hapstr = "_".join(('haplo',chrom,str(min(mutpos_list)),str(max(mutpos_list))))
        log = open('addsnv_logs_' + os.path.basename(args.outBamFile) + '/' + os.path.basename(args.outBamFile) + "." + hapstr + ".log",'w')

        tmpoutbamname = args.tmpdir + "/" + hapstr + ".tmpbam." + str(uuid4()) + ".bam"
        print "INFO\t" + now() + "\t" + hapstr + "\tcreating tmp bam: ",tmpoutbamname
        outbam_muts = pysam.Samfile(tmpoutbamname, 'wb', template=bamfile)

        mutfail, hasSNP, maxfrac, outreads, mutreads, mutmates = mutation.mutate(args, log, bamfile, bammate, chrom, min(mutpos_list), max(mutpos_list)+1, mutpos_list, avoid=avoid, mutid_list=mutid_list, is_snv=True, mutbase_list=mutbase_list, reffile=reffile)

        if mutfail:
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        # pick reads to change
        readlist = []
        for extqname,read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                readlist.append(extqname)

        print "INFO\t" + now() + "\t" + hapstr + "\tlen(readlist): " + str(len(readlist))
        readlist.sort()
        random.shuffle(readlist)

        if len(readlist) < int(args.mindepth):
            print "WARN\t" + now() + "\t" + hapstr + "\ttoo few reads in region (" + str(len(readlist)) + ") skipping..."
            outbam_muts.close()
            os.remove(tmpoutbamname)
            return None

        if vaf is None:
            vaf = float(args.mutfrac) # default minor allele freq if not otherwise specified
        if cnv: # cnv file is present
            if chrom in cnv.contigs:
                for cnregion in cnv.fetch(chrom,min(mutpos_list),max(mutpos_list)+1):
                    cn = float(cnregion.strip().split()[3]) # expect chrom,start,end,CN
                    print "INFO\t" + now() + "\t" + hapstr + "\t" + ' '.join(("copy number in snp region:",chrom,str(min(mutpos_list)),str(max(mutpos_list)),"=",str(cn))) + "\n"
                    if float(cn) > 0.0:
                        vaf = 1.0/float(cn)
                    else:
                        vaf = 0.0
                    print "adjusted VAF: " + str(vaf) + "\n"
        else:
            print "INFO\t" + now() + "\t" + hapstr + "\tselected VAF: " + str(vaf) + "\n"

        lastread = int(len(readlist)*vaf)

        # pick at least args.minmutreads if possible
        if lastread < int(args.minmutreads):
            if len(readlist) > int(args.minmutreads):
                lastread = int(args.minmutreads)
                sys.stdout.write("WARN\t" + now() + "\t" + hapstr + "\tforced " + str(lastread) + " reads.\n")
            else:
                print "WARN\t" + now() + "\t" + hapstr + "\tdropped site with fewer reads than --minmutreads"
                os.remove(tmpoutbamname)
                return None

        readtrack = dd(list)

        for readname in readlist:
            orig_name, readpos, pairend = readname.split(',')
            readtrack[orig_name].append('%s,%s' % (readpos, pairend))

        usedreads = 0
        newreadlist = []

        for orig_name in readtrack:
            for read_instance in readtrack[orig_name]:
                newreadlist.append(orig_name + ',' + read_instance)
                usedreads += 1

            if usedreads >= lastread:
                break

        readlist = newreadlist

        print "INFO\t" + now() + "\t" + hapstr + "\tpicked:",str(len(readlist))

        wrote = 0
        nmut = 0
        mut_out = {}
        # change reads from .bam to mutated sequences
        for extqname,read in outreads.iteritems():
            if read.seq != mutreads[extqname]:
                if not args.nomut and extqname in readlist:
                    qual = read.qual # changing seq resets qual (see pysam API docs)
                    read.seq = mutreads[extqname] # make mutation
                    read.qual = qual
                    nmut += 1
            if not hasSNP or args.force:
                wrote += 1
                mut_out[extqname] = read

        muts_written = {}

        for extqname in mut_out:
            if extqname not in muts_written:
                outbam_muts.write(mut_out[extqname])
                muts_written[extqname] = True

                if mutmates[extqname] is not None:
                    # is mate also in mutated list?
                    mate_read = mutmates[extqname]

                    pairname = 'F' # read is first in pair
                    if mate_read.is_read2:
                        pairname = 'S' # read is second in pair
                    if not mate_read.is_paired:
                        pairname = 'U' # read is unpaired

                    mateqname = ','.join((mate_read.qname,str(mate_read.pos),pairname))

                    if mateqname in mut_out:
                        # yes: output mutated mate
                        outbam_muts.write(mut_out[mateqname])
                        muts_written[mateqname] = True

                    else:
                        # no: output original mate
                        outbam_muts.write(mate_read)

        print "INFO\t" + now() + "\t" + hapstr + "\twrote: ",wrote,"mutated:",nmut

        if not hasSNP or args.force:
            outbam_muts.close()

            aligners.remap_bam(args.aligner, tmpoutbamname, args.refFasta, alignopts, mutid=hapstr, paired=(not args.single), picardjar=args.picardjar)

            outbam_muts = pysam.Samfile(tmpoutbamname,'rb')
            coverwindow = 1
            incover  = countReadCoverage(bamfile,chrom,min(mutpos_list)-coverwindow,max(mutpos_list)+coverwindow)
            outcover = countReadCoverage(outbam_muts,chrom,min(mutpos_list)-coverwindow,max(mutpos_list)+coverwindow)

            avgincover  = float(sum(incover))/float(len(incover)) 
            avgoutcover = float(sum(outcover))/float(len(outcover))

            print "INFO\t" + now() + "\t" + hapstr + "\tavgincover: " + str(avgincover) + " avgoutcover: " + str(avgoutcover)

            spikein_snvfrac = 0.0
            if wrote > 0:
                spikein_snvfrac = float(nmut)/float(wrote)

            # qc cutoff for final snv depth 
            if (avgoutcover > 0 and avgincover > 0 and avgoutcover/avgincover >= float(args.coverdiff)) or args.force:
                tmpbams.append(tmpoutbamname)
                for n,site in enumerate(hc):
                    snvstr = chrom + ":" + str(site['start']) + "-" + str(site['end']) + " (VAF=" + str(vaf) + ")"
                    log.write("\t".join(("snv",snvstr,str(mutpos_list[n]),mutstr_list[n],str(avgoutcover),str(avgoutcover),str(spikein_snvfrac),str(maxfrac)))+"\n")
            else:

                outbam_muts.close()
                os.remove(tmpoutbamname)
                if os.path.exists(tmpoutbamname + '.bai'):
                    os.remove(tmpoutbamname + '.bai')
                print "WARN\t" + now() + "\t" + hapstr + "\tdropped for outcover/incover < " + str(args.coverdiff)
                return None

        outbam_muts.close()
        bamfile.close()
        bammate.close()
        log.close() 

        return tmpbams

    except Exception, e:
        sys.stderr.write("*"*60 + "\nERROR\t" + now() + "\tencountered error in mutation spikein: " + str(mutid_list) + "\n")
        traceback.print_exc(file=sys.stdout)
        sys.stderr.write("*"*60 + "\n")
        if os.path.exists(tmpoutbamname):
            os.remove(tmpoutbamname)
        if os.path.exists(tmpoutbamname + '.bai'):
            os.remove(tmpoutbamname + '.bai')
        return None