Example #1
0
def bcfmpileup(flist, ref, bcname, regionrestrict, diploid, q, threads, cmdfile, logfile):
    """ Call genotypes using BCFTools.

    :param flist: File list.
    :param ref: Reference genome.
    :param bcname: Base name of input file.
    :param verbose: Verbose output to log.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return: Name of merged sample VCF.
    """
    print "\nCreating mpileup consensus using BCFTools and writing as a VCF..."
    mpileupcmd = "bcftools mpileup --threads " + threads + " -C 50 -d 8000 -Ov -f " + ref + " -q " + q + " "
    for i in range(len(flist)):
        sample = flist[i]
        mpileupcmd = mpileupcmd + sample + ".bam" + " "
    if regionrestrict:
        mpileupcmd = mpileupcmd + " -r " + regionrestrict
    mpileupcmd = mpileupcmd + " | bcftools call --threads " + threads + " -Oz -m -o " + bcname + "-samples.vcf.gz - "
    if diploid:
        pass
    else:
        mpileupcmd = mpileupcmd + " --ploidy 1 "
    upa_util.bash_command(mpileupcmd, False, cmdfile, logfile)
    return bcname + "-samples.vcf.gz"
Example #2
0
def haplogrep_gen_hsd(flist, ref, bcname, regdic, cmdfile, logfile):
    """ Generate HSD file from list of samples using BCFtools mpileup/call.

    :param flist: File list.
    :param ref: Reference genome.
    :param bcname: Base name of input file.
    :param regdic: Region dictionary.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return: Dictionary of Regions coevered by each sample.
    """
    flength = len(flist)
    fbamlist = []

    print "\nCreating mpileup consensus and writing as a VCF..."
    bar = progressbar.ProgressBar()
    for i in bar(range(flength)):
        sample = flist[i]
        fbamlist.append(sample + ".bam")

    mtmcmd = "bcftools mpileup -I -d 8000 -Ov -f " + ref + " "
    for fbam in fbamlist:
        mtmcmd = mtmcmd + fbam + " "

    mtmcmd = mtmcmd + "| bcftools call -V indels --ploidy 1 -Ov -m -v -o " + bcname + "-4hgrp.vcf"

    upa_util.bash_command(mtmcmd, False, cmdfile, logfile)
    return regdic
Example #3
0
def addreadgroup(flist, binloc, verbose, cmdfile, logfile):
    """ Adds the read group information by using Picard

    :param flist: File list.
    :param verbose: Verbose output to log.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return:
    """
    for i in range(len(flist)):
        sample = flist[i]
        basename = upa_util.name_strip(sample)
        addrgcmd = "java -jar " + binloc + "picard.jar AddOrReplaceReadGroups I=" + sample + ".bam O=" + sample + ".intermediate.bam RGID=4 RGLB=" + sample + " RGPL=illumina RGPU=BerkeleyHiSeq RGSM=" + basename + " VALIDATION_STRINGENCY=LENIENT"
        upa_util.bash_command(addrgcmd, verbose, cmdfile, logfile)
        shutil.move(sample + ".bam", sample + ".norg.bam")
        shutil.move(sample + ".intermediate.bam", sample + ".bam")
Example #4
0
def stripchr(flist, verbose, cmdfile, logfile):
    """ Strips 'chr' from BAM files

    :param flist: File list.
    :param verbose: Verbose output to log.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return:
    """
    print "\nStripping chr from chromosome names..."
    for i in range(len(flist)):
        sample = flist[i]
        stripchrcmd = "samtools view -H " + sample + ".bam | sed -e 's/SN:chr1/SN:1/' | sed -e 's/SN:chr2/SN:2/' | sed -e 's/SN:chr3/SN:3/' | sed -e 's/SN:chr4/SN:4/' | sed -e 's/SN:chr5/SN:5/' | sed -e 's/SN:chr6/SN:6/' | sed -e 's/SN:chr7/SN:7/' | sed -e 's/SN:chr8/SN:8/' | sed -e 's/SN:chr9/SN:9/' | sed -e 's/SN:chr10/SN:10/' | sed -e 's/SN:chr11/SN:11/' | sed -e 's/SN:chr12/SN:12/' | sed -e 's/SN:chr13/SN:13/' | sed -e 's/SN:chr14/SN:14/' | sed -e 's/SN:chr15/SN:15/' | sed -e 's/SN:chr16/SN:16/' | sed -e 's/SN:chr17/SN:17/' | sed -e 's/SN:chr18/SN:18/' | sed -e 's/SN:chr19/SN:19/' | sed -e 's/SN:chr20/SN:20/' | sed -e 's/SN:chr21/SN:21/' | sed -e 's/SN:chr22/SN:22/' | sed -e 's/SN:chrX/SN:X/' | sed -e 's/SN:chrY/SN:Y/' | sed -e 's/SN:chrM/SN:MT/' | samtools reheader - " + sample + ".bam > " + sample + ".intermediate.bam"
        upa_util.bash_command(stripchrcmd, verbose, cmdfile, logfile)
        shutil.move(sample + ".bam", sample + ".wchr.bam")
        shutil.move(sample + ".intermediate.bam", sample + ".bam")
Example #5
0
def gen_reg_line(sample, mindepth, maxgap, cmdfile, logfile):
    """ Generate line describing regions covered by sample

    :param sample: File list.
    :param mindepth: Minimum depth of coverage to include.
    :param maxgap: Maximum gap size to allow a region to continue.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return: Line describing genomic regions covered by sample.
    """

    regline = ""
    # depthinfo = upa_util.bash_command("samtools depth " + sample + " > upadepthout.txt", False, cmdfile, logfile)

    depthinfo = upa_util.bash_command("samtools depth " + sample, False, cmdfile, logfile)


    depthlines = depthinfo.split("\n")
    curstart = 0
    lastpos = 0
    firstthrough = True
    for dline in depthlines:
        dcols = dline.split("\t")
        if len(dcols) < 3:
            continue
        curpos = int(dcols[1])
        curdepth = int(dcols[2])
        if firstthrough:
            if curdepth >= mindepth:
                curstart = curpos
                firstthrough = False
        else:
            if curpos <= lastpos + maxgap:  # Continuous run
                if curdepth >= mindepth:
                    pass
                else:  # depth too small, stop here
                    curend = lastpos
                    regline = regline + str(curstart) + "-" + str(curend) + ";"
                    curstart = curpos
            else:  # discontinous
                curend = lastpos
                regline = regline + str(curstart) + "-" + str(curend) + ";"
                curstart = curpos
        lastpos = curpos
    return regline
Example #6
0
def genocaller(flist, bedfile, bcname, indent, ref, regionrestrict, threads, verbose, cmdfile, logfile):
    """ Calls genotypes using Krishna Veeramah's GenoCaller_indent

    :param flist: File list.
    :param bedfile: UCSC-style BED file.
    :param bcname: Base name of input file.
    :param indent: Indent depth to each end of read.
    :param ref: Reference genome.
    :param regionrestrict: Area of genome to limit calling.
    :param threads: Number of multiprocessing threads to use.
    :param verbose: Verbose output to log.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return: Name of merged sample VCF.
    """
    print "\nGenoCaller..."
    samplevcfnames = []
    for i in range(len(flist)):
        sample = flist[i]
        gccmd = "GenoCaller_indent.py " + sample + ".bam " + bedfile + " " + ref + " " + indent
        upa_util.bash_command(gccmd, verbose, cmdfile, logfile)

        #Must compress to allow bcftools to merge

        with open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf", 'r') as f_in, bgzf.open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz", 'wb') as f_out:
        # with open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf", 'r') as f_in, gzip.open(sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz", 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

        samplevcfname = sample + "." + bedfile + ".indent" + str(indent) + ".vcf.gz"
        # sampleemitallname = sample + "." + bedfile + ".indent" + str(indent) + ".emit_all.vcf.gz"

        if os.path.isfile(samplevcfname):
            upa_util.vcf_name_strip(samplevcfname)
            upa_util.bash_command("bcftools index --threads " + threads + " " + samplevcfname, verbose, cmdfile, logfile)
            samplevcfnames.append(samplevcfname)
        else:
            print "ERROR: Cannot find " + samplevcfname

    #Merge the resulting VCFs together using bcftools
    bcfmergecmd = "bcftools merge --threads " + threads + " -Oz -o " + bcname + "-samples.vcf.gz "
    if regionrestrict:
        bcfmergecmd = bcfmergecmd + " -r " + regionrestrict
    for samplevcfname in samplevcfnames:
        bcfmergecmd = bcfmergecmd + samplevcfname + " "
    upa_util.bash_command(bcfmergecmd, verbose, cmdfile,logfile)
    return bcname + "-samples.vcf"
Example #7
0
def run_ad():
    """ Invokes ADMIXTURE software for the user-specified number of reps, between chosen low and high K.

    :return:
    """
    print "\nRunning Admixture..."

    famfilename = file + ".h.fam"
    ind2popname = filebase + "/" + filebase + ".ind2pop"
    famfile = open(famfilename, 'r')
    ind2pop = open(ind2popname, 'w')
    for famline in famfile:
        famcols = famline.split()
        ind2pop.write(famcols[0])
        ind2pop.write("\n")
    famfile.close()
    ind2pop.close()

    kcvss = []
    klogls = []
    kbests = {}

    pongfile = open(rdir + "pong_filemap", 'w')
    for k in xrange(lowk, (hik + 1)):
        print "\n*********** K:" + str(k)
        kreplist = []
        kcvs = []
        logls = []
        bar = progressbar.ProgressBar()
        for j in bar(xrange(0, reps + 1)):
            jreplist = []
            stdoutfilename = rdir + filebase + ".h." + str(k) + ".r" + str(
                j) + ".log"
            stdoutfile = open(stdoutfilename, 'w')
            stdoutfile.write(
                upa_util.bash_command(
                    "admixture --cv " + file + ".h.bed " + str(k) + " -j" +
                    threads + " -s " + str(rng.getrandbits(32)) + " -C " +
                    str(termcrit) + " -m " + optmethod, verbose, cmdfile,
                    logfile))
            stdoutfile.close()
            pfile = filebase + ".h." + str(k) + ".P"
            qfile = filebase + ".h." + str(k) + ".Q"
            shutil.move(
                pfile, rdir + filebase + ".h." + str(k) + ".r" + str(j) + ".P")
            shutil.move(
                qfile, rdir + filebase + ".h." + str(k) + ".r" + str(j) + ".Q")
            grepcmd = "grep -h CV " + stdoutfilename
            grepline = upa_util.bash_command(grepcmd, True, cmdfile, logfile)
            grepcols = grepline.split()
            kcvs.append(float(grepcols[3]))
            jreplist.append(float(grepcols[3]))
            jreplist.append(filebase + ".h." + str(k) + ".r" + str(j))

            stdreadout = open(stdoutfilename, 'r')
            for stdoutline in stdreadout:
                if stdoutline.startswith("Loglikelihood:"):
                    loggrepcols = stdoutline.split()
                    logls.append(loggrepcols[1])
                    jreplist.append(loggrepcols[1])
                    continue
            jreplist.append(str(j))
            kreplist.append(jreplist)

            runid = "run"
            runid += str(j)
            runid += "_K"
            runid += str(k)
            pongfile.write(runid)
            pongfile.write("\t")
            pongfile.write(str(k))
            pongfile.write("\t")
            pongfile.write(filebase + ".h." + str(k) + ".r" + str(j) + ".Q")
            pongfile.write("\n")

        llbest = 0
        for i in xrange(reps):
            if logls[i] < logls[llbest]:
                llbest = i

        print("For K = " + str(k) + " best index: " + str(llbest))
        pbestname = rdir + filebase + ".h." + str(k) + ".r" + str(
            llbest) + ".P"
        shutil.copy(pbestname, bestdir)
        qbestname = rdir + filebase + ".h." + str(k) + ".r" + str(
            llbest) + ".Q"
        shutil.copy(qbestname, bestdir)
        newqbestname = bestdir + "/" + filebase + "." + str(k) + ".r" + str(
            llbest) + ".Q"

        logbestname = rdir + filebase + ".h." + str(k) + ".r" + str(
            llbest) + ".log"
        shutil.move(logbestname, bestdir)

        kbests[k] = kreplist[llbest]

        kcvss.append(kcvs)
        klogls.append(logls)

    cvfileoutname = rdir + filebase + "-cvout.csv"
    cvfileout = open(cvfileoutname, 'w')

    for k in xrange(lowk, (hik + 1)):
        cvfileout.write(str(k))
        cvfileout.write(",")
    cvfileout.write("\n")
    for j in xrange(reps):
        for i in xrange(len(kcvss)):
            cvfileout.write(str(kcvss[i][j]))
            cvfileout.write(",")
        cvfileout.write("\n")
    cvfileout.close()
    upa_util.bash_command("Rscript cvsplot.R " + cvfileoutname, verbose,
                          cmdfile, logfile)

    bestname = bestdir + "/bests.csv"
    bestout = open(bestname, 'w')
    bestout.write("K,CV,filebase,logL, Rep\n")
    for k, l in kbests.iteritems():
        bestout.write(str(k))
        bestout.write(",")
        for i in l:
            bestout.write(str(i))
            bestout.write(",")
        bestout.write("\n")
    bestout.close()
    pongfile.close()

    pongcmd = "pong -f -m " + rdir + "pong_filemap -i " + ind2popname
    print "\n\n\n\n Copy and paste the following to run pong:\n"
    print pongcmd
    print "\nYou will need a separate terminal logged in with ssh -X to visualize."
    print "On that terminal type :"
    print "\nfirefox &\n"
    print "And go to http://localhost:4000"
    print "\nAlternatively, you can simply download the folder " + filebase + "to your own machine"
    print "and run pong there. See http://brown.edu/Research/Ramachandran_Lab/projects/"

    elapsed_time = time.time() - start_time

    print "Number threads: " + str(threads) + " Elapsed time: " + str(
        elapsed_time)
Example #8
0
def converttohaploid():
    """ Convert a tped file to haploid

    :return:
    """

    print "\nConverting to tped format..."
    upa_util.bash_command(
        "plink --bed " + file + ".bed --bim " + file + ".bim --fam " + file +
        ".fam  --alleleACGT --recode transpose --out " + file, verbose,
        cmdfile, logfile)

    print "\nConverting to haploid..."
    tpedfile = open(file + ".tped", 'r')
    oldtped = []
    tlc = 0
    for tline in tpedfile:
        oldtped.append(tline)
        tlc = tlc + 1
    tpedfile.close()

    newtped = []

    bar = progressbar.ProgressBar()

    for i in bar(range(tlc)):
        tline = oldtped[i]

        cols = tline.rstrip().split()

        newtlist = []

        genotypes = cols[4:]
        nalleles = list(set(genotypes))
        alleles = []

        for g in nalleles:
            if g != '0':
                alleles.append(g)
        if tvonly:
            if 'A' in alleles and 'G' in alleles:
                continue
            if 'C' in alleles and 'T' in alleles:
                continue
        for icol in cols[0:4]:
            newtlist.append(icol)
        for j in xrange(0, len(genotypes), 2):
            thisg = random.choice([genotypes[j], genotypes[(j + 1)]])
            newtlist.append(thisg)
            newtlist.append(thisg)
        newtlist.append("\n")
        newtped.append(' '.join(newtlist))
    tpedfile.close()

    toutfile = open(file + ".h.tped", 'w')
    for toutline in newtped:
        toutfile.write(toutline)
    toutfile.close()

    shutil.copy(file + ".fam", file + ".h.tfam")

    print "\nConverting to bed format..."
    upa_util.bash_command(
        "plink --tfile " + file + ".h --make-bed --out " + file + ".h", True,
        cmdfile, logfile)
Example #9
0
def haplogrep_java(invcf, scriptsloc, cmdfile, logfile):
    """ Submit HSD file directly to Haplogrep server.

    :param invcf: Input VCF file.
    :param scriptsloc: Location of scripts repository on local machine.
    :param cmdfile: File storing external commands invoked.
    :param logfile: Output log.
    :return:
    """
    filebase, filext = os.path.splitext(invcf)

    upa_util.bash_command("java -jar " + scriptsloc + "haplogrep-2.1.1.jar --format vcf --in " + invcf + " --out " + filebase + ".hsd --phylotree 17", False, cmdfile, logfile)


    firstfile = filebase + "-FIRST.hsd"
    # upa_util.bash_command("java -jar " + scriptsloc + "haplogrep-2.1.1.jar --format vcf --in " + invcf + " --out " + firstfile + " --phylotree 17", False, cmdfile, logfile)

    # # then edit that HSD file and do it again
    # hsdoutlines = []
    # hsdfirstfile = open(firstfile, 'r')
    # for hsdline in hsdfirstfile:
    #
    #     hsdnewline = ""
    #     hsdcols = hsdline.strip().split("\t")
    #     if hsdcols[0] == "SampleID":
    #         hsdheadline = hsdline.strip()
    #     else:
    #         if len(regdic[hsdcols[0]]) > 1:  # LEAVE OUT indivs for whom where are no viable regions
    #
    #
    #
    #             for i in range(len(hsdcols)):
    #                 if i == 0 or i == 5:
    #
    #                     variantcols = hsdcols[i].split(";")
    #                     for variant in variantcols:
    #                         varvar = variant.strip()
    #                         print varvar
    #
    #                     hsdnewline = hsdnewline + hsdcols[i] + "\t"
    #                 elif i == 1:
    #                     hsdnewline = hsdnewline + regdic[hsdcols[0]] + "\t"
    #                 elif i == 2:
    #                     hsdnewline = hsdnewline + "?\t"
    #                 else:
    #                     pass
    #             hsdoutlines.append(hsdnewline)
    # hsdfirstfile.close()
    #
    # secondfile = filebase + "-SECOND.hsd"
    #
    # hsdsecondfile = open(secondfile, 'w')
    # hsdsecondfile.write("SampleID\tRange\tHaplogroup\tPolymorphisms")
    # hsdsecondfile.write("\n")
    # for hsdoutline in hsdoutlines:
    #     hsdoutcols = hsdoutline.split("\t")
    #     if len(hsdoutcols[3]) > 0:  # LEAVE OUT indivs for whom where are no viable regions
    #         hsdsecondfile.write(hsdoutline)
    #         hsdsecondfile.write("\n")
    # hsdsecondfile.close()
    #
    # finalfile = filebase + "-FINAL.hsd"
    #
    # upa_util.bash_command("java -jar /data/scripts/haplogrep-2.1.1.jar --format hsd --in " + secondfile + " --out " + finalfile + " --phylotree 17", False, cmdfile, logfile)
Example #10
0
File: upa.py Project: mjobin/UPA
    # -----------------------------

    print "\nProcessing input files..."
    if bampreprocess:
        if stripchr:
            print("\nStripping chr")
            upa_input.stripchr(flist, verbose, cmdfile, logfile)
        if addreadgroup:
            print("\nAdding read group...")
            upa_input.addreadgroup(flist, binloc, verbose, cmdfile, logfile)
        if samindex:
            print "\nIndexing..."
            bar = progressbar.ProgressBar()
            for i in bar(range(flength)):
                sample = flist[i]
                upa_util.bash_command("samtools index " + sample + ".bam",
                                      verbose, cmdfile, logfile)

    if vcf_file:
        if os.path.isfile(vcf_file):
            samplevcffile = vcf_file  #User submitting a VCF file
        else:
            print "ERROR: Cannot find " + vcf_file
            exit(1)
    elif callmethod == 'bcf':
        samplevcffile = upa_input.bcfmpileup(flist, ref, bcname,
                                             regionrestrict, diploid, q,
                                             threads, cmdfile, logfile)
    elif callmethod == 'genocaller':
        samplevcffile = upa_input.genocaller(flist, gcbedfile, bcname,
                                             gcindent, ref, regionrestrict,
                                             threads, verbose, cmdfile,