Example #1
0
def main():
    parser = OptionParser(prog="pyRAD", usage="%prog [options]", version="%prog 3.0.6")
    parser.add_option('-p', action="store", type="string", dest="params",
                      help="input file for within sample filtering and clustering\n")
    parser.add_option('-s', action="store", dest="steps",
                      help="""perform step-wise parts of within analysis\n
                      1 = barcode sorting                        \
                      2 = filter/edit raw sequences              \
                      3 = within-sample clustering               \
                      4 = estimate pi and e                      \
                      5 = consensus calling                      \
                      6 = cluster consensus                      \
                      7 = align & create output files """ )
    parser.add_option('-d', action="store", type="string", dest="dtest",
                      help="""input file for D-test of introgression,
                              can iterate over multiple samples """ )
    parser.add_option('-n', action="store_true", dest="newparamsfile",
                      help="""creates a new empty input params.txt file """ )
    parser.add_option('-D', action="store_true", dest="newDtestfile",
                      help="""creates a new empty Dtest input file """ )


    (options, args) = parser.parse_args()

    if not any([options.params,options.dtest,options.newparamsfile,options.newDtestfile]):
        print "\n\tmust include option of -p, -d, -D or -n\n"
        sys.exit()

    if options.params:
        sys.stderr.write('\n\n'+' '*5+'---'*20+'\n'+\
                         ' '*6+'pyRAD : RADseq for phylogenetics & introgression analyses\n'+\
                         ' '*5+'---'*20+'\n\n')
        
        readin = [line.strip().split('##')[0].strip() for line in open(options.params).readlines()]
        if "==** " not in str(readin[0]):
            print "\n\twarning: update params input file format to latest version\n"; sys.exit()

        WORK     = str(readin[1])
        GLOB     = str(readin[2])
        Bcode    = str(readin[3])
        vsearch  = str(readin[4])
        muscle   = str(readin[5])
        CUT      = str(readin[6])  
        parallel = int(readin[7])
        mindepth = int(readin[8])
        pN       = str(readin[9])    
        wclust   = str(readin[10])   
        datatype = str(readin[11])   
        minsamp  = int(readin[12])
        maxpoly  = str(readin[13])
        outname  = str(readin[14])
        ###########################
        ## 15 is separator line
        ###########################
        subset   = str(readin[16])
        outgroup = str(readin[17])
        exclude  = str(readin[18])
        Floc     = str(readin[19])
        try: maxmismatch = int(readin[20])
        except (ValueError,IndexError): maxmismatch = 1
        try: Q = int(readin[21])
        except (ValueError,IndexError): Q = 33
        try: strict     = int(readin[22])
        except (ValueError, IndexError): strict = 0
        try: E,H      = str(readin[23]).strip().split(",")
        except ValueError: E = ""; H = ""
        try: maxN     = int(readin[24])
        except ValueError: maxN = 5
        try: maxH     = int(readin[25])
        except ValueError: maxH = 5
        try: haplos   = int(readin[26])
        except ValueError: haplos = 2
        maxSNP   = str(readin[27])
        if maxSNP == "": maxSNP = "99"
        max_inserts = str(readin[28])
        if max_inserts == "": max_inserts = "3"
        try: seed     = int(readin[29])
        except ValueError: seed = 112233
        try: overhang    = [int(i) for i in str(readin[30]).strip().split(',')]
        except (ValueError,IndexError): overhang = [0,0]
        try: outform   = str(readin[31])
        except (ValueError,IndexError): outform = ""
        try: lowcounts   = int(readin[32])
        except (ValueError, IndexError): lowcounts = mindepth
        ##mergepairs = str(readin[31])
        ##if mergepairs in [0,""]: mergepairs = 0
        try: trimkeep = int(readin[33])
        except ValueError: trimkeep = 0
        try: maxstack = int(readin[34])  
        except ValueError: maxstack = "2SD"
        try: minuniq = int(readin[35])  
        except ValueError: minuniq = 0
        try: hierarch = int(readin[36])  
        except ValueError: hierarch = 0
        try: MASK = int(readin[37])
        except ValueError: MASK = 'dust'
        if MASK == 1: MASK='dust'
        else: MASK='none'
        try: threads = int(readin[38])
        except ValueError: threads = 6
        ###############################
        ## 39 is separator line
        ###############################
        try: clustprefix = readin[40:]
        except IndexError: clustprefix = ""
        clustprefix = [i for i in clustprefix if i]
        

        """ expand ./ ~ and ../ designators in location names """
        def expander(namepath):
            if "~" in namepath:
                namepath = namepath.replace("~",os.path.expanduser("~"))
            if "../" in namepath:
                a,b = namepath.split("../")
                namepath = os.path.abspath(os.path.join(os.path.dirname( "" ), '..', b))
            elif "./" in namepath:
                a,b = namepath.split("./")
                namepath = os.path.abspath("")+"/"+b
            return namepath
            
        if WORK == "":
            WORK = os.path.abspath("")+"/"
        else:
            WORK = expander(WORK) 
        if WORK[-1] != "/":
            WORK = WORK+"/"
        stripped = 0
        if Floc:
            if Floc[0] == "@":
                stripped = 1
                Floc = expander(Floc[1:])
            else:
                Floc = expander(Floc)
        if GLOB:   GLOB = expander(GLOB)
        if Bcode:  Bcode = expander(Bcode)
        if vsearch: vsearch = expander(vsearch)
        if options.dtest: options.dtest = expander(options.dtest)

        """ find location of vsearch (or usearch) and muscle """
        def cmd_exists(cmd):
            return subprocess.call("type " + cmd, shell=True, 
                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0

        # " check platform: mac v linux "
        # if 'linux' in sys.platform:
        #     vsearch = "vsearch-1.0.3-linux-x86_64"
        # else:
        #     vsearch = "vsearch-1.0.3-mac-x86_64"

        # " find vsearch and muscle in user's lib/"
        # PYRADPATH = os.path.dirname(os.path.realpath(__file__))
        # vsearch = PYRADPATH+"/lib/"+vsearch
        # muscle = PYRADPATH+"/lib/muscle"

        " threads = 1 for usearch"
        if 'vsearch' not in vsearch:
            threads = 1
    
        if not cmd_exists(vsearch):
            print "\tcannot find vsearch (or usearch), edit path in param file"
            sys.exit()
        if not cmd_exists(muscle):
            print "\tcannot find muscle, edit path in input file"
            sys.exit()

        """ expand clustprefix cluster groups """
        gids = []
        groups = []
        minhits = []
        "hierarchical clustering "
        for line in clustprefix:
            gid, hits, inds = line.strip().split()
            gids.append(gid)
            minhits.append(hits)
            if "," in inds:
                thisgroup = []
                ii = inds.split(",")
                for i in ii:
                    if "*" in i:
                        expanded = glob.glob(WORK+"clust"+wclust+"/"+i+".consens*")
                        [thisgroup.append(i) for i in expanded]
                    else:
                        thisgroup.append(WORK+"clust"+wclust+"/"+i+".consens.gz")
                groups.append(thisgroup)
            else:
                if "*" in inds:
                    expanded = glob.glob(WORK+"clust"+wclust+"/"+inds+".consens*")
                    groups.append(expanded)
                else:
                    inds = inds.split(",")
                    groups.append([WORK+"clust"+wclust+"/"+i+".consens.gz" for i in inds])
            "TODO check for size=1 "
        if not gids:
            gids = ""


        " step of the analysis "
        k = tuple('1234567')
        if options.steps:
            k = tuple(str(options.steps))

        " check that the data type was entered correctly "
        datopts = ['rad','gbs','ddrad','pairgbs','pairddrad','merged','2brad']
        if datatype not in datopts:
            print "\t datatype argument (line 11) not recognized "
            sys.exit()
        # if datatype == 'merged':
        #     print "specify mergetype in params file, ex: mergeddrad or mergegbs "
        #     sys.exit()

        " parse max_inserts argument "
        w1=3
        w2=6
        a1=a2=99
        if 'pair' in datatype:
            if "," in max_inserts:
                wargs = max_inserts.strip().split(",")
                if len(wargs) == 2:
                    w1 = w2 = wargs[0]
                    a1 = a2 = wargs[1]
                elif len(wargs) == 4:
                    w1,w2,a1,a2 = wargs
                else:
                    print "\n\tmax_inserts parameter not recognized. see documentation"
                    sys.exit()
        else:
            if "," in max_inserts:
                w1,a1 = map(int,max_inserts.split(","))


        #########  Begin analysis  ###################################################
        if '1' in k:
            " expand Barcode file name if necessary "
            if "*" in Bcode:
                try: Bcode = glob.glob(Bcode)[0]
                except IndexError:
                    print "\tcould not find barcodes file ",Bcode,
                    "\n\tcomment out line 3 of params file or edit path to barcodes file"
                    sys.exit()
            if Floc:
                print "\tskipping step 1: line 18 of input file shows seqs already sorted"
            else:
                " if directory as input select all inside"
                if GLOB:
                    if GLOB[-1] == "/":
                        GLOB = GLOB+"*"
                sortandcheck2.main(Bcode,GLOB,CUT,datatype,parallel,maxmismatch,WORK)


        ### step 2 ###################
        if '2' in k:
            if Floc:
                print >>sys.stderr, "\tsorted .fastq from %s being used" % Floc
                if len(glob.glob(Floc))<1:
                    sys.stderr.write("\t... no files found in line 18 location, check required file name formatting\n")
                    sys.exit()
                FQs = Floc
                if stripped:
                    print "\tbarcode & restriction site are already stripped off of sequences"
                    CUT = ""
                    if strict:
                        print "\tApplying step 2 filter (param 19) is not recommended for data that is stripped (w/ @) \n"
            else:
                " default location "
                FQs = WORK+"fastq/"+subset+"*.fq.gz"

            " if directory as input select all inside"
            if FQs[-1] == "/":
                FQs = FQs+"*"

            " if not paired filter only read 1 "
            if 'pair' not in datatype:  # in ['rad','ddrad','gbs','merged','2brad']:
                editraw_rads.main(parallel, WORK, FQs, CUT,
                                  pN, Q, strict, trimkeep, datatype)

            else:   #elif datatype in ['pairddrad','pairgbs']:
                " check for both CUT sites in pairddrad"
                if datatype == 'pairddrad':
                    if "," not in CUT:
                        print "\n\tyou must enter two restriction sites for pair ddRAD data"
                        sys.exit()
                editraw_pairs.main(parallel, WORK, FQs, CUT, 
                                   pN, Q, strict, trimkeep, datatype)

            #elif "merge" in datatype:
            #    editraw_merges.main(parallel, WORK, FQs, CUT,
            #                       pN, Q, strict, trimkeep)



        ### step 3  ####################
        if '3' in k:
            cluster7dp.main(WORK, parallel, wclust, mindepth,
                            subset, datatype, w1, w2, minuniq,
                            MASK, muscle, vsearch, threads, remake=0)


        ### step 4  ####################
        if '4' in k:
            " if using low depth option still use a reasonable limit for parameter estimates"
            if mindepth < 5:
                tempmindepth = 5
            else:
                tempmindepth = mindepth
            H_err_dp.main(parallel, wclust, tempmindepth, subset,
                          haplos, WORK, CUT, datatype)


        ### step 5  ####################
        if '5' in k:
            if not E:
                try: Pi = open(WORK+"stats/Pi_E_estimate.txt").readlines()
                except IOError: Pi = ""
                if Pi:
                    El = []
                    Hl = []
                    for line in Pi[1:]:
                        try: _,h,e = line.strip().split("\t")
                        except IndexError:
                            None
                        Hl.append(float(h))
                        El.append(float(e))
                    if len(Hl) == 0:
                        print "\n\terror in step 4, no estimates in file stats/Pi_E_estimate.txt"
                        sys.exit()
                    H = sum(Hl)/len(Hl)
                    E = sum(El)/len(El)
                else:
                    E = 0.001
                    H = 0.01
                    print "\n\tstep 4 values not detected, using E=0.001, H=0.01"
            if 'pair' in datatype:
                " call consensus on each pair separately "
                consens_pairs.main(parallel, float(E), float(H), wclust, mindepth, subset+"*",
                                   maxN, maxH, haplos, CUT, datatype,
                                   lowcounts, strict, WORK, maxstack)
            else:
                " call consensus on single end clusters "
                consensdp.main(parallel, float(E), float(H), wclust, mindepth, subset+"*",
                               maxN, maxH, haplos, CUT, datatype,
                               lowcounts, strict, WORK, maxstack)


        ### step 6  ####################
        if '6' in k:
            if not hierarch:
                gids = ""
                if "," in subset:
                    inlist = [WORK+"clust"+wclust+"/"+i+".consens*" for i in subset.strip().split(",")]
                else:
                    inlist = glob.glob(WORK+"clust"+wclust+"/"+subset+"*.consens*")
                cluster_cons7_shuf.main(vsearch, wclust, datatype, 
                                        outgroup, seed, gids, minhits, 
                                        inlist, WORK, MASK, 0)
                print "\n\tfinished clustering"
            else:
                """ re-expand clustprefix cluster groups in case no -s """
                Hgids = []
                Hgroups = {}
                Hminhits = []
                "hierarchical clustering "
                for line in clustprefix:
                    Hgid, Hhits, Hinds = line.strip().split()
                    Hgids.append(Hgid)
                    Hminhits.append(Hhits)
                    Hgroups[Hgid] = []
                    if "," in Hinds:
                        Hinds = Hinds.split(",")
                        for Hind in Hinds:
                            if "*" in Hind:
                                expanded = glob.glob(WORK+"clust"+wclust+"/"+Hind+".consens*")
                                Hgroups[Hgid] += expanded #.append(expanded)
                            else:
                                Hgroups[Hgid].append(WORK+"clust"+wclust+"/"+Hind+".consens.gz")
                    else:
                        if "*" in Hinds:
                            expanded = glob.glob(WORK+"clust"+wclust+"/"+Hinds+".consens*")
                            Hgroups[Hgid] += expanded #.append(expanded)
                        else:
                            Hgroups[Hgid].append(WORK+"clust"+wclust+"/"+Hinds+".consens.gz")

                for i,j in zip(Hgids,Hminhits):
                    for cons in Hgroups[i]:
                        if cons not in glob.glob(WORK+"clust"+wclust+"/*.consens.gz"):
                            print "\n\tsample name",cons,"in group",i,"does not match any filenames"
                            sys.exit()

                preclusts = []
                for i in Hgroups.values():
                    preclusts += i

                for cons in glob.glob(WORK+"clust"+wclust+"/*.consens.gz"):
                    if cons not in preclusts:
                        print "\n\twarning: sample",cons,"not assigned to a cluster group"

                #if not gids:
                #    gids = ""
                    
                " make prefix directory "
                if not os.path.exists(WORK+'prefix/'):
                    os.makedirs(WORK+'prefix')


                ########### TODO ####################################
                # if os.path.exists(WORK+"prefix/cat.clust_.gz"):
                #     print "\tRemaking clusters from existing clustprefix files "+\
                #           "using minmatches: ",minmatch
                #     print "\t(To completely re-start hierarchical clustering delete the prefix/ directory)\n"
                #    
                #     for (gid,minhit,inlist) in zip(gids,minhits,groups):
                #         handle = WORK+"clust"+wclust+"/cat.haplos_"+gid
                #         #cluster_cons7_shuf.makeclust(handle, datatype, pre, pre, minm, WORK, 1)
                #     #tier2clust.makeclust(wclust, datatype, WORK)
                #######################################################

                " queue up jobs "
                work_queue = multiprocessing.Queue()
                result_queue = multiprocessing.Queue()

                " submit jobs "
                for (Hgid,Hminhit) in zip(Hgids,Hminhits):
                    inlist = Hgroups[Hgid]
                    work_queue.put([vsearch, wclust, datatype, 
                                    outgroup, seed,
                                    Hgid, Hminhit, inlist,
                                    WORK, MASK, 1 ])
                        
                " execute first tier jobs "    
                jobs = []
                for i in range(parallel):
                    worker = Worker(work_queue, result_queue, cluster_cons7_shuf.main)
                    jobs.append(worker)
                    worker.start()
                for j in jobs:
                    j.join()

                " cluster second tier "
                tier2clust.main(vsearch, wclust, datatype,
                                Hgids, seed, WORK, MASK)

                print "\n\tfinished clustering\n"

            " cleanup "
            #for ff in glob.glob(WORK+"clust"+wclust+"/cat.consens_*.gz"):
            #    os.remove(ff)
            #for ff in glob.glob(WORK+"clust"+wclust+"/cat.u*"):
            #    os.remove(ff)


        if '7' in k:
            if minsamp < 2:
                print "\n\tminimum minCov setting is <2: changing to 2"
                minsamp = 2
                
            if gids:
                inclustfile = WORK+"prefix/cat.clust_.gz"
            else:
                inclustfile = WORK+'clust'+wclust+"/cat.clust_.gz"

            if not os.path.exists(inclustfile):
                #sys.stderr.write("\n\t didn't find hierarchically clustered subset: \n\t"+inclustfile)
                #sys.stderr.write("\n\t looking for default full cluster file")
                if os.path.exists(WORK+'clust'+wclust+"/cat.clust_.gz"):
                    inclustfile = WORK+'clust'+wclust+"/cat.clust_.gz"
                    sys.stderr.write("\n\tCluster input file: using \n\t"+inclustfile+"\n\n")
                else:
                    print "\tnot found"
                    #print "\tcat.clust_ file is selected based on line 15 subset argument "
                    #print "\n\t if you wish to exclude samples from an existing cat.clust file "+\
                    #      "\n\t in your output alignments list exclude names on line 17 of the params file.\n "
                    sys.exit()
            #if any([i in outform for i in ['t','m']]):
            #    if gids:
            #        print "\tgroups for 't' or 'm' outputs:", gids
            taxadict = OrderedDict(zip(gids,groups))
            alignable.main(outgroup, minsamp, outname,
                           inclustfile, maxpoly, parallel,
                           maxSNP, muscle, exclude, overhang,
                           outform, WORK, gids, CUT,
                           a1, a2, datatype, subset,
                           parser.version.split(" ")[1],
                           mindepth, taxadict, minhits, seed, haplos)

        if '8' in k:
            cluster7dp.main(WORK, parallel, wclust, mindepth,
                            subset, datatype, w1, w2, minuniq,
                            MASK, muscle, vsearch, threads, remake=1)

    if options.dtest:
        readin = [line.strip() for line in open(options.dtest).readlines()]

        nboots =    int(readin[0].split("##")[0].strip())
        alignfile = str(readin[1].split("##")[0].strip())
        outfile   = str(readin[2].split("##")[0].strip())
        ntax =      str(readin[3].split("##")[0].strip())
        nproc =     int(readin[4].split("##")[0].strip())
        makesort =  int(readin[5].split("##")[0].strip())
        makeboots = int(readin[6].split("##")[0].strip())
        
        tests = []
        for line in readin[8:]:
            if line:
                notes = ""
                if "##" in line:
                    tax,notes = line.strip().split("##")[0], line.strip().split("##")[-1], 
                    if tax:
                        tests.append([tax.strip().split(), notes.strip()])   #.split("\t"),notes.strip()])
                else:
                    tests.append(line.strip().split()) # "\t"))
        if ntax == '4':
            Dtest.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots)
        elif ntax == 'part':
            Dtest_5.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots)
        elif ntax == 'foil':
            Dtest_foil.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots,0)
        elif ntax == 'foilalt':
            Dtest_foil.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots,1)
        else:
            print "error in input file"

    if options.newparamsfile:
        if os.path.exists("./params.txt"):
            print "\tfile params.txt already exists"
            sys.exit()
        else:
            createfile.main(parser.version.split(" ")[1])

    if options.newDtestfile:
        outstring = """200                          ## N bootstrap replicates
test.loci                    ## loc/path to input .loci file
dstats/test1_res             ## output file path/name (no suffix)
4                            ## which test: 4,part,foil,foilalt
2                            ## N cores (execute jobs [lines below] in parallel
0                            ## output ABBA/BABA loci to files (0=no,1,2=verbose)
0                            ## output bootstrap Ds to files (0=no,1=yes)
-----------------------------------------------------------\n"""
        sys.stdout.write(outstring)
Example #2
0
def writefunc(GLOB, Parallel, Bcode, CUT, datatype, maxmismatch, WORK):
    "create barcode dictionary"
    codetable = open(Bcode, 'r')
    codes = [line.strip().split() for line in codetable.readlines()]
    C = {}
    for line in codes:
        if line[0]:
            C[line[1].strip().upper()] = line[0]

    " find longest barcode "
    keylens = map(len, C.keys())
    if len(set(keylens)) == 1:
        longB = (keylens[0], 'same')
    else:
        longB = (max(keylens), 'diff')

    " check for CUT in barcodes "
    CCC = unambig(CUT)
    if len(CCC) > 1:
        for cut in CCC:
            if any([cut in i for i in C.keys()]):
                print "\n\twarning: CUT site matches within one of the barcodes, "+\
                "I suggest double \n\tchecking the file to make sure it properly demultiplexes"
    else:
        if any([CUT in i for i in C.keys()]):
            print "\n\twarning: CUT site matches within one of the barcodes, "+\
            "I suggest double \n\tchecking the file to make sure it properly demultiplexes"

    " read in sequence files "
    if len(glob.glob(GLOB)) > 1:
        FS = [f for f in glob.glob(GLOB)]
    else:
        FS = glob.glob(GLOB)
    if 'pair' in datatype:
        Raws = combinefiles(GLOB)
    else:
        Raws = FS

    "send jobs to multiprocess queue"
    num = 0
    work_queue = multiprocessing.Queue()
    submitted = 0
    for fs in Raws:
        if 'pair' in datatype:
            work_queue.put([
                C, [fs[0], fs[1]], CUT, datatype, num, maxmismatch, WORK, longB
            ])
            submitted += 1
        else:
            work_queue.put(
                [C, fs, CUT, datatype, num, maxmismatch, WORK, longB])
            submitted += 1
        num += 1

    result_queue = multiprocessing.Queue()

    "spawn workers, give function"
    jobs = []
    for i in range(min(Parallel, submitted)):
        worker = Worker(work_queue, result_queue, barmatch)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    Ms = {}

    if len(glob.glob(WORK + "fastq/.*.pickle")) > 1:
        for pick in glob.glob(WORK + "fastq/.*.pickle"):
            pickin = open(pick, "rb")
            M = pickle.load(pickin)
            pickin.close()
            for key in M:
                if key not in Ms:
                    Ms[key] = M[key]
                else:
                    Ms[key] += M[key]
            os.remove(pick)
    elif len(glob.glob(WORK + "fastq/.*.pickle")) == 1:
        pick = glob.glob(WORK + "fastq/.*.pickle")[0]
        pickin = open(pick, 'rb')
        Ms = pickle.load(pickin)
        pickin.close()
        os.remove(pick)
    else:
        print "\nno stats file generated"

    Mkeys = Ms.keys()
    Mkeys.sort(key=lambda x: Ms[x], reverse=True)

    statout = open(WORK + "stats/s1.sorting.txt", 'a')
    statout.write("\n\n")
    statout.write("sample\ttrue_bar\tobs_bars\tN_obs\n")

    Cnames = C.keys()
    Cnames.sort()
    try:
        maxl = max(map(len, map(str, Ms.values())))
    except ValueError:
        maxl = 5

    hits = []
    for bar in Cnames:
        for barcode in Mkeys:
            if matching(bar, barcode, maxmismatch):
                print >> statout, "%s    \t%s    \t%s\t%s" % (
                    C[bar], bar, barcode, str(Ms[barcode]) + " " *
                    (maxl + 3 - len(str(Ms[barcode]))))
                hits.append(barcode)

    statout.write("\n")
    maxl = max(map(len, Mkeys))
    for barcode in Mkeys:
        if barcode not in hits:
            print >> statout, "nomatch  \t%s    \t%i" % (
                barcode + " " * (maxl + 3 - len(barcode)), Ms[barcode])
    statout.close()
Example #3
0
def main(WORK, parallel, wclust, mindepth,
         subset, datatype, w1, w2, minuniq,
         MASK, muscle, vsearch, threads, remake):

    " find .edit files in edits/ directory "
    if not os.path.exists(WORK+'edits/'):
        print "\terror: could not find edits/ folder in working directory"
        sys.exit()

    " make output folder for clusters" 
    if not os.path.exists(WORK+'clust'+wclust):
        os.makedirs(WORK+'clust'+wclust)
    outfolder = WORK+'clust'+str(wclust)
    if not os.path.exists(WORK+'stats'):
        os.makedirs(WORK+'stats')

    " remake option... in development"
    if remake:
        for ufile in glob.glob(outfolder+"/*.u"):
            infile = open(ufile).readlines()
            cmd = "/bin/sed '$d' < " + ufile + " > tempfile"
            os.system(cmd)
            cmd = "/bin/mv "+ufile+" "+ufile+".backup"
            os.system(cmd)
            cmd = "/bin/mv tempfile "+ufile
            os.system(cmd)

    FS = []

    " if not only 1 sample "
    if len(glob.glob(WORK+"edits/"+subset+"*.edit*")) > 1:  
        for f in glob.glob(WORK+"edits/"+subset+"*.edit*"):
            " append files to list if not already clustered or empty"
            if not os.path.exists(outfolder+"/"+f.replace(".edit",".clustS.gz")):
                size = os.stat(f)
                if size.st_size > 0:
                    FS.append(f)
                else:
                    print "excluding "+str(f)+" file is empty"
            else:
                print f.replace(".edit",".clustS")+" already exists"
        " arranges files by decreasing size for fast clustering order"
        for i in range(len(FS)):
            statinfo = os.stat(FS[i])
            FS[i] = FS[i],statinfo.st_size
        FS.sort(key=operator.itemgetter(1), reverse = True)
        FS = [i[0] for i in FS]

    elif len(glob.glob(WORK+"edits/"+subset+"*.edit*")) == 1:
        f = glob.glob(WORK+"edits/"+subset+"*.edit*")
        size = os.stat(f[0])
        if size.st_size > 0:
            FS = f
        else:
            print "excluding "+f[0]+" file is empty"
    else:
        print "\tNo .edit files found in edits/ dir."

    sys.stderr.write("\n\tde-replicating files for clustering...\n")

    """ do not split big files if using 64-bit Usearch,
    or if using Vsearch, else do it to avoid 4GB limit of 32-bit usearch"""

    if "vsearch" not in vsearch:
        print '\n\tsplitting big files'
        splitbigfilesforderep(FS, vsearch, datatype, minuniq)

    " load work queue"
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()

    " perform function 'final' on files in FS list "
    submitted = {}
    fileno = 1

    if not remake:
        if threads == 0:
            nthreads = 'all'
        else:
            nthreads =threads
        np = min(parallel,len(FS))
        sys.stderr.write("\n\tstep 3: within-sample clustering of "+\
                         `len(FS)`+" samples at \n\t        "+`wclust`+\
                         " similarity. Running "+`np`+" parallel jobs\n\t"+\
                         " \twith up to "+`nthreads`+" threads per job."+\
                         " If needed, \n\t\tadjust to avoid CPU and MEM limits\n\n")
    else:
        sys.stderr.write("\n\tstep 3: rebuilding clusters from unfinished step 3 files\n")

    for handle in FS:
        if outfolder+"/"+handle.split("/")[-1].replace(".edit",".clustS.gz") not in glob.glob(outfolder+"/*"):
            work_queue.put([vsearch,outfolder,handle,wclust,mindepth,
                            parallel,muscle,datatype,fileno, w1, w2, 
                            WORK, minuniq, MASK, threads, remake])
            submitted[handle] = 1
            fileno += 1
        else:
            print "\tskipping "+handle.split("/")[-1].replace(".edit",".clustS.gz")+\
                  ' already exists in '+WORK+outfolder.split("/")[-1]

    " create a queue to pass to workers to store the results"
    jobs = []
    for i in range( min(submitted,parallel) ):
        worker = Worker(work_queue, result_queue, final)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " output statistics on depth of coverage"
    outstats = open(WORK+"stats/s3.clusters.txt",'a')
    print >>outstats, '\n'+'\t'.join(['taxa','total','dpt.me',
                                      'dpt.sd','d>'+`mindepth-1`+'.tot',
                                      'd>'+`mindepth-1`+'.me',
                                      'd>'+`mindepth-1`+'.sd',
                                      'badpairs'])

    RES = []
    HISTO = []
    #for ff in glob.glob(outfolder+"/.temp.*"):
    for ff in FS:
        end = ff.split("/")[-1].replace(".edit","")
        ff = outfolder+"/.temp."+end
        if os.path.exists(ff):
            line = open(ff).readlines()
            RES.append(line[0].strip().split("\t"))
            HISTO.append([line[0].split("\t")[0],"".join(line[1:])])
            os.remove(ff)
    RES.sort(key=lambda x:x[0])
    HISTO.sort(key=lambda x:x[0])
    
    for i in RES:
        print >>outstats, "\t".join(i)
    
    print >>outstats, """
    ## total = total number of clusters, including singletons
    ## dpt.me = mean depth of clusters
    ## dpt.sd = standard deviation of cluster depth
    ## >N.tot = number of clusters with depth greater than N
    ## >N.me = mean depth of clusters with depth greater than N
    ## >N.sd = standard deviation of cluster depth for clusters with depth greater than N
    ## badpairs = mismatched 1st & 2nd reads (only for paired ddRAD data)\n\nHISTOGRAMS\n
    """

    for i in HISTO:
        print >>outstats, "sample: "+i[0]+"\n"+i[1]
    
    
    outstats.close()
    for handle in FS:
        nothere = 0
        try: submitted[handle]
        except KeyError:
            nothere = 1
        if not nothere:
            if submitted[handle]:
                if os.path.exists(outfolder+"/"+handle.split("/")[-1].replace(".edit",".clust.gz")):
                    cmd = "/bin/rm "+outfolder+"/"+handle.split("/")[-1].replace(".edit",".clust.gz")
                    subprocess.call(cmd, shell=True)
Example #4
0
def multiproc_it(tests, alignfile, outfile, nboots, nproc, namelen, makesort, makeboots):

    " submit jobs to processors "
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in tests:
        notes = ""
        if len(rep) == 2:
            rep,notes = rep
        p1,p2,p3,o = rep
        if any(["[" in i for i in rep]):
            p1 = p1[1:-1].split(",")
            p2 = p2[1:-1].split(",")
            p3 = p3[1:-1].split(",")
            o =   o[1:-1].split(",")
            taxalist = list(itertools.chain(*[p1+p2+p3+o]))
            if checktaxa(taxalist,alignfile):
                work_queue.put([alignfile,[p1,p2,p3,o],nboots,1, submitted])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1,p2,p3,o],alignfile):
                work_queue.put([alignfile,[p1,p2,p3,o],nboots,0, submitted])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'

        Notes.append(notes)
    jobs = []
    for i in range(nproc):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    #Results = [result_queue.get() for i in range(submitted)]
    Results = [pickle.load(open(".save.D4temp"+str(i),'rb')) for i in xrange(submitted)]
    Results.sort(key = lambda x:x[8])

    "setup results file "
    outs = open(outfile+".D4.txt", 'w')
    header = "\t".join([ 'P1'+" "*(namelen[0]-2),
                         'P2'+" "*(namelen[1]-2),
                         'P3'+" "*(namelen[2]-2),
                         'O'+" "*(namelen[3]-1),
                         'D','std(D)','Z',
                         'BABA','ABBA',
                         'nloci','nboot','pdisc', 'notes'])
    print >>outs, header

    for i in range(len(Results)):
        ps,D,STD,Z,nloci,ABBA,BABA,pdisc,sub,ABBAloci,BABAloci,boots = Results[i]
        ps = [str(x).replace("['","[").replace("']","]").replace("', '",",").replace(">","") for x in ps]
        print >>outs, "%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.2f\t%s" % (ps[0]+" "*(namelen[0]-len(ps[0])),
                                                                                          ps[1]+" "*(namelen[1]-len(ps[1])),
                                                                                          ps[2]+" "*(namelen[2]-len(ps[2])),
                                                                                          ps[3]+" "*(namelen[3]-len(ps[3])),
                                                                                          D,STD,Z,
                                                                                          BABA,ABBA,
                                                                                          nloci,nboots,
                                                                                          pdisc,Notes[i])



        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            makesortfiles('ABBA',ABBAloci,4,loci,outfile,makesort,sub,ps)
            makesortfiles('BABA',BABAloci,4,loci,outfile,makesort,sub,ps)            

        if makeboots:
            with open(outfile+"_"+str(sub+1)+".boots",'w') as out:
                out.write(",".join(map(str,boots)))

    for oldpickle in glob.glob(".save.D4temp*"):
        os.remove(oldpickle)
Example #5
0
def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots,noterminals):
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in subtests:
        notes = ""
        if len(rep) == 2:
            rep,notes = rep
        p1,p2,p3a,p3b,o = rep
        if all(["[" in i for i in rep[1:]]):
            p1  = p1[1:-1].split(",")
            p2  = p2[1:-1].split(",")
            p3a = p3a[1:-1].split(",")
            p3b = p3b[1:-1].split(",")
            o   = o[1:-1].split(",")
            if checktaxa([p1,p2,p3a,p3b,o],alignfile):
                work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted, noterminals])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1,p2,p3a,p3b,o],alignfile):
                work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted, noterminals])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        Notes.append(notes)

    jobs = []
    for i in range(min(submitted,nproc)):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    Results = [result_queue.get() for i in range(submitted)]
    Results.sort(key = lambda x:x[15])



    " setup results file "
    if noterminals: 
        outs = open(outfile+".Dfoilalt.txt", 'w')
    else:
        outs = open(outfile+".Dfoil.txt", 'w')
    header = "\t".join([ 'p1'+" "*(namelen[0]-2),
                         'p2'+" "*(namelen[1]-2),
                         'p3'+" "*(namelen[2]-2),
                         'p4'+" "*(namelen[3]-2),
                         'O'+" "*(namelen[4]-1),
                         'Dfo','Dil','Dfi','Dol',
                         'Z_fo','Z_il','Z_fi','Z_ol',
                         'BABBA','ABBBA',
                         'BABAA','ABBAA',
                         'BAABA','ABABA',
                         'BBBAA','BBABA',
                         'AABAA','AAABA',
                         'BAAAA','ABAAA',
                         'nloci','sign', 'notes'])
    print >>outs, header

    for i in range(len(Results)):
        L,DFO,ZFO,DIL,ZIL,DFI,ZFI,DOL,ZOL,nloc,BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA,pdisc,sub,BBFO,BBIL,BBFI,BBOL = Results[i]
        L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L]

        sign = []
        for s,d in zip([ZFO,ZIL,ZFI,ZOL],[DFO,DIL,DFI,DOL]):
            if s>3.5:
                if d>0:
                    sign.append("+")
                else:
                    sign.append("-")
            else:
                sign.append("0")
        #print sign

        resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))),
                       str(L[1])+" "*(namelen[1]-len(str(L[1]))),
                       str(L[2])+" "*(namelen[2]-len(str(L[2]))),
                       str(L[3])+" "*(namelen[3]-len(str(L[3]))),
                       str(L[4])+" "*(namelen[4]-len(str(L[4]))),
                       DFO,DIL,DFI,DOL,
                       ZFO,ZIL,ZFI,ZOL,
                       BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA,
                       nloc, "".join(sign), Notes[i]])
        
        print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%s\t%s" % resin 

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            None
            # makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L)            
            # makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L)            
            # makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L)

        if makeboots:
            None
Example #6
0
def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots):
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in subtests:
        notes = ""
        if len(rep) == 2:
            rep,notes = rep
        p1,p2,p3a,p3b,o = rep
        if all(["[" in i for i in rep[1:]]):
            p1  = p1[1:-1].split(",")
            p2  = p2[1:-1].split(",")
            p3a = p3a[1:-1].split(",")
            p3b = p3b[1:-1].split(",")
            o   = o[1:-1].split(",")
            if checktaxa([p1,p2,p3a,p3b,o],alignfile):
                work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1,p2,p3a,p3b,o],alignfile):
                work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        Notes.append(notes)

    jobs = []
    for i in range(min(submitted,nproc)):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()


    " read results back in "
    #Results = [result_queue.get() for i in range(submitted)]
    Results = [pickle.load(open(".save."+str(i),'rb')) for i in range(submitted)]
    Results.sort(key = lambda x:x[15])


    " setup results file "
    outs = open(outfile+".partD.txt", 'w')
    header = "\t".join([ 'p1'+" "*(namelen[0]-2),
                         'p2'+" "*(namelen[1]-2),
                         'p3_1'+" "*(namelen[2]-4),
                         'p3_2'+" "*(namelen[3]-4),
                         'O'+" "*(namelen[4]-1),
                         'D_12','D_1','D_2',
                         'Z_12','Z_1','Z_2',
                         'BABBA','ABBBA',
                         'BABAA','ABBAA',
                         'BAABA','ABABA',
                         'nloci','pdisc', 'notes'])

    print >>outs, header


    for i in range(len(Results)):
        L,D12,Z12,D1,Z1,D2,Z2,nloc,ABBBA,BABBA,ABBAA,BABAA,ABABA,BAABA,pdisc,sub,ABBBAloci,BABBAloci,ABBAAloci,BABAAloci,ABABAloci,BAABAloci,BB12,BB1,BB2 = Results[i]
        L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L]

        resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))),
                       str(L[1])+" "*(namelen[1]-len(str(L[1]))),
                       str(L[2])+" "*(namelen[2]-len(str(L[2]))),
                       str(L[3])+" "*(namelen[3]-len(str(L[3]))),
                       str(L[4])+" "*(namelen[4]-len(str(L[4]))),
                       D12, D1, D2, Z12, Z1, Z2, 
                       BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA,
                       nloc, pdisc, Notes[i]])
        
        print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s" % resin 

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L)
            makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L)            
            makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L)            
            makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L)
            makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L)
            makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L)

        if makeboots:
            with open(outfile+"_"+str(sub+1)+".boots_D12",'w') as out:
                out.write(",".join(map(str,BB12)))
            with open(outfile+"_"+str(sub+1)+".boots_D1",'w') as out:
                out.write(",".join(map(str,BB1)))
            with open(outfile+"_"+str(sub+1)+".boots_D2",'w') as out:
                out.write(",".join(map(str,BB2)))
Example #7
0
def main(Parallel, E, H, ID, mindepth, subset,
         maxN, maxH, haplos, CUT, datatype,
         lowcounts, strict, WORK, maxstack):

    " find clust.xx directory "
    if not os.path.exists(WORK+'clust'+ID):
        print  "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
                "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
                "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
        sys.exit()

    " load up work queue"
    work_queue = multiprocessing.Queue()

    " iterate over files"
    outfolder = WORK+'clust'+str(ID)
    HH = glob.glob(outfolder+"/"+subset+".clustS*")
    stringout = "\n\tstep 5: creating consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5))
    sys.stderr.write(stringout)
    
    if len(HH) > 1:
        " sort files by size"
        for i in xrange(len(HH)):
            statinfo = os.stat(HH[i])
            HH[i] = HH[i],statinfo.st_size
        HH.sort(key=operator.itemgetter(1))
        FS = [f[0] for f in HH][::-1]
    else: FS = HH
    REMOVE = glob.glob('clust'+ID+"/cat.*")
    FS = [f for f in FS if f not in REMOVE]
    submitted = 0
    for handle in FS:
        if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"):
            m,sd = upSD(handle,mindepth)
            if maxstack == "2SD":
                upperSD = max(500,m+(sd*2.5))
            else:
                upperSD = int(maxstack)
            work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype,
                            haplos,CUT,upperSD,strict,lowcounts])
            submitted += 1
        else:
            print "\tskipping "+handle.replace(".clustS",".consens")+\
                  ', it already exists in '+outfolder+"/"


    " create a queue to pass to workers to store the results"
    result_queue = multiprocessing.Queue()

    " spawn workers"
    jobs = []
    for i in xrange( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, consensus)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " get results"
    stats = open(WORK+'stats/s5.consens.txt','a+')
    print >>stats,  "taxon          \tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly"
    for i in range(submitted):
        a,b,c,d,e,f,g = result_queue.get()
        print >> stats, "\t".join(map(str,[a.replace(".clustS.gz","")+" "*(10-len(a)),b,c,d,e,f,g]))
    print >>stats, """
    ## nloci = number of loci
    ## f1loci = number of loci with >N depth coverage
    ## f2loci = number of loci with >N depth and passed paralog filter
    ## nsites = number of sites across f loci
    ## npoly = number of polymorphic sites in nsites
    ## poly = frequency of polymorphic sites"""
    stats.close()
Example #8
0
def multiproc_it(tests, alignfile, outfile, nboots, nproc, namelen, makesort,
                 makeboots):

    " submit jobs to processors "
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in tests:
        notes = ""
        if len(rep) == 2:
            rep, notes = rep
        p1, p2, p3, o = rep
        if any(["[" in i for i in rep]):
            p1 = p1[1:-1].split(",")
            p2 = p2[1:-1].split(",")
            p3 = p3[1:-1].split(",")
            o = o[1:-1].split(",")
            taxalist = list(itertools.chain(*[p1 + p2 + p3 + o]))
            if checktaxa(taxalist, alignfile):
                work_queue.put(
                    [alignfile, [p1, p2, p3, o], nboots, 1, submitted])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1, p2, p3, o], alignfile):
                work_queue.put(
                    [alignfile, [p1, p2, p3, o], nboots, 0, submitted])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'

        Notes.append(notes)
    jobs = []
    for i in range(nproc):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    #Results = [result_queue.get() for i in range(submitted)]
    Results = [
        pickle.load(open(".save.D4temp" + str(i), 'rb'))
        for i in xrange(submitted)
    ]
    Results.sort(key=lambda x: x[8])

    "setup results file "
    outs = open(outfile + ".D4.txt", 'w')
    header = "\t".join([
        'P1' + " " * (namelen[0] - 2), 'P2' + " " * (namelen[1] - 2),
        'P3' + " " * (namelen[2] - 2), 'O' + " " * (namelen[3] - 1), 'D',
        'std(D)', 'Z', 'BABA', 'ABBA', 'nloci', 'nboot', 'pdisc', 'notes'
    ])
    print >> outs, header

    for i in range(len(Results)):
        ps, D, STD, Z, nloci, ABBA, BABA, pdisc, sub, ABBAloci, BABAloci, boots = Results[
            i]
        ps = [
            str(x).replace("['",
                           "[").replace("']",
                                        "]").replace("', '",
                                                     ",").replace(">", "")
            for x in ps
        ]
        print >> outs, "%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.2f\t%s" % (
            ps[0] + " " * (namelen[0] - len(ps[0])), ps[1] + " " *
            (namelen[1] - len(ps[1])), ps[2] + " " *
            (namelen[2] - len(ps[2])), ps[3] + " " * (namelen[3] - len(ps[3])),
            D, STD, Z, BABA, ABBA, nloci, nboots, pdisc, Notes[i])

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            makesortfiles('ABBA', ABBAloci, 4, loci, outfile, makesort, sub,
                          ps)
            makesortfiles('BABA', BABAloci, 4, loci, outfile, makesort, sub,
                          ps)

        if makeboots:
            with open(outfile + "_" + str(sub + 1) + ".boots", 'w') as out:
                out.write(",".join(map(str, boots)))

    for oldpickle in glob.glob(".save.D4temp*"):
        os.remove(oldpickle)
Example #9
0
def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype):
    print >>sys.stderr, "\tstep 2: editing raw reads \n\t",

    " create output directories "
    if not os.path.exists(WORK+'stats'):
        os.makedirs(WORK+'stats')
    if not os.path.exists(WORK+'edits'):
        os.makedirs(WORK+'edits')

    " load up work queue "
    submitted = 0
    work_queue = multiprocessing.Queue()
    if len(glob.glob(FQs)) > 1:
        FS = glob.glob(FQs)

        " order files by size "
        for i in range(len(FS)):
            statinfo = os.stat(FS[i])
            FS[i] = FS[i],statinfo.st_size
        FS.sort(key=operator.itemgetter(1))
        FS = [i[0] for i in FS][::-1]

        " submit jobs to queue "
        for handle in FS:
            finder = WORK+'edits/'+handle.split("/")[-1]
            while finder.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
                finder = finder.replace('.'+finder.split(".")[-1], "").replace("_R1","")
            if finder+".edit" not in glob.glob(WORK+"edits/*"):
                if os.stat(handle).st_size > 0:   ## exclude empty files
                    args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype]
                    work_queue.put(args)
                    submitted += 1
                else:
                    print "skipping",handle,", file is empty"
            else:
                print "\t"+finder+" already in edits/"

    elif len(glob.glob(FQs)) == 1:
        " if only one file "
        work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype])
        submitted += 1

    else:
        print "\tNo demultiplexed files found. Check path."
        sys.exit()

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()

    " spawn workers, give function "
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, rawedit)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()


    " collect the results off the queue "
    outstats = open(WORK+"stats/s2.rawedit.txt",'a')
    print >> outstats, "\t".join(["sample ","Nreads","passed","passed.w.trim","passed.total"])
    STATS = []
    for i in range(submitted):
        STATS.append(result_queue.get())

    STATS.sort(key = lambda x: x[0])
    for i in range(submitted):
        a,b,c,d = STATS[i]
        print >> outstats, "\t".join([a,b,c,d,str(int(c)+int(d))])

    print >>outstats, """
    Nreads = total number of reads for a sample
    passed = retained reads that passed quality filtering at full length
    passed.w.trim= retained reads that were trimmed due to detection of adapters
    passed.total  = total kept reads of sufficient length
    note: you can set the option in params file to include trimmed reads of xx length. """
    outstats.close()
Example #10
0
def main(Parallel, E, H, ID, mindepth, subset,
         maxN, maxH, ploidy, CUT, datatype,
         lowcounts, strict, WORK, maxstack):

    " find clust.xx directory "
    if not os.path.exists(WORK+'clust'+ID):
        print  "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
                "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
                "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
        sys.exit()

    " create work queue"
    work_queue = multiprocessing.Queue()

    " iterate over files"
    outfolder = WORK+'clust'+str(ID)
    HH = glob.glob(outfolder+"/"+subset+".clustS*")
    stringout = "\n\tstep 5: created consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5))
    sys.stderr.write(stringout)
    
    if len(HH) > 1:
        " sort files by size"
        for i in range(len(HH)):
            statinfo = os.stat(HH[i])
            HH[i] = HH[i],statinfo.st_size
        HH.sort(key=operator.itemgetter(1))
        FS = [f[0] for f in HH][::-1]
    else: FS = HH
    REMOVE = glob.glob('clust'+ID+"/cat.*")
    FS = [f for f in FS if f not in REMOVE]
    submitted = 0
    for handle in FS:
        if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"):
            m,sd = upSD(handle,mindepth)
            if maxstack == "2SD":
                upperSD = max(500,m+(sd*2.5))
            else:
                upperSD = int(maxstack)
            work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype,
                            ploidy,CUT,upperSD,strict,lowcounts])
            submitted += 1
        else:
            print "\tskipping "+handle.replace(".clustS",".consens")+\
                  ', it already exists in '+outfolder+"/"


    " create a queue to pass to workers to store the results"
    result_queue = multiprocessing.Queue()

    " spawn workers"
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, consensus)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " get results"
    stats = open(WORK+'stats/s5.consens.txt','a+')
    print >>stats,  "taxon\tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly"
    for i in range(submitted):
        a,b,c,d,e,f,g = result_queue.get()
        nn = a.replace(".clustS.gz","")
        print >> stats, "\t".join(map(str,[nn,b,c,d,e,f,g]))
    print >>stats, """
    ## nloci = number of loci
    ## f1loci = number of loci with >N depth coverage
    ## f2loci = number of loci with >N depth and passed paralog filter
    ## nsites = number of sites across f loci
    ## npoly = number of polymorphic sites in nsites
    ## poly = frequency of polymorphic sites"""
    stats.close()
Example #11
0
def multiproc_it(subtests, alignfile, outfile, nboots, nproc, namelen,
                 makesort, makeboots, noterminals):
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in subtests:
        notes = ""
        if len(rep) == 2:
            rep, notes = rep
        p1, p2, p3a, p3b, o = rep
        if all(["[" in i for i in rep[1:]]):
            p1 = p1[1:-1].split(",")
            p2 = p2[1:-1].split(",")
            p3a = p3a[1:-1].split(",")
            p3b = p3b[1:-1].split(",")
            o = o[1:-1].split(",")
            if checktaxa([p1, p2, p3a, p3b, o], alignfile):
                work_queue.put([
                    alignfile, [p1, p2, p3a, p3b, o], nboots, 1, submitted,
                    noterminals
                ])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1, p2, p3a, p3b, o], alignfile):
                work_queue.put([
                    alignfile, [p1, p2, p3a, p3b, o], nboots, 0, submitted,
                    noterminals
                ])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        Notes.append(notes)

    jobs = []
    for i in range(min(submitted, nproc)):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    Results = [result_queue.get() for i in range(submitted)]
    Results.sort(key=lambda x: x[15])

    " setup results file "
    if noterminals:
        outs = open(outfile + ".Dfoilalt.txt", 'w')
    else:
        outs = open(outfile + ".Dfoil.txt", 'w')
    header = "\t".join([
        'p1' + " " * (namelen[0] - 2), 'p2' + " " * (namelen[1] - 2),
        'p3' + " " * (namelen[2] - 2), 'p4' + " " * (namelen[3] - 2),
        'O' + " " * (namelen[4] - 1), 'Dfo', 'Dil', 'Dfi', 'Dol', 'Z_fo',
        'Z_il', 'Z_fi', 'Z_ol', 'BABBA', 'ABBBA', 'BABAA', 'ABBAA', 'BAABA',
        'ABABA', 'BBBAA', 'BBABA', 'AABAA', 'AAABA', 'BAAAA', 'ABAAA', 'nloci',
        'sign', 'notes'
    ])
    print >> outs, header

    for i in range(len(Results)):
        L, DFO, ZFO, DIL, ZIL, DFI, ZFI, DOL, ZOL, nloc, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, BBBAA, BBABA, AABAA, AAABA, BAAAA, ABAAA, pdisc, sub, BBFO, BBIL, BBFI, BBOL = Results[
            i]
        L = [
            str(x).replace("['", "[").replace("']", "]").replace("', '", ",")
            for x in L
        ]

        sign = []
        for s, d in zip([ZFO, ZIL, ZFI, ZOL], [DFO, DIL, DFI, DOL]):
            if s > 3.5:
                if d > 0:
                    sign.append("+")
                else:
                    sign.append("-")
            else:
                sign.append("0")
        #print sign

        resin = tuple([
            str(L[0]) + " " * (namelen[0] - len(str(L[0]))),
            str(L[1]) + " " * (namelen[1] - len(str(L[1]))),
            str(L[2]) + " " * (namelen[2] - len(str(L[2]))),
            str(L[3]) + " " * (namelen[3] - len(str(L[3]))),
            str(L[4]) + " " * (namelen[4] - len(str(L[4]))), DFO, DIL, DFI,
            DOL, ZFO, ZIL, ZFI, ZOL, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA,
            BBBAA, BBABA, AABAA, AAABA, BAAAA, ABAAA, nloc, "".join(sign),
            Notes[i]
        ])

        print >> outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%s\t%s" % resin

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            None
            # makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L)

        if makeboots:
            None
Example #12
0
def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype):

    print >> sys.stderr, "\n\tstep 2: quality filtering \n\t",

    " create output directories "
    if not os.path.exists(WORK + 'stats'):
        os.makedirs(WORK + 'stats')
    if not os.path.exists(WORK + 'edits'):
        os.makedirs(WORK + 'edits')

    " load up work queue "
    submitted = 0
    work_queue = multiprocessing.Queue()

    " do not select merged or discarded reads if PEAR was used on data"
    FQs = glob.glob(FQs)
    fqs = [
        i for i in FQs
        if not any([j in i for j in ["discarded", ".assembled."]])
    ]

    if len(fqs) > 1:
        " subselect only the first reads "
        if any([".unassembled.forward." in i for i in fqs]):
            FS = [i for i in fqs if '.forward.' in i]
        else:
            FS = [i for i in fqs if '_R1.' in i]

        " order files by size "
        for i in range(len(FS)):
            statinfo = os.stat(FS[i])
            FS[i] = FS[i], statinfo.st_size
        FS.sort(key=operator.itemgetter(1))
        FS = [i[0] for i in FS][::-1]

        " submit jobs to queue "
        for handle in FS:
            n = handle.split('/')[-1]
            while n.split(".")[-1] in [
                    "fastq", "fastQ", "gz", "fq", "FastQ", "nomerge"
            ]:
                n = n.replace('.' + n.split(".")[-1], "")
            if '.forward.' in n:
                n = n.split(".forward")[0]
                None
            else:
                "_".join(n.split('_R')[:-1])
            if WORK + "edits/" + n + ".edit" not in glob.glob(WORK +
                                                              "edits/*"):
                if os.stat(handle).st_size > 0:  ## exclude empty files
                    args = [
                        WORK, handle, CUT,
                        float(pN), trimkeep, strict, Q, datatype
                    ]
                    work_queue.put(args)
                    submitted += 1
                else:
                    print 'skipping', handle, ", file is empty"
            else:
                print "\t" + n + '.edit' + " already in edits/"
    elif len(fqs) == 1:
        " if only one file "
        work_queue.put([
            WORK,
            glob.glob(FQs)[0], CUT,
            float(pN), trimkeep, strict, Q, datatype
        ])
        submitted += 1

    else:
        print "no _paired_ de-multiplexed files found in this location."
        sys.exit()

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()

    " spawn workers, give function "
    jobs = []
    for i in range(min(Parallel, submitted)):
        worker = Worker(work_queue, result_queue, rawedit)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    " collect the results off the queue "
    outstats = open(WORK + "stats/s2.rawedit.txt", 'a')
    print >> outstats, "\t".join(
        ["sample", "Nreads", "exclude", "trimmed", "passed"])
    for i in range(submitted):
        a, b, c, d = result_queue.get()
        print >> outstats, "\t".join([a, b, str(int(b) - int(d)), c, d])

    print >> outstats, """
    Nreads = total number of reads for a sample
    exclude = reads that were excluded
    trimmed = reads that had adapter trimmed but were kept
    passed = total kept reads
    """
    outstats.close()
Example #13
0
def main(WORK, UCLUST, FQs, match, Q, Parallel):

    " create output directories " 
    if not os.path.exists(WORK+'fastq/'):
        os.makedirs(WORK+'fastq')
    if not os.path.exists(WORK+'mergedreads'):
        os.makedirs(WORK+'mergedreads')
    if not os.path.exists(WORK+'stats'):
        os.makedirs(WORK+'stats')


    submitted = 0
    work_queue = multiprocessing.Queue()

    names = [i for i in glob.glob(FQs) if "_R1.fq" in i]

    " submit jobs to queue "
    if len(names) > 1:
        for handle in names:
            if "nomerge." not in handle:
                n = str(handle.split('/')[-1]).replace("_R1.",".")
                while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
                    n = n.replace('.'+n.split(".")[-1], "")
                finder = WORK+'edits/'+n+".edit"
                if finder not in glob.glob(WORK+"edits/*"):
                    if os.stat(handle).st_size > 0:   ## exclude empty files
                        if os.path.exists(handle.replace("_R1.","_R2.")):
                            if not os.path.exists(handle.replace(".fq",".nomerge.fq")):
                                args = [WORK, UCLUST, handle, match, Q]
                                work_queue.put(args)
                                submitted += 1
                            else:
                                print "merge file already created for", handle.split("/")[-1]
                        else:
                            print "cannot find 2nd read file for", handle.split("/")[-1]
                    else:
                        print "\t"+finder+" already in edits/"
    else:
        if not names:
            if [i for i in glob.glob(FQs) if "_R1_." in i]:
                print "\n\tfile names should have _R1. not _R1_."
            print "\n\tcannot find input files"
            sys.exit()
        else:
            work_queue.put([WORK, UCLUST, names[0], match, Q])
            submitted += 1

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()


    " spawn workers, give function "
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, mergepairs)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    if submitted > 0:
        statout = open(WORK+"stats/s2.mergedreads.txt",'w')
        print >>statout, "\t".join(["taxon","mergedreads"])

        for i in range(submitted):
            stat = result_queue.get()
            a,b = stat
            n = a.strip().split("/")[-1].replace(".nomerge.gz","")
            print >>statout, "\t".join([n,str(b)])
        print >>statout, "\nmerged reads written to", WORK+"mergedreads/ "
        statout.close()
Example #14
0
def main(Parallel,ID,minsamp,subset,haplos,WORK,CUT,datatype):
    sys.stderr.write("\n\tstep 4: estimating error rate and heterozygosity\n\t")

    " find clust.xx directory "
    if not os.path.exists(WORK+'clust'+ID):
        print  "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
                "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
                "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
        sys.exit()


    # warning message for low minsamp
    if minsamp < 5:
        sys.stderr.write("""\n\t warning: Mindepth < 5 is not recommended for this step.\n
                            If you intend to make low coverage base calls use a high mindepth in
                            step 4 to accurately infer H & E parameters, and then use a low mindepth
                            in conjunction with the line 31 params file option to make low coverage
                            base calls""")
        
    # if haploid data
    if haplos == 1:
        sys.stderr.write("\n\tapplying haploid-based test (infer E while H is fixed to 0)\n\t")

    # if double digest use first cut site
    if "," in CUT:
        CUT1, CUT2 = CUT.strip().split(",")
    else:
        CUT1 = CUT2 = CUT

    # load up work queue
    work_queue = multiprocessing.Queue()

    # iterate over files
    HH = glob.glob(WORK+"clust"+ID+"/"+subset+"*.clustS*")
    submitted = 0
    FS = []
    if len(HH) > 1:
        ## sort files by size
        for i in range(len(HH)):
            statinfo = os.stat(HH[i])
            if statinfo.st_size > 1000:
                FS.append((HH[i],statinfo.st_size))
            else:
                print "excluding ",HH[i],"file is too small\n"
        FS.sort(key=lambda x: x[1])
        FS = [i[0] for i in FS]
    else:
        FS = HH
    REMOVE = glob.glob(WORK+'clust'+ID+"/cat.*")
    FS = [f for f in FS if f not in REMOVE]
    for handle in FS:
        work_queue.put([WORK,handle, minsamp, CUT1, CUT2, datatype, haplos])
        submitted += 1

    " remove temp files if previous run "
    for ff in FS:
        end = ff.split("/")[-1].replace(".clustS.gz","") 
        ff = WORK+"stats/."+end+".temp"
        if os.path.exists(ff):
            os.remove(ff)

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()
    results = []
    
    " spawn workers "
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, optim)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    " write results to stats file "
    if not os.path.exists(WORK+"stats/Pi_E_estimate.txt"):
        outstats = open(WORK+"stats/Pi_E_estimate.txt",'w')
        outstats.write("taxa\tH\tE\n")
    else:
        outstats = open(WORK+"stats/Pi_E_estimate.txt",'a')
    for ff in FS:
        end = ff.split("/")[-1].replace(".clustS.gz","")
        ft = WORK+"stats/."+end+".temp"
        line = open(ft).readlines()
        outstats.write(line[0])
        os.remove(ft)
        # n,h,e = line[0].strip().split("\t")
        # H.append(float(h))
        # E.append(float(e))
    #outstats.write(" ".join(["mean E =",str(numpy.mean(E))])+"\n")
    #outstats.write(" ".join(["mean H =",str(numpy.mean(H))]))
    outstats.close()
Example #15
0
def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype):
    print >> sys.stderr, "\tstep 2: editing raw reads \n\t",

    " create output directories "
    if not os.path.exists(WORK + 'stats'):
        os.makedirs(WORK + 'stats')
    if not os.path.exists(WORK + 'edits'):
        os.makedirs(WORK + 'edits')

    " load up work queue "
    submitted = 0
    work_queue = multiprocessing.Queue()
    if len(glob.glob(FQs)) > 1:
        FS = glob.glob(FQs)

        " order files by size "
        for i in range(len(FS)):
            statinfo = os.stat(FS[i])
            FS[i] = FS[i], statinfo.st_size
        FS.sort(key=operator.itemgetter(1))
        FS = [i[0] for i in FS][::-1]

        " submit jobs to queue "
        for handle in FS:
            finder = WORK + 'edits/' + handle.split("/")[-1]
            while finder.split(".")[-1] in [
                    "fastq", "fastQ", "gz", "fq", "FastQ"
            ]:
                finder = finder.replace('.' + finder.split(".")[-1],
                                        "").replace("_R1", "")
            if finder + ".edit" not in glob.glob(WORK + "edits/*"):
                if os.stat(handle).st_size > 0:  ## exclude empty files
                    args = [
                        WORK, handle, CUT,
                        float(pN), trimkeep, strict, Q, datatype
                    ]
                    work_queue.put(args)
                    submitted += 1
                else:
                    print "skipping", handle, ", file is empty"
            else:
                print "\t" + finder + " already in edits/"

    elif len(glob.glob(FQs)) == 1:
        " if only one file "
        work_queue.put([
            WORK,
            glob.glob(FQs)[0], CUT,
            float(pN), trimkeep, strict, Q, datatype
        ])
        submitted += 1

    else:
        print "\tNo demultiplexed files found. Check path."
        sys.exit()

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()

    " spawn workers, give function "
    jobs = []
    for i in range(min(Parallel, submitted)):
        worker = Worker(work_queue, result_queue, rawedit)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    " collect the results off the queue "
    outstats = open(WORK + "stats/s2.rawedit.txt", 'a')
    print >> outstats, "\t".join(
        ["sample ", "Nreads", "passed", "passed.w.trim", "passed.total"])
    STATS = []
    for i in range(submitted):
        STATS.append(result_queue.get())

    STATS.sort(key=lambda x: x[0])
    for i in range(submitted):
        a, b, c, d = STATS[i]
        print >> outstats, "\t".join([a, b, c, d, str(int(c) + int(d))])

    print >> outstats, """
    Nreads = total number of reads for a sample
    passed = retained reads that passed quality filtering at full length
    passed.w.trim= retained reads that were trimmed due to detection of adapters
    passed.total  = total kept reads of sufficient length
    note: you can set the option in params file to include trimmed reads of xx length. """
    outstats.close()
Example #16
0
def multiproc_it(subtests, alignfile, outfile, nboots, nproc, namelen,
                 makesort, makeboots):
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in subtests:
        notes = ""
        if len(rep) == 2:
            rep, notes = rep
        p1, p2, p3a, p3b, o = rep
        if all(["[" in i for i in rep[1:]]):
            p1 = p1[1:-1].split(",")
            p2 = p2[1:-1].split(",")
            p3a = p3a[1:-1].split(",")
            p3b = p3b[1:-1].split(",")
            o = o[1:-1].split(",")
            if checktaxa([p1, p2, p3a, p3b, o], alignfile):
                work_queue.put(
                    [alignfile, [p1, p2, p3a, p3b, o], nboots, 1, submitted])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1, p2, p3a, p3b, o], alignfile):
                work_queue.put(
                    [alignfile, [p1, p2, p3a, p3b, o], nboots, 0, submitted])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        Notes.append(notes)

    jobs = []
    for i in range(min(submitted, nproc)):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    #Results = [result_queue.get() for i in range(submitted)]
    Results = [
        pickle.load(open(".save." + str(i), 'rb')) for i in range(submitted)
    ]
    Results.sort(key=lambda x: x[15])

    " setup results file "
    outs = open(outfile + ".partD.txt", 'w')
    header = "\t".join([
        'p1' + " " * (namelen[0] - 2), 'p2' + " " * (namelen[1] - 2),
        'p3_1' + " " * (namelen[2] - 4), 'p3_2' + " " * (namelen[3] - 4),
        'O' + " " * (namelen[4] - 1), 'D_12', 'D_1', 'D_2', 'Z_12', 'Z_1',
        'Z_2', 'BABBA', 'ABBBA', 'BABAA', 'ABBAA', 'BAABA', 'ABABA', 'nloci',
        'pdisc', 'notes'
    ])

    print >> outs, header

    for i in range(len(Results)):
        L, D12, Z12, D1, Z1, D2, Z2, nloc, ABBBA, BABBA, ABBAA, BABAA, ABABA, BAABA, pdisc, sub, ABBBAloci, BABBAloci, ABBAAloci, BABAAloci, ABABAloci, BAABAloci, BB12, BB1, BB2 = Results[
            i]
        L = [
            str(x).replace("['", "[").replace("']", "]").replace("', '", ",")
            for x in L
        ]

        resin = tuple([
            str(L[0]) + " " * (namelen[0] - len(str(L[0]))),
            str(L[1]) + " " * (namelen[1] - len(str(L[1]))),
            str(L[2]) + " " * (namelen[2] - len(str(L[2]))),
            str(L[3]) + " " * (namelen[3] - len(str(L[3]))),
            str(L[4]) + " " * (namelen[4] - len(str(L[4]))), D12, D1, D2, Z12,
            Z1, Z2, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, nloc, pdisc,
            Notes[i]
        ])

        print >> outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s" % resin

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            makesortfiles("ABBBA", ABBBAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("BABBA", BABBAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("ABBAA", ABBAAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("BABAA", BABAAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("ABABA", ABABAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("BAABA", BAABAloci, 5, loci, outfile, makesort, sub,
                          L)

        if makeboots:
            with open(outfile + "_" + str(sub + 1) + ".boots_D12", 'w') as out:
                out.write(",".join(map(str, BB12)))
            with open(outfile + "_" + str(sub + 1) + ".boots_D1", 'w') as out:
                out.write(",".join(map(str, BB1)))
            with open(outfile + "_" + str(sub + 1) + ".boots_D2", 'w') as out:
                out.write(",".join(map(str, BB2)))
Example #17
0
def main(Parallel, ID, minsamp, subset, haplos, WORK, CUT, datatype):
    sys.stderr.write(
        "\n\tstep 4: estimating error rate and heterozygosity\n\t")

    " find clust.xx directory "
    if not os.path.exists(WORK + 'clust' + ID):
        print  "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
                "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
                "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
        sys.exit()

    # warning message for low minsamp
    if minsamp < 5:
        sys.stderr.write(
            """\n\t warning: Mindepth < 5 is not recommended for this step.\n
                            If you intend to make low coverage base calls use a high mindepth in
                            step 4 to accurately infer H & E parameters, and then use a low mindepth
                            in conjunction with the line 31 params file option to make low coverage
                            base calls""")

    # if haploid data
    if haplos == 1:
        sys.stderr.write(
            "\n\tapplying haploid-based test (infer E while H is fixed to 0)\n\t"
        )

    # if double digest use first cut site
    if "," in CUT:
        CUT1, CUT2 = CUT.strip().split(",")
    else:
        CUT1 = CUT2 = CUT

    # load up work queue
    work_queue = multiprocessing.Queue()

    # iterate over files
    HH = glob.glob(WORK + "clust" + ID + "/" + subset + "*.clustS*")
    submitted = 0
    FS = []
    if len(HH) > 1:
        ## sort files by size
        for i in range(len(HH)):
            statinfo = os.stat(HH[i])
            if statinfo.st_size > 1000:
                FS.append((HH[i], statinfo.st_size))
            else:
                print "excluding ", HH[i], "file is too small\n"
        FS.sort(key=lambda x: x[1])
        FS = [i[0] for i in FS]
    else:
        FS = HH
    REMOVE = glob.glob(WORK + 'clust' + ID + "/cat.*")
    FS = [f for f in FS if f not in REMOVE]
    for handle in FS:
        work_queue.put([WORK, handle, minsamp, CUT1, CUT2, datatype, haplos])
        submitted += 1

    " remove temp files if previous run "
    for ff in FS:
        end = ff.split("/")[-1].replace(".clustS.gz", "")
        ff = WORK + "stats/." + end + ".temp"
        if os.path.exists(ff):
            os.remove(ff)

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()
    results = []

    " spawn workers "
    jobs = []
    for i in range(min(Parallel, submitted)):
        worker = Worker(work_queue, result_queue, optim)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    " write results to stats file "
    if not os.path.exists(WORK + "stats/Pi_E_estimate.txt"):
        outstats = open(WORK + "stats/Pi_E_estimate.txt", 'w')
        outstats.write("taxa\tH\tE\n")
    else:
        outstats = open(WORK + "stats/Pi_E_estimate.txt", 'a')
    for ff in FS:
        end = ff.split("/")[-1].replace(".clustS.gz", "")
        ft = WORK + "stats/." + end + ".temp"
        line = open(ft).readlines()
        outstats.write(line[0])
        os.remove(ft)
        # n,h,e = line[0].strip().split("\t")
        # H.append(float(h))
        # E.append(float(e))
    #outstats.write(" ".join(["mean E =",str(numpy.mean(E))])+"\n")
    #outstats.write(" ".join(["mean H =",str(numpy.mean(H))]))
    outstats.close()
Example #18
0
def main():
    parser = OptionParser(prog="pyRAD",
                          usage="%prog [options]",
                          version="%prog 3.0.61")
    parser.add_option(
        '-p',
        action="store",
        type="string",
        dest="params",
        help="input file for within sample filtering and clustering\n")
    parser.add_option('-s',
                      action="store",
                      dest="steps",
                      help="""perform step-wise parts of within analysis\n
                      1 = barcode sorting                        \
                      2 = filter/edit raw sequences              \
                      3 = within-sample clustering               \
                      4 = estimate pi and e                      \
                      5 = consensus calling                      \
                      6 = cluster consensus                      \
                      7 = align & create output files """)
    parser.add_option('-d',
                      action="store",
                      type="string",
                      dest="dtest",
                      help="""input file for D-test of introgression,
                              can iterate over multiple samples """)
    parser.add_option('-n',
                      action="store_true",
                      dest="newparamsfile",
                      help="""creates a new empty input params.txt file """)
    parser.add_option('-D',
                      action="store_true",
                      dest="newDtestfile",
                      help="""creates a new empty Dtest input file """)

    (options, args) = parser.parse_args()

    if not any([
            options.params, options.dtest, options.newparamsfile,
            options.newDtestfile
    ]):
        print "\n\tmust include option of -p, -d, -D or -n\n"
        sys.exit()

    if options.params:
        sys.stderr.write('\n\n'+' '*5+'---'*20+'\n'+\
                         ' '*6+'pyRAD : RADseq for phylogenetics & introgression analyses\n'+\
                         ' '*5+'---'*20+'\n\n')

        readin = [
            line.strip().split('##')[0].strip()
            for line in open(options.params).readlines()
        ]
        if "==** " not in str(readin[0]):
            print "\n\twarning: update params input file format to latest version\n"
            sys.exit()

        WORK = str(readin[1])
        GLOB = str(readin[2])
        Bcode = str(readin[3])
        vsearch = str(readin[4])
        muscle = str(readin[5])
        CUT = str(readin[6])
        parallel = int(readin[7])
        mindepth = int(readin[8])
        pN = str(readin[9])
        wclust = str(readin[10])
        datatype = str(readin[11])
        minsamp = int(readin[12])
        maxpoly = str(readin[13])
        outname = str(readin[14])
        ###########################
        ## 15 is separator line
        ###########################
        subset = str(readin[16])
        outgroup = str(readin[17])
        exclude = str(readin[18])
        Floc = str(readin[19])
        try:
            maxmismatch = int(readin[20])
        except (ValueError, IndexError):
            maxmismatch = 1
        try:
            Q = int(readin[21])
        except (ValueError, IndexError):
            Q = 33
        try:
            strict = int(readin[22])
        except (ValueError, IndexError):
            strict = 0
        try:
            E, H = str(readin[23]).strip().split(",")
        except ValueError:
            E = ""
            H = ""
        try:
            maxN = int(readin[24])
        except ValueError:
            maxN = 5
        try:
            maxH = int(readin[25])
        except ValueError:
            maxH = 5
        try:
            haplos = int(readin[26])
        except ValueError:
            haplos = 2
        maxSNP = str(readin[27])
        if maxSNP == "": maxSNP = "99"
        max_inserts = str(readin[28])
        if max_inserts == "": max_inserts = "3"
        try:
            seed = int(readin[29])
        except ValueError:
            seed = 112233
        try:
            overhang = [int(i) for i in str(readin[30]).strip().split(',')]
        except (ValueError, IndexError):
            overhang = [0, 0]
        try:
            outform = str(readin[31])
        except (ValueError, IndexError):
            outform = ""
        try:
            lowcounts = int(readin[32])
        except (ValueError, IndexError):
            lowcounts = mindepth
        ##mergepairs = str(readin[31])
        ##if mergepairs in [0,""]: mergepairs = 0
        try:
            trimkeep = int(readin[33])
        except ValueError:
            trimkeep = 0
        try:
            maxstack = int(readin[34])
        except ValueError:
            maxstack = "2SD"
        try:
            minuniq = int(readin[35])
        except ValueError:
            minuniq = 0
        try:
            hierarch = int(readin[36])
        except ValueError:
            hierarch = 0
        try:
            MASK = int(readin[37])
        except ValueError:
            MASK = 'dust'
        if MASK == 1: MASK = 'dust'
        else: MASK = 'none'
        try:
            threads = int(readin[38])
        except ValueError:
            threads = 6
        ###############################
        ## 39 is separator line
        ###############################
        try:
            clustprefix = readin[40:]
        except IndexError:
            clustprefix = ""
        clustprefix = [i for i in clustprefix if i]
        """ expand ./ ~ and ../ designators in location names """
        def expander(namepath):
            if "~" in namepath:
                namepath = namepath.replace("~", os.path.expanduser("~"))
            if "../" in namepath:
                a, b = namepath.split("../")
                namepath = os.path.abspath(
                    os.path.join(os.path.dirname(""), '..', b))
            elif "./" in namepath:
                a, b = namepath.split("./")
                namepath = os.path.abspath("") + "/" + b
            return namepath

        if WORK == "":
            WORK = os.path.abspath("") + "/"
        else:
            WORK = expander(WORK)
        if WORK[-1] != "/":
            WORK = WORK + "/"
        stripped = 0
        if Floc:
            if Floc[0] == "@":
                stripped = 1
                Floc = expander(Floc[1:])
            else:
                Floc = expander(Floc)
        if GLOB: GLOB = expander(GLOB)
        if Bcode: Bcode = expander(Bcode)
        if vsearch: vsearch = expander(vsearch)
        if options.dtest: options.dtest = expander(options.dtest)
        """ find location of vsearch (or usearch) and muscle """

        def cmd_exists(cmd):
            return subprocess.call("type " + cmd,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE) == 0

        # " check platform: mac v linux "
        # if 'linux' in sys.platform:
        #     vsearch = "vsearch-1.0.3-linux-x86_64"
        # else:
        #     vsearch = "vsearch-1.0.3-mac-x86_64"

        # " find vsearch and muscle in user's lib/"
        # PYRADPATH = os.path.dirname(os.path.realpath(__file__))
        # vsearch = PYRADPATH+"/lib/"+vsearch
        # muscle = PYRADPATH+"/lib/muscle"

        " threads = 1 for usearch"
        if 'vsearch' not in vsearch:
            threads = 1

        if not cmd_exists(vsearch):
            print "\tcannot find vsearch (or usearch), edit path in param file"
            sys.exit()
        if not cmd_exists(muscle):
            print "\tcannot find muscle, edit path in input file"
            sys.exit()
        """ expand clustprefix cluster groups """
        gids = []
        groups = []
        minhits = []
        "hierarchical clustering "
        for line in clustprefix:
            gid, hits, inds = line.strip().split()
            gids.append(gid)
            minhits.append(hits)
            if "," in inds:
                thisgroup = []
                ii = inds.split(",")
                for i in ii:
                    if "*" in i:
                        expanded = glob.glob(WORK + "clust" + wclust + "/" +
                                             i + ".consens*")
                        [thisgroup.append(i) for i in expanded]
                    else:
                        thisgroup.append(WORK + "clust" + wclust + "/" + i +
                                         ".consens.gz")
                groups.append(thisgroup)
            else:
                if "*" in inds:
                    expanded = glob.glob(WORK + "clust" + wclust + "/" + inds +
                                         ".consens*")
                    groups.append(expanded)
                else:
                    inds = inds.split(",")
                    groups.append([
                        WORK + "clust" + wclust + "/" + i + ".consens.gz"
                        for i in inds
                    ])
            "TODO check for size=1 "
        if not gids:
            gids = ""

        " step of the analysis "
        k = tuple('1234567')
        if options.steps:
            k = tuple(str(options.steps))

        " check that the data type was entered correctly "
        datopts = [
            'rad', 'gbs', 'ddrad', 'pairgbs', 'pairddrad', 'merged', '2brad'
        ]
        if datatype not in datopts:
            print "\t datatype argument (line 11) not recognized "
            sys.exit()
        # if datatype == 'merged':
        #     print "specify mergetype in params file, ex: mergeddrad or mergegbs "
        #     sys.exit()

        " parse max_inserts argument "
        w1 = 3
        w2 = 6
        a1 = a2 = 99
        if 'pair' in datatype:
            if "," in max_inserts:
                wargs = max_inserts.strip().split(",")
                if len(wargs) == 2:
                    w1 = w2 = wargs[0]
                    a1 = a2 = wargs[1]
                elif len(wargs) == 4:
                    w1, w2, a1, a2 = wargs
                else:
                    print "\n\tmax_inserts parameter not recognized. see documentation"
                    sys.exit()
        else:
            if "," in max_inserts:
                w1, a1 = map(int, max_inserts.split(","))

        #########  Begin analysis  ###################################################
        if '1' in k:
            " expand Barcode file name if necessary "
            if "*" in Bcode:
                try:
                    Bcode = glob.glob(Bcode)[0]
                except IndexError:
                    print "\tcould not find barcodes file ", Bcode,
                    "\n\tcomment out line 3 of params file or edit path to barcodes file"
                    sys.exit()
            if Floc:
                print "\tskipping step 1: line 18 of input file shows seqs already sorted"
            else:
                " if directory as input select all inside"
                if GLOB:
                    if GLOB[-1] == "/":
                        GLOB = GLOB + "*"
                sortandcheck2.main(Bcode, GLOB, CUT, datatype, parallel,
                                   maxmismatch, WORK)

        ### step 2 ###################
        if '2' in k:
            if Floc:
                print >> sys.stderr, "\tsorted .fastq from %s being used" % Floc
                if len(glob.glob(Floc)) < 1:
                    sys.stderr.write(
                        "\t... no files found in line 18 location, check required file name formatting\n"
                    )
                    sys.exit()
                FQs = Floc
                if stripped:
                    print "\tbarcode & restriction site are already stripped off of sequences"
                    CUT = ""
                    if strict:
                        print "\tApplying step 2 filter (param 19) is not recommended for data that is stripped (w/ @) \n"
            else:
                " default location "
                FQs = WORK + "fastq/" + subset + "*.fq.gz"

            " if directory as input select all inside"
            if FQs[-1] == "/":
                FQs = FQs + "*"

            " if not paired filter only read 1 "
            if 'pair' not in datatype:  # in ['rad','ddrad','gbs','merged','2brad']:
                editraw_rads.main(parallel, WORK, FQs, CUT, pN, Q, strict,
                                  trimkeep, datatype)

            else:  #elif datatype in ['pairddrad','pairgbs']:
                " check for both CUT sites in pairddrad"
                if datatype == 'pairddrad':
                    if "," not in CUT:
                        print "\n\tyou must enter two restriction sites for pair ddRAD data"
                        sys.exit()
                editraw_pairs.main(parallel, WORK, FQs, CUT, pN, Q, strict,
                                   trimkeep, datatype)

            #elif "merge" in datatype:
            #    editraw_merges.main(parallel, WORK, FQs, CUT,
            #                       pN, Q, strict, trimkeep)

        ### step 3  ####################
        if '3' in k:
            cluster7dp.main(WORK,
                            parallel,
                            wclust,
                            mindepth,
                            subset,
                            datatype,
                            w1,
                            w2,
                            minuniq,
                            MASK,
                            muscle,
                            vsearch,
                            threads,
                            remake=0)

        ### step 4  ####################
        if '4' in k:
            " if using low depth option still use a reasonable limit for parameter estimates"
            if mindepth < 5:
                tempmindepth = 5
            else:
                tempmindepth = mindepth
            H_err_dp.main(parallel, wclust, tempmindepth, subset, haplos, WORK,
                          CUT, datatype)

        ### step 5  ####################
        if '5' in k:
            if not E:
                try:
                    Pi = open(WORK + "stats/Pi_E_estimate.txt").readlines()
                except IOError:
                    Pi = ""
                if Pi:
                    El = []
                    Hl = []
                    for line in Pi[1:]:
                        try:
                            _, h, e = line.strip().split("\t")
                        except IndexError:
                            None
                        Hl.append(float(h))
                        El.append(float(e))
                    if len(Hl) == 0:
                        print "\n\terror in step 4, no estimates in file stats/Pi_E_estimate.txt"
                        sys.exit()
                    H = sum(Hl) / len(Hl)
                    E = sum(El) / len(El)
                else:
                    E = 0.001
                    H = 0.01
                    print "\n\tstep 4 values not detected, using E=0.001, H=0.01"
            if 'pair' in datatype:
                " call consensus on each pair separately "
                consens_pairs.main(parallel, float(E), float(H), wclust,
                                   mindepth, subset + "*", maxN, maxH, haplos,
                                   CUT, datatype, lowcounts, strict, WORK,
                                   maxstack)
            else:
                " call consensus on single end clusters "
                consensdp.main(parallel, float(E), float(H), wclust, mindepth,
                               subset + "*", maxN, maxH, haplos, CUT, datatype,
                               lowcounts, strict, WORK, maxstack)

        ### step 6  ####################
        if '6' in k:
            if not hierarch:
                gids = ""
                if "," in subset:
                    inlist = [
                        WORK + "clust" + wclust + "/" + i + ".consens*"
                        for i in subset.strip().split(",")
                    ]
                else:
                    inlist = glob.glob(WORK + "clust" + wclust + "/" + subset +
                                       "*.consens*")
                cluster_cons7_shuf.main(vsearch, wclust, datatype, outgroup,
                                        seed, gids, minhits, inlist, WORK,
                                        MASK, 0)
                print "\n\tfinished clustering"
            else:
                """ re-expand clustprefix cluster groups in case no -s """
                Hgids = []
                Hgroups = {}
                Hminhits = []
                "hierarchical clustering "
                for line in clustprefix:
                    Hgid, Hhits, Hinds = line.strip().split()
                    Hgids.append(Hgid)
                    Hminhits.append(Hhits)
                    Hgroups[Hgid] = []
                    if "," in Hinds:
                        Hinds = Hinds.split(",")
                        for Hind in Hinds:
                            if "*" in Hind:
                                expanded = glob.glob(WORK + "clust" + wclust +
                                                     "/" + Hind + ".consens*")
                                Hgroups[Hgid] += expanded  #.append(expanded)
                            else:
                                Hgroups[Hgid].append(WORK + "clust" + wclust +
                                                     "/" + Hind +
                                                     ".consens.gz")
                    else:
                        if "*" in Hinds:
                            expanded = glob.glob(WORK + "clust" + wclust +
                                                 "/" + Hinds + ".consens*")
                            Hgroups[Hgid] += expanded  #.append(expanded)
                        else:
                            Hgroups[Hgid].append(WORK + "clust" + wclust +
                                                 "/" + Hinds + ".consens.gz")

                for i, j in zip(Hgids, Hminhits):
                    for cons in Hgroups[i]:
                        if cons not in glob.glob(WORK + "clust" + wclust +
                                                 "/*.consens.gz"):
                            print "\n\tsample name", cons, "in group", i, "does not match any filenames"
                            sys.exit()

                preclusts = []
                for i in Hgroups.values():
                    preclusts += i

                for cons in glob.glob(WORK + "clust" + wclust +
                                      "/*.consens.gz"):
                    if cons not in preclusts:
                        print "\n\twarning: sample", cons, "not assigned to a cluster group"

                #if not gids:
                #    gids = ""

                " make prefix directory "
                if not os.path.exists(WORK + 'prefix/'):
                    os.makedirs(WORK + 'prefix')

                ########### TODO ####################################
                # if os.path.exists(WORK+"prefix/cat.clust_.gz"):
                #     print "\tRemaking clusters from existing clustprefix files "+\
                #           "using minmatches: ",minmatch
                #     print "\t(To completely re-start hierarchical clustering delete the prefix/ directory)\n"
                #
                #     for (gid,minhit,inlist) in zip(gids,minhits,groups):
                #         handle = WORK+"clust"+wclust+"/cat.haplos_"+gid
                #         #cluster_cons7_shuf.makeclust(handle, datatype, pre, pre, minm, WORK, 1)
                #     #tier2clust.makeclust(wclust, datatype, WORK)
                #######################################################

                " queue up jobs "
                work_queue = multiprocessing.Queue()
                result_queue = multiprocessing.Queue()

                " submit jobs "
                for (Hgid, Hminhit) in zip(Hgids, Hminhits):
                    inlist = Hgroups[Hgid]
                    work_queue.put([
                        vsearch, wclust, datatype, outgroup, seed, Hgid,
                        Hminhit, inlist, WORK, MASK, 1
                    ])

                " execute first tier jobs "
                jobs = []
                for i in range(parallel):
                    worker = Worker(work_queue, result_queue,
                                    cluster_cons7_shuf.main)
                    jobs.append(worker)
                    worker.start()
                for j in jobs:
                    j.join()

                " cluster second tier "
                tier2clust.main(vsearch, wclust, datatype, Hgids, seed, WORK,
                                MASK)

                print "\n\tfinished clustering\n"

            " cleanup "
            #for ff in glob.glob(WORK+"clust"+wclust+"/cat.consens_*.gz"):
            #    os.remove(ff)
            #for ff in glob.glob(WORK+"clust"+wclust+"/cat.u*"):
            #    os.remove(ff)

        if '7' in k:
            if minsamp < 2:
                print "\n\tminimum minCov setting is <2: changing to 2"
                minsamp = 2

            if gids:
                inclustfile = WORK + "prefix/cat.clust_.gz"
            else:
                inclustfile = WORK + 'clust' + wclust + "/cat.clust_.gz"

            if not os.path.exists(inclustfile):
                #sys.stderr.write("\n\t didn't find hierarchically clustered subset: \n\t"+inclustfile)
                #sys.stderr.write("\n\t looking for default full cluster file")
                if os.path.exists(WORK + 'clust' + wclust + "/cat.clust_.gz"):
                    inclustfile = WORK + 'clust' + wclust + "/cat.clust_.gz"
                    sys.stderr.write("\n\tCluster input file: using \n\t" +
                                     inclustfile + "\n\n")
                else:
                    print "\tnot found"
                    #print "\tcat.clust_ file is selected based on line 15 subset argument "
                    #print "\n\t if you wish to exclude samples from an existing cat.clust file "+\
                    #      "\n\t in your output alignments list exclude names on line 17 of the params file.\n "
                    sys.exit()
            #if any([i in outform for i in ['t','m']]):
            #    if gids:
            #        print "\tgroups for 't' or 'm' outputs:", gids
            taxadict = OrderedDict(zip(gids, groups))
            alignable.main(outgroup, minsamp, outname, inclustfile, maxpoly,
                           parallel, maxSNP, muscle, exclude, overhang,
                           outform, WORK, gids, CUT, a1, a2, datatype, subset,
                           parser.version.split(" ")[1], mindepth, taxadict,
                           minhits, seed, haplos)

        if '8' in k:
            cluster7dp.main(WORK,
                            parallel,
                            wclust,
                            mindepth,
                            subset,
                            datatype,
                            w1,
                            w2,
                            minuniq,
                            MASK,
                            muscle,
                            vsearch,
                            threads,
                            remake=1)

    if options.dtest:
        readin = [line.strip() for line in open(options.dtest).readlines()]

        nboots = int(readin[0].split("##")[0].strip())
        alignfile = str(readin[1].split("##")[0].strip())
        outfile = str(readin[2].split("##")[0].strip())
        ntax = str(readin[3].split("##")[0].strip())
        nproc = int(readin[4].split("##")[0].strip())
        makesort = int(readin[5].split("##")[0].strip())
        makeboots = int(readin[6].split("##")[0].strip())

        tests = []
        for line in readin[8:]:
            if line:
                notes = ""
                if "##" in line:
                    tax, notes = line.strip().split(
                        "##")[0], line.strip().split("##")[-1],
                    if tax:
                        tests.append([tax.strip().split(),
                                      notes.strip()
                                      ])  #.split("\t"),notes.strip()])
                else:
                    tests.append(line.strip().split())  # "\t"))
        if ntax == '4':
            Dtest.main(tests, alignfile, outfile, nboots, nproc, makesort,
                       makeboots)
        elif ntax == 'part':
            Dtest_5.main(tests, alignfile, outfile, nboots, nproc, makesort,
                         makeboots)
        elif ntax == 'foil':
            Dtest_foil.main(tests, alignfile, outfile, nboots, nproc, makesort,
                            makeboots, 0)
        elif ntax == 'foilalt':
            Dtest_foil.main(tests, alignfile, outfile, nboots, nproc, makesort,
                            makeboots, 1)
        else:
            print "error in input file"

    if options.newparamsfile:
        if os.path.exists("./params.txt"):
            print "\tfile params.txt already exists"
            sys.exit()
        else:
            createfile.main(parser.version.split(" ")[1])

    if options.newDtestfile:
        outstring = """200                          ## N bootstrap replicates
test.loci                    ## loc/path to input .loci file
dstats/test1_res             ## output file path/name (no suffix)
4                            ## which test: 4,part,foil,foilalt
2                            ## N cores (execute jobs [lines below] in parallel
0                            ## output ABBA/BABA loci to files (0=no,1,2=verbose)
0                            ## output bootstrap Ds to files (0=no,1=yes)
-----------------------------------------------------------\n"""
        sys.stdout.write(outstring)
Example #19
0
def writefunc(GLOB,Parallel,Bcode,CUT,datatype,maxmismatch,WORK):
    "create barcode dictionary"
    codetable = open(Bcode, 'r')
    codes = [line.strip().split() for line in codetable.readlines()]
    C = {}
    for line in codes:
        if line:
            C[line[1].strip().upper()] = line[0]

    " find longest barcode "
    keylens = map(len,C.keys())
    if len(set(keylens)) == 1:
        longB = (keylens[0],'same')
    else:
        longB = (max(keylens),'diff')

    " check for CUT in barcodes "
    CCC = unambig(CUT)
    if len(CCC)>1:
        for cut in CCC:
            if any([cut in i for i in C.keys()]):
                print "\n\twarning: CUT site matches within one of the barcodes, "+\
                "I suggest double \n\tchecking the file to make sure it properly demultiplexes"
    else:
        if any([CUT in i for i in C.keys()]):
            print "\n\twarning: CUT site matches within one of the barcodes, "+\
            "I suggest double \n\tchecking the file to make sure it properly demultiplexes"

    " read in sequence files "
    if len(glob.glob(GLOB)) > 1:
        FS = [f for f in glob.glob(GLOB)]
    else:
        FS = glob.glob(GLOB)
    if 'pair' in datatype:
        Raws = combinefiles(GLOB)
    else:
        Raws = FS

    "send jobs to multiprocess queue"
    num = 0
    work_queue = multiprocessing.Queue()
    submitted = 0
    for fs in Raws:
        if 'pair' in datatype:
            work_queue.put([C, [fs[0],fs[1]], CUT, datatype, num, maxmismatch, WORK, longB])
            submitted += 1
        else:
            work_queue.put([C, fs, CUT, datatype, num, maxmismatch, WORK, longB])
            submitted += 1
        num += 1

    result_queue = multiprocessing.Queue()

    "spawn workers, give function"
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, barmatch)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    Ms = {}

    if len(glob.glob(WORK+"fastq/.*.pickle")) > 1:
        for pick in glob.glob(WORK+"fastq/.*.pickle"):
            pickin = open(pick, "rb")
            M = pickle.load( pickin )
            pickin.close()
            for key in M:
                if key not in Ms:
                    Ms[key] = M[key]
                else:
                    Ms[key] += M[key]
            os.remove(pick)
    elif len(glob.glob(WORK+"fastq/.*.pickle")) == 1:
        pick = glob.glob(WORK+"fastq/.*.pickle")[0]
        pickin = open(pick, 'rb')
        Ms = pickle.load( pickin )
        pickin.close()
        os.remove(pick)
    else:
        print "\nno stats file generated"

    Mkeys = Ms.keys()
    Mkeys.sort(key=lambda x: Ms[x], reverse=True)

    statout = open(WORK+"stats/s1.sorting.txt",'a')
    statout.write("\n\n")
    statout.write("sample\ttrue_bar\tobs_bars\tN_obs\n")

    Cnames = C.keys()
    Cnames.sort()
    try: maxl = max(map(len,map(str,Ms.values())))
    except ValueError: maxl = 5

    hits = []
    for bar in Cnames:
        for barcode in Mkeys:
            if matching(bar, barcode, maxmismatch):
                print >>statout, "%s    \t%s    \t%s\t%s" % (C[bar], bar, barcode,
                                                             str(Ms[barcode])+" "*(maxl+3-len(str(Ms[barcode]))))
                hits.append(barcode)

    statout.write("\n")
    maxl = max(map(len,Mkeys))
    for barcode in Mkeys:
        if barcode not in hits:
            print >>statout, "nomatch  \t%s    \t%i" % (barcode+" "*(maxl+3-len(barcode)), Ms[barcode])
    statout.close()