Python Worker Examples

Programming Language: Python

Namespace/Package Name: potpour

Class/Type: Worker

Examples at hotexamples.com: 19

Python Worker - 19 examples found. These are the top rated real world Python examples of potpour.Worker extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

Worker(11)

start(11)

Example #1

Show file

File: pyRAD.py Project: VeroIarrachtai/pyrad

def main():
    parser = OptionParser(prog="pyRAD", usage="%prog [options]", version="%prog 3.0.6")
    parser.add_option('-p', action="store", type="string", dest="params",
                      help="input file for within sample filtering and clustering\n")
    parser.add_option('-s', action="store", dest="steps",
                      help="""perform step-wise parts of within analysis\n
                      1 = barcode sorting                        \
                      2 = filter/edit raw sequences              \
                      3 = within-sample clustering               \
                      4 = estimate pi and e                      \
                      5 = consensus calling                      \
                      6 = cluster consensus                      \
                      7 = align & create output files """ )
    parser.add_option('-d', action="store", type="string", dest="dtest",
                      help="""input file for D-test of introgression,
                              can iterate over multiple samples """ )
    parser.add_option('-n', action="store_true", dest="newparamsfile",
                      help="""creates a new empty input params.txt file """ )
    parser.add_option('-D', action="store_true", dest="newDtestfile",
                      help="""creates a new empty Dtest input file """ )


    (options, args) = parser.parse_args()

    if not any([options.params,options.dtest,options.newparamsfile,options.newDtestfile]):
        print "\n\tmust include option of -p, -d, -D or -n\n"
        sys.exit()

    if options.params:
        sys.stderr.write('\n\n'+' '*5+'---'*20+'\n'+\
                         ' '*6+'pyRAD : RADseq for phylogenetics & introgression analyses\n'+\
                         ' '*5+'---'*20+'\n\n')
        
        readin = [line.strip().split('##')[0].strip() for line in open(options.params).readlines()]
        if "==** " not in str(readin[0]):
            print "\n\twarning: update params input file format to latest version\n"; sys.exit()

        WORK     = str(readin[1])
        GLOB     = str(readin[2])
        Bcode    = str(readin[3])
        vsearch  = str(readin[4])
        muscle   = str(readin[5])
        CUT      = str(readin[6])  
        parallel = int(readin[7])
        mindepth = int(readin[8])
        pN       = str(readin[9])    
        wclust   = str(readin[10])   
        datatype = str(readin[11])   
        minsamp  = int(readin[12])
        maxpoly  = str(readin[13])
        outname  = str(readin[14])
        ###########################
        ## 15 is separator line
        ###########################
        subset   = str(readin[16])
        outgroup = str(readin[17])
        exclude  = str(readin[18])
        Floc     = str(readin[19])
        try: maxmismatch = int(readin[20])
        except (ValueError,IndexError): maxmismatch = 1
        try: Q = int(readin[21])
        except (ValueError,IndexError): Q = 33
        try: strict     = int(readin[22])
        except (ValueError, IndexError): strict = 0
        try: E,H      = str(readin[23]).strip().split(",")
        except ValueError: E = ""; H = ""
        try: maxN     = int(readin[24])
        except ValueError: maxN = 5
        try: maxH     = int(readin[25])
        except ValueError: maxH = 5
        try: haplos   = int(readin[26])
        except ValueError: haplos = 2
        maxSNP   = str(readin[27])
        if maxSNP == "": maxSNP = "99"
        max_inserts = str(readin[28])
        if max_inserts == "": max_inserts = "3"
        try: seed     = int(readin[29])
        except ValueError: seed = 112233
        try: overhang    = [int(i) for i in str(readin[30]).strip().split(',')]
        except (ValueError,IndexError): overhang = [0,0]
        try: outform   = str(readin[31])
        except (ValueError,IndexError): outform = ""
        try: lowcounts   = int(readin[32])
        except (ValueError, IndexError): lowcounts = mindepth
        ##mergepairs = str(readin[31])
        ##if mergepairs in [0,""]: mergepairs = 0
        try: trimkeep = int(readin[33])
        except ValueError: trimkeep = 0
        try: maxstack = int(readin[34])  
        except ValueError: maxstack = "2SD"
        try: minuniq = int(readin[35])  
        except ValueError: minuniq = 0
        try: hierarch = int(readin[36])  
        except ValueError: hierarch = 0
        try: MASK = int(readin[37])
        except ValueError: MASK = 'dust'
        if MASK == 1: MASK='dust'
        else: MASK='none'
        try: threads = int(readin[38])
        except ValueError: threads = 6
        ###############################
        ## 39 is separator line
        ###############################
        try: clustprefix = readin[40:]
        except IndexError: clustprefix = ""
        clustprefix = [i for i in clustprefix if i]
        

        """ expand ./ ~ and ../ designators in location names """
        def expander(namepath):
            if "~" in namepath:
                namepath = namepath.replace("~",os.path.expanduser("~"))
            if "../" in namepath:
                a,b = namepath.split("../")
                namepath = os.path.abspath(os.path.join(os.path.dirname( "" ), '..', b))
            elif "./" in namepath:
                a,b = namepath.split("./")
                namepath = os.path.abspath("")+"/"+b
            return namepath
            
        if WORK == "":
            WORK = os.path.abspath("")+"/"
        else:
            WORK = expander(WORK) 
        if WORK[-1] != "/":
            WORK = WORK+"/"
        stripped = 0
        if Floc:
            if Floc[0] == "@":
                stripped = 1
                Floc = expander(Floc[1:])
            else:
                Floc = expander(Floc)
        if GLOB:   GLOB = expander(GLOB)
        if Bcode:  Bcode = expander(Bcode)
        if vsearch: vsearch = expander(vsearch)
        if options.dtest: options.dtest = expander(options.dtest)

        """ find location of vsearch (or usearch) and muscle """
        def cmd_exists(cmd):
            return subprocess.call("type " + cmd, shell=True, 
                                   stdout=subprocess.PIPE, stderr=subprocess.PIPE) == 0

        # " check platform: mac v linux "
        # if 'linux' in sys.platform:
        #     vsearch = "vsearch-1.0.3-linux-x86_64"
        # else:
        #     vsearch = "vsearch-1.0.3-mac-x86_64"

        # " find vsearch and muscle in user's lib/"
        # PYRADPATH = os.path.dirname(os.path.realpath(__file__))
        # vsearch = PYRADPATH+"/lib/"+vsearch
        # muscle = PYRADPATH+"/lib/muscle"

        " threads = 1 for usearch"
        if 'vsearch' not in vsearch:
            threads = 1
    
        if not cmd_exists(vsearch):
            print "\tcannot find vsearch (or usearch), edit path in param file"
            sys.exit()
        if not cmd_exists(muscle):
            print "\tcannot find muscle, edit path in input file"
            sys.exit()

        """ expand clustprefix cluster groups """
        gids = []
        groups = []
        minhits = []
        "hierarchical clustering "
        for line in clustprefix:
            gid, hits, inds = line.strip().split()
            gids.append(gid)
            minhits.append(hits)
            if "," in inds:
                thisgroup = []
                ii = inds.split(",")
                for i in ii:
                    if "*" in i:
                        expanded = glob.glob(WORK+"clust"+wclust+"/"+i+".consens*")
                        [thisgroup.append(i) for i in expanded]
                    else:
                        thisgroup.append(WORK+"clust"+wclust+"/"+i+".consens.gz")
                groups.append(thisgroup)
            else:
                if "*" in inds:
                    expanded = glob.glob(WORK+"clust"+wclust+"/"+inds+".consens*")
                    groups.append(expanded)
                else:
                    inds = inds.split(",")
                    groups.append([WORK+"clust"+wclust+"/"+i+".consens.gz" for i in inds])
            "TODO check for size=1 "
        if not gids:
            gids = ""


        " step of the analysis "
        k = tuple('1234567')
        if options.steps:
            k = tuple(str(options.steps))

        " check that the data type was entered correctly "
        datopts = ['rad','gbs','ddrad','pairgbs','pairddrad','merged','2brad']
        if datatype not in datopts:
            print "\t datatype argument (line 11) not recognized "
            sys.exit()
        # if datatype == 'merged':
        #     print "specify mergetype in params file, ex: mergeddrad or mergegbs "
        #     sys.exit()

        " parse max_inserts argument "
        w1=3
        w2=6
        a1=a2=99
        if 'pair' in datatype:
            if "," in max_inserts:
                wargs = max_inserts.strip().split(",")
                if len(wargs) == 2:
                    w1 = w2 = wargs[0]
                    a1 = a2 = wargs[1]
                elif len(wargs) == 4:
                    w1,w2,a1,a2 = wargs
                else:
                    print "\n\tmax_inserts parameter not recognized. see documentation"
                    sys.exit()
        else:
            if "," in max_inserts:
                w1,a1 = map(int,max_inserts.split(","))


        #########  Begin analysis  ###################################################
        if '1' in k:
            " expand Barcode file name if necessary "
            if "*" in Bcode:
                try: Bcode = glob.glob(Bcode)[0]
                except IndexError:
                    print "\tcould not find barcodes file ",Bcode,
                    "\n\tcomment out line 3 of params file or edit path to barcodes file"
                    sys.exit()
            if Floc:
                print "\tskipping step 1: line 18 of input file shows seqs already sorted"
            else:
                " if directory as input select all inside"
                if GLOB:
                    if GLOB[-1] == "/":
                        GLOB = GLOB+"*"
                sortandcheck2.main(Bcode,GLOB,CUT,datatype,parallel,maxmismatch,WORK)


        ### step 2 ###################
        if '2' in k:
            if Floc:
                print >>sys.stderr, "\tsorted .fastq from %s being used" % Floc
                if len(glob.glob(Floc))<1:
                    sys.stderr.write("\t... no files found in line 18 location, check required file name formatting\n")
                    sys.exit()
                FQs = Floc
                if stripped:
                    print "\tbarcode & restriction site are already stripped off of sequences"
                    CUT = ""
                    if strict:
                        print "\tApplying step 2 filter (param 19) is not recommended for data that is stripped (w/ @) \n"
            else:
                " default location "
                FQs = WORK+"fastq/"+subset+"*.fq.gz"

            " if directory as input select all inside"
            if FQs[-1] == "/":
                FQs = FQs+"*"

            " if not paired filter only read 1 "
            if 'pair' not in datatype:  # in ['rad','ddrad','gbs','merged','2brad']:
                editraw_rads.main(parallel, WORK, FQs, CUT,
                                  pN, Q, strict, trimkeep, datatype)

            else:   #elif datatype in ['pairddrad','pairgbs']:
                " check for both CUT sites in pairddrad"
                if datatype == 'pairddrad':
                    if "," not in CUT:
                        print "\n\tyou must enter two restriction sites for pair ddRAD data"
                        sys.exit()
                editraw_pairs.main(parallel, WORK, FQs, CUT, 
                                   pN, Q, strict, trimkeep, datatype)

            #elif "merge" in datatype:
            #    editraw_merges.main(parallel, WORK, FQs, CUT,
            #                       pN, Q, strict, trimkeep)



        ### step 3  ####################
        if '3' in k:
            cluster7dp.main(WORK, parallel, wclust, mindepth,
                            subset, datatype, w1, w2, minuniq,
                            MASK, muscle, vsearch, threads, remake=0)


        ### step 4  ####################
        if '4' in k:
            " if using low depth option still use a reasonable limit for parameter estimates"
            if mindepth < 5:
                tempmindepth = 5
            else:
                tempmindepth = mindepth
            H_err_dp.main(parallel, wclust, tempmindepth, subset,
                          haplos, WORK, CUT, datatype)


        ### step 5  ####################
        if '5' in k:
            if not E:
                try: Pi = open(WORK+"stats/Pi_E_estimate.txt").readlines()
                except IOError: Pi = ""
                if Pi:
                    El = []
                    Hl = []
                    for line in Pi[1:]:
                        try: _,h,e = line.strip().split("\t")
                        except IndexError:
                            None
                        Hl.append(float(h))
                        El.append(float(e))
                    if len(Hl) == 0:
                        print "\n\terror in step 4, no estimates in file stats/Pi_E_estimate.txt"
                        sys.exit()
                    H = sum(Hl)/len(Hl)
                    E = sum(El)/len(El)
                else:
                    E = 0.001
                    H = 0.01
                    print "\n\tstep 4 values not detected, using E=0.001, H=0.01"
            if 'pair' in datatype:
                " call consensus on each pair separately "
                consens_pairs.main(parallel, float(E), float(H), wclust, mindepth, subset+"*",
                                   maxN, maxH, haplos, CUT, datatype,
                                   lowcounts, strict, WORK, maxstack)
            else:
                " call consensus on single end clusters "
                consensdp.main(parallel, float(E), float(H), wclust, mindepth, subset+"*",
                               maxN, maxH, haplos, CUT, datatype,
                               lowcounts, strict, WORK, maxstack)


        ### step 6  ####################
        if '6' in k:
            if not hierarch:
                gids = ""
                if "," in subset:
                    inlist = [WORK+"clust"+wclust+"/"+i+".consens*" for i in subset.strip().split(",")]
                else:
                    inlist = glob.glob(WORK+"clust"+wclust+"/"+subset+"*.consens*")
                cluster_cons7_shuf.main(vsearch, wclust, datatype, 
                                        outgroup, seed, gids, minhits, 
                                        inlist, WORK, MASK, 0)
                print "\n\tfinished clustering"
            else:
                """ re-expand clustprefix cluster groups in case no -s """
                Hgids = []
                Hgroups = {}
                Hminhits = []
                "hierarchical clustering "
                for line in clustprefix:
                    Hgid, Hhits, Hinds = line.strip().split()
                    Hgids.append(Hgid)
                    Hminhits.append(Hhits)
                    Hgroups[Hgid] = []
                    if "," in Hinds:
                        Hinds = Hinds.split(",")
                        for Hind in Hinds:
                            if "*" in Hind:
                                expanded = glob.glob(WORK+"clust"+wclust+"/"+Hind+".consens*")
                                Hgroups[Hgid] += expanded #.append(expanded)
                            else:
                                Hgroups[Hgid].append(WORK+"clust"+wclust+"/"+Hind+".consens.gz")
                    else:
                        if "*" in Hinds:
                            expanded = glob.glob(WORK+"clust"+wclust+"/"+Hinds+".consens*")
                            Hgroups[Hgid] += expanded #.append(expanded)
                        else:
                            Hgroups[Hgid].append(WORK+"clust"+wclust+"/"+Hinds+".consens.gz")

                for i,j in zip(Hgids,Hminhits):
                    for cons in Hgroups[i]:
                        if cons not in glob.glob(WORK+"clust"+wclust+"/*.consens.gz"):
                            print "\n\tsample name",cons,"in group",i,"does not match any filenames"
                            sys.exit()

                preclusts = []
                for i in Hgroups.values():
                    preclusts += i

                for cons in glob.glob(WORK+"clust"+wclust+"/*.consens.gz"):
                    if cons not in preclusts:
                        print "\n\twarning: sample",cons,"not assigned to a cluster group"

                #if not gids:
                #    gids = ""
                    
                " make prefix directory "
                if not os.path.exists(WORK+'prefix/'):
                    os.makedirs(WORK+'prefix')


                ########### TODO ####################################
                # if os.path.exists(WORK+"prefix/cat.clust_.gz"):
                #     print "\tRemaking clusters from existing clustprefix files "+\
                #           "using minmatches: ",minmatch
                #     print "\t(To completely re-start hierarchical clustering delete the prefix/ directory)\n"
                #    
                #     for (gid,minhit,inlist) in zip(gids,minhits,groups):
                #         handle = WORK+"clust"+wclust+"/cat.haplos_"+gid
                #         #cluster_cons7_shuf.makeclust(handle, datatype, pre, pre, minm, WORK, 1)
                #     #tier2clust.makeclust(wclust, datatype, WORK)
                #######################################################

                " queue up jobs "
                work_queue = multiprocessing.Queue()
                result_queue = multiprocessing.Queue()

                " submit jobs "
                for (Hgid,Hminhit) in zip(Hgids,Hminhits):
                    inlist = Hgroups[Hgid]
                    work_queue.put([vsearch, wclust, datatype, 
                                    outgroup, seed,
                                    Hgid, Hminhit, inlist,
                                    WORK, MASK, 1 ])
                        
                " execute first tier jobs "    
                jobs = []
                for i in range(parallel):
                    worker = Worker(work_queue, result_queue, cluster_cons7_shuf.main)
                    jobs.append(worker)
                    worker.start()
                for j in jobs:
                    j.join()

                " cluster second tier "
                tier2clust.main(vsearch, wclust, datatype,
                                Hgids, seed, WORK, MASK)

                print "\n\tfinished clustering\n"

            " cleanup "
            #for ff in glob.glob(WORK+"clust"+wclust+"/cat.consens_*.gz"):
            #    os.remove(ff)
            #for ff in glob.glob(WORK+"clust"+wclust+"/cat.u*"):
            #    os.remove(ff)


        if '7' in k:
            if minsamp < 2:
                print "\n\tminimum minCov setting is <2: changing to 2"
                minsamp = 2
                
            if gids:
                inclustfile = WORK+"prefix/cat.clust_.gz"
            else:
                inclustfile = WORK+'clust'+wclust+"/cat.clust_.gz"

            if not os.path.exists(inclustfile):
                #sys.stderr.write("\n\t didn't find hierarchically clustered subset: \n\t"+inclustfile)
                #sys.stderr.write("\n\t looking for default full cluster file")
                if os.path.exists(WORK+'clust'+wclust+"/cat.clust_.gz"):
                    inclustfile = WORK+'clust'+wclust+"/cat.clust_.gz"
                    sys.stderr.write("\n\tCluster input file: using \n\t"+inclustfile+"\n\n")
                else:
                    print "\tnot found"
                    #print "\tcat.clust_ file is selected based on line 15 subset argument "
                    #print "\n\t if you wish to exclude samples from an existing cat.clust file "+\
                    #      "\n\t in your output alignments list exclude names on line 17 of the params file.\n "
                    sys.exit()
            #if any([i in outform for i in ['t','m']]):
            #    if gids:
            #        print "\tgroups for 't' or 'm' outputs:", gids
            taxadict = OrderedDict(zip(gids,groups))
            alignable.main(outgroup, minsamp, outname,
                           inclustfile, maxpoly, parallel,
                           maxSNP, muscle, exclude, overhang,
                           outform, WORK, gids, CUT,
                           a1, a2, datatype, subset,
                           parser.version.split(" ")[1],
                           mindepth, taxadict, minhits, seed, haplos)

        if '8' in k:
            cluster7dp.main(WORK, parallel, wclust, mindepth,
                            subset, datatype, w1, w2, minuniq,
                            MASK, muscle, vsearch, threads, remake=1)

    if options.dtest:
        readin = [line.strip() for line in open(options.dtest).readlines()]

        nboots =    int(readin[0].split("##")[0].strip())
        alignfile = str(readin[1].split("##")[0].strip())
        outfile   = str(readin[2].split("##")[0].strip())
        ntax =      str(readin[3].split("##")[0].strip())
        nproc =     int(readin[4].split("##")[0].strip())
        makesort =  int(readin[5].split("##")[0].strip())
        makeboots = int(readin[6].split("##")[0].strip())
        
        tests = []
        for line in readin[8:]:
            if line:
                notes = ""
                if "##" in line:
                    tax,notes = line.strip().split("##")[0], line.strip().split("##")[-1], 
                    if tax:
                        tests.append([tax.strip().split(), notes.strip()])   #.split("\t"),notes.strip()])
                else:
                    tests.append(line.strip().split()) # "\t"))
        if ntax == '4':
            Dtest.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots)
        elif ntax == 'part':
            Dtest_5.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots)
        elif ntax == 'foil':
            Dtest_foil.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots,0)
        elif ntax == 'foilalt':
            Dtest_foil.main(tests,alignfile,outfile,nboots,nproc,makesort,makeboots,1)
        else:
            print "error in input file"

    if options.newparamsfile:
        if os.path.exists("./params.txt"):
            print "\tfile params.txt already exists"
            sys.exit()
        else:
            createfile.main(parser.version.split(" ")[1])

    if options.newDtestfile:
        outstring = """200                          ## N bootstrap replicates
test.loci                    ## loc/path to input .loci file
dstats/test1_res             ## output file path/name (no suffix)
4                            ## which test: 4,part,foil,foilalt
2                            ## N cores (execute jobs [lines below] in parallel
0                            ## output ABBA/BABA loci to files (0=no,1,2=verbose)
0                            ## output bootstrap Ds to files (0=no,1=yes)
-----------------------------------------------------------\n"""
        sys.stdout.write(outstring)

Example #2

Show file

def writefunc(GLOB, Parallel, Bcode, CUT, datatype, maxmismatch, WORK):
    "create barcode dictionary"
    codetable = open(Bcode, 'r')
    codes = [line.strip().split() for line in codetable.readlines()]
    C = {}
    for line in codes:
        if line[0]:
            C[line[1].strip().upper()] = line[0]

    " find longest barcode "
    keylens = map(len, C.keys())
    if len(set(keylens)) == 1:
        longB = (keylens[0], 'same')
    else:
        longB = (max(keylens), 'diff')

    " check for CUT in barcodes "
    CCC = unambig(CUT)
    if len(CCC) > 1:
        for cut in CCC:
            if any([cut in i for i in C.keys()]):
                print "\n\twarning: CUT site matches within one of the barcodes, "+\
                "I suggest double \n\tchecking the file to make sure it properly demultiplexes"
    else:
        if any([CUT in i for i in C.keys()]):
            print "\n\twarning: CUT site matches within one of the barcodes, "+\
            "I suggest double \n\tchecking the file to make sure it properly demultiplexes"

    " read in sequence files "
    if len(glob.glob(GLOB)) > 1:
        FS = [f for f in glob.glob(GLOB)]
    else:
        FS = glob.glob(GLOB)
    if 'pair' in datatype:
        Raws = combinefiles(GLOB)
    else:
        Raws = FS

    "send jobs to multiprocess queue"
    num = 0
    work_queue = multiprocessing.Queue()
    submitted = 0
    for fs in Raws:
        if 'pair' in datatype:
            work_queue.put([
                C, [fs[0], fs[1]], CUT, datatype, num, maxmismatch, WORK, longB
            ])
            submitted += 1
        else:
            work_queue.put(
                [C, fs, CUT, datatype, num, maxmismatch, WORK, longB])
            submitted += 1
        num += 1

    result_queue = multiprocessing.Queue()

    "spawn workers, give function"
    jobs = []
    for i in range(min(Parallel, submitted)):
        worker = Worker(work_queue, result_queue, barmatch)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    Ms = {}

    if len(glob.glob(WORK + "fastq/.*.pickle")) > 1:
        for pick in glob.glob(WORK + "fastq/.*.pickle"):
            pickin = open(pick, "rb")
            M = pickle.load(pickin)
            pickin.close()
            for key in M:
                if key not in Ms:
                    Ms[key] = M[key]
                else:
                    Ms[key] += M[key]
            os.remove(pick)
    elif len(glob.glob(WORK + "fastq/.*.pickle")) == 1:
        pick = glob.glob(WORK + "fastq/.*.pickle")[0]
        pickin = open(pick, 'rb')
        Ms = pickle.load(pickin)
        pickin.close()
        os.remove(pick)
    else:
        print "\nno stats file generated"

    Mkeys = Ms.keys()
    Mkeys.sort(key=lambda x: Ms[x], reverse=True)

    statout = open(WORK + "stats/s1.sorting.txt", 'a')
    statout.write("\n\n")
    statout.write("sample\ttrue_bar\tobs_bars\tN_obs\n")

    Cnames = C.keys()
    Cnames.sort()
    try:
        maxl = max(map(len, map(str, Ms.values())))
    except ValueError:
        maxl = 5

    hits = []
    for bar in Cnames:
        for barcode in Mkeys:
            if matching(bar, barcode, maxmismatch):
                print >> statout, "%s    \t%s    \t%s\t%s" % (
                    C[bar], bar, barcode, str(Ms[barcode]) + " " *
                    (maxl + 3 - len(str(Ms[barcode]))))
                hits.append(barcode)

    statout.write("\n")
    maxl = max(map(len, Mkeys))
    for barcode in Mkeys:
        if barcode not in hits:
            print >> statout, "nomatch  \t%s    \t%i" % (
                barcode + " " * (maxl + 3 - len(barcode)), Ms[barcode])
    statout.close()

Example #3

Show file

def main(WORK, parallel, wclust, mindepth,
         subset, datatype, w1, w2, minuniq,
         MASK, muscle, vsearch, threads, remake):

    " find .edit files in edits/ directory "
    if not os.path.exists(WORK+'edits/'):
        print "\terror: could not find edits/ folder in working directory"
        sys.exit()

    " make output folder for clusters" 
    if not os.path.exists(WORK+'clust'+wclust):
        os.makedirs(WORK+'clust'+wclust)
    outfolder = WORK+'clust'+str(wclust)
    if not os.path.exists(WORK+'stats'):
        os.makedirs(WORK+'stats')

    " remake option... in development"
    if remake:
        for ufile in glob.glob(outfolder+"/*.u"):
            infile = open(ufile).readlines()
            cmd = "/bin/sed '$d' < " + ufile + " > tempfile"
            os.system(cmd)
            cmd = "/bin/mv "+ufile+" "+ufile+".backup"
            os.system(cmd)
            cmd = "/bin/mv tempfile "+ufile
            os.system(cmd)

    FS = []

    " if not only 1 sample "
    if len(glob.glob(WORK+"edits/"+subset+"*.edit*")) > 1:  
        for f in glob.glob(WORK+"edits/"+subset+"*.edit*"):
            " append files to list if not already clustered or empty"
            if not os.path.exists(outfolder+"/"+f.replace(".edit",".clustS.gz")):
                size = os.stat(f)
                if size.st_size > 0:
                    FS.append(f)
                else:
                    print "excluding "+str(f)+" file is empty"
            else:
                print f.replace(".edit",".clustS")+" already exists"
        " arranges files by decreasing size for fast clustering order"
        for i in range(len(FS)):
            statinfo = os.stat(FS[i])
            FS[i] = FS[i],statinfo.st_size
        FS.sort(key=operator.itemgetter(1), reverse = True)
        FS = [i[0] for i in FS]

    elif len(glob.glob(WORK+"edits/"+subset+"*.edit*")) == 1:
        f = glob.glob(WORK+"edits/"+subset+"*.edit*")
        size = os.stat(f[0])
        if size.st_size > 0:
            FS = f
        else:
            print "excluding "+f[0]+" file is empty"
    else:
        print "\tNo .edit files found in edits/ dir."

    sys.stderr.write("\n\tde-replicating files for clustering...\n")

    """ do not split big files if using 64-bit Usearch,
    or if using Vsearch, else do it to avoid 4GB limit of 32-bit usearch"""

    if "vsearch" not in vsearch:
        print '\n\tsplitting big files'
        splitbigfilesforderep(FS, vsearch, datatype, minuniq)

    " load work queue"
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()

    " perform function 'final' on files in FS list "
    submitted = {}
    fileno = 1

    if not remake:
        if threads == 0:
            nthreads = 'all'
        else:
            nthreads =threads
        np = min(parallel,len(FS))
        sys.stderr.write("\n\tstep 3: within-sample clustering of "+\
                         `len(FS)`+" samples at \n\t        "+`wclust`+\
                         " similarity. Running "+`np`+" parallel jobs\n\t"+\
                         " \twith up to "+`nthreads`+" threads per job."+\
                         " If needed, \n\t\tadjust to avoid CPU and MEM limits\n\n")
    else:
        sys.stderr.write("\n\tstep 3: rebuilding clusters from unfinished step 3 files\n")

    for handle in FS:
        if outfolder+"/"+handle.split("/")[-1].replace(".edit",".clustS.gz") not in glob.glob(outfolder+"/*"):
            work_queue.put([vsearch,outfolder,handle,wclust,mindepth,
                            parallel,muscle,datatype,fileno, w1, w2, 
                            WORK, minuniq, MASK, threads, remake])
            submitted[handle] = 1
            fileno += 1
        else:
            print "\tskipping "+handle.split("/")[-1].replace(".edit",".clustS.gz")+\
                  ' already exists in '+WORK+outfolder.split("/")[-1]

    " create a queue to pass to workers to store the results"
    jobs = []
    for i in range( min(submitted,parallel) ):
        worker = Worker(work_queue, result_queue, final)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " output statistics on depth of coverage"
    outstats = open(WORK+"stats/s3.clusters.txt",'a')
    print >>outstats, '\n'+'\t'.join(['taxa','total','dpt.me',
                                      'dpt.sd','d>'+`mindepth-1`+'.tot',
                                      'd>'+`mindepth-1`+'.me',
                                      'd>'+`mindepth-1`+'.sd',
                                      'badpairs'])

    RES = []
    HISTO = []
    #for ff in glob.glob(outfolder+"/.temp.*"):
    for ff in FS:
        end = ff.split("/")[-1].replace(".edit","")
        ff = outfolder+"/.temp."+end
        if os.path.exists(ff):
            line = open(ff).readlines()
            RES.append(line[0].strip().split("\t"))
            HISTO.append([line[0].split("\t")[0],"".join(line[1:])])
            os.remove(ff)
    RES.sort(key=lambda x:x[0])
    HISTO.sort(key=lambda x:x[0])
    
    for i in RES:
        print >>outstats, "\t".join(i)
    
    print >>outstats, """
    ## total = total number of clusters, including singletons
    ## dpt.me = mean depth of clusters
    ## dpt.sd = standard deviation of cluster depth
    ## >N.tot = number of clusters with depth greater than N
    ## >N.me = mean depth of clusters with depth greater than N
    ## >N.sd = standard deviation of cluster depth for clusters with depth greater than N
    ## badpairs = mismatched 1st & 2nd reads (only for paired ddRAD data)\n\nHISTOGRAMS\n
    """

    for i in HISTO:
        print >>outstats, "sample: "+i[0]+"\n"+i[1]
    
    
    outstats.close()
    for handle in FS:
        nothere = 0
        try: submitted[handle]
        except KeyError:
            nothere = 1
        if not nothere:
            if submitted[handle]:
                if os.path.exists(outfolder+"/"+handle.split("/")[-1].replace(".edit",".clust.gz")):
                    cmd = "/bin/rm "+outfolder+"/"+handle.split("/")[-1].replace(".edit",".clust.gz")
                    subprocess.call(cmd, shell=True)

Example #4

Show file

File: Dtest.py Project: VeroIarrachtai/pyrad

def multiproc_it(tests, alignfile, outfile, nboots, nproc, namelen, makesort, makeboots):

    " submit jobs to processors "
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in tests:
        notes = ""
        if len(rep) == 2:
            rep,notes = rep
        p1,p2,p3,o = rep
        if any(["[" in i for i in rep]):
            p1 = p1[1:-1].split(",")
            p2 = p2[1:-1].split(",")
            p3 = p3[1:-1].split(",")
            o =   o[1:-1].split(",")
            taxalist = list(itertools.chain(*[p1+p2+p3+o]))
            if checktaxa(taxalist,alignfile):
                work_queue.put([alignfile,[p1,p2,p3,o],nboots,1, submitted])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1,p2,p3,o],alignfile):
                work_queue.put([alignfile,[p1,p2,p3,o],nboots,0, submitted])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'

        Notes.append(notes)
    jobs = []
    for i in range(nproc):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    #Results = [result_queue.get() for i in range(submitted)]
    Results = [pickle.load(open(".save.D4temp"+str(i),'rb')) for i in xrange(submitted)]
    Results.sort(key = lambda x:x[8])

    "setup results file "
    outs = open(outfile+".D4.txt", 'w')
    header = "\t".join([ 'P1'+" "*(namelen[0]-2),
                         'P2'+" "*(namelen[1]-2),
                         'P3'+" "*(namelen[2]-2),
                         'O'+" "*(namelen[3]-1),
                         'D','std(D)','Z',
                         'BABA','ABBA',
                         'nloci','nboot','pdisc', 'notes'])
    print >>outs, header

    for i in range(len(Results)):
        ps,D,STD,Z,nloci,ABBA,BABA,pdisc,sub,ABBAloci,BABAloci,boots = Results[i]
        ps = [str(x).replace("['","[").replace("']","]").replace("', '",",").replace(">","") for x in ps]
        print >>outs, "%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.2f\t%s" % (ps[0]+" "*(namelen[0]-len(ps[0])),
                                                                                          ps[1]+" "*(namelen[1]-len(ps[1])),
                                                                                          ps[2]+" "*(namelen[2]-len(ps[2])),
                                                                                          ps[3]+" "*(namelen[3]-len(ps[3])),
                                                                                          D,STD,Z,
                                                                                          BABA,ABBA,
                                                                                          nloci,nboots,
                                                                                          pdisc,Notes[i])



        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            makesortfiles('ABBA',ABBAloci,4,loci,outfile,makesort,sub,ps)
            makesortfiles('BABA',BABAloci,4,loci,outfile,makesort,sub,ps)            

        if makeboots:
            with open(outfile+"_"+str(sub+1)+".boots",'w') as out:
                out.write(",".join(map(str,boots)))

    for oldpickle in glob.glob(".save.D4temp*"):
        os.remove(oldpickle)

Example #5

Show file

File: Dtest_foil.py Project: StuntsPT/pyrad

def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots,noterminals):
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in subtests:
        notes = ""
        if len(rep) == 2:
            rep,notes = rep
        p1,p2,p3a,p3b,o = rep
        if all(["[" in i for i in rep[1:]]):
            p1  = p1[1:-1].split(",")
            p2  = p2[1:-1].split(",")
            p3a = p3a[1:-1].split(",")
            p3b = p3b[1:-1].split(",")
            o   = o[1:-1].split(",")
            if checktaxa([p1,p2,p3a,p3b,o],alignfile):
                work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted, noterminals])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1,p2,p3a,p3b,o],alignfile):
                work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted, noterminals])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        Notes.append(notes)

    jobs = []
    for i in range(min(submitted,nproc)):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    Results = [result_queue.get() for i in range(submitted)]
    Results.sort(key = lambda x:x[15])



    " setup results file "
    if noterminals: 
        outs = open(outfile+".Dfoilalt.txt", 'w')
    else:
        outs = open(outfile+".Dfoil.txt", 'w')
    header = "\t".join([ 'p1'+" "*(namelen[0]-2),
                         'p2'+" "*(namelen[1]-2),
                         'p3'+" "*(namelen[2]-2),
                         'p4'+" "*(namelen[3]-2),
                         'O'+" "*(namelen[4]-1),
                         'Dfo','Dil','Dfi','Dol',
                         'Z_fo','Z_il','Z_fi','Z_ol',
                         'BABBA','ABBBA',
                         'BABAA','ABBAA',
                         'BAABA','ABABA',
                         'BBBAA','BBABA',
                         'AABAA','AAABA',
                         'BAAAA','ABAAA',
                         'nloci','sign', 'notes'])
    print >>outs, header

    for i in range(len(Results)):
        L,DFO,ZFO,DIL,ZIL,DFI,ZFI,DOL,ZOL,nloc,BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA,pdisc,sub,BBFO,BBIL,BBFI,BBOL = Results[i]
        L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L]

        sign = []
        for s,d in zip([ZFO,ZIL,ZFI,ZOL],[DFO,DIL,DFI,DOL]):
            if s>3.5:
                if d>0:
                    sign.append("+")
                else:
                    sign.append("-")
            else:
                sign.append("0")
        #print sign

        resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))),
                       str(L[1])+" "*(namelen[1]-len(str(L[1]))),
                       str(L[2])+" "*(namelen[2]-len(str(L[2]))),
                       str(L[3])+" "*(namelen[3]-len(str(L[3]))),
                       str(L[4])+" "*(namelen[4]-len(str(L[4]))),
                       DFO,DIL,DFI,DOL,
                       ZFO,ZIL,ZFI,ZOL,
                       BABBA,ABBBA,BABAA,ABBAA,BAABA,ABABA,BBBAA,BBABA,AABAA,AAABA,BAAAA,ABAAA,
                       nloc, "".join(sign), Notes[i]])
        
        print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%s\t%s" % resin 

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            None
            # makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L)            
            # makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L)            
            # makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L)

        if makeboots:
            None

Example #6

Show file

File: Dtest_5.py Project: StuntsPT/pyrad

def multiproc_it(subtests,alignfile,outfile, nboots,nproc,namelen,makesort,makeboots):
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in subtests:
        notes = ""
        if len(rep) == 2:
            rep,notes = rep
        p1,p2,p3a,p3b,o = rep
        if all(["[" in i for i in rep[1:]]):
            p1  = p1[1:-1].split(",")
            p2  = p2[1:-1].split(",")
            p3a = p3a[1:-1].split(",")
            p3b = p3b[1:-1].split(",")
            o   = o[1:-1].split(",")
            if checktaxa([p1,p2,p3a,p3b,o],alignfile):
                work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 1, submitted])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1,p2,p3a,p3b,o],alignfile):
                work_queue.put([alignfile, [p1,p2,p3a,p3b,o], nboots, 0, submitted])
                submitted += 1
            else: 
                print 'a taxon name was found that is not in the sequence file'
        Notes.append(notes)

    jobs = []
    for i in range(min(submitted,nproc)):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()


    " read results back in "
    #Results = [result_queue.get() for i in range(submitted)]
    Results = [pickle.load(open(".save."+str(i),'rb')) for i in range(submitted)]
    Results.sort(key = lambda x:x[15])


    " setup results file "
    outs = open(outfile+".partD.txt", 'w')
    header = "\t".join([ 'p1'+" "*(namelen[0]-2),
                         'p2'+" "*(namelen[1]-2),
                         'p3_1'+" "*(namelen[2]-4),
                         'p3_2'+" "*(namelen[3]-4),
                         'O'+" "*(namelen[4]-1),
                         'D_12','D_1','D_2',
                         'Z_12','Z_1','Z_2',
                         'BABBA','ABBBA',
                         'BABAA','ABBAA',
                         'BAABA','ABABA',
                         'nloci','pdisc', 'notes'])

    print >>outs, header


    for i in range(len(Results)):
        L,D12,Z12,D1,Z1,D2,Z2,nloc,ABBBA,BABBA,ABBAA,BABAA,ABABA,BAABA,pdisc,sub,ABBBAloci,BABBAloci,ABBAAloci,BABAAloci,ABABAloci,BAABAloci,BB12,BB1,BB2 = Results[i]
        L = [str(x).replace("['","[").replace("']","]").replace("', '",",") for x in L]

        resin = tuple([str(L[0])+" "*(namelen[0]-len(str(L[0]))),
                       str(L[1])+" "*(namelen[1]-len(str(L[1]))),
                       str(L[2])+" "*(namelen[2]-len(str(L[2]))),
                       str(L[3])+" "*(namelen[3]-len(str(L[3]))),
                       str(L[4])+" "*(namelen[4]-len(str(L[4]))),
                       D12, D1, D2, Z12, Z1, Z2, 
                       BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA,
                       nloc, pdisc, Notes[i]])
        
        print >>outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s" % resin 

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L)
            makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L)            
            makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L)            
            makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L)
            makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L)
            makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L)

        if makeboots:
            with open(outfile+"_"+str(sub+1)+".boots_D12",'w') as out:
                out.write(",".join(map(str,BB12)))
            with open(outfile+"_"+str(sub+1)+".boots_D1",'w') as out:
                out.write(",".join(map(str,BB1)))
            with open(outfile+"_"+str(sub+1)+".boots_D2",'w') as out:
                out.write(",".join(map(str,BB2)))

Example #7

Show file

File: consensdp.py Project: atcg/pyrad

def main(Parallel, E, H, ID, mindepth, subset,
         maxN, maxH, haplos, CUT, datatype,
         lowcounts, strict, WORK, maxstack):

    " find clust.xx directory "
    if not os.path.exists(WORK+'clust'+ID):
        print  "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
                "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
                "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
        sys.exit()

    " load up work queue"
    work_queue = multiprocessing.Queue()

    " iterate over files"
    outfolder = WORK+'clust'+str(ID)
    HH = glob.glob(outfolder+"/"+subset+".clustS*")
    stringout = "\n\tstep 5: creating consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5))
    sys.stderr.write(stringout)
    
    if len(HH) > 1:
        " sort files by size"
        for i in xrange(len(HH)):
            statinfo = os.stat(HH[i])
            HH[i] = HH[i],statinfo.st_size
        HH.sort(key=operator.itemgetter(1))
        FS = [f[0] for f in HH][::-1]
    else: FS = HH
    REMOVE = glob.glob('clust'+ID+"/cat.*")
    FS = [f for f in FS if f not in REMOVE]
    submitted = 0
    for handle in FS:
        if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"):
            m,sd = upSD(handle,mindepth)
            if maxstack == "2SD":
                upperSD = max(500,m+(sd*2.5))
            else:
                upperSD = int(maxstack)
            work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype,
                            haplos,CUT,upperSD,strict,lowcounts])
            submitted += 1
        else:
            print "\tskipping "+handle.replace(".clustS",".consens")+\
                  ', it already exists in '+outfolder+"/"


    " create a queue to pass to workers to store the results"
    result_queue = multiprocessing.Queue()

    " spawn workers"
    jobs = []
    for i in xrange( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, consensus)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " get results"
    stats = open(WORK+'stats/s5.consens.txt','a+')
    print >>stats,  "taxon          \tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly"
    for i in range(submitted):
        a,b,c,d,e,f,g = result_queue.get()
        print >> stats, "\t".join(map(str,[a.replace(".clustS.gz","")+" "*(10-len(a)),b,c,d,e,f,g]))
    print >>stats, """
    ## nloci = number of loci
    ## f1loci = number of loci with >N depth coverage
    ## f2loci = number of loci with >N depth and passed paralog filter
    ## nsites = number of sites across f loci
    ## npoly = number of polymorphic sites in nsites
    ## poly = frequency of polymorphic sites"""
    stats.close()

Example #8

Show file

def multiproc_it(tests, alignfile, outfile, nboots, nproc, namelen, makesort,
                 makeboots):

    " submit jobs to processors "
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in tests:
        notes = ""
        if len(rep) == 2:
            rep, notes = rep
        p1, p2, p3, o = rep
        if any(["[" in i for i in rep]):
            p1 = p1[1:-1].split(",")
            p2 = p2[1:-1].split(",")
            p3 = p3[1:-1].split(",")
            o = o[1:-1].split(",")
            taxalist = list(itertools.chain(*[p1 + p2 + p3 + o]))
            if checktaxa(taxalist, alignfile):
                work_queue.put(
                    [alignfile, [p1, p2, p3, o], nboots, 1, submitted])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1, p2, p3, o], alignfile):
                work_queue.put(
                    [alignfile, [p1, p2, p3, o], nboots, 0, submitted])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'

        Notes.append(notes)
    jobs = []
    for i in range(nproc):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    #Results = [result_queue.get() for i in range(submitted)]
    Results = [
        pickle.load(open(".save.D4temp" + str(i), 'rb'))
        for i in xrange(submitted)
    ]
    Results.sort(key=lambda x: x[8])

    "setup results file "
    outs = open(outfile + ".D4.txt", 'w')
    header = "\t".join([
        'P1' + " " * (namelen[0] - 2), 'P2' + " " * (namelen[1] - 2),
        'P3' + " " * (namelen[2] - 2), 'O' + " " * (namelen[3] - 1), 'D',
        'std(D)', 'Z', 'BABA', 'ABBA', 'nloci', 'nboot', 'pdisc', 'notes'
    ])
    print >> outs, header

    for i in range(len(Results)):
        ps, D, STD, Z, nloci, ABBA, BABA, pdisc, sub, ABBAloci, BABAloci, boots = Results[
            i]
        ps = [
            str(x).replace("['",
                           "[").replace("']",
                                        "]").replace("', '",
                                                     ",").replace(">", "")
            for x in ps
        ]
        print >> outs, "%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%d\t%d\t%.2f\t%s" % (
            ps[0] + " " * (namelen[0] - len(ps[0])), ps[1] + " " *
            (namelen[1] - len(ps[1])), ps[2] + " " *
            (namelen[2] - len(ps[2])), ps[3] + " " * (namelen[3] - len(ps[3])),
            D, STD, Z, BABA, ABBA, nloci, nboots, pdisc, Notes[i])

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            makesortfiles('ABBA', ABBAloci, 4, loci, outfile, makesort, sub,
                          ps)
            makesortfiles('BABA', BABAloci, 4, loci, outfile, makesort, sub,
                          ps)

        if makeboots:
            with open(outfile + "_" + str(sub + 1) + ".boots", 'w') as out:
                out.write(",".join(map(str, boots)))

    for oldpickle in glob.glob(".save.D4temp*"):
        os.remove(oldpickle)

Example #9

Show file

File: editraw_rads.py Project: StuntsPT/pyrad

def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype):
    print >>sys.stderr, "\tstep 2: editing raw reads \n\t",

    " create output directories "
    if not os.path.exists(WORK+'stats'):
        os.makedirs(WORK+'stats')
    if not os.path.exists(WORK+'edits'):
        os.makedirs(WORK+'edits')

    " load up work queue "
    submitted = 0
    work_queue = multiprocessing.Queue()
    if len(glob.glob(FQs)) > 1:
        FS = glob.glob(FQs)

        " order files by size "
        for i in range(len(FS)):
            statinfo = os.stat(FS[i])
            FS[i] = FS[i],statinfo.st_size
        FS.sort(key=operator.itemgetter(1))
        FS = [i[0] for i in FS][::-1]

        " submit jobs to queue "
        for handle in FS:
            finder = WORK+'edits/'+handle.split("/")[-1]
            while finder.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
                finder = finder.replace('.'+finder.split(".")[-1], "").replace("_R1","")
            if finder+".edit" not in glob.glob(WORK+"edits/*"):
                if os.stat(handle).st_size > 0:   ## exclude empty files
                    args = [WORK, handle, CUT, float(pN), trimkeep, strict, Q, datatype]
                    work_queue.put(args)
                    submitted += 1
                else:
                    print "skipping",handle,", file is empty"
            else:
                print "\t"+finder+" already in edits/"

    elif len(glob.glob(FQs)) == 1:
        " if only one file "
        work_queue.put([WORK, glob.glob(FQs)[0], CUT, float(pN), trimkeep, strict, Q, datatype])
        submitted += 1

    else:
        print "\tNo demultiplexed files found. Check path."
        sys.exit()

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()

    " spawn workers, give function "
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, rawedit)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()


    " collect the results off the queue "
    outstats = open(WORK+"stats/s2.rawedit.txt",'a')
    print >> outstats, "\t".join(["sample ","Nreads","passed","passed.w.trim","passed.total"])
    STATS = []
    for i in range(submitted):
        STATS.append(result_queue.get())

    STATS.sort(key = lambda x: x[0])
    for i in range(submitted):
        a,b,c,d = STATS[i]
        print >> outstats, "\t".join([a,b,c,d,str(int(c)+int(d))])

    print >>outstats, """
    Nreads = total number of reads for a sample
    passed = retained reads that passed quality filtering at full length
    passed.w.trim= retained reads that were trimmed due to detection of adapters
    passed.total  = total kept reads of sufficient length
    note: you can set the option in params file to include trimmed reads of xx length. """
    outstats.close()

Example #10

Show file

File: consens_pairs.py Project: isaacovercast/pyrad

def main(Parallel, E, H, ID, mindepth, subset,
         maxN, maxH, ploidy, CUT, datatype,
         lowcounts, strict, WORK, maxstack):

    " find clust.xx directory "
    if not os.path.exists(WORK+'clust'+ID):
        print  "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
                "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
                "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
        sys.exit()

    " create work queue"
    work_queue = multiprocessing.Queue()

    " iterate over files"
    outfolder = WORK+'clust'+str(ID)
    HH = glob.glob(outfolder+"/"+subset+".clustS*")
    stringout = "\n\tstep 5: created consensus seqs for %i samples, using H=%.5f E=%.5f\n\t" % (len(HH),round(H,5),round(E,5))
    sys.stderr.write(stringout)
    
    if len(HH) > 1:
        " sort files by size"
        for i in range(len(HH)):
            statinfo = os.stat(HH[i])
            HH[i] = HH[i],statinfo.st_size
        HH.sort(key=operator.itemgetter(1))
        FS = [f[0] for f in HH][::-1]
    else: FS = HH
    REMOVE = glob.glob('clust'+ID+"/cat.*")
    FS = [f for f in FS if f not in REMOVE]
    submitted = 0
    for handle in FS:
        if handle.replace('.clustS','.consens').replace('.clust','.consens') not in glob.glob(outfolder+"/*"):
            m,sd = upSD(handle,mindepth)
            if maxstack == "2SD":
                upperSD = max(500,m+(sd*2.5))
            else:
                upperSD = int(maxstack)
            work_queue.put([handle,E,H,mindepth,maxN,maxH,datatype,
                            ploidy,CUT,upperSD,strict,lowcounts])
            submitted += 1
        else:
            print "\tskipping "+handle.replace(".clustS",".consens")+\
                  ', it already exists in '+outfolder+"/"


    " create a queue to pass to workers to store the results"
    result_queue = multiprocessing.Queue()

    " spawn workers"
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, consensus)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " get results"
    stats = open(WORK+'stats/s5.consens.txt','a+')
    print >>stats,  "taxon\tnloci\tf1loci\tf2loci\tnsites\tnpoly\tpoly"
    for i in range(submitted):
        a,b,c,d,e,f,g = result_queue.get()
        nn = a.replace(".clustS.gz","")
        print >> stats, "\t".join(map(str,[nn,b,c,d,e,f,g]))
    print >>stats, """
    ## nloci = number of loci
    ## f1loci = number of loci with >N depth coverage
    ## f2loci = number of loci with >N depth and passed paralog filter
    ## nsites = number of sites across f loci
    ## npoly = number of polymorphic sites in nsites
    ## poly = frequency of polymorphic sites"""
    stats.close()

Example #11

Show file

File: Dtest_foil.py Project: isaacovercast/pyrad

def multiproc_it(subtests, alignfile, outfile, nboots, nproc, namelen,
                 makesort, makeboots, noterminals):
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in subtests:
        notes = ""
        if len(rep) == 2:
            rep, notes = rep
        p1, p2, p3a, p3b, o = rep
        if all(["[" in i for i in rep[1:]]):
            p1 = p1[1:-1].split(",")
            p2 = p2[1:-1].split(",")
            p3a = p3a[1:-1].split(",")
            p3b = p3b[1:-1].split(",")
            o = o[1:-1].split(",")
            if checktaxa([p1, p2, p3a, p3b, o], alignfile):
                work_queue.put([
                    alignfile, [p1, p2, p3a, p3b, o], nboots, 1, submitted,
                    noterminals
                ])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1, p2, p3a, p3b, o], alignfile):
                work_queue.put([
                    alignfile, [p1, p2, p3a, p3b, o], nboots, 0, submitted,
                    noterminals
                ])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        Notes.append(notes)

    jobs = []
    for i in range(min(submitted, nproc)):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    Results = [result_queue.get() for i in range(submitted)]
    Results.sort(key=lambda x: x[15])

    " setup results file "
    if noterminals:
        outs = open(outfile + ".Dfoilalt.txt", 'w')
    else:
        outs = open(outfile + ".Dfoil.txt", 'w')
    header = "\t".join([
        'p1' + " " * (namelen[0] - 2), 'p2' + " " * (namelen[1] - 2),
        'p3' + " " * (namelen[2] - 2), 'p4' + " " * (namelen[3] - 2),
        'O' + " " * (namelen[4] - 1), 'Dfo', 'Dil', 'Dfi', 'Dol', 'Z_fo',
        'Z_il', 'Z_fi', 'Z_ol', 'BABBA', 'ABBBA', 'BABAA', 'ABBAA', 'BAABA',
        'ABABA', 'BBBAA', 'BBABA', 'AABAA', 'AAABA', 'BAAAA', 'ABAAA', 'nloci',
        'sign', 'notes'
    ])
    print >> outs, header

    for i in range(len(Results)):
        L, DFO, ZFO, DIL, ZIL, DFI, ZFI, DOL, ZOL, nloc, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, BBBAA, BBABA, AABAA, AAABA, BAAAA, ABAAA, pdisc, sub, BBFO, BBIL, BBFI, BBOL = Results[
            i]
        L = [
            str(x).replace("['", "[").replace("']", "]").replace("', '", ",")
            for x in L
        ]

        sign = []
        for s, d in zip([ZFO, ZIL, ZFI, ZOL], [DFO, DIL, DFI, DOL]):
            if s > 3.5:
                if d > 0:
                    sign.append("+")
                else:
                    sign.append("-")
            else:
                sign.append("0")
        #print sign

        resin = tuple([
            str(L[0]) + " " * (namelen[0] - len(str(L[0]))),
            str(L[1]) + " " * (namelen[1] - len(str(L[1]))),
            str(L[2]) + " " * (namelen[2] - len(str(L[2]))),
            str(L[3]) + " " * (namelen[3] - len(str(L[3]))),
            str(L[4]) + " " * (namelen[4] - len(str(L[4]))), DFO, DIL, DFI,
            DOL, ZFO, ZIL, ZFI, ZOL, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA,
            BBBAA, BBABA, AABAA, AAABA, BAAAA, ABAAA, nloc, "".join(sign),
            Notes[i]
        ])

        print >> outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%s\t%s" % resin

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            None
            # makesortfiles("ABBBA",ABBBAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BABBA",BABBAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("ABBAA",ABBAAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BABAA",BABAAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("ABABA",ABABAloci,5,loci,outfile,makesort,sub,L)
            # makesortfiles("BAABA",BAABAloci,5,loci,outfile,makesort,sub,L)

        if makeboots:
            None

Example #12

Show file

def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype):

    print >> sys.stderr, "\n\tstep 2: quality filtering \n\t",

    " create output directories "
    if not os.path.exists(WORK + 'stats'):
        os.makedirs(WORK + 'stats')
    if not os.path.exists(WORK + 'edits'):
        os.makedirs(WORK + 'edits')

    " load up work queue "
    submitted = 0
    work_queue = multiprocessing.Queue()

    " do not select merged or discarded reads if PEAR was used on data"
    FQs = glob.glob(FQs)
    fqs = [
        i for i in FQs
        if not any([j in i for j in ["discarded", ".assembled."]])
    ]

    if len(fqs) > 1:
        " subselect only the first reads "
        if any([".unassembled.forward." in i for i in fqs]):
            FS = [i for i in fqs if '.forward.' in i]
        else:
            FS = [i for i in fqs if '_R1.' in i]

        " order files by size "
        for i in range(len(FS)):
            statinfo = os.stat(FS[i])
            FS[i] = FS[i], statinfo.st_size
        FS.sort(key=operator.itemgetter(1))
        FS = [i[0] for i in FS][::-1]

        " submit jobs to queue "
        for handle in FS:
            n = handle.split('/')[-1]
            while n.split(".")[-1] in [
                    "fastq", "fastQ", "gz", "fq", "FastQ", "nomerge"
            ]:
                n = n.replace('.' + n.split(".")[-1], "")
            if '.forward.' in n:
                n = n.split(".forward")[0]
                None
            else:
                "_".join(n.split('_R')[:-1])
            if WORK + "edits/" + n + ".edit" not in glob.glob(WORK +
                                                              "edits/*"):
                if os.stat(handle).st_size > 0:  ## exclude empty files
                    args = [
                        WORK, handle, CUT,
                        float(pN), trimkeep, strict, Q, datatype
                    ]
                    work_queue.put(args)
                    submitted += 1
                else:
                    print 'skipping', handle, ", file is empty"
            else:
                print "\t" + n + '.edit' + " already in edits/"
    elif len(fqs) == 1:
        " if only one file "
        work_queue.put([
            WORK,
            glob.glob(FQs)[0], CUT,
            float(pN), trimkeep, strict, Q, datatype
        ])
        submitted += 1

    else:
        print "no _paired_ de-multiplexed files found in this location."
        sys.exit()

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()

    " spawn workers, give function "
    jobs = []
    for i in range(min(Parallel, submitted)):
        worker = Worker(work_queue, result_queue, rawedit)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    " collect the results off the queue "
    outstats = open(WORK + "stats/s2.rawedit.txt", 'a')
    print >> outstats, "\t".join(
        ["sample", "Nreads", "exclude", "trimmed", "passed"])
    for i in range(submitted):
        a, b, c, d = result_queue.get()
        print >> outstats, "\t".join([a, b, str(int(b) - int(d)), c, d])

    print >> outstats, """
    Nreads = total number of reads for a sample
    exclude = reads that were excluded
    trimmed = reads that had adapter trimmed but were kept
    passed = total kept reads
    """
    outstats.close()

Example #13

Show file

def main(WORK, UCLUST, FQs, match, Q, Parallel):

    " create output directories " 
    if not os.path.exists(WORK+'fastq/'):
        os.makedirs(WORK+'fastq')
    if not os.path.exists(WORK+'mergedreads'):
        os.makedirs(WORK+'mergedreads')
    if not os.path.exists(WORK+'stats'):
        os.makedirs(WORK+'stats')


    submitted = 0
    work_queue = multiprocessing.Queue()

    names = [i for i in glob.glob(FQs) if "_R1.fq" in i]

    " submit jobs to queue "
    if len(names) > 1:
        for handle in names:
            if "nomerge." not in handle:
                n = str(handle.split('/')[-1]).replace("_R1.",".")
                while n.split(".")[-1] in ["fastq","fastQ","gz","fq","FastQ"]:
                    n = n.replace('.'+n.split(".")[-1], "")
                finder = WORK+'edits/'+n+".edit"
                if finder not in glob.glob(WORK+"edits/*"):
                    if os.stat(handle).st_size > 0:   ## exclude empty files
                        if os.path.exists(handle.replace("_R1.","_R2.")):
                            if not os.path.exists(handle.replace(".fq",".nomerge.fq")):
                                args = [WORK, UCLUST, handle, match, Q]
                                work_queue.put(args)
                                submitted += 1
                            else:
                                print "merge file already created for", handle.split("/")[-1]
                        else:
                            print "cannot find 2nd read file for", handle.split("/")[-1]
                    else:
                        print "\t"+finder+" already in edits/"
    else:
        if not names:
            if [i for i in glob.glob(FQs) if "_R1_." in i]:
                print "\n\tfile names should have _R1. not _R1_."
            print "\n\tcannot find input files"
            sys.exit()
        else:
            work_queue.put([WORK, UCLUST, names[0], match, Q])
            submitted += 1

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()


    " spawn workers, give function "
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, mergepairs)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    if submitted > 0:
        statout = open(WORK+"stats/s2.mergedreads.txt",'w')
        print >>statout, "\t".join(["taxon","mergedreads"])

        for i in range(submitted):
            stat = result_queue.get()
            a,b = stat
            n = a.strip().split("/")[-1].replace(".nomerge.gz","")
            print >>statout, "\t".join([n,str(b)])
        print >>statout, "\nmerged reads written to", WORK+"mergedreads/ "
        statout.close()

Example #14

Show file

File: H_err_dp.py Project: StuntsPT/pyrad

def main(Parallel,ID,minsamp,subset,haplos,WORK,CUT,datatype):
    sys.stderr.write("\n\tstep 4: estimating error rate and heterozygosity\n\t")

    " find clust.xx directory "
    if not os.path.exists(WORK+'clust'+ID):
        print  "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
                "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
                "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
        sys.exit()


    # warning message for low minsamp
    if minsamp < 5:
        sys.stderr.write("""\n\t warning: Mindepth < 5 is not recommended for this step.\n
                            If you intend to make low coverage base calls use a high mindepth in
                            step 4 to accurately infer H & E parameters, and then use a low mindepth
                            in conjunction with the line 31 params file option to make low coverage
                            base calls""")
        
    # if haploid data
    if haplos == 1:
        sys.stderr.write("\n\tapplying haploid-based test (infer E while H is fixed to 0)\n\t")

    # if double digest use first cut site
    if "," in CUT:
        CUT1, CUT2 = CUT.strip().split(",")
    else:
        CUT1 = CUT2 = CUT

    # load up work queue
    work_queue = multiprocessing.Queue()

    # iterate over files
    HH = glob.glob(WORK+"clust"+ID+"/"+subset+"*.clustS*")
    submitted = 0
    FS = []
    if len(HH) > 1:
        ## sort files by size
        for i in range(len(HH)):
            statinfo = os.stat(HH[i])
            if statinfo.st_size > 1000:
                FS.append((HH[i],statinfo.st_size))
            else:
                print "excluding ",HH[i],"file is too small\n"
        FS.sort(key=lambda x: x[1])
        FS = [i[0] for i in FS]
    else:
        FS = HH
    REMOVE = glob.glob(WORK+'clust'+ID+"/cat.*")
    FS = [f for f in FS if f not in REMOVE]
    for handle in FS:
        work_queue.put([WORK,handle, minsamp, CUT1, CUT2, datatype, haplos])
        submitted += 1

    " remove temp files if previous run "
    for ff in FS:
        end = ff.split("/")[-1].replace(".clustS.gz","") 
        ff = WORK+"stats/."+end+".temp"
        if os.path.exists(ff):
            os.remove(ff)

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()
    results = []
    
    " spawn workers "
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, optim)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    " write results to stats file "
    if not os.path.exists(WORK+"stats/Pi_E_estimate.txt"):
        outstats = open(WORK+"stats/Pi_E_estimate.txt",'w')
        outstats.write("taxa\tH\tE\n")
    else:
        outstats = open(WORK+"stats/Pi_E_estimate.txt",'a')
    for ff in FS:
        end = ff.split("/")[-1].replace(".clustS.gz","")
        ft = WORK+"stats/."+end+".temp"
        line = open(ft).readlines()
        outstats.write(line[0])
        os.remove(ft)
        # n,h,e = line[0].strip().split("\t")
        # H.append(float(h))
        # E.append(float(e))
    #outstats.write(" ".join(["mean E =",str(numpy.mean(E))])+"\n")
    #outstats.write(" ".join(["mean H =",str(numpy.mean(H))]))
    outstats.close()

Example #15

Show file

def main(Parallel, WORK, FQs, CUT, pN, Q, strict, trimkeep, datatype):
    print >> sys.stderr, "\tstep 2: editing raw reads \n\t",

    " create output directories "
    if not os.path.exists(WORK + 'stats'):
        os.makedirs(WORK + 'stats')
    if not os.path.exists(WORK + 'edits'):
        os.makedirs(WORK + 'edits')

    " load up work queue "
    submitted = 0
    work_queue = multiprocessing.Queue()
    if len(glob.glob(FQs)) > 1:
        FS = glob.glob(FQs)

        " order files by size "
        for i in range(len(FS)):
            statinfo = os.stat(FS[i])
            FS[i] = FS[i], statinfo.st_size
        FS.sort(key=operator.itemgetter(1))
        FS = [i[0] for i in FS][::-1]

        " submit jobs to queue "
        for handle in FS:
            finder = WORK + 'edits/' + handle.split("/")[-1]
            while finder.split(".")[-1] in [
                    "fastq", "fastQ", "gz", "fq", "FastQ"
            ]:
                finder = finder.replace('.' + finder.split(".")[-1],
                                        "").replace("_R1", "")
            if finder + ".edit" not in glob.glob(WORK + "edits/*"):
                if os.stat(handle).st_size > 0:  ## exclude empty files
                    args = [
                        WORK, handle, CUT,
                        float(pN), trimkeep, strict, Q, datatype
                    ]
                    work_queue.put(args)
                    submitted += 1
                else:
                    print "skipping", handle, ", file is empty"
            else:
                print "\t" + finder + " already in edits/"

    elif len(glob.glob(FQs)) == 1:
        " if only one file "
        work_queue.put([
            WORK,
            glob.glob(FQs)[0], CUT,
            float(pN), trimkeep, strict, Q, datatype
        ])
        submitted += 1

    else:
        print "\tNo demultiplexed files found. Check path."
        sys.exit()

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()

    " spawn workers, give function "
    jobs = []
    for i in range(min(Parallel, submitted)):
        worker = Worker(work_queue, result_queue, rawedit)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    " collect the results off the queue "
    outstats = open(WORK + "stats/s2.rawedit.txt", 'a')
    print >> outstats, "\t".join(
        ["sample ", "Nreads", "passed", "passed.w.trim", "passed.total"])
    STATS = []
    for i in range(submitted):
        STATS.append(result_queue.get())

    STATS.sort(key=lambda x: x[0])
    for i in range(submitted):
        a, b, c, d = STATS[i]
        print >> outstats, "\t".join([a, b, c, d, str(int(c) + int(d))])

    print >> outstats, """
    Nreads = total number of reads for a sample
    passed = retained reads that passed quality filtering at full length
    passed.w.trim= retained reads that were trimmed due to detection of adapters
    passed.total  = total kept reads of sufficient length
    note: you can set the option in params file to include trimmed reads of xx length. """
    outstats.close()

Example #16

Show file

def multiproc_it(subtests, alignfile, outfile, nboots, nproc, namelen,
                 makesort, makeboots):
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    submitted = 0
    Notes = []
    for rep in subtests:
        notes = ""
        if len(rep) == 2:
            rep, notes = rep
        p1, p2, p3a, p3b, o = rep
        if all(["[" in i for i in rep[1:]]):
            p1 = p1[1:-1].split(",")
            p2 = p2[1:-1].split(",")
            p3a = p3a[1:-1].split(",")
            p3b = p3b[1:-1].split(",")
            o = o[1:-1].split(",")
            if checktaxa([p1, p2, p3a, p3b, o], alignfile):
                work_queue.put(
                    [alignfile, [p1, p2, p3a, p3b, o], nboots, 1, submitted])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        else:
            if checktaxa([p1, p2, p3a, p3b, o], alignfile):
                work_queue.put(
                    [alignfile, [p1, p2, p3a, p3b, o], nboots, 0, submitted])
                submitted += 1
            else:
                print 'a taxon name was found that is not in the sequence file'
        Notes.append(notes)

    jobs = []
    for i in range(min(submitted, nproc)):
        worker = Worker(work_queue, result_queue, runtest)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    " read results back in "
    #Results = [result_queue.get() for i in range(submitted)]
    Results = [
        pickle.load(open(".save." + str(i), 'rb')) for i in range(submitted)
    ]
    Results.sort(key=lambda x: x[15])

    " setup results file "
    outs = open(outfile + ".partD.txt", 'w')
    header = "\t".join([
        'p1' + " " * (namelen[0] - 2), 'p2' + " " * (namelen[1] - 2),
        'p3_1' + " " * (namelen[2] - 4), 'p3_2' + " " * (namelen[3] - 4),
        'O' + " " * (namelen[4] - 1), 'D_12', 'D_1', 'D_2', 'Z_12', 'Z_1',
        'Z_2', 'BABBA', 'ABBBA', 'BABAA', 'ABBAA', 'BAABA', 'ABABA', 'nloci',
        'pdisc', 'notes'
    ])

    print >> outs, header

    for i in range(len(Results)):
        L, D12, Z12, D1, Z1, D2, Z2, nloc, ABBBA, BABBA, ABBAA, BABAA, ABABA, BAABA, pdisc, sub, ABBBAloci, BABBAloci, ABBAAloci, BABAAloci, ABABAloci, BAABAloci, BB12, BB1, BB2 = Results[
            i]
        L = [
            str(x).replace("['", "[").replace("']", "]").replace("', '", ",")
            for x in L
        ]

        resin = tuple([
            str(L[0]) + " " * (namelen[0] - len(str(L[0]))),
            str(L[1]) + " " * (namelen[1] - len(str(L[1]))),
            str(L[2]) + " " * (namelen[2] - len(str(L[2]))),
            str(L[3]) + " " * (namelen[3] - len(str(L[3]))),
            str(L[4]) + " " * (namelen[4] - len(str(L[4]))), D12, D1, D2, Z12,
            Z1, Z2, BABBA, ABBBA, BABAA, ABBAA, BAABA, ABABA, nloc, pdisc,
            Notes[i]
        ])

        print >> outs, "%s\t%s\t%s\t%s\t%s\t%.3f\t%.3f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\t%.2f\t%s" % resin

        loci = open(alignfile).read().strip().split("|")[:-1]
        if makesort:
            makesortfiles("ABBBA", ABBBAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("BABBA", BABBAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("ABBAA", ABBAAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("BABAA", BABAAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("ABABA", ABABAloci, 5, loci, outfile, makesort, sub,
                          L)
            makesortfiles("BAABA", BAABAloci, 5, loci, outfile, makesort, sub,
                          L)

        if makeboots:
            with open(outfile + "_" + str(sub + 1) + ".boots_D12", 'w') as out:
                out.write(",".join(map(str, BB12)))
            with open(outfile + "_" + str(sub + 1) + ".boots_D1", 'w') as out:
                out.write(",".join(map(str, BB1)))
            with open(outfile + "_" + str(sub + 1) + ".boots_D2", 'w') as out:
                out.write(",".join(map(str, BB2)))

Example #17

Show file

File: H_err_dp.py Project: isaacovercast/pyrad

def main(Parallel, ID, minsamp, subset, haplos, WORK, CUT, datatype):
    sys.stderr.write(
        "\n\tstep 4: estimating error rate and heterozygosity\n\t")

    " find clust.xx directory "
    if not os.path.exists(WORK + 'clust' + ID):
        print  "\terror: could not find "+WORK+"clust"+str(ID)+"/ directory,"+ \
                "\n\t\tif you changed the clustering threshold you must transfer *.clustS"+ \
                "\n\t\tfiles to a new directory named clust.xx with xx replaced by new clustering threshold"
        sys.exit()

    # warning message for low minsamp
    if minsamp < 5:
        sys.stderr.write(
            """\n\t warning: Mindepth < 5 is not recommended for this step.\n
                            If you intend to make low coverage base calls use a high mindepth in
                            step 4 to accurately infer H & E parameters, and then use a low mindepth
                            in conjunction with the line 31 params file option to make low coverage
                            base calls""")

    # if haploid data
    if haplos == 1:
        sys.stderr.write(
            "\n\tapplying haploid-based test (infer E while H is fixed to 0)\n\t"
        )

    # if double digest use first cut site
    if "," in CUT:
        CUT1, CUT2 = CUT.strip().split(",")
    else:
        CUT1 = CUT2 = CUT

    # load up work queue
    work_queue = multiprocessing.Queue()

    # iterate over files
    HH = glob.glob(WORK + "clust" + ID + "/" + subset + "*.clustS*")
    submitted = 0
    FS = []
    if len(HH) > 1:
        ## sort files by size
        for i in range(len(HH)):
            statinfo = os.stat(HH[i])
            if statinfo.st_size > 1000:
                FS.append((HH[i], statinfo.st_size))
            else:
                print "excluding ", HH[i], "file is too small\n"
        FS.sort(key=lambda x: x[1])
        FS = [i[0] for i in FS]
    else:
        FS = HH
    REMOVE = glob.glob(WORK + 'clust' + ID + "/cat.*")
    FS = [f for f in FS if f not in REMOVE]
    for handle in FS:
        work_queue.put([WORK, handle, minsamp, CUT1, CUT2, datatype, haplos])
        submitted += 1

    " remove temp files if previous run "
    for ff in FS:
        end = ff.split("/")[-1].replace(".clustS.gz", "")
        ff = WORK + "stats/." + end + ".temp"
        if os.path.exists(ff):
            os.remove(ff)

    " create a queue to pass to workers to store the results "
    result_queue = multiprocessing.Queue()
    results = []

    " spawn workers "
    jobs = []
    for i in range(min(Parallel, submitted)):
        worker = Worker(work_queue, result_queue, optim)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    " write results to stats file "
    if not os.path.exists(WORK + "stats/Pi_E_estimate.txt"):
        outstats = open(WORK + "stats/Pi_E_estimate.txt", 'w')
        outstats.write("taxa\tH\tE\n")
    else:
        outstats = open(WORK + "stats/Pi_E_estimate.txt", 'a')
    for ff in FS:
        end = ff.split("/")[-1].replace(".clustS.gz", "")
        ft = WORK + "stats/." + end + ".temp"
        line = open(ft).readlines()
        outstats.write(line[0])
        os.remove(ft)
        # n,h,e = line[0].strip().split("\t")
        # H.append(float(h))
        # E.append(float(e))
    #outstats.write(" ".join(["mean E =",str(numpy.mean(E))])+"\n")
    #outstats.write(" ".join(["mean H =",str(numpy.mean(H))]))
    outstats.close()

Example #18

Show file

File: pyRAD.py Project: StuntsPT/pyrad

def main():
    parser = OptionParser(prog="pyRAD",
                          usage="%prog [options]",
                          version="%prog 3.0.61")
    parser.add_option(
        '-p',
        action="store",
        type="string",
        dest="params",
        help="input file for within sample filtering and clustering\n")
    parser.add_option('-s',
                      action="store",
                      dest="steps",
                      help="""perform step-wise parts of within analysis\n
                      1 = barcode sorting                        \
                      2 = filter/edit raw sequences              \
                      3 = within-sample clustering               \
                      4 = estimate pi and e                      \
                      5 = consensus calling                      \
                      6 = cluster consensus                      \
                      7 = align & create output files """)
    parser.add_option('-d',
                      action="store",
                      type="string",
                      dest="dtest",
                      help="""input file for D-test of introgression,
                              can iterate over multiple samples """)
    parser.add_option('-n',
                      action="store_true",
                      dest="newparamsfile",
                      help="""creates a new empty input params.txt file """)
    parser.add_option('-D',
                      action="store_true",
                      dest="newDtestfile",
                      help="""creates a new empty Dtest input file """)

    (options, args) = parser.parse_args()

    if not any([
            options.params, options.dtest, options.newparamsfile,
            options.newDtestfile
    ]):
        print "\n\tmust include option of -p, -d, -D or -n\n"
        sys.exit()

    if options.params:
        sys.stderr.write('\n\n'+' '*5+'---'*20+'\n'+\
                         ' '*6+'pyRAD : RADseq for phylogenetics & introgression analyses\n'+\
                         ' '*5+'---'*20+'\n\n')

        readin = [
            line.strip().split('##')[0].strip()
            for line in open(options.params).readlines()
        ]
        if "==** " not in str(readin[0]):
            print "\n\twarning: update params input file format to latest version\n"
            sys.exit()

        WORK = str(readin[1])
        GLOB = str(readin[2])
        Bcode = str(readin[3])
        vsearch = str(readin[4])
        muscle = str(readin[5])
        CUT = str(readin[6])
        parallel = int(readin[7])
        mindepth = int(readin[8])
        pN = str(readin[9])
        wclust = str(readin[10])
        datatype = str(readin[11])
        minsamp = int(readin[12])
        maxpoly = str(readin[13])
        outname = str(readin[14])
        ###########################
        ## 15 is separator line
        ###########################
        subset = str(readin[16])
        outgroup = str(readin[17])
        exclude = str(readin[18])
        Floc = str(readin[19])
        try:
            maxmismatch = int(readin[20])
        except (ValueError, IndexError):
            maxmismatch = 1
        try:
            Q = int(readin[21])
        except (ValueError, IndexError):
            Q = 33
        try:
            strict = int(readin[22])
        except (ValueError, IndexError):
            strict = 0
        try:
            E, H = str(readin[23]).strip().split(",")
        except ValueError:
            E = ""
            H = ""
        try:
            maxN = int(readin[24])
        except ValueError:
            maxN = 5
        try:
            maxH = int(readin[25])
        except ValueError:
            maxH = 5
        try:
            haplos = int(readin[26])
        except ValueError:
            haplos = 2
        maxSNP = str(readin[27])
        if maxSNP == "": maxSNP = "99"
        max_inserts = str(readin[28])
        if max_inserts == "": max_inserts = "3"
        try:
            seed = int(readin[29])
        except ValueError:
            seed = 112233
        try:
            overhang = [int(i) for i in str(readin[30]).strip().split(',')]
        except (ValueError, IndexError):
            overhang = [0, 0]
        try:
            outform = str(readin[31])
        except (ValueError, IndexError):
            outform = ""
        try:
            lowcounts = int(readin[32])
        except (ValueError, IndexError):
            lowcounts = mindepth
        ##mergepairs = str(readin[31])
        ##if mergepairs in [0,""]: mergepairs = 0
        try:
            trimkeep = int(readin[33])
        except ValueError:
            trimkeep = 0
        try:
            maxstack = int(readin[34])
        except ValueError:
            maxstack = "2SD"
        try:
            minuniq = int(readin[35])
        except ValueError:
            minuniq = 0
        try:
            hierarch = int(readin[36])
        except ValueError:
            hierarch = 0
        try:
            MASK = int(readin[37])
        except ValueError:
            MASK = 'dust'
        if MASK == 1: MASK = 'dust'
        else: MASK = 'none'
        try:
            threads = int(readin[38])
        except ValueError:
            threads = 6
        ###############################
        ## 39 is separator line
        ###############################
        try:
            clustprefix = readin[40:]
        except IndexError:
            clustprefix = ""
        clustprefix = [i for i in clustprefix if i]
        """ expand ./ ~ and ../ designators in location names """
        def expander(namepath):
            if "~" in namepath:
                namepath = namepath.replace("~", os.path.expanduser("~"))
            if "../" in namepath:
                a, b = namepath.split("../")
                namepath = os.path.abspath(
                    os.path.join(os.path.dirname(""), '..', b))
            elif "./" in namepath:
                a, b = namepath.split("./")
                namepath = os.path.abspath("") + "/" + b
            return namepath

        if WORK == "":
            WORK = os.path.abspath("") + "/"
        else:
            WORK = expander(WORK)
        if WORK[-1] != "/":
            WORK = WORK + "/"
        stripped = 0
        if Floc:
            if Floc[0] == "@":
                stripped = 1
                Floc = expander(Floc[1:])
            else:
                Floc = expander(Floc)
        if GLOB: GLOB = expander(GLOB)
        if Bcode: Bcode = expander(Bcode)
        if vsearch: vsearch = expander(vsearch)
        if options.dtest: options.dtest = expander(options.dtest)
        """ find location of vsearch (or usearch) and muscle """

        def cmd_exists(cmd):
            return subprocess.call("type " + cmd,
                                   shell=True,
                                   stdout=subprocess.PIPE,
                                   stderr=subprocess.PIPE) == 0

        # " check platform: mac v linux "
        # if 'linux' in sys.platform:
        #     vsearch = "vsearch-1.0.3-linux-x86_64"
        # else:
        #     vsearch = "vsearch-1.0.3-mac-x86_64"

        # " find vsearch and muscle in user's lib/"
        # PYRADPATH = os.path.dirname(os.path.realpath(__file__))
        # vsearch = PYRADPATH+"/lib/"+vsearch
        # muscle = PYRADPATH+"/lib/muscle"

        " threads = 1 for usearch"
        if 'vsearch' not in vsearch:
            threads = 1

        if not cmd_exists(vsearch):
            print "\tcannot find vsearch (or usearch), edit path in param file"
            sys.exit()
        if not cmd_exists(muscle):
            print "\tcannot find muscle, edit path in input file"
            sys.exit()
        """ expand clustprefix cluster groups """
        gids = []
        groups = []
        minhits = []
        "hierarchical clustering "
        for line in clustprefix:
            gid, hits, inds = line.strip().split()
            gids.append(gid)
            minhits.append(hits)
            if "," in inds:
                thisgroup = []
                ii = inds.split(",")
                for i in ii:
                    if "*" in i:
                        expanded = glob.glob(WORK + "clust" + wclust + "/" +
                                             i + ".consens*")
                        [thisgroup.append(i) for i in expanded]
                    else:
                        thisgroup.append(WORK + "clust" + wclust + "/" + i +
                                         ".consens.gz")
                groups.append(thisgroup)
            else:
                if "*" in inds:
                    expanded = glob.glob(WORK + "clust" + wclust + "/" + inds +
                                         ".consens*")
                    groups.append(expanded)
                else:
                    inds = inds.split(",")
                    groups.append([
                        WORK + "clust" + wclust + "/" + i + ".consens.gz"
                        for i in inds
                    ])
            "TODO check for size=1 "
        if not gids:
            gids = ""

        " step of the analysis "
        k = tuple('1234567')
        if options.steps:
            k = tuple(str(options.steps))

        " check that the data type was entered correctly "
        datopts = [
            'rad', 'gbs', 'ddrad', 'pairgbs', 'pairddrad', 'merged', '2brad'
        ]
        if datatype not in datopts:
            print "\t datatype argument (line 11) not recognized "
            sys.exit()
        # if datatype == 'merged':
        #     print "specify mergetype in params file, ex: mergeddrad or mergegbs "
        #     sys.exit()

        " parse max_inserts argument "
        w1 = 3
        w2 = 6
        a1 = a2 = 99
        if 'pair' in datatype:
            if "," in max_inserts:
                wargs = max_inserts.strip().split(",")
                if len(wargs) == 2:
                    w1 = w2 = wargs[0]
                    a1 = a2 = wargs[1]
                elif len(wargs) == 4:
                    w1, w2, a1, a2 = wargs
                else:
                    print "\n\tmax_inserts parameter not recognized. see documentation"
                    sys.exit()
        else:
            if "," in max_inserts:
                w1, a1 = map(int, max_inserts.split(","))

        #########  Begin analysis  ###################################################
        if '1' in k:
            " expand Barcode file name if necessary "
            if "*" in Bcode:
                try:
                    Bcode = glob.glob(Bcode)[0]
                except IndexError:
                    print "\tcould not find barcodes file ", Bcode,
                    "\n\tcomment out line 3 of params file or edit path to barcodes file"
                    sys.exit()
            if Floc:
                print "\tskipping step 1: line 18 of input file shows seqs already sorted"
            else:
                " if directory as input select all inside"
                if GLOB:
                    if GLOB[-1] == "/":
                        GLOB = GLOB + "*"
                sortandcheck2.main(Bcode, GLOB, CUT, datatype, parallel,
                                   maxmismatch, WORK)

        ### step 2 ###################
        if '2' in k:
            if Floc:
                print >> sys.stderr, "\tsorted .fastq from %s being used" % Floc
                if len(glob.glob(Floc)) < 1:
                    sys.stderr.write(
                        "\t... no files found in line 18 location, check required file name formatting\n"
                    )
                    sys.exit()
                FQs = Floc
                if stripped:
                    print "\tbarcode & restriction site are already stripped off of sequences"
                    CUT = ""
                    if strict:
                        print "\tApplying step 2 filter (param 19) is not recommended for data that is stripped (w/ @) \n"
            else:
                " default location "
                FQs = WORK + "fastq/" + subset + "*.fq.gz"

            " if directory as input select all inside"
            if FQs[-1] == "/":
                FQs = FQs + "*"

            " if not paired filter only read 1 "
            if 'pair' not in datatype:  # in ['rad','ddrad','gbs','merged','2brad']:
                editraw_rads.main(parallel, WORK, FQs, CUT, pN, Q, strict,
                                  trimkeep, datatype)

            else:  #elif datatype in ['pairddrad','pairgbs']:
                " check for both CUT sites in pairddrad"
                if datatype == 'pairddrad':
                    if "," not in CUT:
                        print "\n\tyou must enter two restriction sites for pair ddRAD data"
                        sys.exit()
                editraw_pairs.main(parallel, WORK, FQs, CUT, pN, Q, strict,
                                   trimkeep, datatype)

            #elif "merge" in datatype:
            #    editraw_merges.main(parallel, WORK, FQs, CUT,
            #                       pN, Q, strict, trimkeep)

        ### step 3  ####################
        if '3' in k:
            cluster7dp.main(WORK,
                            parallel,
                            wclust,
                            mindepth,
                            subset,
                            datatype,
                            w1,
                            w2,
                            minuniq,
                            MASK,
                            muscle,
                            vsearch,
                            threads,
                            remake=0)

        ### step 4  ####################
        if '4' in k:
            " if using low depth option still use a reasonable limit for parameter estimates"
            if mindepth < 5:
                tempmindepth = 5
            else:
                tempmindepth = mindepth
            H_err_dp.main(parallel, wclust, tempmindepth, subset, haplos, WORK,
                          CUT, datatype)

        ### step 5  ####################
        if '5' in k:
            if not E:
                try:
                    Pi = open(WORK + "stats/Pi_E_estimate.txt").readlines()
                except IOError:
                    Pi = ""
                if Pi:
                    El = []
                    Hl = []
                    for line in Pi[1:]:
                        try:
                            _, h, e = line.strip().split("\t")
                        except IndexError:
                            None
                        Hl.append(float(h))
                        El.append(float(e))
                    if len(Hl) == 0:
                        print "\n\terror in step 4, no estimates in file stats/Pi_E_estimate.txt"
                        sys.exit()
                    H = sum(Hl) / len(Hl)
                    E = sum(El) / len(El)
                else:
                    E = 0.001
                    H = 0.01
                    print "\n\tstep 4 values not detected, using E=0.001, H=0.01"
            if 'pair' in datatype:
                " call consensus on each pair separately "
                consens_pairs.main(parallel, float(E), float(H), wclust,
                                   mindepth, subset + "*", maxN, maxH, haplos,
                                   CUT, datatype, lowcounts, strict, WORK,
                                   maxstack)
            else:
                " call consensus on single end clusters "
                consensdp.main(parallel, float(E), float(H), wclust, mindepth,
                               subset + "*", maxN, maxH, haplos, CUT, datatype,
                               lowcounts, strict, WORK, maxstack)

        ### step 6  ####################
        if '6' in k:
            if not hierarch:
                gids = ""
                if "," in subset:
                    inlist = [
                        WORK + "clust" + wclust + "/" + i + ".consens*"
                        for i in subset.strip().split(",")
                    ]
                else:
                    inlist = glob.glob(WORK + "clust" + wclust + "/" + subset +
                                       "*.consens*")
                cluster_cons7_shuf.main(vsearch, wclust, datatype, outgroup,
                                        seed, gids, minhits, inlist, WORK,
                                        MASK, 0)
                print "\n\tfinished clustering"
            else:
                """ re-expand clustprefix cluster groups in case no -s """
                Hgids = []
                Hgroups = {}
                Hminhits = []
                "hierarchical clustering "
                for line in clustprefix:
                    Hgid, Hhits, Hinds = line.strip().split()
                    Hgids.append(Hgid)
                    Hminhits.append(Hhits)
                    Hgroups[Hgid] = []
                    if "," in Hinds:
                        Hinds = Hinds.split(",")
                        for Hind in Hinds:
                            if "*" in Hind:
                                expanded = glob.glob(WORK + "clust" + wclust +
                                                     "/" + Hind + ".consens*")
                                Hgroups[Hgid] += expanded  #.append(expanded)
                            else:
                                Hgroups[Hgid].append(WORK + "clust" + wclust +
                                                     "/" + Hind +
                                                     ".consens.gz")
                    else:
                        if "*" in Hinds:
                            expanded = glob.glob(WORK + "clust" + wclust +
                                                 "/" + Hinds + ".consens*")
                            Hgroups[Hgid] += expanded  #.append(expanded)
                        else:
                            Hgroups[Hgid].append(WORK + "clust" + wclust +
                                                 "/" + Hinds + ".consens.gz")

                for i, j in zip(Hgids, Hminhits):
                    for cons in Hgroups[i]:
                        if cons not in glob.glob(WORK + "clust" + wclust +
                                                 "/*.consens.gz"):
                            print "\n\tsample name", cons, "in group", i, "does not match any filenames"
                            sys.exit()

                preclusts = []
                for i in Hgroups.values():
                    preclusts += i

                for cons in glob.glob(WORK + "clust" + wclust +
                                      "/*.consens.gz"):
                    if cons not in preclusts:
                        print "\n\twarning: sample", cons, "not assigned to a cluster group"

                #if not gids:
                #    gids = ""

                " make prefix directory "
                if not os.path.exists(WORK + 'prefix/'):
                    os.makedirs(WORK + 'prefix')

                ########### TODO ####################################
                # if os.path.exists(WORK+"prefix/cat.clust_.gz"):
                #     print "\tRemaking clusters from existing clustprefix files "+\
                #           "using minmatches: ",minmatch
                #     print "\t(To completely re-start hierarchical clustering delete the prefix/ directory)\n"
                #
                #     for (gid,minhit,inlist) in zip(gids,minhits,groups):
                #         handle = WORK+"clust"+wclust+"/cat.haplos_"+gid
                #         #cluster_cons7_shuf.makeclust(handle, datatype, pre, pre, minm, WORK, 1)
                #     #tier2clust.makeclust(wclust, datatype, WORK)
                #######################################################

                " queue up jobs "
                work_queue = multiprocessing.Queue()
                result_queue = multiprocessing.Queue()

                " submit jobs "
                for (Hgid, Hminhit) in zip(Hgids, Hminhits):
                    inlist = Hgroups[Hgid]
                    work_queue.put([
                        vsearch, wclust, datatype, outgroup, seed, Hgid,
                        Hminhit, inlist, WORK, MASK, 1
                    ])

                " execute first tier jobs "
                jobs = []
                for i in range(parallel):
                    worker = Worker(work_queue, result_queue,
                                    cluster_cons7_shuf.main)
                    jobs.append(worker)
                    worker.start()
                for j in jobs:
                    j.join()

                " cluster second tier "
                tier2clust.main(vsearch, wclust, datatype, Hgids, seed, WORK,
                                MASK)

                print "\n\tfinished clustering\n"

            " cleanup "
            #for ff in glob.glob(WORK+"clust"+wclust+"/cat.consens_*.gz"):
            #    os.remove(ff)
            #for ff in glob.glob(WORK+"clust"+wclust+"/cat.u*"):
            #    os.remove(ff)

        if '7' in k:
            if minsamp < 2:
                print "\n\tminimum minCov setting is <2: changing to 2"
                minsamp = 2

            if gids:
                inclustfile = WORK + "prefix/cat.clust_.gz"
            else:
                inclustfile = WORK + 'clust' + wclust + "/cat.clust_.gz"

            if not os.path.exists(inclustfile):
                #sys.stderr.write("\n\t didn't find hierarchically clustered subset: \n\t"+inclustfile)
                #sys.stderr.write("\n\t looking for default full cluster file")
                if os.path.exists(WORK + 'clust' + wclust + "/cat.clust_.gz"):
                    inclustfile = WORK + 'clust' + wclust + "/cat.clust_.gz"
                    sys.stderr.write("\n\tCluster input file: using \n\t" +
                                     inclustfile + "\n\n")
                else:
                    print "\tnot found"
                    #print "\tcat.clust_ file is selected based on line 15 subset argument "
                    #print "\n\t if you wish to exclude samples from an existing cat.clust file "+\
                    #      "\n\t in your output alignments list exclude names on line 17 of the params file.\n "
                    sys.exit()
            #if any([i in outform for i in ['t','m']]):
            #    if gids:
            #        print "\tgroups for 't' or 'm' outputs:", gids
            taxadict = OrderedDict(zip(gids, groups))
            alignable.main(outgroup, minsamp, outname, inclustfile, maxpoly,
                           parallel, maxSNP, muscle, exclude, overhang,
                           outform, WORK, gids, CUT, a1, a2, datatype, subset,
                           parser.version.split(" ")[1], mindepth, taxadict,
                           minhits, seed, haplos)

        if '8' in k:
            cluster7dp.main(WORK,
                            parallel,
                            wclust,
                            mindepth,
                            subset,
                            datatype,
                            w1,
                            w2,
                            minuniq,
                            MASK,
                            muscle,
                            vsearch,
                            threads,
                            remake=1)

    if options.dtest:
        readin = [line.strip() for line in open(options.dtest).readlines()]

        nboots = int(readin[0].split("##")[0].strip())
        alignfile = str(readin[1].split("##")[0].strip())
        outfile = str(readin[2].split("##")[0].strip())
        ntax = str(readin[3].split("##")[0].strip())
        nproc = int(readin[4].split("##")[0].strip())
        makesort = int(readin[5].split("##")[0].strip())
        makeboots = int(readin[6].split("##")[0].strip())

        tests = []
        for line in readin[8:]:
            if line:
                notes = ""
                if "##" in line:
                    tax, notes = line.strip().split(
                        "##")[0], line.strip().split("##")[-1],
                    if tax:
                        tests.append([tax.strip().split(),
                                      notes.strip()
                                      ])  #.split("\t"),notes.strip()])
                else:
                    tests.append(line.strip().split())  # "\t"))
        if ntax == '4':
            Dtest.main(tests, alignfile, outfile, nboots, nproc, makesort,
                       makeboots)
        elif ntax == 'part':
            Dtest_5.main(tests, alignfile, outfile, nboots, nproc, makesort,
                         makeboots)
        elif ntax == 'foil':
            Dtest_foil.main(tests, alignfile, outfile, nboots, nproc, makesort,
                            makeboots, 0)
        elif ntax == 'foilalt':
            Dtest_foil.main(tests, alignfile, outfile, nboots, nproc, makesort,
                            makeboots, 1)
        else:
            print "error in input file"

    if options.newparamsfile:
        if os.path.exists("./params.txt"):
            print "\tfile params.txt already exists"
            sys.exit()
        else:
            createfile.main(parser.version.split(" ")[1])

    if options.newDtestfile:
        outstring = """200                          ## N bootstrap replicates
test.loci                    ## loc/path to input .loci file
dstats/test1_res             ## output file path/name (no suffix)
4                            ## which test: 4,part,foil,foilalt
2                            ## N cores (execute jobs [lines below] in parallel
0                            ## output ABBA/BABA loci to files (0=no,1,2=verbose)
0                            ## output bootstrap Ds to files (0=no,1=yes)
-----------------------------------------------------------\n"""
        sys.stdout.write(outstring)

Example #19

Show file

File: sortandcheck2.py Project: VeroIarrachtai/pyrad

def writefunc(GLOB,Parallel,Bcode,CUT,datatype,maxmismatch,WORK):
    "create barcode dictionary"
    codetable = open(Bcode, 'r')
    codes = [line.strip().split() for line in codetable.readlines()]
    C = {}
    for line in codes:
        if line:
            C[line[1].strip().upper()] = line[0]

    " find longest barcode "
    keylens = map(len,C.keys())
    if len(set(keylens)) == 1:
        longB = (keylens[0],'same')
    else:
        longB = (max(keylens),'diff')

    " check for CUT in barcodes "
    CCC = unambig(CUT)
    if len(CCC)>1:
        for cut in CCC:
            if any([cut in i for i in C.keys()]):
                print "\n\twarning: CUT site matches within one of the barcodes, "+\
                "I suggest double \n\tchecking the file to make sure it properly demultiplexes"
    else:
        if any([CUT in i for i in C.keys()]):
            print "\n\twarning: CUT site matches within one of the barcodes, "+\
            "I suggest double \n\tchecking the file to make sure it properly demultiplexes"

    " read in sequence files "
    if len(glob.glob(GLOB)) > 1:
        FS = [f for f in glob.glob(GLOB)]
    else:
        FS = glob.glob(GLOB)
    if 'pair' in datatype:
        Raws = combinefiles(GLOB)
    else:
        Raws = FS

    "send jobs to multiprocess queue"
    num = 0
    work_queue = multiprocessing.Queue()
    submitted = 0
    for fs in Raws:
        if 'pair' in datatype:
            work_queue.put([C, [fs[0],fs[1]], CUT, datatype, num, maxmismatch, WORK, longB])
            submitted += 1
        else:
            work_queue.put([C, fs, CUT, datatype, num, maxmismatch, WORK, longB])
            submitted += 1
        num += 1

    result_queue = multiprocessing.Queue()

    "spawn workers, give function"
    jobs = []
    for i in range( min(Parallel,submitted) ):
        worker = Worker(work_queue, result_queue, barmatch)
        worker.start()
        jobs.append(worker)
    for job in jobs:
        job.join()

    Ms = {}

    if len(glob.glob(WORK+"fastq/.*.pickle")) > 1:
        for pick in glob.glob(WORK+"fastq/.*.pickle"):
            pickin = open(pick, "rb")
            M = pickle.load( pickin )
            pickin.close()
            for key in M:
                if key not in Ms:
                    Ms[key] = M[key]
                else:
                    Ms[key] += M[key]
            os.remove(pick)
    elif len(glob.glob(WORK+"fastq/.*.pickle")) == 1:
        pick = glob.glob(WORK+"fastq/.*.pickle")[0]
        pickin = open(pick, 'rb')
        Ms = pickle.load( pickin )
        pickin.close()
        os.remove(pick)
    else:
        print "\nno stats file generated"

    Mkeys = Ms.keys()
    Mkeys.sort(key=lambda x: Ms[x], reverse=True)

    statout = open(WORK+"stats/s1.sorting.txt",'a')
    statout.write("\n\n")
    statout.write("sample\ttrue_bar\tobs_bars\tN_obs\n")

    Cnames = C.keys()
    Cnames.sort()
    try: maxl = max(map(len,map(str,Ms.values())))
    except ValueError: maxl = 5

    hits = []
    for bar in Cnames:
        for barcode in Mkeys:
            if matching(bar, barcode, maxmismatch):
                print >>statout, "%s    \t%s    \t%s\t%s" % (C[bar], bar, barcode,
                                                             str(Ms[barcode])+" "*(maxl+3-len(str(Ms[barcode]))))
                hits.append(barcode)

    statout.write("\n")
    maxl = max(map(len,Mkeys))
    for barcode in Mkeys:
        if barcode not in hits:
            print >>statout, "nomatch  \t%s    \t%i" % (barcode+" "*(maxl+3-len(barcode)), Ms[barcode])
    statout.close()