Beispiel #1
0
def makehaplos(WORK, outname, longname):
    """TODO print gbs warning that haplos may not be
    phased on non-overlapping segments"""
    outfile = open(WORK+"outfiles/"+outname+".alleles", 'w')
    lines = open(WORK+"outfiles/"+outname+".loci").readlines()
    writing = []
    loc = 0
    for line in lines:
        if ">" in line:
            a,b = line.split(" ")[0],line.split(" ")[-1]
            a1,a2 = breakalleles(b.strip())
            writing.append(a+"_0"+" "*(longname-len(a)+3)+a1)
            writing.append(a+"_1"+" "*(longname-len(a)+3)+a2)
        else:
            writing.append(line.strip())
        loc += 1

        " print every 10K loci "
        if not loc % 10000:
            outfile.write("\n".join(writing)+"\n")
            writing = []
            
    outfile.write("\n".join(writing))
    outfile.close()
Beispiel #2
0
def makecons(vsearch, ID, datatype, 
             outg, seed, gid, minmatch, inlist,
             WORK, quiet, outhandle):

    " find usearch"
    if not cmd_exists(vsearch):
        print "\tcannot find usearch (or vsearch), edit path in param file"
        sys.exit()

    " make list of consens files "
    FS = [i for i in inlist if "/cat.cons" not in i]
    FS = [i for i in FS if "/cat.group" not in i]
    if not FS:
        print "no consens files found"
        sys.exit()

    " and a list including outgroups "
    fs = copy.copy(inlist)
    
    " are files gzipped ? "
    if any(['.gz' in i[-4:] for i in FS]):
        gz = ".gz"
    else:
        gz = ""

    " remove previous files if present "
    if os.path.exists(WORK+'clust'+ID+'/cat.consens_'+gid+gz):
        os.remove(WORK+'clust'+ID+'/cat.consens_'+gid+gz)
    if os.path.exists(WORK+'clust'+ID+'/cat.group_'+gid+gz):
        os.remove(WORK+'clust'+ID+'/cat.group_'+gid+gz)


    " remove outgroup sequences, add back in later to bottom after shuffling "
    if outg:
        outgroup = outg.strip().split(",")
        if len(outgroup) > 1:
            for s in outgroup:
                if WORK+"clust"+ID+"/"+s+".consens"+gz in FS:
                    FS.remove(WORK+"clust"+ID+"/"+s+".consens"+gz)
        else:
            outgroup = WORK+"clust"+ID+"/"+outg+".consens"+gz
            if outgroup in FS:
                FS.remove(outgroup)
                
    " create file with consens seqs from all taxa in list "
    out = gzip.open(WORK+'clust'+ID+'/cat.group_'+gid+gz,'w')

    for qhandle in FS:
        if gz:
            f = gzip.open(qhandle)
        else:
            f = open(qhandle)
        k = itertools.izip(*[iter(f)]*2)
        while 1:
            try: a = k.next()
            except StopIteration: break
            print >>out, a[0].strip()+"    "+a[1].strip()
        f.close()
    out.close()

    " message to shell "
    if gid:
        sys.stderr.write('\n\tstep 6: clustering across '+str(len(FS))+' samples at '+`ID`+\
                         ' similarity \n\tfor group ('+str(gid)+') retaining seeds w/ minimum of '+str(minmatch)+' hits\n\n')
    else:
        sys.stderr.write('\n\tstep 6: clustering across '+str(len(FS))+' samples at '+`ID`+' similarity \n\n')

    " make list of random number and data "
    if seed:
        random.seed(seed)
    source = gzip.open(WORK+'clust'+ID+'/cat.group_'+gid+".gz",'r')
    data = [ (random.random(), line) for line in source ]
    source.close()
    " sort by random number "
    data.sort()

    " order by size while retaining randomization within size classes "
    D = [line.split('    ') for _, line in data]
    DD = ["".join([i[0]+" "*(100-len(i[0])),i[1]]) for i in D]
    DD.sort(key=len, reverse=True)
    k = iter(["**".join([i.split(" ")[0],i.split(" ")[-1]]) for i in DD])

    " write output to .consens_.gz file "
    out = gzip.open(WORK+'clust'+ID+'/cat.consens_'+gid+".gz",'w')
    while 1:
        try: a,b = k.next().split("**")
        except StopIteration: break
        print >>out, a+'\n'+b.strip()

    
    """ add outgroup taxa back onto end of file."""
    if outg:
        " append to existing consens_ file "
        outgroup = outg.strip().split(',')
        if len(outgroup) > 1:
            for s in outgroup:
                xoutg = WORK+"clust"+ID+"/"+s+".consens.gz"
                if xoutg in fs:
                    f = gzip.open(xoutg)
                    k = itertools.izip(*[iter(f)]*2)
                    while 1:
                        try: a = k.next()
                        except StopIteration: break
                        print >>out, a[0].strip()+"\n"+a[1].strip()
                    f.close()
        elif len(outgroup) == 1:
            xoutg = WORK+"clust"+ID+"/"+outgroup[0]+".consens.gz"
            if xoutg in fs:
                f = gzip.open(xoutg)
                k = itertools.izip(*[iter(f)]*2)
                while 1:
                    try: a = k.next()
                    except StopIteration: break
                    print >>out, a[0].strip()+"\n"+a[1].strip()
                f.close()
        else:
            None
    out.close()        


    """ convert ambiguity codes into a sampled haplotype for any sample
    to use for clustering, but save ambiguities for later """

    " output file"
    outhaplos = open(outhandle,'w')

    " input file "
    infile = gzip.open(WORK+"clust"+ID+"/cat.consens_"+gid+".gz")
    lines = iter(infile.readlines())
    infile.close()
    
    " write to haplo files in fasta format "
    writinghaplos = []

    for line in lines:
        if ">" in line:
            writinghaplos.append(line.strip())
        else:
            allele = breakalleles(line)[0]
            writinghaplos.append(allele.strip())
    outhaplos.write("\n".join(writinghaplos))
    outhaplos.close()
Beispiel #3
0
def makecons(vsearch, ID, datatype, outg, seed, gid, minmatch, inlist, WORK,
             quiet, outhandle):

    " find usearch"
    if not cmd_exists(vsearch):
        print "\tcannot find usearch (or vsearch), edit path in param file"
        sys.exit()

    " make list of consens files "
    FS = [i for i in inlist if "/cat.cons" not in i]
    FS = [i for i in FS if "/cat.group" not in i]
    if not FS:
        print "no consens files found"
        sys.exit()

    " and a list including outgroups "
    fs = copy.copy(inlist)

    " are files gzipped ? "
    if any([i.endswith(".gz") for i in FS]):
        gz = ".gz"
    else:
        gz = ""

    " remove previous files if present "
    if os.path.exists(WORK + 'clust' + ID + '/cat.consens_' + gid + gz):
        os.remove(WORK + 'clust' + ID + '/cat.consens_' + gid + gz)
    if os.path.exists(WORK + 'clust' + ID + '/cat.group_' + gid + gz):
        os.remove(WORK + 'clust' + ID + '/cat.group_' + gid + gz)

    " remove outgroup sequences, add back in later to bottom after shuffling "
    if outg:
        outgroup = outg.strip().split(",")
        if len(outgroup) > 1:
            for s in outgroup:
                if WORK + "clust" + ID + "/" + s + ".consens" + gz in FS:
                    FS.remove(WORK + "clust" + ID + "/" + s + ".consens" + gz)
        else:
            outgroup = WORK + "clust" + ID + "/" + outg + ".consens" + gz
            if outgroup in FS:
                FS.remove(outgroup)

    " create file with consens seqs from all taxa in list "
    out = gzip.open(WORK + 'clust' + ID + '/cat.group_' + gid + gz, 'w')

    for qhandle in FS:
        if gz:
            f = gzip.open(qhandle)
        else:
            f = open(qhandle)
        k = itertools.izip(*[iter(f)] * 2)
        while 1:
            try:
                a = k.next()
            except StopIteration:
                break
            print >> out, a[0].strip() + "    " + a[1].strip()
        f.close()
    out.close()

    " message to shell "
    if gid:
        sys.stderr.write('\n\tstep 6: clustering across '+str(len(FS))+' samples at '+`ID`+\
                         ' similarity \n\tfor group ('+str(gid)+') retaining seeds w/ minimum of '+str(minmatch)+' hits\n\n')
    else:
        sys.stderr.write('\n\tstep 6: clustering across ' + str(len(FS)) +
                         ' samples at ' + ` ID ` + ' similarity \n\n')

    " make list of random number and data "
    if seed:
        random.seed(seed)

    " open file for reading consensus reads grouped together in one file "
    source = gzip.open(WORK + 'clust' + ID + '/cat.group_' + gid + ".gz", 'r')
    " generator to add a random number next to every sequence "
    data = ((random.random(), line) for line in source)
    " sort by the random number into a list (now stored in memory)"
    randomized_data = sorted(data)
    source.close()

    " order by size while retaining randomization within size classes "
    splitlines = (line.split('    ') for _, line in randomized_data)
    equalspacers = iter("".join([i[0] + " " * (100 - len(i[0])), i[1]])
                        for i in splitlines)
    orderedseqs = sorted(equalspacers, key=len, reverse=True)
    k = iter(
        ["**".join([i.split(" ")[0], i.split(" ")[-1]]) for i in orderedseqs])

    " write output to .consens_.gz file "
    ## NB: could probably speed this up
    out = gzip.open(WORK + 'clust' + ID + '/cat.consens_' + gid + ".gz", 'wb')
    while 1:
        try:
            a, b = k.next().split("**")
        except StopIteration:
            break
        print >> out, a + '\n' + b.strip()
    """ add outgroup taxa back onto end of file."""
    if outg:
        " append to existing consens_ file "
        outgroup = outg.strip().split(',')
        if len(outgroup) > 1:
            for s in outgroup:
                xoutg = WORK + "clust" + ID + "/" + s + ".consens.gz"
                if xoutg in fs:
                    f = gzip.open(xoutg)
                    k = itertools.izip(*[iter(f)] * 2)
                    while 1:
                        try:
                            a = k.next()
                        except StopIteration:
                            break
                        print >> out, a[0].strip() + "\n" + a[1].strip()
                    f.close()
        elif len(outgroup) == 1:
            xoutg = WORK + "clust" + ID + "/" + outgroup[0] + ".consens.gz"
            if xoutg in fs:
                f = gzip.open(xoutg)
                k = itertools.izip(*[iter(f)] * 2)
                while 1:
                    try:
                        a = k.next()
                    except StopIteration:
                        break
                    print >> out, a[0].strip() + "\n" + a[1].strip()
                f.close()
        else:
            None
    out.close()
    """ convert ambiguity codes into a sampled haplotype for any sample
    to use for clustering, but save ambiguities for later """

    " output file"
    outhaplos = open(outhandle, 'w')

    " input file "
    infile = gzip.open(WORK + "clust" + ID + "/cat.consens_" + gid + ".gz")
    lines = iter(infile.readlines())
    infile.close()

    " write to haplo files in fasta format "
    writinghaplos = []

    for line in lines:
        if ">" in line:
            writinghaplos.append(line.strip())
        else:
            allele = breakalleles(line)[0]
            writinghaplos.append(allele.strip())
    outhaplos.write("\n".join(writinghaplos))
    outhaplos.close()
Beispiel #4
0
def splitandalign(ingroup, minspecies, outname, infile,
              MAXpoly, parallel, s1, s2, muscle,
              exclude, overhang, WORK, CUT,
              a1, a2, datatype, longname, nloci, formats):

    """ split cluster file into smaller files depending on the number
    of processors and align each file separately using alignfunc function."""

    ## double check that old chunk and aligns are removed
    for i in glob.glob(WORK+".align*"):
        os.remove(i)
    for i in glob.glob(WORK+".chunk*"):
        os.remove(i)

    ## read infile, split into chunks for aligning, nchuncks
    ## depends on number of available processors
    data = gzip.open(infile, 'rb').read().strip().split("//\n")
    minpar = max(3, parallel)  ## pp
    chunks = [0+(len(data)/minpar)*i for i in range(minpar)]

    for i in range(len(chunks)-1):
        with open(WORK+".chunk_"+str(i), 'w') as dat:
            dat.write("//\n//\n".join(data[chunks[i]:chunks[i+1]])+"//\n//\n")

    ## write the last chunk
    with open(WORK+".chunk_"+str(i+1), 'w') as dat:
        dat.write("//\n//\n".join(data[chunks[i+1]:])+"//\n//\n")

    ## make alleles file
    makealleles = bool("a" in formats)

    ## set up parallel
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    for handle in glob.glob(WORK+".chunk*"):
        #work_queue.put([params, handle, ingroup, 
        #                exclude, longname, quiet])
        work_queue.put([handle, minspecies, ingroup, MAXpoly,
                        outname, s1, s2, muscle, 
                        exclude, overhang, WORK, CUT,
                        a1, a2, datatype, longname, makealleles])
    ## spawn workers
    jobs = []
    for i in range(minpar):
        worker = Worker(work_queue, result_queue, alignFUNC)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    locus = 0
    for handle in glob.glob(WORK+".chunk*"):
        locus += int(result_queue.get())

    " output loci and excluded loci and delete temp files... "
    locicounter = 1
    aligns = glob.glob(WORK+".align*")
    aligns.sort(key=lambda x: int(x.split("_")[-1]))

    ## write loci output
    locifile = open(WORK+"outfiles/"+outname+".loci", "w")    
    for chunkfile in aligns:
        chunkdata = open(chunkfile, "r")
        for line in chunkdata:
            if line.startswith("//"):
                #line = line.replace("|\n", str(locicounter)+"|\n", 1)
                #lines = lines.replace("\n", str(locicounter)+"\n", 1)
                locifile.write("{}{}|\n".format(line.strip(), locicounter))
                locicounter += 1
            else:
                nam_, seq = line.rsplit(" ", 1)
                locifile.write("{} {}".format(nam_, seq.upper()))
        chunkdata.close()
    locifile.close()


    if makealleles:    
        locicounter = 1    
        ## write alleles output
        allelesfile = open(WORK+"outfiles/"+outname+".alleles", "w")        
        for chunkfile in aligns:
            chunkdata = open(chunkfile, "r")
            for line in chunkdata:
                if line.startswith("//"):
                    allelesfile.write("{}{}|\n".format(line.strip(), locicounter))
                    locicounter += 1
                else:
                    bits = line.split(" ")
                    hap0, hap1 = breakalleles(bits[-1])
                    allelesfile.write("{}_0{}{}".format(bits[0], " ".join(bits[1:-1]), hap0))
                    allelesfile.write("{}_1{}{}".format(bits[0], " ".join(bits[1:-1]), hap1))
            chunkdata.close()
        allelesfile.close()

    ## clean up chunks and aligns
    chunks = glob.glob(WORK+".chunk*") + glob.glob(WORK+".align")
    for handle in chunks:
        os.remove(handle)
    
    unaligns = glob.glob(WORK+".not*")
    excluded_loci_file = open(WORK+"outfiles/"+outname+".excluded_loci", "w")
    
    for excludechunk in unaligns:
        excludedata = open(excludechunk, "r")
        for lines in excludedata:
            excluded_loci_file.write(lines)
        excludedata.close()
        os.remove(excludechunk)
    
    excluded_loci_file.close()
Beispiel #5
0
def splitandalign(ingroup, minspecies, outname, infile, MAXpoly, parallel, s1,
                  s2, muscle, exclude, overhang, WORK, CUT, a1, a2, datatype,
                  longname, nloci, formats):
    """ split cluster file into smaller files depending on the number
    of processors and align each file separately using alignfunc function."""

    ## double check that old chunk and aligns are removed
    for i in glob.glob(WORK + ".align*"):
        os.remove(i)
    for i in glob.glob(WORK + ".chunk*"):
        os.remove(i)

    ## read infile, split into chunks for aligning, nchuncks
    ## depends on number of available processors
    data = gzip.open(infile, 'rb').read().strip().split("//\n")
    minpar = max(3, parallel)  ## pp
    chunks = [0 + (len(data) / minpar) * i for i in range(minpar)]

    for i in range(len(chunks) - 1):
        with open(WORK + ".chunk_" + str(i), 'w') as dat:
            dat.write("//\n//\n".join(data[chunks[i]:chunks[i + 1]]) +
                      "//\n//\n")

    ## write the last chunk
    with open(WORK + ".chunk_" + str(i + 1), 'w') as dat:
        dat.write("//\n//\n".join(data[chunks[i + 1]:]) + "//\n//\n")

    ## make alleles file
    makealleles = bool("a" in formats)

    ## set up parallel
    work_queue = multiprocessing.Queue()
    result_queue = multiprocessing.Queue()
    for handle in glob.glob(WORK + ".chunk*"):
        #work_queue.put([params, handle, ingroup,
        #                exclude, longname, quiet])
        work_queue.put([
            handle, minspecies, ingroup, MAXpoly, outname, s1, s2, muscle,
            exclude, overhang, WORK, CUT, a1, a2, datatype, longname,
            makealleles
        ])
    ## spawn workers
    jobs = []
    for i in range(minpar):
        worker = Worker(work_queue, result_queue, alignFUNC)
        jobs.append(worker)
        worker.start()
    for j in jobs:
        j.join()

    locus = 0
    for handle in glob.glob(WORK + ".chunk*"):
        locus += int(result_queue.get())

    " output loci and excluded loci and delete temp files... "
    locicounter = 1
    aligns = glob.glob(WORK + ".align*")
    aligns.sort(key=lambda x: int(x.split("_")[-1]))

    ## write loci output
    locifile = open(WORK + "outfiles/" + outname + ".loci", "w")
    for chunkfile in aligns:
        chunkdata = open(chunkfile, "r")
        for line in chunkdata:
            if line.startswith("//"):
                #line = line.replace("|\n", str(locicounter)+"|\n", 1)
                #lines = lines.replace("\n", str(locicounter)+"\n", 1)
                locifile.write("{}{}|\n".format(line.strip(), locicounter))
                locicounter += 1
            else:
                nam_, seq = line.rsplit(" ", 1)
                locifile.write("{} {}".format(nam_, seq.upper()))
        chunkdata.close()
    locifile.close()

    if makealleles:
        locicounter = 1
        ## write alleles output
        allelesfile = open(WORK + "outfiles/" + outname + ".alleles", "w")
        for chunkfile in aligns:
            chunkdata = open(chunkfile, "r")
            for line in chunkdata:
                if line.startswith("//"):
                    allelesfile.write("{}{}|\n".format(line.strip(),
                                                       locicounter))
                    locicounter += 1
                else:
                    bits = line.split(" ")
                    hap0, hap1 = breakalleles(bits[-1])
                    allelesfile.write("{}_0{}{}".format(
                        bits[0], " ".join(bits[1:-1]), hap0))
                    allelesfile.write("{}_1{}{}".format(
                        bits[0], " ".join(bits[1:-1]), hap1))
            chunkdata.close()
        allelesfile.close()

    ## clean up chunks and aligns
    chunks = glob.glob(WORK + ".chunk*") + glob.glob(WORK + ".align")
    for handle in chunks:
        os.remove(handle)

    unaligns = glob.glob(WORK + ".not*")
    excluded_loci_file = open(WORK + "outfiles/" + outname + ".excluded_loci",
                              "w")

    for excludechunk in unaligns:
        excludedata = open(excludechunk, "r")
        for lines in excludedata:
            excluded_loci_file.write(lines)
        excludedata.close()
        os.remove(excludechunk)

    excluded_loci_file.close()
Beispiel #6
0
def makecons(vsearch, ID, datatype, outg, seed, gid, minmatch, inlist, WORK, quiet, outhandle):

    " find usearch"
    if not cmd_exists(vsearch):
        print "\tcannot find usearch (or vsearch), edit path in param file"
        sys.exit()

    " make list of consens files "
    FS = [i for i in inlist if "/cat.cons" not in i]
    FS = [i for i in FS if "/cat.group" not in i]
    if not FS:
        print "no consens files found"
        sys.exit()

    " and a list including outgroups "
    fs = copy.copy(inlist)

    " are files gzipped ? "
    if any([i.endswith(".gz") for i in FS]):
        gz = ".gz"
    else:
        gz = ""

    " remove previous files if present "
    if os.path.exists(WORK + "clust" + ID + "/cat.consens_" + gid + gz):
        os.remove(WORK + "clust" + ID + "/cat.consens_" + gid + gz)
    if os.path.exists(WORK + "clust" + ID + "/cat.group_" + gid + gz):
        os.remove(WORK + "clust" + ID + "/cat.group_" + gid + gz)

    " remove outgroup sequences, add back in later to bottom after shuffling "
    if outg:
        outgroup = outg.strip().split(",")
        if len(outgroup) > 1:
            for s in outgroup:
                if WORK + "clust" + ID + "/" + s + ".consens" + gz in FS:
                    FS.remove(WORK + "clust" + ID + "/" + s + ".consens" + gz)
        else:
            outgroup = WORK + "clust" + ID + "/" + outg + ".consens" + gz
            if outgroup in FS:
                FS.remove(outgroup)

    " create file with consens seqs from all taxa in list "
    out = gzip.open(WORK + "clust" + ID + "/cat.group_" + gid + gz, "w")

    for qhandle in FS:
        if gz:
            f = gzip.open(qhandle)
        else:
            f = open(qhandle)
        k = itertools.izip(*[iter(f)] * 2)
        while 1:
            try:
                a = k.next()
            except StopIteration:
                break
            print >> out, a[0].strip() + "    " + a[1].strip()
        f.close()
    out.close()

    " message to shell "
    if gid:
        sys.stderr.write(
            "\n\tstep 6: clustering across "
            + str(len(FS))
            + " samples at "
            + ` ID `
            + " similarity \n\tfor group ("
            + str(gid)
            + ") retaining seeds w/ minimum of "
            + str(minmatch)
            + " hits\n\n"
        )
    else:
        sys.stderr.write("\n\tstep 6: clustering across " + str(len(FS)) + " samples at " + ` ID ` + " similarity \n\n")

    " make list of random number and data "
    if seed:
        random.seed(seed)

    " open file for reading consensus reads grouped together in one file "
    source = gzip.open(WORK + "clust" + ID + "/cat.group_" + gid + ".gz", "r")
    " generator to add a random number next to every sequence "
    data = ((random.random(), line) for line in source)
    " sort by the random number into a list (now stored in memory)"
    randomized_data = sorted(data)
    source.close()

    " order by size while retaining randomization within size classes "
    splitlines = (line.split("    ") for _, line in randomized_data)
    equalspacers = iter("".join([i[0] + " " * (100 - len(i[0])), i[1]]) for i in splitlines)
    orderedseqs = sorted(equalspacers, key=len, reverse=True)
    k = iter(["**".join([i.split(" ")[0], i.split(" ")[-1]]) for i in orderedseqs])

    " write output to .consens_.gz file "
    ## NB: could probably speed this up
    out = gzip.open(WORK + "clust" + ID + "/cat.consens_" + gid + ".gz", "wb")
    while 1:
        try:
            a, b = k.next().split("**")
        except StopIteration:
            break
        print >> out, a + "\n" + b.strip()

    """ add outgroup taxa back onto end of file."""
    if outg:
        " append to existing consens_ file "
        outgroup = outg.strip().split(",")
        if len(outgroup) > 1:
            for s in outgroup:
                xoutg = WORK + "clust" + ID + "/" + s + ".consens.gz"
                if xoutg in fs:
                    f = gzip.open(xoutg)
                    k = itertools.izip(*[iter(f)] * 2)
                    while 1:
                        try:
                            a = k.next()
                        except StopIteration:
                            break
                        print >> out, a[0].strip() + "\n" + a[1].strip()
                    f.close()
        elif len(outgroup) == 1:
            xoutg = WORK + "clust" + ID + "/" + outgroup[0] + ".consens.gz"
            if xoutg in fs:
                f = gzip.open(xoutg)
                k = itertools.izip(*[iter(f)] * 2)
                while 1:
                    try:
                        a = k.next()
                    except StopIteration:
                        break
                    print >> out, a[0].strip() + "\n" + a[1].strip()
                f.close()
        else:
            None
    out.close()

    """ convert ambiguity codes into a sampled haplotype for any sample
    to use for clustering, but save ambiguities for later """

    " output file"
    outhaplos = open(outhandle, "w")

    " input file "
    infile = gzip.open(WORK + "clust" + ID + "/cat.consens_" + gid + ".gz")
    lines = iter(infile.readlines())
    infile.close()

    " write to haplo files in fasta format "
    writinghaplos = []

    for line in lines:
        if ">" in line:
            writinghaplos.append(line.strip())
        else:
            allele = breakalleles(line)[0]
            writinghaplos.append(allele.strip())
    outhaplos.write("\n".join(writinghaplos))
    outhaplos.close()