def merge_alignments(outfile,tempdir="./"):
    cmd = "mafft --thread "+nthread+" --quiet --adjustdirection --merge "+tempdir+"subMSAtable "+tempdir+"temp.mergealn 2> "+tempdir+"mafft.out > "+outfile
    os.system(cmd)
    if os.path.exists(outfile) == False:
        print(colored.red("ALIGNMENT DOESN'T EXIST"+" "+emoticons.get_ran_emot("sad")))
        sys.exit(1)
    #for some buggy reason these can be unaligned, so realigning here
    if check_unaligned(outfile) == False:
        print(colored.red("PROBLEM REDOING ALIGNMENT ("+outfile+")"+" "+emoticons.get_ran_emot("sad")))

        #log.w("PROBLEM REDOING ALIGNMENT")
        copyfile(tempdir+"subMSAtable","problem_subMSAtable")
        copyfile(tempdir+"temp.mergealn","problem_temp.mergealn")
        cmd = "mafft --quiet --adjustdirection "+tempdir+"temp.mergealn > "+outfile
        os.system(cmd)
    if mac == False:
        os.system("sed -i 's/_R_//g' "+outfile)
    else:
        os.system("sed -i '' 's/_R_//g' "+outfile)
Beispiel #2
0
def make_trim_trees(alignments):
    fasttreename = "FastTree"
    if check_for_programs.which_program("FastTree") == None:
        if check_for_programs.which_program("fasttree") != None:
            fasttreename = "fasttree"
        else:
            print(colored.red("FastTree NOT IN PATH"),
                  colored.red(emoticons.get_ran_emot("sad")))
            sys.exit(1)
    newalns = {}
    for i in alignments:
        print("making tree for", i)
        cmd = fasttreename + " -nt -gtr " + i + " > " + i.replace(
            ".aln", ".tre") + " 2> /dev/null"
        os.system(cmd)
        cmd = py + " " + DI + "trim_tips.py " + i.replace(
            ".aln", ".tre") + " " + str(relcut) + " " + str(abscut)
        #print cmd
        p = subprocess.Popen(cmd,
                             shell=True,
                             stderr=subprocess.PIPE,
                             stdout=subprocess.PIPE)
        outtre = p.stdout.read().strip()
        outrem = p.stderr.read().strip()
        removetax = set()
        if len(outrem) > 0:
            outrem = outrem.decode("utf-8")
            print("  removing", len(str(outrem).split("\n")), "tips")
            for j in str(outrem).split("\n"):
                taxon = j.split(" ")[1]
                removetax.add(taxon)
        cmd = py + " " + DI + "trim_internal_edges.py " + i.replace(
            ".aln", ".tre") + " " + str(abscutint)
        #print cmd
        p = subprocess.Popen(cmd,
                             shell=True,
                             stderr=subprocess.PIPE,
                             stdout=subprocess.PIPE)
        outtre = p.stdout.read().strip()
        outrem = p.stderr.read().strip()
        if len(outrem) > 0:
            outrem = outrem.decode("utf-8")
            print("  removing", len(str(outrem).split("\n")), "tips")
            for j in str(outrem).split("\n"):
                taxon = j.split(" ")[1]
                removetax.add(taxon)
        if len(removetax) > 0:
            cmd = "pxrms -s " + i + " -n " + ",".join(
                list(removetax)) + " -o " + i.replace(".aln", ".aln.ed")
            newalns[i] = i.replace(".aln", ".aln.ed")
            #print cmd
            os.system(cmd)

    return newalns
Beispiel #3
0
    tree = next(tree_reader.read_tree_file_iter(sys.argv[1]))
    dirl = sys.argv[2] + "/"

    didntmake = set()

    for i in tree.iternodes(order="PREORDER"):
        if "unclassified" in i.label:
            didntmake.add(i)
            continue
        if "environmental" in i.label:
            didntmake.add(i)
            continue
        if i.parent in didntmake:
            didntmake.add(i)
            continue
        if i != tree:
            i.label = i.parent.label + "/" + i.label
        try:
            os.mkdir(dirl + i.label)
        except:
            print(colored.red("PROBLEM CREATING"), dirl + i.label,
                  colored.red(emoticons.get_ran_emot("sad")))
            sys.exit(1)
        try:
            os.mkdir(dirl + i.label + "/clusters")
        except:
            print(colored.red("PROBLEM CREATING"),
                  dirl + i.label + "/clusters",
                  colored.red(emoticons.get_ran_emot("sad")))
            sys.exit(1)
Beispiel #4
0
 log = Logger(LOGFILE)
 outclu = d + "/clusters/"
 #TODO: need to make sure that the seqs that are in the DIR that aren't in the children get clustered and included here
 #   1 make a tempfile with the seqs that aren't in children
 #   2 cluster these as a single cluster
 #   3 add the results to the cluster directory
 dirs = [
     os.path.join(d, o) for o in os.listdir(d)
     if os.path.isdir(os.path.join(d, o))
 ]
 count = 0
 for c in dirs:
     if "environmental" in c or "clusters" in c:
         continue
     print colored.green("  ADDING"), c, colored.green(
         emoticons.get_ran_emot("meh"))
     cur = c + "/clusters"
     cmd = "python " + DI + "add_clade_clusters.py " + cur + " " + outclu + " " + LOGFILE
     rc = subprocess.call(cmd, shell=True)
     if rc != 0:
         print colored.red("  PROBLEM ADDING CLADE"), colored.red(
             emoticons.get_ran_emot("sad"))
         sys.exit(1)
     if takeouttaxondups:
         cmd = "python " + DI + "choose_one_species_cluster_fa_aln_and_samp.py " + tablefile + " " + outclu + " .fa+.aln " + LOGFILE
         os.system(cmd)
     # NEED TO DO SOMETHING ABOUT THE ALIGNMENT FILES
 print colored.green("   ADDING INTERNAL SEQS"), d, colored.green(
     emoticons.get_ran_emot("meh"))
 cmd = "python " + DI + "get_internal_seqs_unrepresented_in_tips.py " + d + " " + LOGFILE
 os.system(cmd)
Beispiel #5
0
                        help=("Where to write the logfile."))
    parser.add_argument("-f",
                        "--tlistf",
                        type=str,
                        nargs=1,
                        required=False,
                        help=("Taxon list file."))
    return parser


if __name__ == "__main__":
    parser = generate_argparser()
    args = parser.parse_args(sys.argv[1:])

    print(
        colored.blue("STARTING PYPHLAWD " + emoticons.get_ran_emot("excited")))
    start = datetime.now()

    dirl = args.outdir[0]
    if dirl[-1] == "/":
        dirl = dirl[:-1]

    taxon = args.taxon[0]
    db = args.database[0]

    # This will be used to limit the taxa
    taxalistf = None
    if args.tlistf is not None:
        taxalistf = args.tlistf[0]
        print(colored.yellow("LIMITING TO TAXA IN"), taxalistf)
import tree_reader
from clint.textui import colored
from conf import DI
from conf import py
import emoticons
from datetime import datetime

if __name__ == "__main__":
    if len(sys.argv) != 6 and len(sys.argv) != 7:
        print("python " + sys.argv[0] +
              " taxon baitdir db outdir logfile [taxalist]")
        sys.exit(0)

    print(
        colored.blue("STARTING PYPHLAWD (baited) " +
                     emoticons.get_ran_emot("excited")))
    start = datetime.now()
    dirl = sys.argv[4]
    if dirl[-1] == "/":
        dirl = dirl[:-1]
    taxon = sys.argv[1]
    baitdir = sys.argv[2]
    db = sys.argv[3]
    # This will be used to limit the taxa
    taxalistf = None
    if len(sys.argv) == 7:
        taxalistf = sys.argv[6]
        print(colored.yellow("LIMITING TO TAXA IN"), sys.argv[6])

    # Log file
    logfile = sys.argv[5]
Beispiel #7
0
 # get the random directory so you can run multiple things in the same directory
 rantempdir = "TEMPDIR_" + str(random.randint(0, 100000)) + "/"
 print(colored.blue("CREATED"), rantempdir)
 os.mkdir(rantempdir)
 log.wac("CREATED " + rantempdir)
 #prepare bait
 baitdir = sys.argv[2]
 # could do samp
 make_blast_db_from_cluster(baitdir, rantempdir)
 count = 0
 for root, dirs, files in os.walk(root, topdown=False):
     if "clusters" not in root:
         log.whac(root)
         if len(dirs) == 1:
             print(colored.yellow("BAIT SINGLE"), root,
                   colored.yellow(emoticons.get_ran_emot("meh")))
             log.wac("BAIT SINGLE " + root)
             tablename = [x for x in files if ".table" in x][0]
             cmd = py + " " + DI + "bait_single.py " + root + " " + logfile + " " + rantempdir
             os.system(cmd)
         else:
             print(colored.blue("BAIT INTERNAL"), root,
                   colored.blue(emoticons.get_ran_emot("meh")))
             log.wac("BAIT " + root)
             tablename = [x for x in files if ".table" in x][0]
             if root[-1] != "/":
                 root = root + "/"
             cmd = py + " " + DI + "bait_clade.py " + root + " " + root + tablename + " " + logfile + " " + rantempdir
             rc = subprocess.call(cmd, shell=True)
             if rc != 0:
                 print(colored.red("PROBLEM WITH CLUSTERING INTERNAL"),
Beispiel #8
0
            for x in l:
                lf1 = str(x[0])
                rt1 = str(x[1])
            sql = "SELECT ncbi_id from taxonomy where node_rank = 'family' and name_class = 'scientific name' and left_value < " + lf1 + " and right_value > " + rt1
            c.execute(sql)
            l = c.fetchall()
            for x in l:
                pn = str(x[0])
            sql = "update taxonomy set custom_parent_id = '" + pn + "' where custom_id = '" + str(
                gid) + "'"
            c.execute(sql)
    fl.close()
    conn.commit()


if __name__ == "__main__":
    parser = generate_argparser()
    args = parser.parse_args(sys.argv[1:])

    print(
        colored.blue("STARTING PYPHLAWD " + emoticons.get_ran_emot("excited")))

    dbloc = args.database[0]
    species = args.species[0]
    genus = args.genus[0]

    conn = sqlite3.connect(dbloc)
    lf, rt = get_plant_left_right(conn)
    add_column(conn)
    unmatched = process_genera(conn, genus, lf, rt)
    add_species(conn, species, unmatched)
    outclu = d + "/clusters/"
    #TODO: need to make sure that the seqs that are in the DIR that aren't in the children get clustered and included here
    #   1 make a tempfile with the seqs that aren't in children
    #   2 cluster these as a single cluster
    #   3 add the results to the cluster directory
    dirs = [
        os.path.join(d, o) for o in os.listdir(d)
        if os.path.isdir(os.path.join(d, o))
    ]
    count = 0
    for c in dirs:
        if "environmental" in c or "clusters" in c:
            continue
        print(colored.green("  ADDING"), c,
              colored.green(emoticons.get_ran_emot("meh")))
        cur = c + "/clusters"
        cmd = py + " " + DI + "add_clade_clusters.py " + cur + " " + outclu + " " + LOGFILE + " " + TEMPDIR
        rc = subprocess.call(cmd, shell=True)
        if rc != 0:
            print(colored.red("  PROBLEM ADDING CLADE"),
                  colored.red(emoticons.get_ran_emot("sad")))
            sys.exit(1)
        if takeouttaxondups:
            cmd = py + " " + DI + "choose_one_species_cluster_fa_aln_and_samp.py " + tablefile + " " + outclu + " .fa+.aln " + LOGFILE
            os.system(cmd)
        # NEED TO DO SOMETHING ABOUT THE ALIGNMENT FILES
    print(colored.green("   ADDING INTERNAL SEQS"), d,
          colored.green(emoticons.get_ran_emot("meh")))
    cmd = py + " " + DI + "get_internal_seqs_unrepresented_in_tips.py " + d + " " + LOGFILE
    os.system(cmd)
Beispiel #10
0
    else:
        TEMPDIR = sys.argv[4]
        if TEMPDIR[-1] != "/":
            TEMPDIR += "/"
    
    outclu = d+"/clusters/"
    #TODO: need to make sure that the seqs that are in the DIR that aren't in the children get clustered and included here
    #   1 make a tempfile with the seqs that aren't in children
    #   2 cluster these as a single cluster
    #   3 add the results to the cluster directory
    dirs = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))]
    joinseqs = {}
    for c in dirs:
        if "environmental" in c or "clusters" in c:
            continue
        print(colored.green("  ADDING"),c,colored.green(emoticons.get_ran_emot("meh")))
        cur =  c+"/clusters"
        for i in os.listdir(cur):
            if i[-3:] == ".fa":
                if i not in joinseqs:
                    joinseqs[i] = []
                joinseqs[i].append(cur+"/"+i)
    for i in joinseqs:
        print(colored.green("    MERGING"),i,colored.green(emoticons.get_ran_emot("meh")))
        write_fasta_file(joinseqs[i],outclu+i)
        wmtt([j.replace(".fa",".aln") for j in joinseqs[i]],TEMPDIR)
        merge_alignments(outclu+i.replace(".fa",".aln"),TEMPDIR)

    """
    cmd = "python "+DI+"get_internal_seqs_unrepresented_in_tips.py "+d+" "+LOGFILE
    os.system(cmd)
Beispiel #11
0
     sys.exit(0)
 
 root = sys.argv[1]
 logfile = sys.argv[2]
 log = Logger(logfile)
 # get the random directory so you can run multiple things in the same directory
 rantempdir = "TEMPDIR_"+str(random.randint(0,100000))+"/"
 print(colored.blue("CREATED"),rantempdir)
 os.mkdir(rantempdir)
 log.wac("CREATED "+rantempdir)
 count = 0
 for root, dirs, files in os.walk(root,topdown=False):
     if "clusters" not in root:
         log.whac(root)
         if len(dirs) == 1:
             print(colored.yellow("CLUSTERING SINGLE"),root,colored.yellow(emoticons.get_ran_emot("meh")))
             log.wac("CLUSTERING SINGLE "+root)
             tablename = [x for x in files if ".table" in x][0]
             cmd = py+" "+DI+"cluster_single_wc.py "+root+" "+logfile
             os.system(cmd)
         else:
             print(colored.blue("CLUSTERING INTERNAL"),root,colored.blue(emoticons.get_ran_emot("meh")))
             log.wac("CLUSTERING INTERNAL "+root)
             tablename = [x for x in files if ".table" in x][0]
             if root[-1] != "/":
                 root = root+"/"
             cmd = py+" "+DI+"cluster_internal_wc.py "+root+ " "+root+tablename+" "+logfile+" "+rantempdir
             rc = subprocess.call(cmd,shell=True)
             if rc != 0:
                 print(colored.red("PROBLEM WITH CLUSTERING INTERNAL"),colored.red(emoticons.get_ran_emot("sad")))
                 sys.exit(1)
Beispiel #12
0
    if len(sys.argv) == 5:
        TEMPDIR = sys.argv[4]
        if TEMPDIR[-1] != "/":
            TEMPDIR += "/"

    outclu = d+"/clusters/"
    #TODO: need to make sure that the seqs that are in the DIR that aren't in the children get clustered and included here
    #   1 make a tempfile with the seqs that aren't in children
    #   2 cluster these as a single cluster
    #   3 add the results to the cluster directory
    dirs = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))]
    count = 0
    for c in dirs:
        if "environmental" in c or "clusters" in c:
            continue
        print(colored.green("  ADDING"),c,colored.green(emoticons.get_ran_emot("meh")))
        cur =  c+"/clusters"
        cmd = py+" "+DI+"add_clade_clusters.py "+cur+" "+outclu+" "+LOGFILE+" "+TEMPDIR
        rc = subprocess.call(cmd, shell=True)
        if rc != 0:
            print(colored.red("  PROBLEM ADDING CLADE"),colored.red(emoticons.get_ran_emot("sad")))
            sys.exit(1)
        if takeouttaxondups:
            cmd = py+" "+DI+"choose_one_species_cluster_fa_aln_and_samp_wc.py "+tablefile+" "+outclu+" .fa+.aln "+LOGFILE
            os.system(cmd)
        # NEED TO DO SOMETHING ABOUT THE ALIGNMENT FILES
    print(colored.green("   ADDING INTERNAL SEQS"),d,colored.green(emoticons.get_ran_emot("meh")))
    cmd = py+" "+DI+"get_internal_seqs_unrepresented_in_tips.py "+d+" "+LOGFILE
    os.system(cmd)
    cmd = py+" "+DI+"add_internal_seqs_to_clusters.py "+d+" "+outclu+" "+LOGFILE+" "+TEMPDIR
    os.system(cmd)
Beispiel #13
0
import os
import sys
from clint.textui import colored
from datetime import datetime

from conf import DI
from conf import py
import emoticons
import tree_reader

if __name__ == "__main__":
    if len(sys.argv) != 5 and len(sys.argv) != 6:
        print("python "+sys.argv[0]+" taxon db outdir logfile [taxalist]")
        sys.exit(0)
    
    print(colored.blue("STARTING PYPHLAWD "+emoticons.get_ran_emot("excited")))
    start = datetime.now()
    dirl = sys.argv[3]
    if dirl[-1] == "/":
        dirl = dirl[:-1]
    taxon = sys.argv[1]
    db = sys.argv[2]
    # This will be used to limit the taxa
    taxalistf = None
    if len(sys.argv) == 6:
        taxalistf = sys.argv[5]
        print(colored.yellow("LIMITING TO TAXA IN"),sys.argv[5])

    # Log file
    logfile = sys.argv[4]
    if logfile[-len(".md.gz"):] != ".md.gz":