def merge_alignments(outfile,tempdir="./"): cmd = "mafft --thread "+nthread+" --quiet --adjustdirection --merge "+tempdir+"subMSAtable "+tempdir+"temp.mergealn 2> "+tempdir+"mafft.out > "+outfile os.system(cmd) if os.path.exists(outfile) == False: print(colored.red("ALIGNMENT DOESN'T EXIST"+" "+emoticons.get_ran_emot("sad"))) sys.exit(1) #for some buggy reason these can be unaligned, so realigning here if check_unaligned(outfile) == False: print(colored.red("PROBLEM REDOING ALIGNMENT ("+outfile+")"+" "+emoticons.get_ran_emot("sad"))) #log.w("PROBLEM REDOING ALIGNMENT") copyfile(tempdir+"subMSAtable","problem_subMSAtable") copyfile(tempdir+"temp.mergealn","problem_temp.mergealn") cmd = "mafft --quiet --adjustdirection "+tempdir+"temp.mergealn > "+outfile os.system(cmd) if mac == False: os.system("sed -i 's/_R_//g' "+outfile) else: os.system("sed -i '' 's/_R_//g' "+outfile)
def make_trim_trees(alignments): fasttreename = "FastTree" if check_for_programs.which_program("FastTree") == None: if check_for_programs.which_program("fasttree") != None: fasttreename = "fasttree" else: print(colored.red("FastTree NOT IN PATH"), colored.red(emoticons.get_ran_emot("sad"))) sys.exit(1) newalns = {} for i in alignments: print("making tree for", i) cmd = fasttreename + " -nt -gtr " + i + " > " + i.replace( ".aln", ".tre") + " 2> /dev/null" os.system(cmd) cmd = py + " " + DI + "trim_tips.py " + i.replace( ".aln", ".tre") + " " + str(relcut) + " " + str(abscut) #print cmd p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) outtre = p.stdout.read().strip() outrem = p.stderr.read().strip() removetax = set() if len(outrem) > 0: outrem = outrem.decode("utf-8") print(" removing", len(str(outrem).split("\n")), "tips") for j in str(outrem).split("\n"): taxon = j.split(" ")[1] removetax.add(taxon) cmd = py + " " + DI + "trim_internal_edges.py " + i.replace( ".aln", ".tre") + " " + str(abscutint) #print cmd p = subprocess.Popen(cmd, shell=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) outtre = p.stdout.read().strip() outrem = p.stderr.read().strip() if len(outrem) > 0: outrem = outrem.decode("utf-8") print(" removing", len(str(outrem).split("\n")), "tips") for j in str(outrem).split("\n"): taxon = j.split(" ")[1] removetax.add(taxon) if len(removetax) > 0: cmd = "pxrms -s " + i + " -n " + ",".join( list(removetax)) + " -o " + i.replace(".aln", ".aln.ed") newalns[i] = i.replace(".aln", ".aln.ed") #print cmd os.system(cmd) return newalns
tree = next(tree_reader.read_tree_file_iter(sys.argv[1])) dirl = sys.argv[2] + "/" didntmake = set() for i in tree.iternodes(order="PREORDER"): if "unclassified" in i.label: didntmake.add(i) continue if "environmental" in i.label: didntmake.add(i) continue if i.parent in didntmake: didntmake.add(i) continue if i != tree: i.label = i.parent.label + "/" + i.label try: os.mkdir(dirl + i.label) except: print(colored.red("PROBLEM CREATING"), dirl + i.label, colored.red(emoticons.get_ran_emot("sad"))) sys.exit(1) try: os.mkdir(dirl + i.label + "/clusters") except: print(colored.red("PROBLEM CREATING"), dirl + i.label + "/clusters", colored.red(emoticons.get_ran_emot("sad"))) sys.exit(1)
log = Logger(LOGFILE) outclu = d + "/clusters/" #TODO: need to make sure that the seqs that are in the DIR that aren't in the children get clustered and included here # 1 make a tempfile with the seqs that aren't in children # 2 cluster these as a single cluster # 3 add the results to the cluster directory dirs = [ os.path.join(d, o) for o in os.listdir(d) if os.path.isdir(os.path.join(d, o)) ] count = 0 for c in dirs: if "environmental" in c or "clusters" in c: continue print colored.green(" ADDING"), c, colored.green( emoticons.get_ran_emot("meh")) cur = c + "/clusters" cmd = "python " + DI + "add_clade_clusters.py " + cur + " " + outclu + " " + LOGFILE rc = subprocess.call(cmd, shell=True) if rc != 0: print colored.red(" PROBLEM ADDING CLADE"), colored.red( emoticons.get_ran_emot("sad")) sys.exit(1) if takeouttaxondups: cmd = "python " + DI + "choose_one_species_cluster_fa_aln_and_samp.py " + tablefile + " " + outclu + " .fa+.aln " + LOGFILE os.system(cmd) # NEED TO DO SOMETHING ABOUT THE ALIGNMENT FILES print colored.green(" ADDING INTERNAL SEQS"), d, colored.green( emoticons.get_ran_emot("meh")) cmd = "python " + DI + "get_internal_seqs_unrepresented_in_tips.py " + d + " " + LOGFILE os.system(cmd)
help=("Where to write the logfile.")) parser.add_argument("-f", "--tlistf", type=str, nargs=1, required=False, help=("Taxon list file.")) return parser if __name__ == "__main__": parser = generate_argparser() args = parser.parse_args(sys.argv[1:]) print( colored.blue("STARTING PYPHLAWD " + emoticons.get_ran_emot("excited"))) start = datetime.now() dirl = args.outdir[0] if dirl[-1] == "/": dirl = dirl[:-1] taxon = args.taxon[0] db = args.database[0] # This will be used to limit the taxa taxalistf = None if args.tlistf is not None: taxalistf = args.tlistf[0] print(colored.yellow("LIMITING TO TAXA IN"), taxalistf)
import tree_reader from clint.textui import colored from conf import DI from conf import py import emoticons from datetime import datetime if __name__ == "__main__": if len(sys.argv) != 6 and len(sys.argv) != 7: print("python " + sys.argv[0] + " taxon baitdir db outdir logfile [taxalist]") sys.exit(0) print( colored.blue("STARTING PYPHLAWD (baited) " + emoticons.get_ran_emot("excited"))) start = datetime.now() dirl = sys.argv[4] if dirl[-1] == "/": dirl = dirl[:-1] taxon = sys.argv[1] baitdir = sys.argv[2] db = sys.argv[3] # This will be used to limit the taxa taxalistf = None if len(sys.argv) == 7: taxalistf = sys.argv[6] print(colored.yellow("LIMITING TO TAXA IN"), sys.argv[6]) # Log file logfile = sys.argv[5]
# get the random directory so you can run multiple things in the same directory rantempdir = "TEMPDIR_" + str(random.randint(0, 100000)) + "/" print(colored.blue("CREATED"), rantempdir) os.mkdir(rantempdir) log.wac("CREATED " + rantempdir) #prepare bait baitdir = sys.argv[2] # could do samp make_blast_db_from_cluster(baitdir, rantempdir) count = 0 for root, dirs, files in os.walk(root, topdown=False): if "clusters" not in root: log.whac(root) if len(dirs) == 1: print(colored.yellow("BAIT SINGLE"), root, colored.yellow(emoticons.get_ran_emot("meh"))) log.wac("BAIT SINGLE " + root) tablename = [x for x in files if ".table" in x][0] cmd = py + " " + DI + "bait_single.py " + root + " " + logfile + " " + rantempdir os.system(cmd) else: print(colored.blue("BAIT INTERNAL"), root, colored.blue(emoticons.get_ran_emot("meh"))) log.wac("BAIT " + root) tablename = [x for x in files if ".table" in x][0] if root[-1] != "/": root = root + "/" cmd = py + " " + DI + "bait_clade.py " + root + " " + root + tablename + " " + logfile + " " + rantempdir rc = subprocess.call(cmd, shell=True) if rc != 0: print(colored.red("PROBLEM WITH CLUSTERING INTERNAL"),
for x in l: lf1 = str(x[0]) rt1 = str(x[1]) sql = "SELECT ncbi_id from taxonomy where node_rank = 'family' and name_class = 'scientific name' and left_value < " + lf1 + " and right_value > " + rt1 c.execute(sql) l = c.fetchall() for x in l: pn = str(x[0]) sql = "update taxonomy set custom_parent_id = '" + pn + "' where custom_id = '" + str( gid) + "'" c.execute(sql) fl.close() conn.commit() if __name__ == "__main__": parser = generate_argparser() args = parser.parse_args(sys.argv[1:]) print( colored.blue("STARTING PYPHLAWD " + emoticons.get_ran_emot("excited"))) dbloc = args.database[0] species = args.species[0] genus = args.genus[0] conn = sqlite3.connect(dbloc) lf, rt = get_plant_left_right(conn) add_column(conn) unmatched = process_genera(conn, genus, lf, rt) add_species(conn, species, unmatched)
outclu = d + "/clusters/" #TODO: need to make sure that the seqs that are in the DIR that aren't in the children get clustered and included here # 1 make a tempfile with the seqs that aren't in children # 2 cluster these as a single cluster # 3 add the results to the cluster directory dirs = [ os.path.join(d, o) for o in os.listdir(d) if os.path.isdir(os.path.join(d, o)) ] count = 0 for c in dirs: if "environmental" in c or "clusters" in c: continue print(colored.green(" ADDING"), c, colored.green(emoticons.get_ran_emot("meh"))) cur = c + "/clusters" cmd = py + " " + DI + "add_clade_clusters.py " + cur + " " + outclu + " " + LOGFILE + " " + TEMPDIR rc = subprocess.call(cmd, shell=True) if rc != 0: print(colored.red(" PROBLEM ADDING CLADE"), colored.red(emoticons.get_ran_emot("sad"))) sys.exit(1) if takeouttaxondups: cmd = py + " " + DI + "choose_one_species_cluster_fa_aln_and_samp.py " + tablefile + " " + outclu + " .fa+.aln " + LOGFILE os.system(cmd) # NEED TO DO SOMETHING ABOUT THE ALIGNMENT FILES print(colored.green(" ADDING INTERNAL SEQS"), d, colored.green(emoticons.get_ran_emot("meh"))) cmd = py + " " + DI + "get_internal_seqs_unrepresented_in_tips.py " + d + " " + LOGFILE os.system(cmd)
else: TEMPDIR = sys.argv[4] if TEMPDIR[-1] != "/": TEMPDIR += "/" outclu = d+"/clusters/" #TODO: need to make sure that the seqs that are in the DIR that aren't in the children get clustered and included here # 1 make a tempfile with the seqs that aren't in children # 2 cluster these as a single cluster # 3 add the results to the cluster directory dirs = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))] joinseqs = {} for c in dirs: if "environmental" in c or "clusters" in c: continue print(colored.green(" ADDING"),c,colored.green(emoticons.get_ran_emot("meh"))) cur = c+"/clusters" for i in os.listdir(cur): if i[-3:] == ".fa": if i not in joinseqs: joinseqs[i] = [] joinseqs[i].append(cur+"/"+i) for i in joinseqs: print(colored.green(" MERGING"),i,colored.green(emoticons.get_ran_emot("meh"))) write_fasta_file(joinseqs[i],outclu+i) wmtt([j.replace(".fa",".aln") for j in joinseqs[i]],TEMPDIR) merge_alignments(outclu+i.replace(".fa",".aln"),TEMPDIR) """ cmd = "python "+DI+"get_internal_seqs_unrepresented_in_tips.py "+d+" "+LOGFILE os.system(cmd)
sys.exit(0) root = sys.argv[1] logfile = sys.argv[2] log = Logger(logfile) # get the random directory so you can run multiple things in the same directory rantempdir = "TEMPDIR_"+str(random.randint(0,100000))+"/" print(colored.blue("CREATED"),rantempdir) os.mkdir(rantempdir) log.wac("CREATED "+rantempdir) count = 0 for root, dirs, files in os.walk(root,topdown=False): if "clusters" not in root: log.whac(root) if len(dirs) == 1: print(colored.yellow("CLUSTERING SINGLE"),root,colored.yellow(emoticons.get_ran_emot("meh"))) log.wac("CLUSTERING SINGLE "+root) tablename = [x for x in files if ".table" in x][0] cmd = py+" "+DI+"cluster_single_wc.py "+root+" "+logfile os.system(cmd) else: print(colored.blue("CLUSTERING INTERNAL"),root,colored.blue(emoticons.get_ran_emot("meh"))) log.wac("CLUSTERING INTERNAL "+root) tablename = [x for x in files if ".table" in x][0] if root[-1] != "/": root = root+"/" cmd = py+" "+DI+"cluster_internal_wc.py "+root+ " "+root+tablename+" "+logfile+" "+rantempdir rc = subprocess.call(cmd,shell=True) if rc != 0: print(colored.red("PROBLEM WITH CLUSTERING INTERNAL"),colored.red(emoticons.get_ran_emot("sad"))) sys.exit(1)
if len(sys.argv) == 5: TEMPDIR = sys.argv[4] if TEMPDIR[-1] != "/": TEMPDIR += "/" outclu = d+"/clusters/" #TODO: need to make sure that the seqs that are in the DIR that aren't in the children get clustered and included here # 1 make a tempfile with the seqs that aren't in children # 2 cluster these as a single cluster # 3 add the results to the cluster directory dirs = [os.path.join(d,o) for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))] count = 0 for c in dirs: if "environmental" in c or "clusters" in c: continue print(colored.green(" ADDING"),c,colored.green(emoticons.get_ran_emot("meh"))) cur = c+"/clusters" cmd = py+" "+DI+"add_clade_clusters.py "+cur+" "+outclu+" "+LOGFILE+" "+TEMPDIR rc = subprocess.call(cmd, shell=True) if rc != 0: print(colored.red(" PROBLEM ADDING CLADE"),colored.red(emoticons.get_ran_emot("sad"))) sys.exit(1) if takeouttaxondups: cmd = py+" "+DI+"choose_one_species_cluster_fa_aln_and_samp_wc.py "+tablefile+" "+outclu+" .fa+.aln "+LOGFILE os.system(cmd) # NEED TO DO SOMETHING ABOUT THE ALIGNMENT FILES print(colored.green(" ADDING INTERNAL SEQS"),d,colored.green(emoticons.get_ran_emot("meh"))) cmd = py+" "+DI+"get_internal_seqs_unrepresented_in_tips.py "+d+" "+LOGFILE os.system(cmd) cmd = py+" "+DI+"add_internal_seqs_to_clusters.py "+d+" "+outclu+" "+LOGFILE+" "+TEMPDIR os.system(cmd)
import os import sys from clint.textui import colored from datetime import datetime from conf import DI from conf import py import emoticons import tree_reader if __name__ == "__main__": if len(sys.argv) != 5 and len(sys.argv) != 6: print("python "+sys.argv[0]+" taxon db outdir logfile [taxalist]") sys.exit(0) print(colored.blue("STARTING PYPHLAWD "+emoticons.get_ran_emot("excited"))) start = datetime.now() dirl = sys.argv[3] if dirl[-1] == "/": dirl = dirl[:-1] taxon = sys.argv[1] db = sys.argv[2] # This will be used to limit the taxa taxalistf = None if len(sys.argv) == 6: taxalistf = sys.argv[5] print(colored.yellow("LIMITING TO TAXA IN"),sys.argv[5]) # Log file logfile = sys.argv[4] if logfile[-len(".md.gz"):] != ".md.gz":