def construct_db_of_sister(taxon, DB, outprefix): outfile = outprefix + ".tempfa" outfiletbl = outprefix + ".temptable" mfid(taxon, DB, outfile, outfiletbl, remove_genomes=True) of = open(outfiletbl, "r") seq_taxon_dict = {} taxon_seqs_dict = {} for i in of: spls = i.strip().split("\t") try: taxon_seqs_dict[spls[1]] except: taxon_seqs_dict[spls[1]] = set() seq_taxon_dict[spls[3]] = spls[1] taxon_seqs_dict[spls[1]].add(spls[3]) of.close() seqs_dict = seq.read_fasta_file_return_dict(outfile) return outfile, outfiletbl, seq_taxon_dict, taxon_seqs_dict, seqs_dict
def add_updated_seqs_to_dir(dirl, tid, DB, outfilehead, internal=False): print(dirl) oldids = set() maintablename = None for i in os.listdir(dirl): # get the main table name so that we can update things if i[-len(".table"):] == ".table": maintablename = i # read all the tables if ".table" in i: print("reading:", i, file=sys.stderr) fl = open(dirl + i, "r") for j in fl: spls = j.strip().split("\t") oldids.add(spls[2]) fl.close() newids = set() newid_info = {} #key is id, value is full bit newid_seq = {} #key is id, value is seq seqs, tbls = None, None if internal == False: seqs, tbls = mfid(tid, DB, None, None) else: seqs, tbls = mfid_internal(tid, DB, None, None) for i, j in zip(seqs, tbls): spls = j.split("\t") newids.add(spls[2]) newid_info[spls[2]] = j newid_seq[spls[2]] = i diff = newids.difference(oldids) if len(diff) > 0: newtable = dirl + maintablename + "." + outfilehead newseqfn = dirl + maintablename.replace(".table", ".fas") + "." + outfilehead print("writing:", newtable, file=sys.stderr) print("writing:", newseqfn, file=sys.stderr) newtablef = open(newtable, "w") newseqf = open(newseqfn, "w") for i in diff: newtablef.write(newid_info[i] + "\n") newseqf.write(newid_seq[i] + "\n") newtablef.close() newseqf.close() else: print("no update", file=sys.stderr)
didntmake = set() for i in tree.iternodes(): if "unclassified" in i.label: didntmake.add(i) continue if "environmental" in i.label: didntmake.add(i) continue if i.parent in didntmake: didntmake.add(i) continue orig = i.label if i != tree: i.label = i.parent.label + "/" + i.label tid = orig.split("_")[-1] dirr = i.label if len(i.children) == 0: mfid(tid, DB, dirl + dirr + "/" + orig + ".fas", dirl + dirr + "/" + orig + ".table", True, limitlist=taxalist) else: mfid_in(tid, DB, dirl + dirr + "/" + orig + ".fas", dirl + dirr + "/" + orig + ".table", True, limitlist=taxalist)