def get_hmm_refalignment(self): sites = [] hmp = open(self.refprofile) l = hmp.readline() start = False while l != "": if l.startswith("//"): break if start: l = l.strip() ll = l.split() usedsite = int(ll[5]) sites.append(usedsite) l = hmp.readline() l = hmp.readline() else: if l.startswith("HMM "): start = True l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() hmp.close() align = SeqGroup(self.refalign) fout = open(self.trimed, "w") for entr in align.get_entries(): fout.write(">" + entr[0] + "\n") for pos in sites: fout.write(entr[1][pos - 1]) fout.write("\n") fout.close() return self.trimed, len(sites)
def trim_refalign_hmm(refaln, hmmprofile): sites = [] hmp = open(hmmprofile) l = hmp.readline() start = False while l!="": if l.startswith("//"): break if start: l = l.strip() ll = l.split() usedsite = int(ll[5]) sites.append(usedsite) l = hmp.readline() l = hmp.readline() else: if l.startswith("HMM "): start = True l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() l = hmp.readline() hmp.close() align = SeqGroup(refaln) fout = open(refaln+".trimed.afa", "w") for entr in align.get_entries(): fout.write(">" + entr[0] + "\n") for pos in sites: fout.write(entr[1][pos-1]) fout.write("\n") fout.close() return refaln+".trimed.afa", len(sites)
def count_reads(nfolder, pref = "me_leaf_"): cnt = 0 naligns = glob.glob(nfolder + pref + "*") for aln in naligns: a = SeqGroup(sequences = aln) for ent in a.get_entries(): name = ent[0] if name == "root_ref": pass elif name.startswith("*R*"): pass else: numread = int(name.split("*")[-1]) cnt = cnt + numread print cnt
def random_remove_taxa(falign, num_remove, num_repeat = 1): align = SeqGroup(sequences = falign) entrs = align.get_entries() numseq = len(entrs) index = range(numseq) namel = [] for i in range(num_repeat): newalign = SeqGroup() random.shuffle(index) idxs = index[num_remove:] for idx in idxs: newalign.set_seq(entrs[idx][0], entrs[idx][1]) newalign.write(outfile = falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa") namel.append(falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa") return namel
def raxml_g_after_epa(nfolder, nref_align, suf = "ifa", T = "2"): align_orgin = SeqGroup(sequences = nref_align) ref_taxa = [] for entr in align_orgin.get_entries(): ref_taxa.append(entr[0]) naligns = glob.glob(nfolder + "*." + suf) cnt = 0 for aln in naligns: print(repr(cnt)) cnt = cnt + 1 if os.path.exists(aln.split(".")[0] + ".subtree"): pass else: mttree = aln.split(".")[0] + ".mttree" #raxml constrait search trename = build_constrain_tree(nsfin = aln, ntfin = mttree, nfout = "i"+repr(cnt), nfolder = nfolder, num_thread = T) #read in the fully resolved tree full_tree = Tree(trename, format=1) all_taxa = full_tree.get_leaf_names() target_taxa = [] for taxa in all_taxa: if taxa in ref_taxa: pass else: target_taxa.append(taxa) #the place where the tree can be safely rooted ref_node = full_tree.get_leaves_by_name(ref_taxa[0])[0] #reroot full_tree.set_outgroup(ref_node) #find the common ancestor of the target taxa leafA = full_tree.get_leaves_by_name(target_taxa[0])[0] leaflist = [] for n in target_taxa[1:]: leaflist.append(full_tree.get_leaves_by_name(n)[0]) common = leafA.get_common_ancestor(leaflist) common.up = None common.write(outfile= aln.split(".")[0] + ".subtree", format=5) os.remove(trename) os.remove(mttree)
def extract_placement_crop(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"): if os.path.exists(logfile): os.remove(logfile) if os.path.exists(nfout + "_inode_picked_otus.fasta"): os.remove(nfout + "_inode_picked_otus.fasta") jsondata = open (nfin_place) align_orgin = SeqGroup(sequences = nfin_aln) data = json.load(jsondata) placements = data["placements"] tree = data["tree"] ete_tree = tree.replace("{", "[&&NHX:B=") ete_tree = ete_tree.replace("}", "]") root = Tree(ete_tree, format=1) leaves = root.get_leaves() allnodes = root.get_descendants() allnodes.append(root) """get refseq""" refseqset = [] for leaf in leaves: refseqset.append(leaf.name) refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin) placemap = {} """find how many edges are used for placement""" for placement in placements: edges = placement["p"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: placemap[curredge] = placemap.get(curredge, []) """placement quality control""" discard_file = open(nfout+".discard.placement.txt", "w") """group taxa to edges""" for placement in placements: edges = placement["p"] taxa_names = placement["n"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: a = placemap[curredge] a.extend(taxa_names) placemap[curredge] = a else: discard_file.write(repr(taxa_names) + "\n") discard_file.close() groups = placemap.items() cnt_leaf = 0 cnt_inode = 0 """check each edge""" for i,item in enumerate(groups): seqset_name = item[0] seqset = item[1] """check if placed on leaf node and find the node being placed on""" flag = False place_node = None for node in allnodes: if str(node.B) == str(seqset_name): place_node = node if node.is_leaf(): flag = True break """generate aligment""" if flag: """process leaf node placement""" cnt_leaf = cnt_leaf + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: cnt_inode = cnt_inode + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta") sp_log(sfout = logfile, logs="I the palcement is on an internal node \nD find new species\nK reads number: 1 \n") else: #for entr in refali.get_entries(): # sname = entr[0] # seqe = entr[1] # newalign.set_seq(sname, seq) newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
def extract_placement(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"): if os.path.exists(logfile): os.remove(logfile) if os.path.exists(nfout + "_inode_picked_otus.fasta"): os.remove(nfout + "_inode_picked_otus.fasta") jsondata = open (nfin_place) align_orgin = SeqGroup(sequences = nfin_aln) data = json.load(jsondata) placements = data["placements"] tree = data["tree"] ete_tree = tree.replace("{", "[&&NHX:B=") ete_tree = ete_tree.replace("}", "]") root = Tree(ete_tree, format=1) leaves = root.get_leaves() allnodes = root.get_descendants() allnodes.append(root) """get refseq""" refseqset = [] for leaf in leaves: refseqset.append(leaf.name) refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin) placemap = {} """find how many edges are used for placement""" for placement in placements: edges = placement["p"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: placemap[curredge] = placemap.get(curredge, []) """placement quality control""" discard_file = open(nfout+".discard.placement.txt", "w") """group taxa to edges""" for placement in placements: edges = placement["p"] taxa_names = placement["n"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: a = placemap[curredge] a.extend(taxa_names) placemap[curredge] = a else: discard_file.write(repr(taxa_names) + "\n") discard_file.close() groups = placemap.items() cnt_leaf = 0 cnt_inode = 0 """check each edge""" for i,item in enumerate(groups): seqset_name = item[0] seqset = item[1] """check if placed on leaf node and find the node being placed on""" flag = False place_node = None for node in allnodes: if str(node.B) == str(seqset_name): place_node = node if node.is_leaf(): flag = True break """find the furthest leaf of the placement node""" fnode = place_node.get_farthest_node()[0] outgroup_name = fnode.name """find sister node""" snode = place_node.get_sisters()[0] if not snode.is_leaf(): snode = snode.get_closest_leaf()[0] sister_name = snode.name """generate aligment""" if flag: """process leaf node placement""" cnt_leaf = cnt_leaf + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: #count_and_pick_reads(align = newalign, outputfile = nfout + "_leaf_picked_otus.fasta") og_seq = align_orgin.get_seq(outgroup_name) sis_seq = align_orgin.get_seq(sister_name) newalign.set_seq("sister", sis_seq) #set the sister seqeunce to make 4 taxa newalign.set_seq("root_ref", og_seq) #set the outgroup name place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: og_seq = align_orgin.get_seq(outgroup_name) newalign.set_seq("root_ref", og_seq) #set the outgroup name place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: """genrate the newwick string to be inserted into the ref tree""" rep = re.compile(r"\{[0-9]*\}") multi_fcating = "(" for seqname in seqset: multi_fcating = multi_fcating + seqname + "," multi_fcating = multi_fcating[:-1] multi_fcating = "{" + repr(seqset_name) + "}," + multi_fcating + ")" mtfc_tree = tree.replace("{" + repr(seqset_name) + "}", multi_fcating) mtfc_tree = rep.sub("", mtfc_tree) cnt_inode = cnt_inode + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta") sp_log(sfout = logfile, logs="I the palcement is on an internal node \nD find new species\nK reads number: 1 \n") else: #og_seq = align_orgin.get_seq(outgroup_name) #newalign.set_seq("root_ref", og_seq) for entr in refali.get_entries(): sname = entr[0] seqe = entr[1] newalign.set_seq(sname, seq) newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa") mtfc_out = open(nfout + "_inode_"+repr(cnt_inode) + ".mttree", "w") mtfc_out.write(mtfc_tree) mtfc_out.close()
class ground_truth: def __init__(self, refaln, type = ""): if type == "fasta": self.aln = SeqGroup(sequences=refaln) else: self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed') self.true_spe = {} self._get_truth() self._get_cluster_label() def _get_truth(self): for entr in self.aln.get_entries(): name = entr[0] gid = name.split(".")[0] self.true_spe[gid] = [] for entr in self.aln.get_entries(): name = entr[0] gid = name.split(".")[0] group = self.true_spe[gid] group.append(name) self.true_spe[gid] = group def _get_cluster_label(self): self.seq_list = [] self.seq_cid_list = [] for entr in self.aln.get_entries(): seq_name = entr[0] cid = int(seq_name.split(".")[0]) self.seq_list.append(seq_name) self.seq_cid_list.append(cid) self.C0 = array(self.seq_cid_list) def get_taxa_order(self): return self.seq_list def set_new_cluster_label(self, new_cid_list, seq_list, newid): if len(new_cid_list) == 0: for i in range(len(self.seq_list)): new_cid_list.append(-1) for i in range(len(self.seq_list)): name = self.seq_list[i] if name in seq_list: new_cid_list[i] = newid return new_cid_list #Mutual information def mutual_info(self,x,y): N=float(len(x)) I=0.0 eps = numpy.finfo(float).eps for l1 in numpy.unique(x): for l2 in numpy.unique(y): #Find the intersections l1_ids=nonzero(x==l1)[0] l2_ids=nonzero(y==l2)[0] pxy=(double(intersect1d(l1_ids,l2_ids).size)/N)+eps I+=pxy*log2(pxy/((l1_ids.size/N)*(l2_ids.size/N))) return I #Normalized mutual information def nmi(self,x,y): N=x.size I=self.mutual_info(x,y) Hx=0 for l1 in unique(x): l1_count=nonzero(x==l1)[0].size Hx+=-(double(l1_count)/N)*log2(double(l1_count)/N) Hy=0 for l2 in unique(y): l2_count=nonzero(y==l2)[0].size Hy+=-(double(l2_count)/N)*log2(double(l2_count)/N) if (Hx+Hy) == 0: return 1.0 else: return I/((Hx+Hy)/2) def get_seq_list(self): return self.seq_list def get_nmi(self, new_cluster_labels): return self.nmi(self.C0, new_cluster_labels) def is_correct(self,names): #*R* newnames = [] for name in names: if name.startswith("*R*"): pass else: newnames.append(name) names_set = set(newnames) for key in self.true_spe.keys(): sps = self.true_spe[key] sps_set = set(sps) if names_set == sps_set: return True return False def get_num_species(self): return len(self.true_spe.keys())
class ground_truth: def __init__(self, refaln, type=""): if type == "fasta": self.aln = SeqGroup(sequences=refaln) else: self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed') self.true_spe = {} self._get_truth() self._get_cluster_label() def _get_truth(self): for entr in self.aln.get_entries(): name = entr[0] gid = name.split(".")[0] self.true_spe[gid] = [] for entr in self.aln.get_entries(): name = entr[0] gid = name.split(".")[0] group = self.true_spe[gid] group.append(name) self.true_spe[gid] = group def _get_cluster_label(self): self.seq_list = [] self.seq_cid_list = [] for entr in self.aln.get_entries(): seq_name = entr[0] cid = int(seq_name.split(".")[0]) self.seq_list.append(seq_name) self.seq_cid_list.append(cid) self.C0 = array(self.seq_cid_list) def get_taxa_order(self): return self.seq_list def set_new_cluster_label(self, new_cid_list, seq_list, newid): if len(new_cid_list) == 0: for i in range(len(self.seq_list)): new_cid_list.append(-1) for i in range(len(self.seq_list)): name = self.seq_list[i] if name in seq_list: new_cid_list[i] = newid return new_cid_list #Mutual information def mutual_info(self, x, y): N = float(len(x)) I = 0.0 eps = numpy.finfo(float).eps for l1 in numpy.unique(x): for l2 in numpy.unique(y): #Find the intersections l1_ids = nonzero(x == l1)[0] l2_ids = nonzero(y == l2)[0] pxy = (double(intersect1d(l1_ids, l2_ids).size) / N) + eps I += pxy * log2(pxy / ((l1_ids.size / N) * (l2_ids.size / N))) return I #Normalized mutual information def nmi(self, x, y): N = x.size I = self.mutual_info(x, y) Hx = 0 for l1 in unique(x): l1_count = nonzero(x == l1)[0].size Hx += -(double(l1_count) / N) * log2(double(l1_count) / N) Hy = 0 for l2 in unique(y): l2_count = nonzero(y == l2)[0].size Hy += -(double(l2_count) / N) * log2(double(l2_count) / N) if (Hx + Hy) == 0: return 1.0 else: return I / ((Hx + Hy) / 2) def get_seq_list(self): return self.seq_list def get_nmi(self, new_cluster_labels): return self.nmi(self.C0, new_cluster_labels) def is_correct(self, names): #*R* newnames = [] for name in names: if name.startswith("*R*"): pass else: newnames.append(name) names_set = set(newnames) for key in self.true_spe.keys(): sps = self.true_spe[key] sps_set = set(sps) if names_set == sps_set: return True return False def get_num_species(self): return len(self.true_spe.keys())