Example #1
0
File: msa.py Project: epruesse/ARB
 def get_hmm_refalignment(self):
     sites = []
     hmp = open(self.refprofile)
     l = hmp.readline()
     start = False
     while l != "":
         if l.startswith("//"):
             break
         if start:
             l = l.strip()
             ll = l.split()
             usedsite = int(ll[5])
             sites.append(usedsite)
             l = hmp.readline()
             l = hmp.readline()
         else:
             if l.startswith("HMM "):
                 start = True
                 l = hmp.readline()
                 l = hmp.readline()
                 l = hmp.readline()
                 l = hmp.readline()
         l = hmp.readline()
     hmp.close()
     align = SeqGroup(self.refalign)
     fout = open(self.trimed, "w")
     for entr in align.get_entries():
         fout.write(">" + entr[0] + "\n")
         for pos in sites:
             fout.write(entr[1][pos - 1])
         fout.write("\n")
     fout.close()
     return self.trimed, len(sites)
Example #2
0
def trim_refalign_hmm(refaln, hmmprofile):
	sites = []
	hmp = open(hmmprofile)
	l = hmp.readline()
	start = False
	while l!="":
		if l.startswith("//"):
			break
		if start:
			l = l.strip()
			ll = l.split()
			usedsite = int(ll[5])
			sites.append(usedsite)
			l = hmp.readline()
			l = hmp.readline()
		else:
			if l.startswith("HMM "):
				start = True
				l = hmp.readline()
				l = hmp.readline()
				l = hmp.readline()
				l = hmp.readline()
		l = hmp.readline()
	hmp.close()
	align = SeqGroup(refaln)
	fout = open(refaln+".trimed.afa", "w")
	for entr in align.get_entries():
		fout.write(">" + entr[0] + "\n")
		for pos in sites:
			fout.write(entr[1][pos-1])
		fout.write("\n")
	fout.close()
	return refaln+".trimed.afa", len(sites)
Example #3
0
def count_reads(nfolder, pref = "me_leaf_"):
	cnt = 0
	naligns = glob.glob(nfolder + pref + "*")
	for aln in naligns:
		a = SeqGroup(sequences = aln)
		for ent in a.get_entries():
			name = ent[0]
			if name == "root_ref":
				pass
			elif name.startswith("*R*"):
				pass
			else:
				numread = int(name.split("*")[-1])
				cnt = cnt + numread
	print cnt
Example #4
0
def random_remove_taxa(falign, num_remove, num_repeat = 1):
	align = SeqGroup(sequences = falign)
	entrs = align.get_entries()
	numseq = len(entrs)
	index = range(numseq)
	namel = []
	
	for i in range(num_repeat):
		newalign = SeqGroup()
		random.shuffle(index)
		idxs = index[num_remove:]
		for idx in idxs:
			newalign.set_seq(entrs[idx][0], entrs[idx][1])
		newalign.write(outfile = falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa")
		namel.append(falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa")
	return namel
Example #5
0
def raxml_g_after_epa(nfolder, nref_align, suf = "ifa", T = "2"):
	align_orgin = SeqGroup(sequences = nref_align)
	ref_taxa = []
	for entr in align_orgin.get_entries():
		ref_taxa.append(entr[0])
	
	naligns = glob.glob(nfolder + "*." + suf)
	cnt = 0
	for aln in naligns:
		print(repr(cnt))
		cnt = cnt + 1
		if os.path.exists(aln.split(".")[0] + ".subtree"):
			pass
		else:
			mttree = aln.split(".")[0] + ".mttree"
			#raxml constrait search
			trename = build_constrain_tree(nsfin = aln, ntfin = mttree, nfout = "i"+repr(cnt), nfolder = nfolder, num_thread = T)
			#read in the fully resolved tree
			full_tree = Tree(trename, format=1)
			all_taxa = full_tree.get_leaf_names()
			target_taxa = []
			for taxa in all_taxa:
				if taxa in ref_taxa:
					pass
				else:
					target_taxa.append(taxa)
			#the place where the tree can be safely rooted
			ref_node = full_tree.get_leaves_by_name(ref_taxa[0])[0]
			#reroot 
			full_tree.set_outgroup(ref_node)
			#find the common ancestor of the target taxa
			leafA = full_tree.get_leaves_by_name(target_taxa[0])[0]
			leaflist = []
			for n in target_taxa[1:]:
				leaflist.append(full_tree.get_leaves_by_name(n)[0])
			common = leafA.get_common_ancestor(leaflist)
			common.up = None
			common.write(outfile= aln.split(".")[0] + ".subtree", format=5)
			os.remove(trename)
			os.remove(mttree)
Example #6
0
def extract_placement_crop(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"):
	if os.path.exists(logfile):
		os.remove(logfile)
	
	if os.path.exists(nfout + "_inode_picked_otus.fasta"):
		os.remove(nfout + "_inode_picked_otus.fasta")
	
	jsondata = open (nfin_place)
	align_orgin = SeqGroup(sequences = nfin_aln)
	data = json.load(jsondata)
	placements = data["placements"]
	tree = data["tree"]
	
	ete_tree = tree.replace("{", "[&&NHX:B=")
	ete_tree = ete_tree.replace("}", "]")
	root = Tree(ete_tree, format=1)
	leaves = root.get_leaves()
	allnodes = root.get_descendants()
	allnodes.append(root)
	
	"""get refseq"""
	refseqset = []
	for leaf in leaves:
		refseqset.append(leaf.name)
	refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin)
	
	placemap = {}
	"""find how many edges are used for placement"""
	for placement in placements:
		edges = placement["p"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			placemap[curredge] = placemap.get(curredge, [])
	
	"""placement quality control"""
	discard_file = open(nfout+".discard.placement.txt", "w")
	"""group taxa to edges"""
	for placement in placements:
		edges = placement["p"]
		taxa_names = placement["n"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			a = placemap[curredge] 
			a.extend(taxa_names)
			placemap[curredge]  = a
		else:
			discard_file.write(repr(taxa_names) + "\n")
	discard_file.close()
	
	groups = placemap.items()
	cnt_leaf = 0
	cnt_inode = 0
	
	"""check each edge""" 
	for i,item in enumerate(groups):
		seqset_name = item[0]
		seqset = item[1]
		
		"""check if placed on leaf node and find the node being placed on"""
		flag = False
		place_node = None
		for node in allnodes:
			if str(node.B) == str(seqset_name):
				place_node = node
				if node.is_leaf():
					flag = True 
				break
		
		"""generate aligment"""
		if flag:
			"""process leaf node placement"""
			cnt_leaf = cnt_leaf + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			place_seq = align_orgin.get_seq(place_node.name)
			newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
			newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
		else:
			cnt_inode = cnt_inode + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			
			if len(newalign.get_entries()) < 2:
				count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta")
				sp_log(sfout = logfile, logs="I	the palcement is on an internal node \nD	find new species\nK	reads number: 1 \n")
			else:
				#for entr in refali.get_entries():
				#	sname = entr[0]
				#	seqe = entr[1]
				#	newalign.set_seq(sname, seq)
				newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
Example #7
0
def extract_placement(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"):
	if os.path.exists(logfile):
		os.remove(logfile)
	
	if os.path.exists(nfout + "_inode_picked_otus.fasta"):
		os.remove(nfout + "_inode_picked_otus.fasta")
	
	jsondata = open (nfin_place)
	align_orgin = SeqGroup(sequences = nfin_aln)
	data = json.load(jsondata)
	placements = data["placements"]
	tree = data["tree"]
	
	ete_tree = tree.replace("{", "[&&NHX:B=")
	ete_tree = ete_tree.replace("}", "]")
	root = Tree(ete_tree, format=1)
	leaves = root.get_leaves()
	allnodes = root.get_descendants()
	allnodes.append(root)
	
	"""get refseq"""
	refseqset = []
	for leaf in leaves:
		refseqset.append(leaf.name)
	refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin)
	
	placemap = {}
	"""find how many edges are used for placement"""
	for placement in placements:
		edges = placement["p"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			placemap[curredge] = placemap.get(curredge, [])
	
	"""placement quality control"""
	discard_file = open(nfout+".discard.placement.txt", "w")
	"""group taxa to edges"""
	for placement in placements:
		edges = placement["p"]
		taxa_names = placement["n"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			a = placemap[curredge] 
			a.extend(taxa_names)
			placemap[curredge]  = a
		else:
			discard_file.write(repr(taxa_names) + "\n")
	discard_file.close()
	
	groups = placemap.items()
	cnt_leaf = 0
	cnt_inode = 0
	
	"""check each edge""" 
	for i,item in enumerate(groups):
		seqset_name = item[0]
		seqset = item[1]
		
		"""check if placed on leaf node and find the node being placed on"""
		flag = False
		place_node = None
		for node in allnodes:
			if str(node.B) == str(seqset_name):
				place_node = node
				if node.is_leaf():
					flag = True 
				break
				
		"""find the furthest leaf of the placement node"""
		fnode = place_node.get_farthest_node()[0]
		outgroup_name = fnode.name
		
		"""find sister node"""
		snode = place_node.get_sisters()[0]
		if not snode.is_leaf():
			snode = snode.get_closest_leaf()[0]
		sister_name = snode.name
		
		"""generate aligment"""
		if flag:
			"""process leaf node placement"""
			cnt_leaf = cnt_leaf + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			if len(newalign.get_entries()) < 2:
				#count_and_pick_reads(align = newalign, outputfile = nfout + "_leaf_picked_otus.fasta")
				og_seq = align_orgin.get_seq(outgroup_name)
				sis_seq = align_orgin.get_seq(sister_name)
				newalign.set_seq("sister", sis_seq) #set the sister seqeunce to make 4 taxa
				newalign.set_seq("root_ref", og_seq) #set the outgroup name
				place_seq = align_orgin.get_seq(place_node.name)
				newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
				newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
			else:
				og_seq = align_orgin.get_seq(outgroup_name)
				newalign.set_seq("root_ref", og_seq) #set the outgroup name
				place_seq = align_orgin.get_seq(place_node.name)
				newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
				newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
		else:
			"""genrate the newwick string to be inserted into the ref tree"""
			rep = re.compile(r"\{[0-9]*\}")
			multi_fcating = "("
			for seqname in seqset:
				multi_fcating = multi_fcating + seqname + ","
			multi_fcating = multi_fcating[:-1] 
			multi_fcating = "{" + repr(seqset_name) + "}," + multi_fcating + ")"
			mtfc_tree = tree.replace("{" + repr(seqset_name) + "}", multi_fcating)
			mtfc_tree = rep.sub("", mtfc_tree)
			
			cnt_inode = cnt_inode + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			if len(newalign.get_entries()) < 2:
				count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta")
				sp_log(sfout = logfile, logs="I	the palcement is on an internal node \nD	find new species\nK	reads number: 1 \n")
			else:
				#og_seq = align_orgin.get_seq(outgroup_name)
				#newalign.set_seq("root_ref", og_seq)
				for entr in refali.get_entries():
					sname = entr[0]
					seqe = entr[1]
					newalign.set_seq(sname, seq)
				newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
				mtfc_out = open(nfout + "_inode_"+repr(cnt_inode) +  ".mttree", "w")
				mtfc_out.write(mtfc_tree)
				mtfc_out.close()
Example #8
0
class ground_truth:
	def __init__(self, refaln, type = ""):
		if type == "fasta":
			self.aln = SeqGroup(sequences=refaln)
		else:
			self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed')
		self.true_spe = {}
		self._get_truth()
		self._get_cluster_label()
		
	
	def _get_truth(self):
		for entr in self.aln.get_entries():
			name = entr[0]
			gid = name.split(".")[0]
			self.true_spe[gid] = []
		
		for entr in self.aln.get_entries():
			name = entr[0]
			gid = name.split(".")[0]
			group = self.true_spe[gid]
			group.append(name)
			self.true_spe[gid] = group
	
	def _get_cluster_label(self):
		self.seq_list = []
		self.seq_cid_list = [] 
		for entr in self.aln.get_entries():
			seq_name = entr[0]
			cid = int(seq_name.split(".")[0])
			self.seq_list.append(seq_name)
			self.seq_cid_list.append(cid)
		self.C0 = array(self.seq_cid_list) 
		
	
	def get_taxa_order(self):
		return self.seq_list
	
		
	def set_new_cluster_label(self, new_cid_list, seq_list, newid):
		if len(new_cid_list) == 0:
			for i in range(len(self.seq_list)):
				new_cid_list.append(-1)
		
		for i in range(len(self.seq_list)):
			name = self.seq_list[i]
			if name in seq_list:
				new_cid_list[i] = newid
		return new_cid_list
	
	#Mutual information
	def mutual_info(self,x,y):
		N=float(len(x))
		I=0.0
		eps = numpy.finfo(float).eps
		for l1 in numpy.unique(x):
			for l2 in numpy.unique(y):
				#Find the intersections
				l1_ids=nonzero(x==l1)[0]
				l2_ids=nonzero(y==l2)[0]
				pxy=(double(intersect1d(l1_ids,l2_ids).size)/N)+eps
				I+=pxy*log2(pxy/((l1_ids.size/N)*(l2_ids.size/N)))
		return I

	#Normalized mutual information
	def nmi(self,x,y):
		N=x.size
		I=self.mutual_info(x,y)
		Hx=0
		for l1 in unique(x):
			l1_count=nonzero(x==l1)[0].size
			Hx+=-(double(l1_count)/N)*log2(double(l1_count)/N)
		Hy=0
		for l2 in unique(y):
			l2_count=nonzero(y==l2)[0].size
			Hy+=-(double(l2_count)/N)*log2(double(l2_count)/N)
		if (Hx+Hy) == 0:
			return 1.0
		else: 
			return I/((Hx+Hy)/2)
	
	def get_seq_list(self):
		return self.seq_list
	
	def get_nmi(self, new_cluster_labels):
		return self.nmi(self.C0, new_cluster_labels)
	
	def is_correct(self,names):
		#*R*
		newnames = []
		for name in names:
			if name.startswith("*R*"):
				pass
			else:
				newnames.append(name)
			
		names_set = set(newnames)
		for key in self.true_spe.keys():
			sps = self.true_spe[key]
			sps_set = set(sps)
			if names_set == sps_set:
				return True
		return False
	
	def get_num_species(self):
		return len(self.true_spe.keys())
Example #9
0
class ground_truth:
    def __init__(self, refaln, type=""):
        if type == "fasta":
            self.aln = SeqGroup(sequences=refaln)
        else:
            self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed')
        self.true_spe = {}
        self._get_truth()
        self._get_cluster_label()

    def _get_truth(self):
        for entr in self.aln.get_entries():
            name = entr[0]
            gid = name.split(".")[0]
            self.true_spe[gid] = []

        for entr in self.aln.get_entries():
            name = entr[0]
            gid = name.split(".")[0]
            group = self.true_spe[gid]
            group.append(name)
            self.true_spe[gid] = group

    def _get_cluster_label(self):
        self.seq_list = []
        self.seq_cid_list = []
        for entr in self.aln.get_entries():
            seq_name = entr[0]
            cid = int(seq_name.split(".")[0])
            self.seq_list.append(seq_name)
            self.seq_cid_list.append(cid)
        self.C0 = array(self.seq_cid_list)

    def get_taxa_order(self):
        return self.seq_list

    def set_new_cluster_label(self, new_cid_list, seq_list, newid):
        if len(new_cid_list) == 0:
            for i in range(len(self.seq_list)):
                new_cid_list.append(-1)

        for i in range(len(self.seq_list)):
            name = self.seq_list[i]
            if name in seq_list:
                new_cid_list[i] = newid
        return new_cid_list

    #Mutual information
    def mutual_info(self, x, y):
        N = float(len(x))
        I = 0.0
        eps = numpy.finfo(float).eps
        for l1 in numpy.unique(x):
            for l2 in numpy.unique(y):
                #Find the intersections
                l1_ids = nonzero(x == l1)[0]
                l2_ids = nonzero(y == l2)[0]
                pxy = (double(intersect1d(l1_ids, l2_ids).size) / N) + eps
                I += pxy * log2(pxy / ((l1_ids.size / N) * (l2_ids.size / N)))
        return I

    #Normalized mutual information
    def nmi(self, x, y):
        N = x.size
        I = self.mutual_info(x, y)
        Hx = 0
        for l1 in unique(x):
            l1_count = nonzero(x == l1)[0].size
            Hx += -(double(l1_count) / N) * log2(double(l1_count) / N)
        Hy = 0
        for l2 in unique(y):
            l2_count = nonzero(y == l2)[0].size
            Hy += -(double(l2_count) / N) * log2(double(l2_count) / N)
        if (Hx + Hy) == 0:
            return 1.0
        else:
            return I / ((Hx + Hy) / 2)

    def get_seq_list(self):
        return self.seq_list

    def get_nmi(self, new_cluster_labels):
        return self.nmi(self.C0, new_cluster_labels)

    def is_correct(self, names):
        #*R*
        newnames = []
        for name in names:
            if name.startswith("*R*"):
                pass
            else:
                newnames.append(name)

        names_set = set(newnames)
        for key in self.true_spe.keys():
            sps = self.true_spe[key]
            sps_set = set(sps)
            if names_set == sps_set:
                return True
        return False

    def get_num_species(self):
        return len(self.true_spe.keys())