Example #1
0
File: msa.py Project: epruesse/ARB
 def get_hmm_refalignment(self):
     sites = []
     hmp = open(self.refprofile)
     l = hmp.readline()
     start = False
     while l != "":
         if l.startswith("//"):
             break
         if start:
             l = l.strip()
             ll = l.split()
             usedsite = int(ll[5])
             sites.append(usedsite)
             l = hmp.readline()
             l = hmp.readline()
         else:
             if l.startswith("HMM "):
                 start = True
                 l = hmp.readline()
                 l = hmp.readline()
                 l = hmp.readline()
                 l = hmp.readline()
         l = hmp.readline()
     hmp.close()
     align = SeqGroup(self.refalign)
     fout = open(self.trimed, "w")
     for entr in align.get_entries():
         fout.write(">" + entr[0] + "\n")
         for pos in sites:
             fout.write(entr[1][pos - 1])
         fout.write("\n")
     fout.close()
     return self.trimed, len(sites)
Example #2
0
def trim_refalign_hmm(refaln, hmmprofile):
	sites = []
	hmp = open(hmmprofile)
	l = hmp.readline()
	start = False
	while l!="":
		if l.startswith("//"):
			break
		if start:
			l = l.strip()
			ll = l.split()
			usedsite = int(ll[5])
			sites.append(usedsite)
			l = hmp.readline()
			l = hmp.readline()
		else:
			if l.startswith("HMM "):
				start = True
				l = hmp.readline()
				l = hmp.readline()
				l = hmp.readline()
				l = hmp.readline()
		l = hmp.readline()
	hmp.close()
	align = SeqGroup(refaln)
	fout = open(refaln+".trimed.afa", "w")
	for entr in align.get_entries():
		fout.write(">" + entr[0] + "\n")
		for pos in sites:
			fout.write(entr[1][pos-1])
		fout.write("\n")
	fout.close()
	return refaln+".trimed.afa", len(sites)
Example #3
0
def gentesting(ftaxa, fseq, fout, fold = 10):
    ftax = open(ftaxa)
    lines = ftax.readlines()
    ftax.close()
    
    #seqs = SeqGroup(fseq, format='phylip_relaxed')
    seqs = SeqGroup(fseq)
    
    idx = range(len(lines))
    random.seed(12345)
    random.shuffle(idx)
    
    numtaxa = len(lines)
    onefold = int(math.ceil(float(numtaxa) / fold))
    
    idx_list = []
    for i in range(fold):
        start = i * onefold
        end = (i + 1) * onefold
        if end > numtaxa:
            end = numtaxa
        if i == fold -1 :
            end = numtaxa
        idx_list.append(idx[start:end])
    
    for i in range(len(idx_list)):
        idxi = idx_list[i]
        f1 = open(fout + repr(i+1) + "testing.tax", "w")
        f2 = open(fout + repr(i+1) + "testing.fa", "w")
        for index in idxi:
             tax = lines[index]
             seqid = tax.split()[0]
             seq = seqs.get_seq(seqid)
             seqnogap = seq.replace("-","")
             f1.write(tax)
             f2.write(">" + seqid + "\n")
             f2.write(seqnogap + "\n")
        f1.close()
        f2.close()
        
        f1 = open(fout + repr(i+1) + "training.tax", "w")
        f2 = open(fout + repr(i+1) + "training.afa", "w")
        f3 = open(fout + repr(i+1) + "training.fa", "w")
        
        for j in range(len(idx_list)):
            if not i==j:
                idxj = idx_list[j]
                for index in idxj:
                    tax = lines[index]
                    seqid = tax.split()[0]
                    seq = seqs.get_seq(seqid)
                    seqnogap = seq.replace("-","")
                    f1.write(tax)
                    f2.write(">" + seqid + "\n")
                    f2.write(seq + "\n")
                    f3.write(">" + seqid + "\n")
                    f3.write(seqnogap + "\n")
        f1.close()
        f2.close()
        f3.close()
Example #4
0
def gen_alignment3(seq_names = [], alignment = SeqGroup()):
	"""generate alignment from the input taxa name list - seq_name, and SeqGroup - alignment"""
	newalign = SeqGroup()
	for taxa in seq_names:
		seq = alignment.get_seq(taxa)
		newalign.set_seq(taxa, seq)
	#newalign.write(outfile = outputfile)
	return newalign
Example #5
0
 def __init__(self, refaln, type=""):
     if type == "fasta":
         self.aln = SeqGroup(sequences=refaln)
     else:
         self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed')
     self.true_spe = {}
     self._get_truth()
     self._get_cluster_label()
Example #6
0
	def __init__(self, refaln, type = ""):
		if type == "fasta":
			self.aln = SeqGroup(sequences=refaln)
		else:
			self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed')
		self.true_spe = {}
		self._get_truth()
		self._get_cluster_label()
Example #7
0
 def __write_algn(self, fullpath):
     """
     to write algn in paml format
     """
     seq_group = SeqGroup()
     for n in self:
         seq_group.id2seq  [n.node_id] = n.nt_sequence
         seq_group.id2name [n.node_id] = n.name
         seq_group.name2id [n.name   ] = n.node_id
     seq_group.write(outfile=fullpath, format='paml')
Example #8
0
 def __write_algn(self, fullpath):
     """
     to write algn in paml format
     """
     seq_group = SeqGroup()
     for n in self:
         seq_group.id2seq[n.node_id] = n.nt_sequence
         seq_group.id2name[n.node_id] = n.name
         seq_group.name2id[n.name] = n.node_id
     seq_group.write(outfile=fullpath, format='paml')
Example #9
0
def gen_alignment2(seq_names = [], alignment = SeqGroup()):
	"""generate alignment from the input taxa name list - seq_name, and SeqGroup - alignment"""
	newalign = SeqGroup()
	for taxa in seq_names:
		if taxa.startswith("*R*"):
			seq = alignment.get_seq(taxa[3:])
		elif taxa == "sister":
			continue
		else:
			seq = alignment.get_seq(taxa)
		newalign.set_seq(taxa, seq)
	#newalign.write(outfile = outputfile)
	return newalign
Example #10
0
def pick_otu(spe_out, alignment):
    fin = open(spe_out)
    lines = fin.readlines()
    fin.close()
    fout = open(alignment + ".otu", "w")
    aln = SeqGroup(sequences=alignment)
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith("Species"):
            nline = lines[i+1].strip()
            seq = aln.get_seq(nline)
            fout.write(">" + nline + "\n")
            fout.write(seq + "\n")
    fout.close()
Example #11
0
def pick_otu(spe_out, alignment):
    fin = open(spe_out)
    lines = fin.readlines()
    fin.close()
    fout = open(alignment + ".otu", "w")
    aln = SeqGroup(sequences=alignment)
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith("Species"):
            nline = lines[i + 1].strip()
            seq = aln.get_seq(nline)
            fout.write(">" + nline + "\n")
            fout.write(seq + "\n")
    fout.close()
Example #12
0
def count_reads(nfolder, pref = "me_leaf_"):
	cnt = 0
	naligns = glob.glob(nfolder + pref + "*")
	for aln in naligns:
		a = SeqGroup(sequences = aln)
		for ent in a.get_entries():
			name = ent[0]
			if name == "root_ref":
				pass
			elif name.startswith("*R*"):
				pass
			else:
				numread = int(name.split("*")[-1])
				cnt = cnt + numread
	print cnt
Example #13
0
def curator(refseq, reftax, method, output, testingtax=""):
    seqs = SeqGroup(refseq)
    ranks = []
    with open(reftax) as fo:
        for line in fo:
            ll = line.split()
            ele = [ll[0], ll[1].split(";")]
            ranks.append(ele)
    testings = []
    if testingtax != "":
        with open(testingtax) as fo:
            for line in fo:
                ll = line.split()
                ele = [ll[0], ll[1].split(";")]
                testings.append(ele)
    else:
        testings = ranks

    for test in testings:
        #refseq, reftax, name, old_tax, method, foutput
        ru, result_string = findmis(refseq=seqs,
                                    reftax=ranks,
                                    name=test[0],
                                    method=method,
                                    foutput=output)
        print(result_string)
Example #14
0
def merge_alignment(aln1, aln2, fout, numsites):
    seqs1 = SeqGroup(aln1)
    seqs2 = SeqGroup(aln2)
    if len(seqs1) == 0 or len(seqs2) == 0:
        print("No sequences aligned! ")
        sys.exit()
    with open(fout, "w") as fo:
        for seq in seqs1.iter_entries():
            if len(seq[1].strip()) == numsites:
                fo.write(">" + seq[0] + "\n" + seq[1] + "\n")
            else:
                print("Error in alignment ....")
                sys.exit()
        for seq in seqs2.iter_entries():
            if len(seq[1].strip()) == numsites:
                fo.write(">" + seq[0] + "\n" + seq[1] + "\n")
            else:
                print("Error in alignment ....")
                sys.exit()
Example #15
0
 def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):
     missing_leaves = []
     missing_internal = []
     if type(alignment) == SeqGroup:
         alg = alignment
     else:
         alg = SeqGroup(alignment, format=alg_format, **kwargs)
     # sets the seq of
     for n in self.traverse():
         try:
             n.add_feature("sequence", alg.get_seq(n.name))
         except KeyError:
             if n.is_leaf():
                 missing_leaves.append(n.name)
             else:
                 missing_internal.append(n.name)
     if len(missing_leaves) > 0:
         print >>sys.stderr, \
             "Warnning: [%d] terminal nodes could not be found in the alignment." %\
             len(missing_leaves)
 def link_to_alignment(self, alignment, alg_format="fasta"):
     missing_leaves = []
     missing_internal = []
     if type(alignment) == SeqGroup:
         alg = alignment
     else:
         alg = SeqGroup(alignment, format=alg_format)
     # sets the seq of
     for n in self.traverse():
         try:
             n.add_feature("sequence",alg.get_seq(n.name))
         except KeyError:
             if n.is_leaf():
                 missing_leaves.append(n.name)
             else:
                 missing_internal.append(n.name)
     if len(missing_leaves)>0:
         print >>sys.stderr, \
             "Warnning: [%d] terminal nodes could not be found in the alignment." %\
             len(missing_leaves)
Example #17
0
def chimera_removal(nuseach, nalign, nout, chimeraout):
	align = SeqGroup(nalign)
	newalign = open(nout, "w")
	chalign = open(chimeraout, "w")
	fus = open(nuseach)
	lines = fus.readlines()
	fus.close()
	for line in lines:
		its = line.split()
		c = its[-1]
		sname = its[1]
		if c == "Y" or c =="?":
			seq = align.get_seq(sname)
			chalign.write(">" + sname + "\n")
			chalign.write(seq + "\n")
		else:
			seq = align.get_seq(sname)
			newalign.write(">" + sname + "\n")
			newalign.write(seq + "\n")
	newalign.close()
	chalign.close()
Example #18
0
def raxml_g_after_epa(nfolder, nref_align, suf = "ifa", T = "2"):
	align_orgin = SeqGroup(sequences = nref_align)
	ref_taxa = []
	for entr in align_orgin.get_entries():
		ref_taxa.append(entr[0])
	
	naligns = glob.glob(nfolder + "*." + suf)
	cnt = 0
	for aln in naligns:
		print(repr(cnt))
		cnt = cnt + 1
		if os.path.exists(aln.split(".")[0] + ".subtree"):
			pass
		else:
			mttree = aln.split(".")[0] + ".mttree"
			#raxml constrait search
			trename = build_constrain_tree(nsfin = aln, ntfin = mttree, nfout = "i"+repr(cnt), nfolder = nfolder, num_thread = T)
			#read in the fully resolved tree
			full_tree = Tree(trename, format=1)
			all_taxa = full_tree.get_leaf_names()
			target_taxa = []
			for taxa in all_taxa:
				if taxa in ref_taxa:
					pass
				else:
					target_taxa.append(taxa)
			#the place where the tree can be safely rooted
			ref_node = full_tree.get_leaves_by_name(ref_taxa[0])[0]
			#reroot 
			full_tree.set_outgroup(ref_node)
			#find the common ancestor of the target taxa
			leafA = full_tree.get_leaves_by_name(target_taxa[0])[0]
			leaflist = []
			for n in target_taxa[1:]:
				leaflist.append(full_tree.get_leaves_by_name(n)[0])
			common = leafA.get_common_ancestor(leaflist)
			common.up = None
			common.write(outfile= aln.split(".")[0] + ".subtree", format=5)
			os.remove(trename)
			os.remove(mttree)
Example #19
0
def random_remove_taxa(falign, num_remove, num_repeat = 1):
	align = SeqGroup(sequences = falign)
	entrs = align.get_entries()
	numseq = len(entrs)
	index = range(numseq)
	namel = []
	
	for i in range(num_repeat):
		newalign = SeqGroup()
		random.shuffle(index)
		idxs = index[num_remove:]
		for idx in idxs:
			newalign.set_seq(entrs[idx][0], entrs[idx][1])
		newalign.write(outfile = falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa")
		namel.append(falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa")
	return namel
Example #20
0
def ngssize(fin, start = 0, end = 2428):
    fout1 = open(fin+".trim.afa", "w")
    fout2 = open(fin+".trim.fa", "w")
    seqs = SeqGroup(fin)
    for seq in seqs:
        name = seq[0]
        sequence = seq[1]
        cut = len(sequence)/2
        sequence_trim = sequence[0:cut]
        sequence_trim_nogap = sequence_trim.replace("-","")
        fout1.write(">" + name + "\n")
        fout2.write(">" + name + "\n")
        fout1.write(sequence_trim + "\n")
        fout2.write(sequence_trim_nogap + "\n")
    
    fout1.close()
    fout2.close()
    return fin+".trim.fa"
Example #21
0
File: msa.py Project: epruesse/ARB
def merge_alignment(aln1, aln2, fout, numsites):
    seqs1 = SeqGroup(aln1)
    seqs2 = SeqGroup(aln2)
    if len(seqs1) == 0 or len(seqs2) == 0:
        print("No sequences aligned! ")
        sys.exit()
    with open(fout, "w") as fo:
        for seq in seqs1.iter_entries():
            if len(seq[1].strip()) == numsites:
                fo.write(">" + seq[0] + "\n" + seq[1] + "\n")
            else:
                print("Error in alignment ....")
                sys.exit()
        for seq in seqs2.iter_entries():
            if len(seq[1].strip()) == numsites:
                fo.write(">" + seq[0] + "\n" + seq[1] + "\n")
            else:
                print("Error in alignment ....")
                sys.exit()
Example #22
0
class ground_truth:
	def __init__(self, refaln, type = ""):
		if type == "fasta":
			self.aln = SeqGroup(sequences=refaln)
		else:
			self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed')
		self.true_spe = {}
		self._get_truth()
		self._get_cluster_label()
		
	
	def _get_truth(self):
		for entr in self.aln.get_entries():
			name = entr[0]
			gid = name.split(".")[0]
			self.true_spe[gid] = []
		
		for entr in self.aln.get_entries():
			name = entr[0]
			gid = name.split(".")[0]
			group = self.true_spe[gid]
			group.append(name)
			self.true_spe[gid] = group
	
	def _get_cluster_label(self):
		self.seq_list = []
		self.seq_cid_list = [] 
		for entr in self.aln.get_entries():
			seq_name = entr[0]
			cid = int(seq_name.split(".")[0])
			self.seq_list.append(seq_name)
			self.seq_cid_list.append(cid)
		self.C0 = array(self.seq_cid_list) 
		
	
	def get_taxa_order(self):
		return self.seq_list
	
		
	def set_new_cluster_label(self, new_cid_list, seq_list, newid):
		if len(new_cid_list) == 0:
			for i in range(len(self.seq_list)):
				new_cid_list.append(-1)
		
		for i in range(len(self.seq_list)):
			name = self.seq_list[i]
			if name in seq_list:
				new_cid_list[i] = newid
		return new_cid_list
	
	#Mutual information
	def mutual_info(self,x,y):
		N=float(len(x))
		I=0.0
		eps = numpy.finfo(float).eps
		for l1 in numpy.unique(x):
			for l2 in numpy.unique(y):
				#Find the intersections
				l1_ids=nonzero(x==l1)[0]
				l2_ids=nonzero(y==l2)[0]
				pxy=(double(intersect1d(l1_ids,l2_ids).size)/N)+eps
				I+=pxy*log2(pxy/((l1_ids.size/N)*(l2_ids.size/N)))
		return I

	#Normalized mutual information
	def nmi(self,x,y):
		N=x.size
		I=self.mutual_info(x,y)
		Hx=0
		for l1 in unique(x):
			l1_count=nonzero(x==l1)[0].size
			Hx+=-(double(l1_count)/N)*log2(double(l1_count)/N)
		Hy=0
		for l2 in unique(y):
			l2_count=nonzero(y==l2)[0].size
			Hy+=-(double(l2_count)/N)*log2(double(l2_count)/N)
		if (Hx+Hy) == 0:
			return 1.0
		else: 
			return I/((Hx+Hy)/2)
	
	def get_seq_list(self):
		return self.seq_list
	
	def get_nmi(self, new_cluster_labels):
		return self.nmi(self.C0, new_cluster_labels)
	
	def is_correct(self,names):
		#*R*
		newnames = []
		for name in names:
			if name.startswith("*R*"):
				pass
			else:
				newnames.append(name)
			
		names_set = set(newnames)
		for key in self.true_spe.keys():
			sps = self.true_spe[key]
			sps_set = set(sps)
			if names_set == sps_set:
				return True
		return False
	
	def get_num_species(self):
		return len(self.true_spe.keys())
Example #23
0
class ground_truth:
    def __init__(self, refaln, type=""):
        if type == "fasta":
            self.aln = SeqGroup(sequences=refaln)
        else:
            self.aln = SeqGroup(sequences=refaln, format='phylip_relaxed')
        self.true_spe = {}
        self._get_truth()
        self._get_cluster_label()

    def _get_truth(self):
        for entr in self.aln.get_entries():
            name = entr[0]
            gid = name.split(".")[0]
            self.true_spe[gid] = []

        for entr in self.aln.get_entries():
            name = entr[0]
            gid = name.split(".")[0]
            group = self.true_spe[gid]
            group.append(name)
            self.true_spe[gid] = group

    def _get_cluster_label(self):
        self.seq_list = []
        self.seq_cid_list = []
        for entr in self.aln.get_entries():
            seq_name = entr[0]
            cid = int(seq_name.split(".")[0])
            self.seq_list.append(seq_name)
            self.seq_cid_list.append(cid)
        self.C0 = array(self.seq_cid_list)

    def get_taxa_order(self):
        return self.seq_list

    def set_new_cluster_label(self, new_cid_list, seq_list, newid):
        if len(new_cid_list) == 0:
            for i in range(len(self.seq_list)):
                new_cid_list.append(-1)

        for i in range(len(self.seq_list)):
            name = self.seq_list[i]
            if name in seq_list:
                new_cid_list[i] = newid
        return new_cid_list

    #Mutual information
    def mutual_info(self, x, y):
        N = float(len(x))
        I = 0.0
        eps = numpy.finfo(float).eps
        for l1 in numpy.unique(x):
            for l2 in numpy.unique(y):
                #Find the intersections
                l1_ids = nonzero(x == l1)[0]
                l2_ids = nonzero(y == l2)[0]
                pxy = (double(intersect1d(l1_ids, l2_ids).size) / N) + eps
                I += pxy * log2(pxy / ((l1_ids.size / N) * (l2_ids.size / N)))
        return I

    #Normalized mutual information
    def nmi(self, x, y):
        N = x.size
        I = self.mutual_info(x, y)
        Hx = 0
        for l1 in unique(x):
            l1_count = nonzero(x == l1)[0].size
            Hx += -(double(l1_count) / N) * log2(double(l1_count) / N)
        Hy = 0
        for l2 in unique(y):
            l2_count = nonzero(y == l2)[0].size
            Hy += -(double(l2_count) / N) * log2(double(l2_count) / N)
        if (Hx + Hy) == 0:
            return 1.0
        else:
            return I / ((Hx + Hy) / 2)

    def get_seq_list(self):
        return self.seq_list

    def get_nmi(self, new_cluster_labels):
        return self.nmi(self.C0, new_cluster_labels)

    def is_correct(self, names):
        #*R*
        newnames = []
        for name in names:
            if name.startswith("*R*"):
                pass
            else:
                newnames.append(name)

        names_set = set(newnames)
        for key in self.true_spe.keys():
            sps = self.true_spe[key]
            sps_set = set(sps)
            if names_set == sps_set:
                return True
        return False

    def get_num_species(self):
        return len(self.true_spe.keys())
Example #24
0
def curator(refseq, reftax, method, output, testingtax=""):

    start_time = time.time()

    seqs = SeqGroup(refseq)
    ranks = []
    with open(reftax) as fo:
        for line in fo:
            ll = line.split()
            ele = [ll[0], ll[1].split(";")]
            ranks.append(ele)
    testings = []
    if len(testingtax) > 1:
        with open(testingtax) as fo:
            for line in fo:
                ll = line.split()
                ele = [ll[0], ll[1].split(";")]
                testings.append(ele)
    else:
        if testingtax:
            partno = int(testingtax)
            partsize = int(len(ranks) / 10)
            p_start = partno * partsize
            if partno == 9:
                p_end = len(ranks)
            else:
                p_end = p_start + partsize
            testings = ranks[p_start:p_end]
        else:
            testings = ranks


#    basepath = os.path.dirname(os.path.abspath(__file__))
    basepath = "/home/kozlovay"
    tmpfolder = basepath + "/tmp/"
    tmpprefix = tmpfolder + str(time.time())

    #    prelim_output = output + ".prelim"

    assf = output + ".ass"

    old_tax = {}
    if os.path.isfile(assf):
        with open(assf, "r") as fi:
            for line in fi:
                seqid, rest = line.strip().split("\t", 1)
                old_tax[seqid] = rest
        print "Found old file with %d assignments; will continue from there..." % len(
            old_tax)

    if True:  #not os.path.isfile(output):
        with open(assf, "a") as fo:
            i = 0
            for test in testings:
                #refseq, reftax, name, old_tax, method, foutput
                i += 1
                if test[0] in old_tax:
                    continue
                tmp_fname = "%s.%d" % (tmpprefix, i)
                ru, result_string = findmis(refseq=seqs,
                                            reftax=ranks,
                                            name=[test[0]],
                                            method=method,
                                            foutput=output,
                                            tmpname=tmp_fname,
                                            refseq_fname=refseq)
                #           print(result_string)
                fo.write(ru)
                fo.flush()

    mis_sid = []
    with open(output, "r") as fmis:
        for line in fmis:
            toks = line.split("\t")
            sid = toks[0]
            lvl = toks[1]
            conf = float(toks[4])
            if lvl != "Species":
                mis_sid += [sid]

    print "Leave-one-out test found %d suspicious sequences; running final test to check them..." % len(
        mis_sid)

    final_output = output + ".final"
    tmp_fname = "%s.%s" % (tmpprefix, "fin")
    findmis(refseq=seqs,
            reftax=ranks,
            name=mis_sid,
            method=method,
            foutput=final_output,
            tmpname=tmp_fname,
            refseq_fname=refseq)

    elapsed_time = time.time() - start_time
    print "\nProcessed %d sequences in %.0f seconds.\n" % (len(testings),
                                                           elapsed_time)
Example #25
0
        regions = []
        for reg in args.target_regions:
            try:
                contig, raw_pos = reg.split(':')
                if not raw_pos:
                    start, end = None, None
                else:
                    start, end = map(int, raw_pos.split('-'))
                regions.append([contig.strip(), start, end])
            except Exception:
                print >>sys.stderr, 'ERROR: Invalid contig region. Use contigid:start-end syntax\n'
                raise
        args.target_regions = regions
        
    if args.target_genes:
        for g in gene_db.find({"sp":int(taxid), "n":{"$in": args.target_genes}}, {"c":1, "s":1, "e":1}):
            print g
            
    if not args.target_regions:
        # If not regions requested, scan all contigs completely 
        args.target_regions = [ [None, None, None] ]

    if args.refseqs:
        from ete2 import SeqGroup
        args.refseqs = SeqGroup(args.refseqs)
               
    main(args)



Example #26
0
from ete2 import PhyloTree, PhylomeDBConnector, SeqGroup

p = PhylomeDBConnector()
w,x, t =  p.get_best_tree("Hsa0000001", 1)
a, l = p.get_clean_alg("Hsa0000001", 1)
A = SeqGroup(a, "iphylip")
for s in A.id2seq:
    A.id2seq[s]=A.id2seq[s][:30]
t.link_to_alignment(A)
print t.get_species()
print t
t.set_outgroup(t&"Ddi0002240")

sp = PhyloTree("(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);")
reconciled, evs = t.reconcile(sp)
print reconciled
reconciled.show()
Example #27
0
def extract_placement_crop(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"):
	if os.path.exists(logfile):
		os.remove(logfile)
	
	if os.path.exists(nfout + "_inode_picked_otus.fasta"):
		os.remove(nfout + "_inode_picked_otus.fasta")
	
	jsondata = open (nfin_place)
	align_orgin = SeqGroup(sequences = nfin_aln)
	data = json.load(jsondata)
	placements = data["placements"]
	tree = data["tree"]
	
	ete_tree = tree.replace("{", "[&&NHX:B=")
	ete_tree = ete_tree.replace("}", "]")
	root = Tree(ete_tree, format=1)
	leaves = root.get_leaves()
	allnodes = root.get_descendants()
	allnodes.append(root)
	
	"""get refseq"""
	refseqset = []
	for leaf in leaves:
		refseqset.append(leaf.name)
	refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin)
	
	placemap = {}
	"""find how many edges are used for placement"""
	for placement in placements:
		edges = placement["p"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			placemap[curredge] = placemap.get(curredge, [])
	
	"""placement quality control"""
	discard_file = open(nfout+".discard.placement.txt", "w")
	"""group taxa to edges"""
	for placement in placements:
		edges = placement["p"]
		taxa_names = placement["n"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			a = placemap[curredge] 
			a.extend(taxa_names)
			placemap[curredge]  = a
		else:
			discard_file.write(repr(taxa_names) + "\n")
	discard_file.close()
	
	groups = placemap.items()
	cnt_leaf = 0
	cnt_inode = 0
	
	"""check each edge""" 
	for i,item in enumerate(groups):
		seqset_name = item[0]
		seqset = item[1]
		
		"""check if placed on leaf node and find the node being placed on"""
		flag = False
		place_node = None
		for node in allnodes:
			if str(node.B) == str(seqset_name):
				place_node = node
				if node.is_leaf():
					flag = True 
				break
		
		"""generate aligment"""
		if flag:
			"""process leaf node placement"""
			cnt_leaf = cnt_leaf + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			place_seq = align_orgin.get_seq(place_node.name)
			newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
			newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
		else:
			cnt_inode = cnt_inode + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			
			if len(newalign.get_entries()) < 2:
				count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta")
				sp_log(sfout = logfile, logs="I	the palcement is on an internal node \nD	find new species\nK	reads number: 1 \n")
			else:
				#for entr in refali.get_entries():
				#	sname = entr[0]
				#	seqe = entr[1]
				#	newalign.set_seq(sname, seq)
				newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
Example #28
0
def extract_placement(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"):
	if os.path.exists(logfile):
		os.remove(logfile)
	
	if os.path.exists(nfout + "_inode_picked_otus.fasta"):
		os.remove(nfout + "_inode_picked_otus.fasta")
	
	jsondata = open (nfin_place)
	align_orgin = SeqGroup(sequences = nfin_aln)
	data = json.load(jsondata)
	placements = data["placements"]
	tree = data["tree"]
	
	ete_tree = tree.replace("{", "[&&NHX:B=")
	ete_tree = ete_tree.replace("}", "]")
	root = Tree(ete_tree, format=1)
	leaves = root.get_leaves()
	allnodes = root.get_descendants()
	allnodes.append(root)
	
	"""get refseq"""
	refseqset = []
	for leaf in leaves:
		refseqset.append(leaf.name)
	refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin)
	
	placemap = {}
	"""find how many edges are used for placement"""
	for placement in placements:
		edges = placement["p"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			placemap[curredge] = placemap.get(curredge, [])
	
	"""placement quality control"""
	discard_file = open(nfout+".discard.placement.txt", "w")
	"""group taxa to edges"""
	for placement in placements:
		edges = placement["p"]
		taxa_names = placement["n"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			a = placemap[curredge] 
			a.extend(taxa_names)
			placemap[curredge]  = a
		else:
			discard_file.write(repr(taxa_names) + "\n")
	discard_file.close()
	
	groups = placemap.items()
	cnt_leaf = 0
	cnt_inode = 0
	
	"""check each edge""" 
	for i,item in enumerate(groups):
		seqset_name = item[0]
		seqset = item[1]
		
		"""check if placed on leaf node and find the node being placed on"""
		flag = False
		place_node = None
		for node in allnodes:
			if str(node.B) == str(seqset_name):
				place_node = node
				if node.is_leaf():
					flag = True 
				break
				
		"""find the furthest leaf of the placement node"""
		fnode = place_node.get_farthest_node()[0]
		outgroup_name = fnode.name
		
		"""find sister node"""
		snode = place_node.get_sisters()[0]
		if not snode.is_leaf():
			snode = snode.get_closest_leaf()[0]
		sister_name = snode.name
		
		"""generate aligment"""
		if flag:
			"""process leaf node placement"""
			cnt_leaf = cnt_leaf + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			if len(newalign.get_entries()) < 2:
				#count_and_pick_reads(align = newalign, outputfile = nfout + "_leaf_picked_otus.fasta")
				og_seq = align_orgin.get_seq(outgroup_name)
				sis_seq = align_orgin.get_seq(sister_name)
				newalign.set_seq("sister", sis_seq) #set the sister seqeunce to make 4 taxa
				newalign.set_seq("root_ref", og_seq) #set the outgroup name
				place_seq = align_orgin.get_seq(place_node.name)
				newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
				newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
			else:
				og_seq = align_orgin.get_seq(outgroup_name)
				newalign.set_seq("root_ref", og_seq) #set the outgroup name
				place_seq = align_orgin.get_seq(place_node.name)
				newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
				newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
		else:
			"""genrate the newwick string to be inserted into the ref tree"""
			rep = re.compile(r"\{[0-9]*\}")
			multi_fcating = "("
			for seqname in seqset:
				multi_fcating = multi_fcating + seqname + ","
			multi_fcating = multi_fcating[:-1] 
			multi_fcating = "{" + repr(seqset_name) + "}," + multi_fcating + ")"
			mtfc_tree = tree.replace("{" + repr(seqset_name) + "}", multi_fcating)
			mtfc_tree = rep.sub("", mtfc_tree)
			
			cnt_inode = cnt_inode + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			if len(newalign.get_entries()) < 2:
				count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta")
				sp_log(sfout = logfile, logs="I	the palcement is on an internal node \nD	find new species\nK	reads number: 1 \n")
			else:
				#og_seq = align_orgin.get_seq(outgroup_name)
				#newalign.set_seq("root_ref", og_seq)
				for entr in refali.get_entries():
					sname = entr[0]
					seqe = entr[1]
					newalign.set_seq(sname, seq)
				newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
				mtfc_out = open(nfout + "_inode_"+repr(cnt_inode) +  ".mttree", "w")
				mtfc_out.write(mtfc_tree)
				mtfc_out.close()
Example #29
0
def autotest(refseq, reftax, testingtax, tf = "/home/zhangje/GIT/tax_benchmark/script/tmp/"):
    testings = []
    with open(testingtax) as fo:
        for line in fo:
            ll = line.split()
            ele = [ll[0], ll[1].split(";")]
            testings.append(ele)
    
    seqs = SeqGroup(refseq)
    ranks = []
    with open(reftax) as fo:
        for line in fo:
            ll = line.split()
            ele = [ll[0], ll[1].split(";")]
            ranks.append(ele)
    
    num_corrected_uclust = 0
    num_unchanged_uclust = 0
    
    num_corrected_rdp = 0
    num_unchanged_rdp = 0
    
    num_corrected_blast = 0
    num_unchanged_blast = 0
    
    f_uclust = open(testingtax+".uclust", "w")
    f_rdp = open(testingtax+".rdp", "w")
    f_blast = open(testingtax+".blast", "w")
    f_mis = open(testingtax+".misb", "w")
    f_umis = open(testingtax+".umisb", "w")
    
    for test in testings:
        ru, result_uclust = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "uclust", temfolder = tf)
        f_uclust.write(ru)
        rr, result_rdp = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "rdp", temfolder = tf)
        f_rdp.write(rr)
        rb, result_blast = findmis(refseq = seqs, reftax = ranks, name = test[0], method = "blast", temfolder = tf)
        f_blast.write(rb)
        truth = test[1]
        if len(truth) == 8:
            f_mis.write(test[0] + "	" + rank2string(truth[0:-1]) + "\n")
            rank_nr = int(truth[7])
            if len(result_uclust) > rank_nr and result_uclust[rank_nr] == truth[rank_nr]:
                num_corrected_uclust = num_corrected_uclust + 1
            if len(result_rdp) > rank_nr and result_rdp[rank_nr] == truth[rank_nr]:
                num_corrected_rdp = num_corrected_rdp + 1
            if len(result_blast) > rank_nr and result_blast[rank_nr] == truth[rank_nr]:
                num_corrected_blast = num_corrected_blast + 1
        else:
            f_umis.write(test[0] + "	" + rank2string(truth) + "\n")
            if result_uclust == truth:
                num_unchanged_uclust = num_unchanged_uclust + 1
            if result_rdp == truth:
                num_unchanged_rdp = num_unchanged_rdp + 1
            if result_uclust == truth:
                num_unchanged_blast = num_unchanged_blast + 1
        print("truth:" + repr(truth))
        print("uclust:" + repr(result_uclust))
        print("rdp:"+ repr(result_rdp))
        print("blast:" +repr(result_blast))
    
    f_uclust.close()
    f_rdp.close()
    f_blast.close()
    f_mis.close()
    f_umis.close()
        
    print("method   corrected   unchanged")
    print("uclust"+ "   " +repr(num_corrected_uclust) + " " + repr(num_unchanged_uclust))
    print("rdp"+ "   " +repr(num_corrected_rdp) + " " + repr(num_unchanged_rdp))
    print("blast"+ "   " +repr(num_corrected_blast) + " " + repr(num_unchanged_blast))
    
    with open(testingtax+".results", "w") as fo:
        fo.write("method   corrected   unchanged \n")
        fo.write("uclust"+ "   " +repr(num_corrected_uclust) + " " + repr(num_unchanged_uclust) + "\n")
        fo.write("rdp"+ "   " +repr(num_corrected_rdp) + " " + repr(num_unchanged_rdp) + "\n")
        fo.write("blast"+ "   " +repr(num_corrected_blast) + " " + repr(num_unchanged_blast) + "\n")
Example #30
0
 def get_ref_alignment(self):
     entries = self.jdata["sequences"]
     alignment = SeqGroup()
     for entr in entries:
         alignment.set_seq(entr[0], entr[1])
     return alignment
Example #31
0
 def get_ref_alignment(self):
     entries = self.jdata["sequences"]
     alignment = SeqGroup()
     for entr in entries:
         alignment.set_seq(entr[0], entr[1])
     return alignment