Ejemplo n.º 1
0
def gentesting(ftaxa, fseq, fout, fold = 10):
    ftax = open(ftaxa)
    lines = ftax.readlines()
    ftax.close()
    
    #seqs = SeqGroup(fseq, format='phylip_relaxed')
    seqs = SeqGroup(fseq)
    
    idx = range(len(lines))
    random.seed(12345)
    random.shuffle(idx)
    
    numtaxa = len(lines)
    onefold = int(math.ceil(float(numtaxa) / fold))
    
    idx_list = []
    for i in range(fold):
        start = i * onefold
        end = (i + 1) * onefold
        if end > numtaxa:
            end = numtaxa
        if i == fold -1 :
            end = numtaxa
        idx_list.append(idx[start:end])
    
    for i in range(len(idx_list)):
        idxi = idx_list[i]
        f1 = open(fout + repr(i+1) + "testing.tax", "w")
        f2 = open(fout + repr(i+1) + "testing.fa", "w")
        for index in idxi:
             tax = lines[index]
             seqid = tax.split()[0]
             seq = seqs.get_seq(seqid)
             seqnogap = seq.replace("-","")
             f1.write(tax)
             f2.write(">" + seqid + "\n")
             f2.write(seqnogap + "\n")
        f1.close()
        f2.close()
        
        f1 = open(fout + repr(i+1) + "training.tax", "w")
        f2 = open(fout + repr(i+1) + "training.afa", "w")
        f3 = open(fout + repr(i+1) + "training.fa", "w")
        
        for j in range(len(idx_list)):
            if not i==j:
                idxj = idx_list[j]
                for index in idxj:
                    tax = lines[index]
                    seqid = tax.split()[0]
                    seq = seqs.get_seq(seqid)
                    seqnogap = seq.replace("-","")
                    f1.write(tax)
                    f2.write(">" + seqid + "\n")
                    f2.write(seq + "\n")
                    f3.write(">" + seqid + "\n")
                    f3.write(seqnogap + "\n")
        f1.close()
        f2.close()
        f3.close()
Ejemplo n.º 2
0
def chimera_removal(nuseach, nalign, nout, chimeraout):
	align = SeqGroup(nalign)
	newalign = open(nout, "w")
	chalign = open(chimeraout, "w")
	fus = open(nuseach)
	lines = fus.readlines()
	fus.close()
	for line in lines:
		its = line.split()
		c = its[-1]
		sname = its[1]
		if c == "Y" or c =="?":
			seq = align.get_seq(sname)
			chalign.write(">" + sname + "\n")
			chalign.write(seq + "\n")
		else:
			seq = align.get_seq(sname)
			newalign.write(">" + sname + "\n")
			newalign.write(seq + "\n")
	newalign.close()
	chalign.close()
Ejemplo n.º 3
0
def pick_otu(spe_out, alignment):
    fin = open(spe_out)
    lines = fin.readlines()
    fin.close()
    fout = open(alignment + ".otu", "w")
    aln = SeqGroup(sequences=alignment)
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith("Species"):
            nline = lines[i + 1].strip()
            seq = aln.get_seq(nline)
            fout.write(">" + nline + "\n")
            fout.write(seq + "\n")
    fout.close()
Ejemplo n.º 4
0
def pick_otu(spe_out, alignment):
    fin = open(spe_out)
    lines = fin.readlines()
    fin.close()
    fout = open(alignment + ".otu", "w")
    aln = SeqGroup(sequences=alignment)
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith("Species"):
            nline = lines[i+1].strip()
            seq = aln.get_seq(nline)
            fout.write(">" + nline + "\n")
            fout.write(seq + "\n")
    fout.close()
Ejemplo n.º 5
0
 def link_to_alignment(self, alignment, alg_format="fasta"):
     missing_leaves = []
     missing_internal = []
     if type(alignment) == SeqGroup:
         alg = alignment
     else:
         alg = SeqGroup(alignment, format=alg_format)
     # sets the seq of
     for n in self.traverse():
         try:
             n.add_feature("sequence",alg.get_seq(n.name))
         except KeyError:
             if n.is_leaf():
                 missing_leaves.append(n.name)
             else:
                 missing_internal.append(n.name)
     if len(missing_leaves)>0:
         print >>sys.stderr, \
             "Warnning: [%d] terminal nodes could not be found in the alignment." %\
             len(missing_leaves)
Ejemplo n.º 6
0
 def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):
     missing_leaves = []
     missing_internal = []
     if type(alignment) == SeqGroup:
         alg = alignment
     else:
         alg = SeqGroup(alignment, format=alg_format, **kwargs)
     # sets the seq of
     for n in self.traverse():
         try:
             n.add_feature("sequence", alg.get_seq(n.name))
         except KeyError:
             if n.is_leaf():
                 missing_leaves.append(n.name)
             else:
                 missing_internal.append(n.name)
     if len(missing_leaves) > 0:
         print >>sys.stderr, \
             "Warnning: [%d] terminal nodes could not be found in the alignment." %\
             len(missing_leaves)
Ejemplo n.º 7
0
def extract_placement_crop(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"):
	if os.path.exists(logfile):
		os.remove(logfile)
	
	if os.path.exists(nfout + "_inode_picked_otus.fasta"):
		os.remove(nfout + "_inode_picked_otus.fasta")
	
	jsondata = open (nfin_place)
	align_orgin = SeqGroup(sequences = nfin_aln)
	data = json.load(jsondata)
	placements = data["placements"]
	tree = data["tree"]
	
	ete_tree = tree.replace("{", "[&&NHX:B=")
	ete_tree = ete_tree.replace("}", "]")
	root = Tree(ete_tree, format=1)
	leaves = root.get_leaves()
	allnodes = root.get_descendants()
	allnodes.append(root)
	
	"""get refseq"""
	refseqset = []
	for leaf in leaves:
		refseqset.append(leaf.name)
	refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin)
	
	placemap = {}
	"""find how many edges are used for placement"""
	for placement in placements:
		edges = placement["p"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			placemap[curredge] = placemap.get(curredge, [])
	
	"""placement quality control"""
	discard_file = open(nfout+".discard.placement.txt", "w")
	"""group taxa to edges"""
	for placement in placements:
		edges = placement["p"]
		taxa_names = placement["n"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			a = placemap[curredge] 
			a.extend(taxa_names)
			placemap[curredge]  = a
		else:
			discard_file.write(repr(taxa_names) + "\n")
	discard_file.close()
	
	groups = placemap.items()
	cnt_leaf = 0
	cnt_inode = 0
	
	"""check each edge""" 
	for i,item in enumerate(groups):
		seqset_name = item[0]
		seqset = item[1]
		
		"""check if placed on leaf node and find the node being placed on"""
		flag = False
		place_node = None
		for node in allnodes:
			if str(node.B) == str(seqset_name):
				place_node = node
				if node.is_leaf():
					flag = True 
				break
		
		"""generate aligment"""
		if flag:
			"""process leaf node placement"""
			cnt_leaf = cnt_leaf + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			place_seq = align_orgin.get_seq(place_node.name)
			newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
			newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
		else:
			cnt_inode = cnt_inode + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			
			if len(newalign.get_entries()) < 2:
				count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta")
				sp_log(sfout = logfile, logs="I	the palcement is on an internal node \nD	find new species\nK	reads number: 1 \n")
			else:
				#for entr in refali.get_entries():
				#	sname = entr[0]
				#	seqe = entr[1]
				#	newalign.set_seq(sname, seq)
				newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
Ejemplo n.º 8
0
def extract_placement(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"):
	if os.path.exists(logfile):
		os.remove(logfile)
	
	if os.path.exists(nfout + "_inode_picked_otus.fasta"):
		os.remove(nfout + "_inode_picked_otus.fasta")
	
	jsondata = open (nfin_place)
	align_orgin = SeqGroup(sequences = nfin_aln)
	data = json.load(jsondata)
	placements = data["placements"]
	tree = data["tree"]
	
	ete_tree = tree.replace("{", "[&&NHX:B=")
	ete_tree = ete_tree.replace("}", "]")
	root = Tree(ete_tree, format=1)
	leaves = root.get_leaves()
	allnodes = root.get_descendants()
	allnodes.append(root)
	
	"""get refseq"""
	refseqset = []
	for leaf in leaves:
		refseqset.append(leaf.name)
	refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin)
	
	placemap = {}
	"""find how many edges are used for placement"""
	for placement in placements:
		edges = placement["p"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			placemap[curredge] = placemap.get(curredge, [])
	
	"""placement quality control"""
	discard_file = open(nfout+".discard.placement.txt", "w")
	"""group taxa to edges"""
	for placement in placements:
		edges = placement["p"]
		taxa_names = placement["n"]
		curredge = edges[0][0]
		lw = edges[0][2] 
		if lw >= min_lw:
			a = placemap[curredge] 
			a.extend(taxa_names)
			placemap[curredge]  = a
		else:
			discard_file.write(repr(taxa_names) + "\n")
	discard_file.close()
	
	groups = placemap.items()
	cnt_leaf = 0
	cnt_inode = 0
	
	"""check each edge""" 
	for i,item in enumerate(groups):
		seqset_name = item[0]
		seqset = item[1]
		
		"""check if placed on leaf node and find the node being placed on"""
		flag = False
		place_node = None
		for node in allnodes:
			if str(node.B) == str(seqset_name):
				place_node = node
				if node.is_leaf():
					flag = True 
				break
				
		"""find the furthest leaf of the placement node"""
		fnode = place_node.get_farthest_node()[0]
		outgroup_name = fnode.name
		
		"""find sister node"""
		snode = place_node.get_sisters()[0]
		if not snode.is_leaf():
			snode = snode.get_closest_leaf()[0]
		sister_name = snode.name
		
		"""generate aligment"""
		if flag:
			"""process leaf node placement"""
			cnt_leaf = cnt_leaf + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			if len(newalign.get_entries()) < 2:
				#count_and_pick_reads(align = newalign, outputfile = nfout + "_leaf_picked_otus.fasta")
				og_seq = align_orgin.get_seq(outgroup_name)
				sis_seq = align_orgin.get_seq(sister_name)
				newalign.set_seq("sister", sis_seq) #set the sister seqeunce to make 4 taxa
				newalign.set_seq("root_ref", og_seq) #set the outgroup name
				place_seq = align_orgin.get_seq(place_node.name)
				newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
				newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
			else:
				og_seq = align_orgin.get_seq(outgroup_name)
				newalign.set_seq("root_ref", og_seq) #set the outgroup name
				place_seq = align_orgin.get_seq(place_node.name)
				newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name
				newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa")
		else:
			"""genrate the newwick string to be inserted into the ref tree"""
			rep = re.compile(r"\{[0-9]*\}")
			multi_fcating = "("
			for seqname in seqset:
				multi_fcating = multi_fcating + seqname + ","
			multi_fcating = multi_fcating[:-1] 
			multi_fcating = "{" + repr(seqset_name) + "}," + multi_fcating + ")"
			mtfc_tree = tree.replace("{" + repr(seqset_name) + "}", multi_fcating)
			mtfc_tree = rep.sub("", mtfc_tree)
			
			cnt_inode = cnt_inode + 1
			newalign = SeqGroup()
			for taxa in seqset:
				seq = align_orgin.get_seq(taxa)
				newalign.set_seq(taxa, seq)
			if len(newalign.get_entries()) < 2:
				count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta")
				sp_log(sfout = logfile, logs="I	the palcement is on an internal node \nD	find new species\nK	reads number: 1 \n")
			else:
				#og_seq = align_orgin.get_seq(outgroup_name)
				#newalign.set_seq("root_ref", og_seq)
				for entr in refali.get_entries():
					sname = entr[0]
					seqe = entr[1]
					newalign.set_seq(sname, seq)
				newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
				mtfc_out = open(nfout + "_inode_"+repr(cnt_inode) +  ".mttree", "w")
				mtfc_out.write(mtfc_tree)
				mtfc_out.close()