def gentesting(ftaxa, fseq, fout, fold = 10): ftax = open(ftaxa) lines = ftax.readlines() ftax.close() #seqs = SeqGroup(fseq, format='phylip_relaxed') seqs = SeqGroup(fseq) idx = range(len(lines)) random.seed(12345) random.shuffle(idx) numtaxa = len(lines) onefold = int(math.ceil(float(numtaxa) / fold)) idx_list = [] for i in range(fold): start = i * onefold end = (i + 1) * onefold if end > numtaxa: end = numtaxa if i == fold -1 : end = numtaxa idx_list.append(idx[start:end]) for i in range(len(idx_list)): idxi = idx_list[i] f1 = open(fout + repr(i+1) + "testing.tax", "w") f2 = open(fout + repr(i+1) + "testing.fa", "w") for index in idxi: tax = lines[index] seqid = tax.split()[0] seq = seqs.get_seq(seqid) seqnogap = seq.replace("-","") f1.write(tax) f2.write(">" + seqid + "\n") f2.write(seqnogap + "\n") f1.close() f2.close() f1 = open(fout + repr(i+1) + "training.tax", "w") f2 = open(fout + repr(i+1) + "training.afa", "w") f3 = open(fout + repr(i+1) + "training.fa", "w") for j in range(len(idx_list)): if not i==j: idxj = idx_list[j] for index in idxj: tax = lines[index] seqid = tax.split()[0] seq = seqs.get_seq(seqid) seqnogap = seq.replace("-","") f1.write(tax) f2.write(">" + seqid + "\n") f2.write(seq + "\n") f3.write(">" + seqid + "\n") f3.write(seqnogap + "\n") f1.close() f2.close() f3.close()
def chimera_removal(nuseach, nalign, nout, chimeraout): align = SeqGroup(nalign) newalign = open(nout, "w") chalign = open(chimeraout, "w") fus = open(nuseach) lines = fus.readlines() fus.close() for line in lines: its = line.split() c = its[-1] sname = its[1] if c == "Y" or c =="?": seq = align.get_seq(sname) chalign.write(">" + sname + "\n") chalign.write(seq + "\n") else: seq = align.get_seq(sname) newalign.write(">" + sname + "\n") newalign.write(seq + "\n") newalign.close() chalign.close()
def pick_otu(spe_out, alignment): fin = open(spe_out) lines = fin.readlines() fin.close() fout = open(alignment + ".otu", "w") aln = SeqGroup(sequences=alignment) for i in range(len(lines)): line = lines[i] if line.startswith("Species"): nline = lines[i + 1].strip() seq = aln.get_seq(nline) fout.write(">" + nline + "\n") fout.write(seq + "\n") fout.close()
def pick_otu(spe_out, alignment): fin = open(spe_out) lines = fin.readlines() fin.close() fout = open(alignment + ".otu", "w") aln = SeqGroup(sequences=alignment) for i in range(len(lines)): line = lines[i] if line.startswith("Species"): nline = lines[i+1].strip() seq = aln.get_seq(nline) fout.write(">" + nline + "\n") fout.write(seq + "\n") fout.close()
def link_to_alignment(self, alignment, alg_format="fasta"): missing_leaves = [] missing_internal = [] if type(alignment) == SeqGroup: alg = alignment else: alg = SeqGroup(alignment, format=alg_format) # sets the seq of for n in self.traverse(): try: n.add_feature("sequence",alg.get_seq(n.name)) except KeyError: if n.is_leaf(): missing_leaves.append(n.name) else: missing_internal.append(n.name) if len(missing_leaves)>0: print >>sys.stderr, \ "Warnning: [%d] terminal nodes could not be found in the alignment." %\ len(missing_leaves)
def link_to_alignment(self, alignment, alg_format="fasta", **kwargs): missing_leaves = [] missing_internal = [] if type(alignment) == SeqGroup: alg = alignment else: alg = SeqGroup(alignment, format=alg_format, **kwargs) # sets the seq of for n in self.traverse(): try: n.add_feature("sequence", alg.get_seq(n.name)) except KeyError: if n.is_leaf(): missing_leaves.append(n.name) else: missing_internal.append(n.name) if len(missing_leaves) > 0: print >>sys.stderr, \ "Warnning: [%d] terminal nodes could not be found in the alignment." %\ len(missing_leaves)
def extract_placement_crop(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"): if os.path.exists(logfile): os.remove(logfile) if os.path.exists(nfout + "_inode_picked_otus.fasta"): os.remove(nfout + "_inode_picked_otus.fasta") jsondata = open (nfin_place) align_orgin = SeqGroup(sequences = nfin_aln) data = json.load(jsondata) placements = data["placements"] tree = data["tree"] ete_tree = tree.replace("{", "[&&NHX:B=") ete_tree = ete_tree.replace("}", "]") root = Tree(ete_tree, format=1) leaves = root.get_leaves() allnodes = root.get_descendants() allnodes.append(root) """get refseq""" refseqset = [] for leaf in leaves: refseqset.append(leaf.name) refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin) placemap = {} """find how many edges are used for placement""" for placement in placements: edges = placement["p"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: placemap[curredge] = placemap.get(curredge, []) """placement quality control""" discard_file = open(nfout+".discard.placement.txt", "w") """group taxa to edges""" for placement in placements: edges = placement["p"] taxa_names = placement["n"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: a = placemap[curredge] a.extend(taxa_names) placemap[curredge] = a else: discard_file.write(repr(taxa_names) + "\n") discard_file.close() groups = placemap.items() cnt_leaf = 0 cnt_inode = 0 """check each edge""" for i,item in enumerate(groups): seqset_name = item[0] seqset = item[1] """check if placed on leaf node and find the node being placed on""" flag = False place_node = None for node in allnodes: if str(node.B) == str(seqset_name): place_node = node if node.is_leaf(): flag = True break """generate aligment""" if flag: """process leaf node placement""" cnt_leaf = cnt_leaf + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: cnt_inode = cnt_inode + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta") sp_log(sfout = logfile, logs="I the palcement is on an internal node \nD find new species\nK reads number: 1 \n") else: #for entr in refali.get_entries(): # sname = entr[0] # seqe = entr[1] # newalign.set_seq(sname, seq) newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
def extract_placement(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"): if os.path.exists(logfile): os.remove(logfile) if os.path.exists(nfout + "_inode_picked_otus.fasta"): os.remove(nfout + "_inode_picked_otus.fasta") jsondata = open (nfin_place) align_orgin = SeqGroup(sequences = nfin_aln) data = json.load(jsondata) placements = data["placements"] tree = data["tree"] ete_tree = tree.replace("{", "[&&NHX:B=") ete_tree = ete_tree.replace("}", "]") root = Tree(ete_tree, format=1) leaves = root.get_leaves() allnodes = root.get_descendants() allnodes.append(root) """get refseq""" refseqset = [] for leaf in leaves: refseqset.append(leaf.name) refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin) placemap = {} """find how many edges are used for placement""" for placement in placements: edges = placement["p"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: placemap[curredge] = placemap.get(curredge, []) """placement quality control""" discard_file = open(nfout+".discard.placement.txt", "w") """group taxa to edges""" for placement in placements: edges = placement["p"] taxa_names = placement["n"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: a = placemap[curredge] a.extend(taxa_names) placemap[curredge] = a else: discard_file.write(repr(taxa_names) + "\n") discard_file.close() groups = placemap.items() cnt_leaf = 0 cnt_inode = 0 """check each edge""" for i,item in enumerate(groups): seqset_name = item[0] seqset = item[1] """check if placed on leaf node and find the node being placed on""" flag = False place_node = None for node in allnodes: if str(node.B) == str(seqset_name): place_node = node if node.is_leaf(): flag = True break """find the furthest leaf of the placement node""" fnode = place_node.get_farthest_node()[0] outgroup_name = fnode.name """find sister node""" snode = place_node.get_sisters()[0] if not snode.is_leaf(): snode = snode.get_closest_leaf()[0] sister_name = snode.name """generate aligment""" if flag: """process leaf node placement""" cnt_leaf = cnt_leaf + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: #count_and_pick_reads(align = newalign, outputfile = nfout + "_leaf_picked_otus.fasta") og_seq = align_orgin.get_seq(outgroup_name) sis_seq = align_orgin.get_seq(sister_name) newalign.set_seq("sister", sis_seq) #set the sister seqeunce to make 4 taxa newalign.set_seq("root_ref", og_seq) #set the outgroup name place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: og_seq = align_orgin.get_seq(outgroup_name) newalign.set_seq("root_ref", og_seq) #set the outgroup name place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: """genrate the newwick string to be inserted into the ref tree""" rep = re.compile(r"\{[0-9]*\}") multi_fcating = "(" for seqname in seqset: multi_fcating = multi_fcating + seqname + "," multi_fcating = multi_fcating[:-1] multi_fcating = "{" + repr(seqset_name) + "}," + multi_fcating + ")" mtfc_tree = tree.replace("{" + repr(seqset_name) + "}", multi_fcating) mtfc_tree = rep.sub("", mtfc_tree) cnt_inode = cnt_inode + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta") sp_log(sfout = logfile, logs="I the palcement is on an internal node \nD find new species\nK reads number: 1 \n") else: #og_seq = align_orgin.get_seq(outgroup_name) #newalign.set_seq("root_ref", og_seq) for entr in refali.get_entries(): sname = entr[0] seqe = entr[1] newalign.set_seq(sname, seq) newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa") mtfc_out = open(nfout + "_inode_"+repr(cnt_inode) + ".mttree", "w") mtfc_out.write(mtfc_tree) mtfc_out.close()