def gen_alignment3(seq_names = [], alignment = SeqGroup()): """generate alignment from the input taxa name list - seq_name, and SeqGroup - alignment""" newalign = SeqGroup() for taxa in seq_names: seq = alignment.get_seq(taxa) newalign.set_seq(taxa, seq) #newalign.write(outfile = outputfile) return newalign
def gen_alignment2(seq_names = [], alignment = SeqGroup()): """generate alignment from the input taxa name list - seq_name, and SeqGroup - alignment""" newalign = SeqGroup() for taxa in seq_names: if taxa.startswith("*R*"): seq = alignment.get_seq(taxa[3:]) elif taxa == "sister": continue else: seq = alignment.get_seq(taxa) newalign.set_seq(taxa, seq) #newalign.write(outfile = outputfile) return newalign
def random_remove_taxa(falign, num_remove, num_repeat = 1): align = SeqGroup(sequences = falign) entrs = align.get_entries() numseq = len(entrs) index = range(numseq) namel = [] for i in range(num_repeat): newalign = SeqGroup() random.shuffle(index) idxs = index[num_remove:] for idx in idxs: newalign.set_seq(entrs[idx][0], entrs[idx][1]) newalign.write(outfile = falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa") namel.append(falign + "_" + repr(num_remove)+ "_" + repr(i + 1) + ".afa") return namel
def extract_placement_crop(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"): if os.path.exists(logfile): os.remove(logfile) if os.path.exists(nfout + "_inode_picked_otus.fasta"): os.remove(nfout + "_inode_picked_otus.fasta") jsondata = open (nfin_place) align_orgin = SeqGroup(sequences = nfin_aln) data = json.load(jsondata) placements = data["placements"] tree = data["tree"] ete_tree = tree.replace("{", "[&&NHX:B=") ete_tree = ete_tree.replace("}", "]") root = Tree(ete_tree, format=1) leaves = root.get_leaves() allnodes = root.get_descendants() allnodes.append(root) """get refseq""" refseqset = [] for leaf in leaves: refseqset.append(leaf.name) refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin) placemap = {} """find how many edges are used for placement""" for placement in placements: edges = placement["p"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: placemap[curredge] = placemap.get(curredge, []) """placement quality control""" discard_file = open(nfout+".discard.placement.txt", "w") """group taxa to edges""" for placement in placements: edges = placement["p"] taxa_names = placement["n"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: a = placemap[curredge] a.extend(taxa_names) placemap[curredge] = a else: discard_file.write(repr(taxa_names) + "\n") discard_file.close() groups = placemap.items() cnt_leaf = 0 cnt_inode = 0 """check each edge""" for i,item in enumerate(groups): seqset_name = item[0] seqset = item[1] """check if placed on leaf node and find the node being placed on""" flag = False place_node = None for node in allnodes: if str(node.B) == str(seqset_name): place_node = node if node.is_leaf(): flag = True break """generate aligment""" if flag: """process leaf node placement""" cnt_leaf = cnt_leaf + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: cnt_inode = cnt_inode + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta") sp_log(sfout = logfile, logs="I the palcement is on an internal node \nD find new species\nK reads number: 1 \n") else: #for entr in refali.get_entries(): # sname = entr[0] # seqe = entr[1] # newalign.set_seq(sname, seq) newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa")
def extract_placement(nfin_place, nfin_aln, nfout, min_lw = 0.5, logfile = "spcount.log"): if os.path.exists(logfile): os.remove(logfile) if os.path.exists(nfout + "_inode_picked_otus.fasta"): os.remove(nfout + "_inode_picked_otus.fasta") jsondata = open (nfin_place) align_orgin = SeqGroup(sequences = nfin_aln) data = json.load(jsondata) placements = data["placements"] tree = data["tree"] ete_tree = tree.replace("{", "[&&NHX:B=") ete_tree = ete_tree.replace("}", "]") root = Tree(ete_tree, format=1) leaves = root.get_leaves() allnodes = root.get_descendants() allnodes.append(root) """get refseq""" refseqset = [] for leaf in leaves: refseqset.append(leaf.name) refali = gen_alignment2(seq_names = refseqset, alignment = align_orgin) placemap = {} """find how many edges are used for placement""" for placement in placements: edges = placement["p"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: placemap[curredge] = placemap.get(curredge, []) """placement quality control""" discard_file = open(nfout+".discard.placement.txt", "w") """group taxa to edges""" for placement in placements: edges = placement["p"] taxa_names = placement["n"] curredge = edges[0][0] lw = edges[0][2] if lw >= min_lw: a = placemap[curredge] a.extend(taxa_names) placemap[curredge] = a else: discard_file.write(repr(taxa_names) + "\n") discard_file.close() groups = placemap.items() cnt_leaf = 0 cnt_inode = 0 """check each edge""" for i,item in enumerate(groups): seqset_name = item[0] seqset = item[1] """check if placed on leaf node and find the node being placed on""" flag = False place_node = None for node in allnodes: if str(node.B) == str(seqset_name): place_node = node if node.is_leaf(): flag = True break """find the furthest leaf of the placement node""" fnode = place_node.get_farthest_node()[0] outgroup_name = fnode.name """find sister node""" snode = place_node.get_sisters()[0] if not snode.is_leaf(): snode = snode.get_closest_leaf()[0] sister_name = snode.name """generate aligment""" if flag: """process leaf node placement""" cnt_leaf = cnt_leaf + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: #count_and_pick_reads(align = newalign, outputfile = nfout + "_leaf_picked_otus.fasta") og_seq = align_orgin.get_seq(outgroup_name) sis_seq = align_orgin.get_seq(sister_name) newalign.set_seq("sister", sis_seq) #set the sister seqeunce to make 4 taxa newalign.set_seq("root_ref", og_seq) #set the outgroup name place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: og_seq = align_orgin.get_seq(outgroup_name) newalign.set_seq("root_ref", og_seq) #set the outgroup name place_seq = align_orgin.get_seq(place_node.name) newalign.set_seq("*R*" + place_node.name, place_seq) #set the reference sequence name newalign.write(outfile = nfout + "_leaf_"+repr(cnt_leaf) + ".lfa") else: """genrate the newwick string to be inserted into the ref tree""" rep = re.compile(r"\{[0-9]*\}") multi_fcating = "(" for seqname in seqset: multi_fcating = multi_fcating + seqname + "," multi_fcating = multi_fcating[:-1] multi_fcating = "{" + repr(seqset_name) + "}," + multi_fcating + ")" mtfc_tree = tree.replace("{" + repr(seqset_name) + "}", multi_fcating) mtfc_tree = rep.sub("", mtfc_tree) cnt_inode = cnt_inode + 1 newalign = SeqGroup() for taxa in seqset: seq = align_orgin.get_seq(taxa) newalign.set_seq(taxa, seq) if len(newalign.get_entries()) < 2: count_and_pick_reads(align = newalign, outputfile = nfout + "_inode_picked_otus.fasta") sp_log(sfout = logfile, logs="I the palcement is on an internal node \nD find new species\nK reads number: 1 \n") else: #og_seq = align_orgin.get_seq(outgroup_name) #newalign.set_seq("root_ref", og_seq) for entr in refali.get_entries(): sname = entr[0] seqe = entr[1] newalign.set_seq(sname, seq) newalign.write(outfile = nfout + "_inode_"+repr(cnt_inode) + ".ifa") mtfc_out = open(nfout + "_inode_"+repr(cnt_inode) + ".mttree", "w") mtfc_out.write(mtfc_tree) mtfc_out.close()
def get_ref_alignment(self): entries = self.jdata["sequences"] alignment = SeqGroup() for entr in entries: alignment.set_seq(entr[0], entr[1]) return alignment