def __write_algn(self, fullpath): """ to write algn in paml format """ seq_group = SeqGroup() for n in self: seq_group.id2seq[n.node_id] = n.nt_sequence seq_group.id2name[n.node_id] = n.name seq_group.name2id[n.name] = n.node_id seq_group.write(outfile=fullpath, format='paml')
def __write_algn(self, fullpath): """ to write algn in paml format """ seq_group = SeqGroup() for n in self: seq_group.id2seq [n.node_id] = n.nt_sequence seq_group.id2name [n.node_id] = n.name seq_group.name2id [n.name ] = n.node_id seq_group.write(outfile=fullpath, format='paml')
def pick_otu(spe_out, alignment): fin = open(spe_out) lines = fin.readlines() fin.close() fout = open(alignment + ".otu", "w") aln = SeqGroup(sequences=alignment) for i in range(len(lines)): line = lines[i] if line.startswith("Species"): nline = lines[i + 1].strip() seq = aln.get_seq(nline) fout.write(">" + nline + "\n") fout.write(seq + "\n") fout.close()
def _create_tree (tree,fasta,out,color): seqs = SeqGroup(fasta, format="fasta") t = Tree(tree) colors = _parse_color_file(color) node_names = t.get_leaf_names() for name in node_names: seq = seqs.get_seq(name) seqFace = SeqMotifFace(seq, seq_format="()") node = t.get_leaves_by_name(name) for i in range(0,len(node)): if name in colors: ns = NodeStyle() ns['bgcolor'] = colors[name] node[i].set_style(ns) node[i].add_face(seqFace,0,'aligned') t.render(out)
def link_to_alignment(self, alignment, alg_format="fasta", **kwargs): missing_leaves = [] missing_internal = [] if type(alignment) == SeqGroup: alg = alignment else: alg = SeqGroup(alignment, format=alg_format, **kwargs) # sets the seq of for n in self.traverse(): try: n.add_feature("sequence", alg.get_seq(n.name)) except KeyError: if n.is_leaf(): missing_leaves.append(n.name) else: missing_internal.append(n.name) if len(missing_leaves) > 0: print("Warnning: [%d] terminal nodes could not be found in the alignment." %\ len(missing_leaves), file=sys.stderr)
def link_to_alignment(self, alignment, alg_format="fasta", **kwargs): missing_leaves = [] missing_internal = [] if type(alignment) == SeqGroup: alg = alignment else: alg = SeqGroup(alignment, format=alg_format, **kwargs) # sets the seq of for n in self.traverse(): try: n.add_feature("sequence",alg.get_seq(n.name)) except KeyError: if n.is_leaf(): missing_leaves.append(n.name) else: missing_internal.append(n.name) if len(missing_leaves)>0: print("Warnning: [%d] terminal nodes could not be found in the alignment." %\ len(missing_leaves), file=sys.stderr)
def extract_ss(input_path, suffix, tree_file): tree = Tree(tree_file, format=1) leaves_set = set(tree.get_leaf_names()) msa = SeqGroup(input_path.alignment, "fasta") path_argv = [input_path._version, input_path._dataset + suffix] output_path = common.Paths(path_argv, 0) data_versioning.setup_new_dataset(output_path) new_msa = SeqGroup() for entry in msa.iter_entries(): label = entry[0] sequence = entry[1] if (label in leaves_set): new_msa.set_seq(label, sequence) open(output_path.alignment, "w").write(new_msa.write(format="fasta")) shutil.copy(input_path.duplicates_json, output_path.duplicates_json) shutil.copy(input_path.outgroups_file, output_path.outgroups_file)
def extract_ss(input_path, suffix, tree_file): print( "Extracting alignment generated with the support selection tree thinning technique..." ) tree = Tree(tree_file, format=1) leaves_set = set(tree.get_leaf_names()) msa = SeqGroup(input_path.alignment, "fasta") path_argv = [input_path._version, input_path._dataset + suffix] output_path = common.Paths(path_argv, 0) data_versioning.setup_new_dataset(output_path) new_msa = SeqGroup() for entry in msa.iter_entries(): label = entry[0] sequence = entry[1] if (label in leaves_set): new_msa.set_seq(label, sequence) open(output_path.alignment, "w").write(new_msa.write(format="fasta")) shutil.copy(input_path.duplicates_json, output_path.duplicates_json) shutil.copy(input_path.outgroups_file, output_path.outgroups_file) print("New version of the snapshot: " + output_path.path)
from ete3 import PhyloTree, SeqGroup, SequenceFace, TreeStyle, AttrFace, NodeStyle, Tree import sys alignment_input=sys.argv[1] tree_input=sys.argv[2] alg=( alignment_input ) t = PhyloTree( tree_input , format=1, quoted_node_names=True ) seqs = SeqGroup(alg, format="fasta") nodestyle1 = NodeStyle() nodestyle1["size"] = 0 nodestyle1["vt_line_width"] = 2 nodestyle1["hz_line_width"] = 2 for node in t.traverse(): node.set_style(nodestyle1) for leaf in t.iter_leaves(): item=seqs.get_seq(leaf.name) name_face = AttrFace(item, fsize=24) Bars = SequenceFace(item, seqtype='aa', fsize=24, bg_colors={'G': 'Khaki', 'A': 'Khaki', 'S': 'Khaki', 'T': 'Khaki', 'C': 'LightGreen', 'V': 'LightGreen', 'I': 'LightGreen', 'L': 'LightGreen', 'P': 'LightGreen', 'F': 'LightGreen', 'Y': 'LightGreen', 'M': 'YellowGreen', 'W': 'LightGreen', 'N': 'Thistle', 'Q': 'Thistle', 'H': 'Thistle', 'D': 'DarkSalmon', 'E': 'DarkSalmon', 'K': 'SkyBlue', 'R': 'SkyBlue', 'X':'Black', '-':'White' }, fg_colors=None, codon=None, col_w=1.5, alt_col_w=3, special_col=None, interactive=False) leaf.add_face(Bars, 2, "aligned")
import os from sys import exit from glob import glob from ete3 import SeqGroup, parser path = sys.argv[1] + "*clustalo" infiles = glob(path) print "%s infiles" % len(infiles) F = parser.fasta.read_fasta(sys.argv[2]) for infile in infiles: print infile if os.stat(infile).st_size == 0: continue alg_aa = SeqGroup(infile) alg_dna = SeqGroup() for name, seq, _ in alg_aa: try: cdna = F.id2seq[F.name2id[name]] except KeyError: print "cdna for %s not found" % name continue cdna_aln = "" for pos in seq: if pos != "-": cdna_aln += cdna[:3] cdna = cdna[3:] else: cdna_aln += "---"
(begin, end, ies, iesId, beginMSA, endMSA) = charMat[(geneFamily, homIES, geneId)] if ies == '?': cf = CircleFace(10, "silver", label = "?") elif ies == '1': cf = CircleFace(10, "black") elif ies == '0': cf = CircleFace(10, "LightGrey") else: sys.exit(1) column = hiesL[(geneFamily, homIES)] + 1 leaf.add_face(cf, column, "aligned") drawTree(outputFile) elif plotStyle == '3': # plot with MSA # load nucleotide sequences for all genes! nuclAlnFile = os.path.join(basePath, 'analysis', 'msas', 'filtered', 'cluster.' + geneFamily + '.nucl.fa') seqs = SeqGroup(sequences = nuclAlnFile, format = "fasta") for leaf in t: geneId = leaf.name seq = seqs.get_seq(geneId) seq = seq.translate(None, string.ascii_lowercase) # keep only CDS iesmotif = [[1, len(seq), "line", 2, 5, None, None, None]] for homIES in gfhomIES[geneFamily]: (begin, end, ies, iesId, beginMSA, endMSA) = charMat[(geneFamily, homIES, geneId)] if ies == '?': if beginMSA == 'NA': iesmotif.append([int(begin), int(end),"()", 10, 10, "red", "black", "arial|8|black|?"]) else: iesmotif.append([int(begin), int(end),"()", 10, 10, "red", "black", "arial|8|black|?"]) elif ies == '1': iesmotif.append([int(beginMSA), int(endMSA),"[]", 10, 10, "black", "red", "arial|8|black|" + iesId])
import cPickle from pandas import DataFrame from collections import Counter, defaultdict import os from ete3 import SeqGroup if os.path.exists('alg.pkl'): print 'loading from pkl file' a = cPickle.load(open('alg.pkl')) else: alg = SeqGroup('formatted_MG_seqs.faa.final_tree.fa') #alg = SeqGroup('Burki_first10.aln.fa') #alg = SeqGroup('test_52.fa') alg_matrix = [] labels = [] for name, seq, _ in alg: aas = [aa for aa in seq.upper()] alg_matrix.append(aas) labels.append(name) a = DataFrame(alg_matrix, labels) cPickle.dump(a, open('alg.pkl', 'wb'), 2) nsp = float(len(a)) spvariants = defaultdict(Counter) valid_cols = 0 refaas = [] for col in a: counter = Counter(a[col]) refaa, num = counter.most_common(1)[0]
import sys import re import random, string #from seqgroup import SeqGroup from ete3 import SeqGroup from ete3 import NCBITaxa ncbi = NCBITaxa() seq_file = SeqGroup(sys.argv[1]) OUT_fasta = open(sys.argv[2], 'w') out_table = open(sys.argv[3], 'w') proc_type = str(sys.argv[4]) def proces_prote_check_taxid(seq_file): for seq_num, (name, seq, _) in enumerate(seq_file): try: taxid = name.split('.')[0] taxid2name = ncbi.get_taxid_translator([taxid]) code = ''.join( random.choices(string.ascii_letters + string.digits, k=5)) code_name = '.'.join([taxid, code]) # name = re.sub('\|', '_', name) # name = re.sub('\:|\;|\*|,|\"|\[|\]|\(|\)|\/', '_', name) # name = re.sub(r"\\", '_', name) seq = re.sub('\*|"|\+', '', seq) seq = re.sub('Z|B|J', 'X', seq) print(">%s\n%s" % (code_name, seq), file=OUT_fasta) print("%s\t%s" % (code_name, name), file=out_table) except: print('ERROR in', name)
import sys import random, string from ete3 import SeqGroup from tempfile import NamedTemporaryFile in_file = sys.argv[1] transform_fasta = sys.argv[2] out_file = sys.argv[3] translate_table = open(sys.argv[4], 'w') alg = SeqGroup(in_file) translate = open(transform_fasta, 'w') for num, (name, seq, _) in enumerate(alg): taxid = name.split('.')[0] code = ''.join(random.choices(string.ascii_letters + string.digits, k=5)) #code=format((num+1), '05') #nam_t=taxid+'.'+str(code) print >> translate, '>%s\n%s' % (code, seq) print >> translate_table, '%s\t%s' % (name, code) translate_table.close() translate.close() translate_alg = SeqGroup(transform_fasta) translate_alg.write(format="phylip", outfile=out_file)
import re from ete3 import SeqGroup, Tree import sys tree_file = sys.argv[1] # in newick format original_fasta = SeqGroup(sys.argv[2]) pruned_fasta = open(sys.argv[3], 'w') star_target = str(sys.argv[4]) end_target = str(sys.argv[5]) tree = Tree(tree_file) R = tree.get_midpoint_outgroup() tree.set_outgroup(R) name_list = [] for num, leaf in enumerate(tree): name_list.append(leaf.name) if star_target == leaf.name: star_pos = num if end_target == leaf.name: end_pos = num pruned_list = name_list[star_pos:(end_pos + 1)] print pruned_list #for ele in pruned_list: # print >>pruned_fasta,">%s\n%s"%(ele, original_fasta.get_seq(ele)) for ele in name_list: if ele not in pruned_list: print >> pruned_fasta, ">%s\n%s" % (ele, original_fasta.get_seq(ele))
from ete3 import PhyloTree, PhylomeDBConnector, SeqGroup p = PhylomeDBConnector() w,x, t = p.get_best_tree("Hsa0000001", 1) a, l = p.get_clean_alg("Hsa0000001", 1) A = SeqGroup(a, "iphylip") for s in A.id2seq: A.id2seq[s]=A.id2seq[s][:30] t.link_to_alignment(A) print t.get_species() print t t.set_outgroup(t&"Ddi0002240") sp = PhyloTree("(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);") reconciled, evs = t.reconcile(sp) print reconciled reconciled.show()
from sys import exit from glob import glob from ete3 import SeqGroup, parser path = sys.argv[1] + "*clustalo" infiles = glob(path) print "%s infiles" % len(infiles) F = parser.fasta.read_fasta(sys.argv[2]) for infile in infiles: print infile if os.stat(infile).st_size == 0: continue alg_aa = SeqGroup(infile) alg_dna = SeqGroup() for name, seq, _ in alg_aa: try: cdna = F.id2seq[F.name2id[name]] except KeyError: print "cdna for %s not found" % name continue cdna_aln = "" for pos in seq: if pos != "-": cdna_aln += cdna[:3] cdna = cdna[3:] else: cdna_aln += "---" # Last the stop codon
def main(args): HSP = defaultdict(list) # store High Scoring Pairs (HSPs) MIN_EVALUE = args.min_evalue MIN_ALG_LENGTH = args.min_alg_length MIN_OVERLAP = args.min_overlap MIN_SCORE = args.min_score MIN_PIDENT = args.min_ident HITS_FILE = args.input HSP_COUNTER = 0 # First, stores every High Score Pair (HSP) observed for each sequence, reading the all-against-all # BLAST matrix. Each HSP is treated as a hit in a potential MOTIF. for query, group in itertools.groupby(iter_queries(HITS_FILE), lambda x: x[0]): for query, hit, evalue, score, length, pident, qstart, qend, sstart, send in group: evalue = float(evalue) score = float(score) pident = float(pident) length, qstart, qend, sstart, send = map( int, [length, qstart, qend, sstart, send]) if query == hit or length < MIN_ALG_LENGTH or evalue > MIN_EVALUE or score < MIN_SCORE or pident < MIN_PIDENT: continue qid = HSP_COUNTER HSP_COUNTER += 1 sid = HSP_COUNTER HSP_COUNTER += 1 HSP[query].append([qstart, qend, qend - qstart, qid, sid]) HSP[hit].append([sstart, send, send - sstart, sid, qid]) #all_motifs = consolidate_hsps(HSP, MIN_OVERLAP) all_motifs = consolidate_hsps_small(HSP, MIN_OVERLAP) # At this point, "all_motifs" contain a list of motifs per sequence. Each # seq-motif entry provides also a set of matching regions (HSPs) in other # seqs, so we have a graph conecting consolidated per-sequence HSPs. The # connected components function returns the clusters of motifs that are # connected, so each cluster represents a disconnected independent group, # therehore a potentially independent alignable motif block in the original # sequences. blocks = connected_components(all_motifs) for blockid, o in enumerate(blocks): coords = [(x[1] - x[0], x[2], x[0], x[1]) for x in o[0]] coords.sort(reverse=True) print('MOTIF BLOCK:', blockid, "nseqs:", len(set([x[2] for x in o[0]])), "min.length:", np.min([x[1] - x[0] for x in o[0]]), "avg.length:", np.mean([x[1] - x[0] for x in o[0]]), "largest:", coords[0][1:4], coords[-1][1:4], list(set([x[2] for x in o[0]]))[:4], sep="\t") # We can now analyze the motif/block composition of each seq, and group # sequences based on similar block architecture seq2blocks = defaultdict(set) for blockid, bl in enumerate(blocks): for blinfo in bl[0]: seq2blocks[blinfo[2]].add(blockid) # Let's analize motif architectures archs = [(key, tuple(sorted(values))) for key, values in seq2blocks.items()] #for name, arch in sorted(archs, key=lambda x: x[1]): # print(name, arch, sep="\t") archcounter = defaultdict(set) for name, arch in sorted(archs, key=lambda x: x[1]): archcounter[arch].add(name) for arch, keys in sorted(archcounter.items(), key=lambda x: len(x[1])): print('Arch:', arch, len(keys), list(keys)[:5], sep="\t") print("nseqs", len(seq2blocks)) if args.dump_motif_seqs: seqs = SeqGroup(args.dump_motif_seqs) for blockid, o in enumerate(blocks): coords = [(x[1] - x[0], x[2], x[0], x[1]) for x in o[0]] coords.sort(reverse=True) with open('motif_%s.faa' % blockid, "w") as FASTA: for m_len, seqname, m_start, m_end in coords: FASTA.write(">%s\n%s")
ncbi = NCBITaxa() path = sys.argv[1] + "*clustalo" infiles = glob(path) print "%s infiles" % len(infiles) valid_cols = 0 spvariants = defaultdict(Counter) refaas = [] for infile in infiles: #for infile in glob("formatted_MG_seqs.faa.final_tree.fa"): print infile if os.stat(infile).st_size == 0: continue alg = SeqGroup(infile) alg_matrix = [] labels = [] for name, seq, _ in alg: # Replace trailing gaps with # and * for stop # Count end positions for n, aa in enumerate(seq[::-1]): if aa != "-": break # Seems that is not possible to know where the stop codon would align.. # seq = seq[:len(seq)-n] + "*" + "#" * (n-1) # So just keep track of trailing gaps, as compared to internal seq = seq[:len(seq) - n] + "*" * n
from ete3 import PhyloTree, PhylomeDBConnector, SeqGroup p = PhylomeDBConnector() w, x, t = p.get_best_tree("Hsa0000001", 1) a, l = p.get_clean_alg("Hsa0000001", 1) A = SeqGroup(a, "iphylip") for s in A.id2seq: A.id2seq[s] = A.id2seq[s][:30] t.link_to_alignment(A) print t.get_species() print t t.set_outgroup(t & "Ddi0002240") sp = PhyloTree( "(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);" ) reconciled, evs = t.reconcile(sp) print reconciled reconciled.show()
import re from ete3 import SeqGroup, Tree import sys alg_file = sys.argv[1] # in fasta format tree_file = sys.argv[2] # in newick format alg = SeqGroup(alg_file) for k, v in alg.name2id.items(): # converts ilegal newick chars from alg names. # Comment this line if not necessary k = re.sub('[:,();]', '_', k) alg.name2id[k] = v tree = Tree(tree_file) for leaf in tree: print(">%s\n%s" % (leaf.name, alg.get_seq(leaf.name)))
if os.path.exists(outpath + '.spcodons.pkl'): print 'loading counts from pkl file' cdna = cPickle.load(open(outpath + '.alg.pkl')) spcodons = cPickle.load(open(outpath + '.spcodons.pkl')) spbases = cPickle.load(open(outpath + '.spbases.pkl')) spstarts = cPickle.load(open(outpath + '.spstarts.pkl')) spstops = cPickle.load(open(outpath + '.spstops.pkl')) print 'loading counts done' else: if os.path.exists(outpath + '.alg.pkl'): print 'loading fasta from pkl file' cdna = cPickle.load(open(outpath + '.alg.pkl')) else: print 'loading from fasta file' cdna = SeqGroup(infile, fix_duplicates=False) cPickle.dump(cdna, open(outpath + '.alg.pkl', 'wb'), 2) print 'loading fasta done' print 'start counting' spcodons = defaultdict(Counter) spbases = defaultdict(Counter) spstops = defaultdict(Counter) spstarts = defaultdict(Counter) for n, (seqid, seq, _) in enumerate(cdna): if n % 10000 == 0: print "%s sequences processed" % n taxid = seqid.split(".")[0] # base counts spbases[taxid].update(seq) # start and stop counts
from ete3 import SeqGroup sp_mem = {} in_fasta = SeqGroup('/home/plaza/research/dom_walk/raw/COG0484.faa') for num, (name, seq, _) in enumerate(in_fasta): sp = name.split('.')[0] if sp not in sp_mem: sp_mem[sp] = [] sp_mem[sp].append(name) print ('writing fastas per sp') for k, val in sp_mem.items(): out_fasta = open('/home/plaza/research/dom_walk/analysis/fasta_per_sp/'+k+'.faa', 'w') for seq_name in val: print (">%s" %(seq_name), file = out_fasta) print (in_fasta.get_seq(seq_name), file =out_fasta) out_fasta.close()