Ejemplo n.º 1
0
 def __write_algn(self, fullpath):
     """
     to write algn in paml format
     """
     seq_group = SeqGroup()
     for n in self:
         seq_group.id2seq[n.node_id] = n.nt_sequence
         seq_group.id2name[n.node_id] = n.name
         seq_group.name2id[n.name] = n.node_id
     seq_group.write(outfile=fullpath, format='paml')
Ejemplo n.º 2
0
 def __write_algn(self, fullpath):
     """
     to write algn in paml format
     """
     seq_group = SeqGroup()
     for n in self:
         seq_group.id2seq  [n.node_id] = n.nt_sequence
         seq_group.id2name [n.node_id] = n.name
         seq_group.name2id [n.name   ] = n.node_id
     seq_group.write(outfile=fullpath, format='paml')
Ejemplo n.º 3
0
def pick_otu(spe_out, alignment):
    fin = open(spe_out)
    lines = fin.readlines()
    fin.close()
    fout = open(alignment + ".otu", "w")
    aln = SeqGroup(sequences=alignment)
    for i in range(len(lines)):
        line = lines[i]
        if line.startswith("Species"):
            nline = lines[i + 1].strip()
            seq = aln.get_seq(nline)
            fout.write(">" + nline + "\n")
            fout.write(seq + "\n")
    fout.close()
Ejemplo n.º 4
0
def _create_tree (tree,fasta,out,color):
    seqs = SeqGroup(fasta, format="fasta")
    t = Tree(tree)
    colors = _parse_color_file(color)
    node_names = t.get_leaf_names()
    for name in node_names:
        seq = seqs.get_seq(name)
        seqFace = SeqMotifFace(seq, seq_format="()")
        node = t.get_leaves_by_name(name)
        for i in range(0,len(node)):
            if name in colors:
                ns = NodeStyle()
                ns['bgcolor'] = colors[name]
                node[i].set_style(ns)
            node[i].add_face(seqFace,0,'aligned')
    t.render(out)
Ejemplo n.º 5
0
 def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):
     missing_leaves = []
     missing_internal = []
     if type(alignment) == SeqGroup:
         alg = alignment
     else:
         alg = SeqGroup(alignment, format=alg_format, **kwargs)
     # sets the seq of
     for n in self.traverse():
         try:
             n.add_feature("sequence", alg.get_seq(n.name))
         except KeyError:
             if n.is_leaf():
                 missing_leaves.append(n.name)
             else:
                 missing_internal.append(n.name)
     if len(missing_leaves) > 0:
         print("Warnning: [%d] terminal nodes could not be found in the alignment." %\
             len(missing_leaves), file=sys.stderr)
Ejemplo n.º 6
0
 def link_to_alignment(self, alignment, alg_format="fasta", **kwargs):
     missing_leaves = []
     missing_internal = []
     if type(alignment) == SeqGroup:
         alg = alignment
     else:
         alg = SeqGroup(alignment, format=alg_format, **kwargs)
     # sets the seq of
     for n in self.traverse():
         try:
             n.add_feature("sequence",alg.get_seq(n.name))
         except KeyError:
             if n.is_leaf():
                 missing_leaves.append(n.name)
             else:
                 missing_internal.append(n.name)
     if len(missing_leaves)>0:
         print("Warnning: [%d] terminal nodes could not be found in the alignment." %\
             len(missing_leaves), file=sys.stderr)
Ejemplo n.º 7
0
def extract_ss(input_path, suffix, tree_file):
    tree = Tree(tree_file, format=1)
    leaves_set = set(tree.get_leaf_names())
    msa = SeqGroup(input_path.alignment, "fasta")
    path_argv = [input_path._version, input_path._dataset + suffix]
    output_path = common.Paths(path_argv, 0)
    data_versioning.setup_new_dataset(output_path)
    new_msa = SeqGroup()
    for entry in msa.iter_entries():
        label = entry[0]
        sequence = entry[1]
        if (label in leaves_set):
            new_msa.set_seq(label, sequence)
    open(output_path.alignment, "w").write(new_msa.write(format="fasta"))
    shutil.copy(input_path.duplicates_json, output_path.duplicates_json)
    shutil.copy(input_path.outgroups_file, output_path.outgroups_file)
Ejemplo n.º 8
0
def extract_ss(input_path, suffix, tree_file):
    print(
        "Extracting alignment generated with the support selection tree thinning technique..."
    )
    tree = Tree(tree_file, format=1)
    leaves_set = set(tree.get_leaf_names())
    msa = SeqGroup(input_path.alignment, "fasta")
    path_argv = [input_path._version, input_path._dataset + suffix]
    output_path = common.Paths(path_argv, 0)
    data_versioning.setup_new_dataset(output_path)
    new_msa = SeqGroup()
    for entry in msa.iter_entries():
        label = entry[0]
        sequence = entry[1]
        if (label in leaves_set):
            new_msa.set_seq(label, sequence)
    open(output_path.alignment, "w").write(new_msa.write(format="fasta"))
    shutil.copy(input_path.duplicates_json, output_path.duplicates_json)
    shutil.copy(input_path.outgroups_file, output_path.outgroups_file)
    print("New version of the snapshot: " + output_path.path)
from ete3 import PhyloTree, SeqGroup, SequenceFace, TreeStyle, AttrFace, NodeStyle, Tree
import sys


alignment_input=sys.argv[1]
tree_input=sys.argv[2]


alg=( alignment_input )



t = PhyloTree( tree_input , format=1, quoted_node_names=True )
seqs = SeqGroup(alg, format="fasta")


nodestyle1 = NodeStyle()
nodestyle1["size"] = 0
nodestyle1["vt_line_width"] = 2
nodestyle1["hz_line_width"] = 2

for node in t.traverse():
    node.set_style(nodestyle1)


for leaf in t.iter_leaves():
    item=seqs.get_seq(leaf.name)
    name_face = AttrFace(item, fsize=24)
    Bars = SequenceFace(item, seqtype='aa', fsize=24, bg_colors={'G': 'Khaki', 'A': 'Khaki', 'S': 'Khaki', 'T': 'Khaki', 'C': 'LightGreen', 'V': 'LightGreen', 'I': 'LightGreen', 'L': 'LightGreen', 'P': 'LightGreen', 'F': 'LightGreen', 'Y': 'LightGreen', 'M': 'YellowGreen', 'W': 'LightGreen', 'N': 'Thistle', 'Q': 'Thistle', 'H': 'Thistle', 'D': 'DarkSalmon', 'E': 'DarkSalmon', 'K': 'SkyBlue', 'R': 'SkyBlue', 'X':'Black', '-':'White' }, fg_colors=None, codon=None, col_w=1.5, alt_col_w=3, special_col=None, interactive=False)
    leaf.add_face(Bars, 2, "aligned")    
    
Ejemplo n.º 10
0
import os
from sys import exit
from glob import glob
from ete3 import SeqGroup, parser

path = sys.argv[1] + "*clustalo"
infiles = glob(path)
print "%s infiles" % len(infiles)

F = parser.fasta.read_fasta(sys.argv[2])

for infile in infiles:
    print infile
    if os.stat(infile).st_size == 0:
        continue
    alg_aa = SeqGroup(infile)
    alg_dna = SeqGroup()

    for name, seq, _ in alg_aa:
        try:
            cdna = F.id2seq[F.name2id[name]]
        except KeyError:
            print "cdna for %s not found" % name
            continue
        cdna_aln = ""
        for pos in seq:
            if pos != "-":
                cdna_aln += cdna[:3]
                cdna = cdna[3:]
            else:
                cdna_aln += "---"
Ejemplo n.º 11
0
                (begin, end, ies, iesId, beginMSA, endMSA) = charMat[(geneFamily, homIES, geneId)]
                if ies == '?':
                    cf = CircleFace(10, "silver", label = "?")
                elif ies == '1':
                    cf = CircleFace(10, "black")
                elif ies == '0':
                    cf = CircleFace(10, "LightGrey")
                else:
                    sys.exit(1)
                column = hiesL[(geneFamily, homIES)] + 1
                leaf.add_face(cf, column, "aligned")
        drawTree(outputFile)
    elif plotStyle == '3': # plot with MSA
        # load nucleotide sequences for all genes!
        nuclAlnFile = os.path.join(basePath, 'analysis', 'msas', 'filtered', 'cluster.' + geneFamily + '.nucl.fa')
        seqs = SeqGroup(sequences = nuclAlnFile, format = "fasta")

        for leaf in t:
            geneId = leaf.name
            seq = seqs.get_seq(geneId)
            seq = seq.translate(None, string.ascii_lowercase) # keep only CDS
            iesmotif = [[1, len(seq), "line", 2, 5, None, None, None]]
            for homIES in gfhomIES[geneFamily]:
                (begin, end, ies, iesId, beginMSA, endMSA) = charMat[(geneFamily, homIES, geneId)]
                if ies == '?':
                    if beginMSA == 'NA':
                        iesmotif.append([int(begin), int(end),"()", 10, 10, "red", "black", "arial|8|black|?"])
                    else:
                        iesmotif.append([int(begin), int(end),"()", 10, 10, "red", "black", "arial|8|black|?"])
                elif ies == '1':
                    iesmotif.append([int(beginMSA), int(endMSA),"[]", 10, 10, "black", "red", "arial|8|black|" + iesId])
Ejemplo n.º 12
0
import cPickle
from pandas import DataFrame
from collections import Counter, defaultdict
import os

from ete3 import SeqGroup

if os.path.exists('alg.pkl'):
    print 'loading from pkl file'
    a = cPickle.load(open('alg.pkl'))
else: 
    alg = SeqGroup('formatted_MG_seqs.faa.final_tree.fa')
    #alg = SeqGroup('Burki_first10.aln.fa')
    #alg = SeqGroup('test_52.fa')
    alg_matrix = []
    labels = []
    for name, seq, _ in alg:
        aas = [aa for aa in seq.upper()]
        alg_matrix.append(aas)
        labels.append(name)

    a = DataFrame(alg_matrix, labels)
    cPickle.dump(a, open('alg.pkl', 'wb'), 2)

nsp = float(len(a))
spvariants = defaultdict(Counter)
valid_cols = 0
refaas = []
for col in a:
    counter = Counter(a[col])
    refaa, num = counter.most_common(1)[0]
Ejemplo n.º 13
0
import sys
import re
import random, string
#from seqgroup import SeqGroup
from ete3 import SeqGroup
from ete3 import NCBITaxa

ncbi = NCBITaxa()
seq_file = SeqGroup(sys.argv[1])
OUT_fasta = open(sys.argv[2], 'w')
out_table = open(sys.argv[3], 'w')
proc_type = str(sys.argv[4])


def proces_prote_check_taxid(seq_file):
    for seq_num, (name, seq, _) in enumerate(seq_file):
        try:
            taxid = name.split('.')[0]
            taxid2name = ncbi.get_taxid_translator([taxid])
            code = ''.join(
                random.choices(string.ascii_letters + string.digits, k=5))
            code_name = '.'.join([taxid, code])
            #	name = re.sub('\|', '_', name)
            #	name = re.sub('\:|\;|\*|,|\"|\[|\]|\(|\)|\/', '_', name)
            #	name = re.sub(r"\\", '_', name)
            seq = re.sub('\*|"|\+', '', seq)
            seq = re.sub('Z|B|J', 'X', seq)
            print(">%s\n%s" % (code_name, seq), file=OUT_fasta)
            print("%s\t%s" % (code_name, name), file=out_table)
        except:
            print('ERROR in', name)
Ejemplo n.º 14
0
import sys
import random, string
from ete3 import SeqGroup
from tempfile import NamedTemporaryFile

in_file = sys.argv[1]
transform_fasta = sys.argv[2]
out_file = sys.argv[3]
translate_table = open(sys.argv[4], 'w')

alg = SeqGroup(in_file)
translate = open(transform_fasta, 'w')

for num, (name, seq, _) in enumerate(alg):
    taxid = name.split('.')[0]
    code = ''.join(random.choices(string.ascii_letters + string.digits, k=5))
    #code=format((num+1), '05')
    #nam_t=taxid+'.'+str(code)
    print >> translate, '>%s\n%s' % (code, seq)
    print >> translate_table, '%s\t%s' % (name, code)

translate_table.close()
translate.close()
translate_alg = SeqGroup(transform_fasta)

translate_alg.write(format="phylip", outfile=out_file)
Ejemplo n.º 15
0
import re
from ete3 import SeqGroup, Tree
import sys

tree_file = sys.argv[1]  # in newick format
original_fasta = SeqGroup(sys.argv[2])
pruned_fasta = open(sys.argv[3], 'w')
star_target = str(sys.argv[4])
end_target = str(sys.argv[5])

tree = Tree(tree_file)
R = tree.get_midpoint_outgroup()
tree.set_outgroup(R)

name_list = []
for num, leaf in enumerate(tree):
    name_list.append(leaf.name)
    if star_target == leaf.name:
        star_pos = num
    if end_target == leaf.name:
        end_pos = num

pruned_list = name_list[star_pos:(end_pos + 1)]
print pruned_list

#for ele in pruned_list:
#    print >>pruned_fasta,">%s\n%s"%(ele, original_fasta.get_seq(ele))

for ele in name_list:
    if ele not in pruned_list:
        print >> pruned_fasta, ">%s\n%s" % (ele, original_fasta.get_seq(ele))
Ejemplo n.º 16
0
from ete3 import PhyloTree, PhylomeDBConnector, SeqGroup

p = PhylomeDBConnector()
w,x, t =  p.get_best_tree("Hsa0000001", 1)
a, l = p.get_clean_alg("Hsa0000001", 1)
A = SeqGroup(a, "iphylip")
for s in A.id2seq:
    A.id2seq[s]=A.id2seq[s][:30]
t.link_to_alignment(A)
print t.get_species()
print t
t.set_outgroup(t&"Ddi0002240")

sp = PhyloTree("(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);")
reconciled, evs = t.reconcile(sp)
print reconciled
reconciled.show()
Ejemplo n.º 17
0
from sys import exit
from glob import glob
from ete3 import SeqGroup, parser

path = sys.argv[1] + "*clustalo"
infiles = glob(path)
print "%s infiles" % len(infiles)

F = parser.fasta.read_fasta(sys.argv[2])

for infile in infiles:
    print infile
    if os.stat(infile).st_size == 0:
        continue
    alg_aa = SeqGroup(infile)
    alg_dna = SeqGroup()

    for name, seq, _ in alg_aa:
        try:
            cdna = F.id2seq[F.name2id[name]]
        except KeyError:
            print "cdna for %s not found" % name
            continue
        cdna_aln = ""
        for pos in seq:
            if pos != "-":
                cdna_aln += cdna[:3]
                cdna = cdna[3:]
            else:
                cdna_aln += "---"
        # Last the stop codon
Ejemplo n.º 18
0
def main(args):
    HSP = defaultdict(list)  # store High Scoring Pairs (HSPs)
    MIN_EVALUE = args.min_evalue
    MIN_ALG_LENGTH = args.min_alg_length
    MIN_OVERLAP = args.min_overlap
    MIN_SCORE = args.min_score
    MIN_PIDENT = args.min_ident
    HITS_FILE = args.input

    HSP_COUNTER = 0
    # First, stores every High Score Pair (HSP) observed for each sequence, reading the all-against-all
    # BLAST matrix. Each HSP is treated as a hit in a potential MOTIF.
    for query, group in itertools.groupby(iter_queries(HITS_FILE),
                                          lambda x: x[0]):
        for query, hit, evalue, score, length, pident, qstart, qend, sstart, send in group:
            evalue = float(evalue)
            score = float(score)
            pident = float(pident)
            length, qstart, qend, sstart, send = map(
                int, [length, qstart, qend, sstart, send])

            if query == hit or length < MIN_ALG_LENGTH or evalue > MIN_EVALUE or score < MIN_SCORE or pident < MIN_PIDENT:
                continue
            qid = HSP_COUNTER
            HSP_COUNTER += 1
            sid = HSP_COUNTER
            HSP_COUNTER += 1

            HSP[query].append([qstart, qend, qend - qstart, qid, sid])
            HSP[hit].append([sstart, send, send - sstart, sid, qid])

    #all_motifs = consolidate_hsps(HSP, MIN_OVERLAP)
    all_motifs = consolidate_hsps_small(HSP, MIN_OVERLAP)

    # At this point, "all_motifs" contain a list of motifs per sequence. Each
    # seq-motif entry provides also a set of matching regions (HSPs) in other
    # seqs, so we have a graph conecting consolidated per-sequence HSPs. The
    # connected components function returns the clusters of motifs that are
    # connected, so each cluster represents a disconnected independent group,
    # therehore a potentially independent alignable motif block in the original
    # sequences.
    blocks = connected_components(all_motifs)
    for blockid, o in enumerate(blocks):
        coords = [(x[1] - x[0], x[2], x[0], x[1]) for x in o[0]]
        coords.sort(reverse=True)
        print('MOTIF BLOCK:',
              blockid,
              "nseqs:",
              len(set([x[2] for x in o[0]])),
              "min.length:",
              np.min([x[1] - x[0] for x in o[0]]),
              "avg.length:",
              np.mean([x[1] - x[0] for x in o[0]]),
              "largest:",
              coords[0][1:4],
              coords[-1][1:4],
              list(set([x[2] for x in o[0]]))[:4],
              sep="\t")

    # We can now analyze the motif/block composition of each seq, and group
    # sequences based on similar block architecture
    seq2blocks = defaultdict(set)
    for blockid, bl in enumerate(blocks):
        for blinfo in bl[0]:
            seq2blocks[blinfo[2]].add(blockid)

    # Let's analize motif architectures
    archs = [(key, tuple(sorted(values)))
             for key, values in seq2blocks.items()]
    #for name, arch in sorted(archs, key=lambda x: x[1]):
    #    print(name, arch, sep="\t")

    archcounter = defaultdict(set)
    for name, arch in sorted(archs, key=lambda x: x[1]):
        archcounter[arch].add(name)
    for arch, keys in sorted(archcounter.items(), key=lambda x: len(x[1])):
        print('Arch:', arch, len(keys), list(keys)[:5], sep="\t")

    print("nseqs", len(seq2blocks))

    if args.dump_motif_seqs:
        seqs = SeqGroup(args.dump_motif_seqs)
        for blockid, o in enumerate(blocks):
            coords = [(x[1] - x[0], x[2], x[0], x[1]) for x in o[0]]
            coords.sort(reverse=True)
            with open('motif_%s.faa' % blockid, "w") as FASTA:
                for m_len, seqname, m_start, m_end in coords:
                    FASTA.write(">%s\n%s")
ncbi = NCBITaxa()

path = sys.argv[1] + "*clustalo"
infiles = glob(path)
print "%s infiles" % len(infiles)

valid_cols = 0
spvariants = defaultdict(Counter)
refaas = []

for infile in infiles:
    #for infile in glob("formatted_MG_seqs.faa.final_tree.fa"):
    print infile
    if os.stat(infile).st_size == 0:
        continue
    alg = SeqGroup(infile)
    alg_matrix = []
    labels = []

    for name, seq, _ in alg:
        # Replace trailing gaps with # and * for stop
        # Count end positions
        for n, aa in enumerate(seq[::-1]):
            if aa != "-":
                break

        # Seems that is not possible to know where the stop codon would align..
        # seq = seq[:len(seq)-n] + "*" + "#" * (n-1)
        # So just keep track of trailing gaps, as compared to internal
        seq = seq[:len(seq) - n] + "*" * n
from ete3 import PhyloTree, PhylomeDBConnector, SeqGroup

p = PhylomeDBConnector()
w, x, t = p.get_best_tree("Hsa0000001", 1)
a, l = p.get_clean_alg("Hsa0000001", 1)
A = SeqGroup(a, "iphylip")
for s in A.id2seq:
    A.id2seq[s] = A.id2seq[s][:30]
t.link_to_alignment(A)
print t.get_species()
print t
t.set_outgroup(t & "Ddi0002240")

sp = PhyloTree(
    "(((((((((((Hsa, Ptr), Mmu), ((Mms, Rno), (Bta, Cfa))), Mdo), Gga), Xtr), (Dre, Fru))),Cin) (Dme, Aga)), Ddi);"
)
reconciled, evs = t.reconcile(sp)
print reconciled
reconciled.show()
Ejemplo n.º 21
0
import re
from ete3 import SeqGroup, Tree
import sys

alg_file = sys.argv[1]  # in fasta format
tree_file = sys.argv[2]  # in newick format

alg = SeqGroup(alg_file)
for k, v in alg.name2id.items():
    # converts ilegal newick chars from alg names.
    # Comment this line if not necessary
    k = re.sub('[:,();]', '_', k)
    alg.name2id[k] = v

tree = Tree(tree_file)
for leaf in tree:
    print(">%s\n%s" % (leaf.name, alg.get_seq(leaf.name)))
Ejemplo n.º 22
0
if os.path.exists(outpath + '.spcodons.pkl'):
    print 'loading counts from pkl file'
    cdna = cPickle.load(open(outpath + '.alg.pkl'))
    spcodons = cPickle.load(open(outpath + '.spcodons.pkl'))
    spbases = cPickle.load(open(outpath + '.spbases.pkl'))
    spstarts = cPickle.load(open(outpath + '.spstarts.pkl'))
    spstops = cPickle.load(open(outpath + '.spstops.pkl'))
    print 'loading counts done'
else:
    if os.path.exists(outpath + '.alg.pkl'):
        print 'loading fasta from pkl file'
        cdna = cPickle.load(open(outpath + '.alg.pkl'))
    else:
        print 'loading from fasta file'
        cdna = SeqGroup(infile, fix_duplicates=False)
        cPickle.dump(cdna, open(outpath + '.alg.pkl', 'wb'), 2)
        print 'loading fasta done'
    print 'start counting'
    spcodons = defaultdict(Counter)
    spbases = defaultdict(Counter)
    spstops = defaultdict(Counter)
    spstarts = defaultdict(Counter)

    for n, (seqid, seq, _) in enumerate(cdna):
        if n % 10000 == 0:
            print "%s sequences processed" % n
        taxid = seqid.split(".")[0]
        # base counts
        spbases[taxid].update(seq)
        # start and stop counts
Ejemplo n.º 23
0
from ete3 import SeqGroup

sp_mem = {}
in_fasta = SeqGroup('/home/plaza/research/dom_walk/raw/COG0484.faa')

for num, (name, seq, _) in enumerate(in_fasta):
    sp = name.split('.')[0]
    if sp not in sp_mem:
        sp_mem[sp] = []
    sp_mem[sp].append(name)
    
print ('writing fastas per sp')
for k, val in sp_mem.items():
    out_fasta = open('/home/plaza/research/dom_walk/analysis/fasta_per_sp/'+k+'.faa', 'w')
    for seq_name in val:
        print (">%s" %(seq_name), file = out_fasta)
        print (in_fasta.get_seq(seq_name), file =out_fasta)
    out_fasta.close()