Exemple #1
0
def annotate_node(t, final_task):
    cladeid2node = {}
    # Annotate cladeid in the whole tree
    for n in t.traverse():
        if n.is_leaf():
            n.add_feature("realname", db.get_seq_name(n.name))
            #n.name = n.realname
        if hasattr(n, "cladeid"):
            cladeid2node[n.cladeid] = n

    alltasks = GLOBALS[final_task.configid]["_nodeinfo"][final_task.nodeid]["tasks"]
    npr_iter = get_iternumber(final_task.threadid)
    n = cladeid2node[t.cladeid]
    n.add_features(size=final_task.size)
    for task in alltasks:
        params = ["%s %s" %(k,v) for k,v in  task.args.iteritems() 
                  if not k.startswith("_")]
        params = " ".join(params)

        if task.ttype == "msf":
            n.add_features(msf_outseqs=task.out_seqs,
                           msf_file=task.multiseq_file)

        elif task.ttype == "acleaner":
            n.add_features(clean_alg_mean_ident=task.mean_ident, 
                           clean_alg_std_ident=task.std_ident, 
                           clean_alg_max_ident=task.max_ident, 
                           clean_alg_min_ident=task.min_ident, 
                           clean_alg_type=task.tname, 
                           clean_alg_cmd=params,
                           clean_alg_path=task.clean_alg_fasta_file)
        elif task.ttype == "alg":
            n.add_features(alg_mean_ident=task.mean_ident, 
                           alg_std_ident=task.std_ident, 
                           alg_max_ident=task.max_ident, 
                           alg_min_ident=task.min_ident, 
                           alg_type=task.tname, 
                           alg_cmd=params,
                           alg_path=task.alg_fasta_file)

        elif task.ttype == "tree":
            n.add_features(tree_model=task.model, 
                           tree_seqtype=task.seqtype, 
                           tree_type=task.tname, 
                           tree_cmd=params,
                           tree_path=task.tree_file,
                           tree_constrain=task.constrain_tree,
                           npr_iter=npr_iter)
        elif task.ttype == "mchooser":
            n.add_features(modeltester_models=task.models, 
                           modeltester_type=task.tname, 
                           modeltester_params=params, 
                           modeltester_bestmodel=task.best_model, 
                           )
        elif task.ttype == "treemerger":
            n.add_features(treemerger_type=task.tname, 
                           treemerger_rf="RF=%s [%s]" %(task.rf[0], task.rf[1]),
                           treemerger_out_match_dist = task.outgroup_match_dist,
                           treemerger_out_match = task.outgroup_match,
            )
Exemple #2
0
def annotate_node(t, final_task):
    cladeid2node = {}
    # Annotate cladeid in the whole tree
    for n in t.traverse():
        if n.is_leaf():
            n.add_feature("realname", db.get_seq_name(n.name))
            #n.name = n.realname
        if hasattr(n, "cladeid"):
            cladeid2node[n.cladeid] = n

    alltasks = GLOBALS[final_task.configid]["_nodeinfo"][final_task.nodeid]["tasks"]
    npr_iter = get_iternumber(final_task.threadid)
    n = cladeid2node[t.cladeid]
    n.add_features(size=final_task.size)
    for task in alltasks:
        params = ["%s %s" %(k,v) for k,v in  task.args.iteritems() 
                  if not k.startswith("_")]
        params = " ".join(params)

        if task.ttype == "tree":
            n.add_features(tree_model=task.model, 
                           tree_seqtype=task.seqtype, 
                           tree_type=task.tname, 
                           tree_cmd=params,
                           tree_file=rpath(task.tree_file),
                           tree_constrain=task.constrain_tree,
                           npr_iter=npr_iter)
            
        elif task.ttype == "treemerger":
            n.add_features(treemerger_type=task.tname, 
                           treemerger_rf="RF=%s [%s]" %(task.rf[0], task.rf[1]),
                           treemerger_out_match_dist = task.outgroup_match_dist,
                           treemerger_out_match = task.outgroup_match)

        elif task.ttype == "concat_alg":
            n.add_features(concatalg_cogs="%d"%task.used_cogs,
                           alg_path=task.alg_fasta_file)                       
Exemple #3
0
def get_concatenated_alg(alg_filenames, models=None, 
                        sp_field=0, sp_delimiter="_", 
                        kill_thr=0.0, 
                        keep_species=set()):
    # Concat alg container 
    concat = SeqGroup()
    # Used to store different model partitions
    concat.id2partition = {}

    if not models: 
        models = ["None"]*len(alg_filenames)
    else:
        if len(models) != len(alg_filenames):
            raise ValueError("Different number of algs and model names was found!")

    expected_total_length = 0
    # Check algs and gets the whole set of species
    alg_objects = []
    sp2alg = defaultdict(list)
    
    for algfile, matrix in zip(alg_filenames, models):
        alg = SeqGroup(algfile, "iphylip_relaxed")
        alg_objects.append(alg)
        lenseq = None
        browsed_species = set()
        alg.sp2seq = {}
        # Set best matrix for this alignment
        alg.matrix = matrix
        # Change seq names to contain only species names
        for i, seq in alg.id2seq.iteritems():
            name = db.get_seq_name(alg.id2name[i])
            taxid = get_species_code(name, splitter=sp_delimiter, field=sp_field)
            if lenseq is not None and len(seq) != lenseq:
                raise Exception("Inconsistent alignment when concatenating: Unequal length")
            elif lenseq is None:
                lenseq = len(seq)
                alg.seqlength = len(seq)
                expected_total_length += len(seq)
            if taxid in browsed_species:
                raise Exception("Inconsistent alignment when concatenating: Repeated species")
            browsed_species.add(taxid) # Check no duplicated species in the same alg
            sp2alg[taxid].append(alg) # Records all species seen in all algs.
            alg.sp2seq[taxid] = seq

    valid_species = [sp for sp in sp2alg.iterkeys() \
                         if sp in keep_species or \
                         len(sp2alg[sp])/float(len(alg_objects)) > kill_thr]

    log.info("%d out of %d will be kept (missing factor threshold=%g, %d species forced to kept)" %\
                 (len(valid_species), len(sp2alg), kill_thr, len(keep_species)))

    def sort_single_algs(alg1, alg2):
        r = cmp(alg1.matrix, alg2.matrix)
        if r == 0:
            return cmp(sorted(alg1.id2name.values()),
                       sorted(alg2.id2name.values()))
        else:
            return r
           
    sorted_algs = sorted(alg_objects, sort_single_algs)
    concat_alg_lengths = [alg.seqlength for alg in sorted_algs]
    model2win = {}
    model2size = {}
    for alg in sorted_algs:
        model2size[alg.matrix] = model2size.get(alg.matrix, 0) + alg.seqlength

    # Create concat alg
    concat.id2seq = defaultdict(list)
    for sp in sorted(valid_species):
        log.log(20, "Concatenating sequences of [%s]" %sp)
        for alg in sorted_algs:
            seq = alg.sp2seq.get(sp, "-" * alg.seqlength)
            concat.id2seq[sp].append(seq)
            #current_seq = concat.id2seq.get(sp, "")
            #concat.id2seq[sp] = current_seq + seq.strip()
            concat.id2name[sp] = sp 
            concat.name2id[sp] = sp
            concat.id2comment[sp] = [""]
        concat.id2seq[sp] = ''.join(concat.id2seq[sp])

    current_pos = 0
    partitions = []
    for model in sorted(model2size.keys()):
        size = model2size[model]
        part = "%s, %s = %d-%d" % (model, model+"_genes", \
                                       current_pos + 1,\
                                       current_pos + size)
        current_pos += size
        partitions.append(part)

    # Basic Checks
    seq_sizes = [len(seq) for seq in concat.id2seq.values()]
    if len(set(seq_sizes)) != 1:
        raise Exception("Concatenated alignment is not consistent: unequal seq length ")
    if seq_sizes[0] != expected_total_length:
        raise Exception("The size of concatenated alg is not what expected")
    return concat, partitions, sp2alg, valid_species, concat_alg_lengths
Exemple #4
0
                    log.log(28, "Writing final tree for @@13:%s@@1:\n   %s\n   %s",
                            threadname, final_tree_file+".nw",
                            final_tree_file+".nwx (newick extended)")
                    main_tree.write(outfile=final_tree_file+".nw")
                    main_tree.write(outfile=final_tree_file+ ".nwx", features=[],
                                    format_root_node=True)

                    if hasattr(main_tree, "alg_path"):
                        log.log(28, "Writing root node alignment @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".fa")

                        alg = SeqGroup(get_stored_data(main_tree.alg_path))
                        OUT = open(final_tree_file+".fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print >>OUT, ">%s\n%s" %(realname, seq)
                        OUT.close()

                    if hasattr(main_tree, "clean_alg_path"):
                        log.log(28, "Writing root node trimmed alignment @@13:%s@@1:\n   %s",
                                threadname, final_tree_file+".trimmed.fa")

                        alg = SeqGroup(get_stored_data(main_tree.clean_alg_path))
                        OUT = open(final_tree_file+".trimmed.fa", "w")
                        for name, seq, comments in alg:
                            realname = db.get_seq_name(name)
                            print >>OUT, ">%s\n%s" %(realname, seq)
                        OUT.close()

                    if norender == False: