Example #1
0
def readExtracts(options, map_component2input_id):
    """read extract information from filename supplied in the options.
    """
    if not options.filename_extract_regions:
        return None

    if options.use_input_id:
        map_id2component = IOTools.getInvertedDictionary(
            map_component2input_id)
    else:
        map_id2component = {}
        for component in map_component2input_id.keys():
            map_id2component[component] = (component,)

    map_component2extracts = collections.defaultdict(list)
    if not os.path.exists(options.filename_extract_regions):
        options.stdlog.write(
            "# could not find %s - ignored \n" % options.filename_extract_regions)
    else:
        for line in open(options.filename_extract_regions, "r"):
            if line[0] == "#":
                continue
            id, start, end = line[:-1].split("\t")
            start, end = int(start), int(end)
            if id not in map_id2component:
                continue
            for x in map_id2component[id]:
                map_component2extracts[x].append((start, end))
        if options.loglevel >= 1:
            options.stdlog.write(
                "# read extracts for %i malis.\n" % len(map_component2extracts))
            options.stdlog.flush()

    return map_component2extracts
Example #2
0
def readExtracts(options, map_component2input_id):
    """read extract information from filename supplied in the options.
    """
    if not options.filename_extract_regions: return None

    if options.use_input_id:
        map_id2component = IOTools.getInvertedDictionary(
            map_component2input_id)
    else:
        map_id2component = {}
        for component in map_component2input_id.keys():
            map_id2component[component] = (component, )

    map_component2extracts = collections.defaultdict(list)
    if not os.path.exists(options.filename_extract_regions):
        options.stdlog.write("# could not find %s - ignored \n" %
                             options.filename_extract_regions)
    else:
        for line in open(options.filename_extract_regions, "r"):
            if line[0] == "#": continue
            id, start, end = line[:-1].split("\t")
            start, end = int(start), int(end)
            if id not in map_id2component: continue
            for x in map_id2component[id]:
                map_component2extracts[x].append((start, end))
        if options.loglevel >= 1:
            options.stdlog.write("# read extracts for %i malis.\n" %
                                 len(map_component2extracts))
            options.stdlog.flush()

    return map_component2extracts
Example #3
0
def trainMali( mali, options ):
    """train a grammar on a multiple alignment."""

    ## remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps( minimum_gaps = 1, frame=1 )
    
    length = mali.getNumColumns()

    input_model = prepareGrammar( options )

    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename( id, species )

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True )
    
    ids = mali.getIdentifiers()

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") )
        tree = nexus.trees[0]
        try:
            tree.relabel( map_old2new, warn = True )
        except KeyError, msg:
            raise KeyError( "names in mali and tree are not congruent: %s" % msg )
def readAnnotations( options, map_component2input_id ):
    """read annotation information from filename supplied in the options.
    """
    if not options.filename_annotate_regions: return None

    if options.use_input_id:
        map_id2component = IOTools.getInvertedDictionary( map_component2input_id )
    else:
        map_id2component = {}
        for component in map_component2input_id.keys():
            map_id2component[component] = (component,)

    map_component2annotations = collections.defaultdict( list )
    if not os.path.exists( options.filename_annotate_regions ):
        options.stdlog.write("# could not find %s - ignored \n" % options.filename_annotate_regions )
    else:
        for line in open(options.filename_annotate_regions, "r" ):
            if line[0] == "#": continue
            try:
                id, start, end, label = line[:-1].split("\t")
            except ValueError:
                raise ValueError("parsing error in line %s\n" % (line[:-1]))

            start, end = int(start), int(end)
            if id not in map_id2component: continue
            for x in map_id2component[id]:
                map_component2annotations[x].append( (start, end, label ) )

        if options.loglevel >= 1:
            options.stdlog.write("# read annotations for %i malis.\n" % len(map_component2annotations))
            options.stdlog.flush()

    return map_component2annotations
Example #5
0
def readAnnotations(options, map_component2input_id):
    """read annotation information from filename supplied in the options.
    """
    if not options.filename_annotate_regions:
        return None

    if options.use_input_id:
        map_id2component = IOTools.getInvertedDictionary(
            map_component2input_id)
    else:
        map_id2component = {}
        for component in map_component2input_id.keys():
            map_id2component[component] = (component, )

    map_component2annotations = collections.defaultdict(list)
    if not os.path.exists(options.filename_annotate_regions):
        options.stdlog.write("# could not find %s - ignored \n" %
                             options.filename_annotate_regions)
    else:
        for line in open(options.filename_annotate_regions, "r"):
            if line[0] == "#":
                continue
            try:
                id, start, end, label = line[:-1].split("\t")
            except ValueError:
                raise ValueError("parsing error in line %s\n" % (line[:-1]))

            start, end = int(start), int(end)
            if id not in map_id2component:
                continue
            for x in map_id2component[id]:
                map_component2annotations[x].append((start, end, label))

        if options.loglevel >= 1:
            options.stdlog.write("# read annotations for %i malis.\n" %
                                 len(map_component2annotations))
            options.stdlog.flush()

    return map_component2annotations
Example #6
0
def processMali(mali, options):

    ncols = mali.getNumColumns()

    if ncols == 0:
        raise "refusing to process empty alignment."

    ## add annotation of states
    if options.block_size != None:
        if options.block_size < 1:
            size = int(float(ncols) / 3.0 * options.block_size) * 3
        else:
            size = int(options.block_size) * 3

        size = min(size, ncols)
        mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size))

    ## remove gene ids
    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename(id, species)

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True)

    ids = mali.getIdentifiers()
    xgram = XGram.XGram()

    if options.xrate_min_increment:
        xgram.setMinIncrement(options.xrate_min_increment)

    ninput, noutput, nskipped = 0, 0, 0

    # remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps(minimum_gaps=1, frame=3)

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r"))
        tree = nexus.trees[0]
        tree.relabel(map_old2new)
    else:
        tree = None

    annotation = mali.getAnnotation("STATE")
    chars = set(list(annotation))
    for c in chars:
        assert c in (
            "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized"
    if len(chars) == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: only a single block")
        blocks = (("B0_", chars[0]), )
    else:
        blocks = (("B0_", "N"), ("B1_", "C"))

    result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks,
                                       options)

    trained_model = result.getModel()

    pis, matrices = RateEstimation.getRateMatrix(trained_model)

    annotation = mali.getAnnotation("STATE")

    for block, code in blocks:

        terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block)

        pi = pis[terminals]

        if options.shared_rates == "all":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa":
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "omega":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = block
        elif options.shared_rates == "omega-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = ""
        elif options.shared_rates == "ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        else:
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block

        if options.shared_frequencies:
            frequency_prefix = ""
        else:
            frequency_prefix = block

        rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs)
        rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn)
        ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri)
        rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv)

        nchars = annotation.count(code)

        msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % (
            result.getNumIterations(), rs, rn, ri, rv)

        try:
            Q, t = RateEstimation.getQMatrix(pi,
                                             Rsi=rs * ri,
                                             Rsv=rs * rv,
                                             Rni=rn * ri,
                                             Rnv=rn * rv)
            avg_omega = (rs + rn) / 2.0
            Q0, t0 = RateEstimation.getQMatrix(pi,
                                               Rsi=ri * avg_omega,
                                               Rsv=rv * avg_omega,
                                               Rni=ri * avg_omega,
                                               Rnv=rv * avg_omega)

            avg_kappa = (ri + rv) / 2.0
            Q1, t1 = RateEstimation.getQMatrix(pi,
                                               Rsi=rs * avg_kappa,
                                               Rsv=rs * avg_kappa,
                                               Rni=rn * avg_kappa,
                                               Rnv=rn * avg_kappa)

            rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q)
            rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0)
            rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1)

            dS = rS / (3 * rS0) * t
            dN = rN / (3 * rN0) * t

            o_kappa = options.value_format % (rI / rI0 * rV0 / rV)
            o_omega = options.value_format % (dN / dS)

            o_dn = options.value_format % dN
            o_ds = options.value_format % dS
            o_rn = options.value_format % rN
            o_rs = options.value_format % rS
            o_rn0 = options.value_format % rN0
            o_rs0 = options.value_format % rS0
            o_t = options.value_format % t
            o_t0 = options.value_format % t0

        except ZeroDivisionError:

            o_kappa = "na"
            o_omega = "na"
            o_dn = "na"
            o_ds = "na"
            o_rn = "na"
            o_rs = "na"
            o_rn0 = "na"
            o_rs0 = "na"
            o_t = "na"
            o_t0 = "na"
            Q = None
            msg = "insufficient data to estimate rate matrix."

        options.stdout.write("\t".join(
            map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na",
                      o_kappa, result.getLogLikelihood(), "na", nchars))))

        if options.with_rho:
            options.stdout.write(
                "\t" +
                "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0))))

        options.stdout.write("\t%s\n" % msg)
Example #7
0
def processMali(mali, options):

    map_new2old = mali.mapIdentifiers()
    ids = mali.getIdentifiers()

    invalid_chars = options.gap_chars + options.mask_chars

    has_non_overlaps = False

    pairs = []

    if options.iteration == "all-vs-all":
        for x in range(len(ids)):
            for y in range(0, x):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))
    elif options.iteration == "tree":
        pairs = []
    else:
        raise "unknown iteration mode: %s" % (options.iteration)

    if options.remove_stops:
        for id, entry in mali.items():
            s = entry.mString.upper()
            fragments = []
            for x in range(0, len(s), 3):
                codon = s[x:x + 3]
                if Genomics.IsStopCodon(codon):
                    codon = "NNN"

                fragments.append(codon)

            entry.mString = "".join(fragments)

    for x, y in pairs:
        noverlap = 0
        for a, b in zip(mali[ids[x]], mali[ids[y]]):
            if a not in invalid_chars and b not in invalid_chars:
                noverlap += 1
                if noverlap >= options.min_overlap:
                    break
        else:
            has_non_overlaps = True
            break

    if options.tree:
        tree = TreeTools.Newick2Nexus(options.tree).trees[0]
        map_old2new = IOTools.getInvertedDictionary(map_new2old,
                                                    make_unique=True)
        tree.relabel(map_old2new)
    else:
        tree = None

    if options.method == "paml":
        runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options)

    elif options.method == "xrate":
        runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
Example #8
0
def Process(lines, other_trees, options, map_old2new, ntree):

    nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines))

    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees.\n" % len(nexus.trees))

    nskipped = 0
    ntotal = len(nexus.trees)
    extract_pattern = None
    species2remove = None
    write_map = False

    phylip_executable = None
    phylip_options = None

    index = 0

    # default: do not output internal node names
    write_all_taxa = False

    for tree in nexus.trees:

        if options.outgroup:
            tree.root_with_outgroup(options.outgroup)

        for method in options.methods:

            if options.loglevel >= 3:
                options.stdlog.write("# applying method %s to tree %i.\n" %
                                     (method, index))

            if method == "midpoint-root":
                tree.root_midpoint()

            elif method == "balanced-root":
                tree.root_balanced()

            elif method == "unroot":
                TreeTools.Unroot(tree)

            elif method == "phylip":
                if not phylip_executable:
                    phylip_executable = options.parameters[0]
                    del options.parameters[0]
                    phylip_options = re.split("@", options.parameters[0])
                    del options.parameters[0]

                    phylip = WrapperPhylip.Phylip()
                    phylip.setProgram(phylip_executable)
                    phylip.setOptions(phylip_options)

                phylip.setTree(tree)

                result = phylip.run()

                nexus.trees[index] = result.mNexus.trees[0]

            elif method == "normalize":
                if options.value == 0:
                    v = 0
                    for n in tree.chain.keys():
                        v = max(v, tree.node(n).data.branchlength)
                else:
                    v = options.value

                for n in tree.chain.keys():
                    tree.node(n).data.branchlength /= float(options.value)

            elif method == "divide-by-tree":

                if len(other_trees) > 1:
                    other_tree = other_trees[ntree]
                else:
                    other_tree = other_trees[0]

                # the trees have to be exactly the same!!
                if options.loglevel >= 2:
                    print tree.display()
                    print other_tree.display()

                if not tree.is_identical(other_tree):
                    nskipped += 1
                    continue

                # even if the trees are the same (in topology), the node numbering might not be
                # the same. Thus build a map of node ids.
                map_a2b = TreeTools.GetNodeMap(tree, other_tree)

                for n in tree.chain.keys():
                    try:
                        tree.node(n).data.branchlength /= float(
                            other_tree.node(map_a2b[n]).data.branchlength)
                    except ZeroDivisionError:
                        options.stdlog.write(
                            "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n"
                            % (n, map_a2b[n], ntree))
                        continue

            elif method == "rename":
                if not map_old2new:

                    map_old2new = IOTools.ReadMap(open(options.parameters[0],
                                                       "r"),
                                                  columns=(0, 1))

                    if options.invert_map:
                        map_old2new = IOTools.getInvertedDictionary(
                            map_old2new, make_unique=True)

                    del options.parameters[0]

                unknown = []
                for n, node in tree.chain.items():
                    if node.data.taxon:
                        try:
                            node.data.taxon = map_old2new[node.data.taxon]
                        except KeyError:
                            unknown.append(node.data.taxon)

                for taxon in unknown:
                    tree.prune(taxon)

            # reformat terminals
            elif method == "extract-with-pattern":

                if not extract_pattern:
                    extract_pattern = re.compile(options.parameters[0])
                    del options.parameters[0]

                for n in tree.get_terminals():
                    node = tree.node(n)
                    node.data.taxon = extract_pattern.search(
                        node.data.taxon).groups()[0]

            elif method == "set-uniform-branchlength":
                for n in tree.chain.keys():
                    tree.node(n).data.branchlength = options.value

            elif method == "build-map":
                # build a map of identifiers
                options.write_map = True
                for n in tree.get_terminals():
                    node = tree.node(n)
                    if node.data.taxon not in map_old2new:
                        new = options.template_identifier % (len(map_old2new) +
                                                             1)
                        map_old2new[node.data.taxon] = new
                    node.data.taxon = map_old2new[node.data.taxon]

            elif method == "remove-pattern":
                if species2remove is None:
                    species2remove = re.compile(options.parameters[0])
                    del options.parameters
                taxa = []
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    skip = False
                    if species2remove.search(t):
                        continue
                    if not skip:
                        taxa.append(t)
                TreeTools.PruneTree(tree, taxa)

            elif method == "add-node-names":

                inode = 0
                write_all_taxa = True
                for n, node in tree.chain.items():
                    if not node.data.taxon:
                        node.data.taxon = "inode%i" % inode
                        inode += 1

            elif method == "newick2nhx":
                # convert names to species names
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    d = t.split("|")
                    if len(d) >= 2:
                        tree.node(n).data.species = d[0]

        index += 1
        ntree += 1

    if options.output_format == "nh":
        options.stdout.write(
            TreeTools.Nexus2Newick(
                nexus,
                write_all_taxa=True,
                with_branchlengths=options.with_branchlengths) + "\n")
    else:
        for tree in nexus.trees:
            tree.writeToFile(options.stdout, format=options.output_format)

    return ntotal, nskipped, ntree
Example #9
0
def processMali(mali, options):

    map_new2old = mali.mapIdentifiers()
    ids = mali.getIdentifiers()

    invalid_chars = options.gap_chars + options.mask_chars

    has_non_overlaps = False

    pairs = []

    if options.iteration == "all-vs-all":
        for x in range(len(ids)):
            for y in range(0, x):
                pairs.append((x, y))
    elif options.iteration == "first-vs-all":
        for y in range(1, len(ids)):
            pairs.append((0, y))
    elif options.iteration == "pairwise":
        if len(ids) % 2 != 0:
            raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len(
                ids)
        for x in range(0, len(ids), 2):
            pairs.append((x, x + 1))
    elif options.iteration == "tree":
        pairs = []
    else:
        raise "unknown iteration mode: %s" % (options.iteration)

    if options.remove_stops:
        for id, entry in mali.items():
            s = entry.mString.upper()
            fragments = []
            for x in range(0, len(s), 3):
                codon = s[x:x + 3]
                if Genomics.IsStopCodon(codon):
                    codon = "NNN"

                fragments.append(codon)

            entry.mString = "".join(fragments)

    for x, y in pairs:
        noverlap = 0
        for a, b in zip(mali[ids[x]], mali[ids[y]]):
            if a not in invalid_chars and b not in invalid_chars:
                noverlap += 1
                if noverlap >= options.min_overlap:
                    break
        else:
            has_non_overlaps = True
            break

    if options.tree:
        tree = TreeTools.Newick2Nexus(options.tree).trees[0]
        map_old2new = IOTools.getInvertedDictionary(
            map_new2old, make_unique=True)
        tree.relabel(map_old2new)
    else:
        tree = None

    if options.method == "paml":
        runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options)

    elif options.method == "xrate":
        runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
Example #10
0
def getMergers(tree, map_strain2species, options):
    """merge strains to species.

    returns the new tree with species merged and
    a dictionary of genes including the genes that have been merged.

    Currently, only binary merges are supported.
    """

    n = TreeTools.GetSize(tree) + 1
    all_strains = map_strain2species.keys()
    all_species = map_strain2species.values()

    genes = []
    for x in range(n):
        g = {}
        for s in all_strains:
            g[s] = set()
        genes.append(g)

    # build list of species pairs that can be joined.
    map_species2strain = IOTools.getInvertedDictionary(map_strain2species)
    pairs = []

    for species, strains in map_species2strain.items():
        for x in range(len(strains)):
            for y in range(0, x):
                pairs.append((strains[x], strains[y]))

    # map of genes to new genes
    # each entry in the list is a pair of genes of the same species
    # but different strains to be joined.
    map_genes2new_genes = []

    # dictionary of merged genes. This is to ensure that no gene
    # is merged twice
    merged_genes = {}

    def count_genes(node_id):
        """record number of genes per species for each node

        This is done separately for each strain. The counts are aggregated for each species
        over strains by taking the maximum gene count per strain. This ignores any finer
        tree structure below a species node.
        """
        node = tree.node(node_id)

        if node.succ:
            this_node_set = genes[node_id]
            # process non-leaf node
            for s in node.succ:

                # propagate: terminated nodes force upper nodes to terminate
                # (assigned to None).
                if not genes[s]:
                    this_node_set = None
                    break

                # check if node merges genes that are not part of the positive
                # set
                for strain in all_strains:
                    if strain in map_strain2species:
                        # merge genes from all children
                        this_node_set[strain] = this_node_set[
                            strain].union(genes[s][strain])

                        if len(this_node_set[strain]) > 1:
                            # more than two genes for a single species, so no
                            # join
                            this_node_set = None
                            break

                    elif strain not in map_strain2species and \
                            this_node_set[strain] > 0:
                        this_node_set = None
                        break

            if this_node_set is None:
                genes[node_id] = None
                return

            for strain_x, strain_y in pairs:
                if len(this_node_set[strain_x]) == 1 and len(this_node_set[strain_y]) == 1:
                    species = map_strain2species[strain_x]
                    gene_x, gene_y = tuple(this_node_set[strain_x])[0], tuple(
                        this_node_set[strain_y])[0]

                    # check if these to genes have already been merged or are
                    # merged with other partners already
                    # The merged genes are assigned the same node_id, if they have
                    # been already merged.
                    key1 = strain_x + gene_x
                    key2 = strain_y + gene_y
                    if key1 > key2:
                        key1, key2 = key2, key1

                    merge = False
                    if key1 in merged_genes and key2 in merged_genes:
                        if merged_genes[key1] == merged_genes[key2]:
                            merge = True
                    elif key1 not in merged_genes and key2 not in merged_genes:
                        merge = True
                        merged_genes[key1] = node_id
                        merged_genes[key2] = node_id

                    if merge:
                        map_genes2new_genes.append(
                            (node_id, species, strain_x, gene_x, strain_y, gene_y))

                    # once two genes have been joined, they can not be remapped
                    # further
                    genes[node_id] = None
                    return
        else:
            # process leaf
            strain, t, g, q = parseIdentifier(node.data.taxon, options)

            if strain in map_strain2species:
                genes[node_id][strain].add(g)
            else:
                # do not process nodes that do not need to be mapped
                genes[node_id] = None

    tree.dfs(tree.root, post_function=count_genes)

    return map_genes2new_genes
Example #11
0
def Process(lines, other_trees, options, map_old2new, ntree):

    nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines))

    if options.loglevel >= 1:
        options.stdlog.write("# read %i trees.\n" % len(nexus.trees))

    nskipped = 0
    ntotal = len(nexus.trees)
    extract_pattern = None
    species2remove = None
    write_map = False

    phylip_executable = None
    phylip_options = None

    index = 0

    # default: do not output internal node names
    write_all_taxa = False

    for tree in nexus.trees:

        if options.outgroup:
            tree.root_with_outgroup(options.outgroup)

        for method in options.methods:

            if options.loglevel >= 3:
                options.stdlog.write("# applying method %s to tree %i.\n" % (method, index))

            if method == "midpoint-root":
                tree.root_midpoint()

            elif method == "balanced-root":
                tree.root_balanced()

            elif method == "unroot":
                TreeTools.Unroot(tree)

            elif method == "phylip":
                if not phylip_executable:
                    phylip_executable = options.parameters[0]
                    del options.parameters[0]
                    phylip_options = re.split("@", options.parameters[0])
                    del options.parameters[0]

                    phylip = WrapperPhylip.Phylip()
                    phylip.setProgram(phylip_executable)
                    phylip.setOptions(phylip_options)

                phylip.setTree(tree)

                result = phylip.run()

                nexus.trees[index] = result.mNexus.trees[0]

            elif method == "normalize":
                if options.value == 0:
                    v = 0
                    for n in tree.chain.keys():
                        v = max(v, tree.node(n).data.branchlength)
                else:
                    v = options.value

                for n in tree.chain.keys():
                    tree.node(n).data.branchlength /= float(options.value)

            elif method == "divide-by-tree":

                if len(other_trees) > 1:
                    other_tree = other_trees[ntree]
                else:
                    other_tree = other_trees[0]

                # the trees have to be exactly the same!!
                if options.loglevel >= 2:
                    print tree.display()
                    print other_tree.display()

                if not tree.is_identical(other_tree):
                    nskipped += 1
                    continue

                # even if the trees are the same (in topology), the node numbering might not be
                # the same. Thus build a map of node ids.
                map_a2b = TreeTools.GetNodeMap(tree, other_tree)

                for n in tree.chain.keys():
                    try:
                        tree.node(n).data.branchlength /= float(other_tree.node(map_a2b[n]).data.branchlength)
                    except ZeroDivisionError:
                        options.stdlog.write(
                            "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n"
                            % (n, map_a2b[n], ntree)
                        )
                        continue

            elif method == "rename":
                if not map_old2new:

                    map_old2new = IOTools.ReadMap(open(options.parameters[0], "r"), columns=(0, 1))

                    if options.invert_map:
                        map_old2new = IOTools.getInvertedDictionary(map_old2new, make_unique=True)

                    del options.parameters[0]

                unknown = []
                for n, node in tree.chain.items():
                    if node.data.taxon:
                        try:
                            node.data.taxon = map_old2new[node.data.taxon]
                        except KeyError:
                            unknown.append(node.data.taxon)

                for taxon in unknown:
                    tree.prune(taxon)

            # reformat terminals
            elif method == "extract-with-pattern":

                if not extract_pattern:
                    extract_pattern = re.compile(options.parameters[0])
                    del options.parameters[0]

                for n in tree.get_terminals():
                    node = tree.node(n)
                    node.data.taxon = extract_pattern.search(node.data.taxon).groups()[0]

            elif method == "set-uniform-branchlength":
                for n in tree.chain.keys():
                    tree.node(n).data.branchlength = options.value

            elif method == "build-map":
                # build a map of identifiers
                options.write_map = True
                for n in tree.get_terminals():
                    node = tree.node(n)
                    if node.data.taxon not in map_old2new:
                        new = options.template_identifier % (len(map_old2new) + 1)
                        map_old2new[node.data.taxon] = new
                    node.data.taxon = map_old2new[node.data.taxon]

            elif method == "remove-pattern":
                if species2remove is None:
                    species2remove = re.compile(options.parameters[0])
                    del options.parameters
                taxa = []
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    skip = False
                    if species2remove.search(t):
                        continue
                    if not skip:
                        taxa.append(t)
                TreeTools.PruneTree(tree, taxa)

            elif method == "add-node-names":

                inode = 0
                write_all_taxa = True
                for n, node in tree.chain.items():
                    if not node.data.taxon:
                        node.data.taxon = "inode%i" % inode
                        inode += 1

            elif method == "newick2nhx":
                # convert names to species names
                for n in tree.get_terminals():
                    t = tree.node(n).data.taxon
                    d = t.split("|")
                    if len(d) >= 2:
                        tree.node(n).data.species = d[0]

        index += 1
        ntree += 1

    if options.output_format == "nh":
        options.stdout.write(
            TreeTools.Nexus2Newick(nexus, write_all_taxa=True, with_branchlengths=options.with_branchlengths) + "\n"
        )
    else:
        for tree in nexus.trees:
            tree.writeToFile(options.stdout, format=options.output_format)

    return ntotal, nskipped, ntree
Example #12
0
def getMergers(tree, map_strain2species, options):
    """merge strains to species.

    returns the new tree with species merged and
    a dictionary of genes including the genes that have been merged.

    Currently, only binary merges are supported.
    """

    n = TreeTools.GetSize(tree) + 1
    all_strains = map_strain2species.keys()
    all_species = map_strain2species.values()

    genes = []
    for x in range(n):
        g = {}
        for s in all_strains:
            g[s] = set()
        genes.append(g)

    # build list of species pairs that can be joined.
    map_species2strain = IOTools.getInvertedDictionary(map_strain2species)
    pairs = []

    for species, strains in map_species2strain.items():
        for x in range(len(strains)):
            for y in range(0, x):
                pairs.append((strains[x], strains[y]))

    # map of genes to new genes
    # each entry in the list is a pair of genes of the same species
    # but different strains to be joined.
    map_genes2new_genes = []

    # dictionary of merged genes. This is to ensure that no gene
    # is merged twice
    merged_genes = {}

    def count_genes(node_id):
        """record number of genes per species for each node

        This is done separately for each strain. The counts are aggregated for each species
        over strains by taking the maximum gene count per strain. This ignores any finer
        tree structure below a species node.
        """
        node = tree.node(node_id)

        if node.succ:
            this_node_set = genes[node_id]
            # process non-leaf node
            for s in node.succ:

                # propagate: terminated nodes force upper nodes to terminate
                # (assigned to None).
                if not genes[s]:
                    this_node_set = None
                    break

                # check if node merges genes that are not part of the positive
                # set
                for strain in all_strains:
                    if strain in map_strain2species:
                        # merge genes from all children
                        this_node_set[strain] = this_node_set[
                            strain].union(genes[s][strain])

                        if len(this_node_set[strain]) > 1:
                            # more than two genes for a single species, so no
                            # join
                            this_node_set = None
                            break

                    elif strain not in map_strain2species and \
                            this_node_set[strain] > 0:
                        this_node_set = None
                        break

            if this_node_set is None:
                genes[node_id] = None
                return

            for strain_x, strain_y in pairs:
                if len(this_node_set[strain_x]) == 1 and len(this_node_set[strain_y]) == 1:
                    species = map_strain2species[strain_x]
                    gene_x, gene_y = tuple(this_node_set[strain_x])[0], tuple(
                        this_node_set[strain_y])[0]

                    # check if these to genes have already been merged or are
                    # merged with other partners already
                    # The merged genes are assigned the same node_id, if they have
                    # been already merged.
                    key1 = strain_x + gene_x
                    key2 = strain_y + gene_y
                    if key1 > key2:
                        key1, key2 = key2, key1

                    merge = False
                    if key1 in merged_genes and key2 in merged_genes:
                        if merged_genes[key1] == merged_genes[key2]:
                            merge = True
                    elif key1 not in merged_genes and key2 not in merged_genes:
                        merge = True
                        merged_genes[key1] = node_id
                        merged_genes[key2] = node_id

                    if merge:
                        map_genes2new_genes.append(
                            (node_id, species, strain_x, gene_x, strain_y, gene_y))

                    # once two genes have been joined, they can not be remapped
                    # further
                    genes[node_id] = None
                    return
        else:
            # process leaf
            strain, t, g, q = parseIdentifier(node.data.taxon, options)

            if strain in map_strain2species:
                genes[node_id][strain].add(g)
            else:
                # do not process nodes that do not need to be mapped
                genes[node_id] = None

    tree.dfs(tree.root, post_function=count_genes)

    return map_genes2new_genes
Example #13
0
def processMali( mali, options ):

    ncols = mali.getNumColumns()

    if ncols == 0:
        raise "refusing to process empty alignment."

    ## add annotation of states
    if options.block_size != None:
        if options.block_size < 1:
            size = int( float( ncols ) / 3.0 * options.block_size) * 3
        else:
            size = int( options.block_size ) * 3
        
        size = min( size, ncols )
        mali.addAnnotation( "STATE", "N" * size + "C" * (ncols - size))
            
    ## remove gene ids
    for id in mali.getIdentifiers():
        if options.separator in id:
            species = id.split(options.separator)[0]
            mali.rename( id, species )

    map_new2old = mali.mapIdentifiers()
    map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True )
    
    ids = mali.getIdentifiers()
    xgram = XGram.XGram()

    if options.xrate_min_increment:
        xgram.setMinIncrement( options.xrate_min_increment )

    ninput, noutput, nskipped = 0, 0, 0

    # remove empty columns and masked columns
    if options.clean_mali:
        mali.mGapChars = mali.mGapChars + ("n", "N")
        mali.removeGaps( minimum_gaps = 1, frame=3 )

    if options.input_filename_tree:
        nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") )
        tree = nexus.trees[0]
        tree.relabel( map_old2new )
    else:
        tree = None

    annotation = mali.getAnnotation( "STATE" )
    chars = set(list(annotation))
    for c in chars:
        assert c in ("N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized"
    if len(chars) == 1:
        if options.loglevel >= 1:
            options.stdlog.write("# WARNING: only a single block" )
        blocks = ( ("B0_", chars[0]), )
    else:
        blocks = ( ("B0_", "N"), 
                   ("B1_", "C") )
    
    result, mali, ids = prepareGrammar( xgram, mali, tree, map_old2new, blocks, options )

    trained_model = result.getModel()

    pis, matrices = RateEstimation.getRateMatrix( trained_model )

    annotation = mali.getAnnotation( "STATE" )

    for block, code in blocks :

        terminals = ( "%sCOD0" % block,
                      "%sCOD1" % block,
                      "%sCOD2" % block )
        
        pi = pis[terminals]

        if options.shared_rates == "all":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa":
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "kappa-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = ""
            rate_prefix_rv = ""
        elif options.shared_rates == "omega":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = block
        elif options.shared_rates == "omega-ds":
            rate_prefix_rs = ""
            rate_prefix_rn = ""
            rate_prefix_ri = block
            rate_prefix_rv = ""
        elif options.shared_rates == "ds":
            rate_prefix_rs = ""
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        else:
            rate_prefix_rs = block
            rate_prefix_rn = block
            rate_prefix_ri = block
            rate_prefix_rv = block
        
        if options.shared_frequencies:
            frequency_prefix = ""
        else:
            frequency_prefix = block

        rs = trained_model.mGrammar.getParameter( '%sRs' % rate_prefix_rs )
        rn = trained_model.mGrammar.getParameter( '%sRn' % rate_prefix_rn )
        ri = trained_model.mGrammar.getParameter( '%sRi' % rate_prefix_ri )
        rv = trained_model.mGrammar.getParameter( '%sRv' % rate_prefix_rv )    

        nchars = annotation.count( code )

        msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv )
        
        try:
            Q, t = RateEstimation.getQMatrix( pi,
                                              Rsi=rs * ri,
                                              Rsv=rs * rv,
                                              Rni=rn * ri,
                                              Rnv=rn * rv )
            avg_omega = (rs + rn) / 2.0
            Q0, t0 = RateEstimation.getQMatrix( pi,
                                                Rsi = ri * avg_omega,
                                                Rsv = rv * avg_omega,
                                                Rni = ri * avg_omega,
                                                Rnv = rv * avg_omega )

            avg_kappa = (ri + rv) / 2.0
            Q1, t1 = RateEstimation.getQMatrix( pi,
                                                Rsi = rs * avg_kappa,
                                                Rsv = rs * avg_kappa,
                                                Rni = rn * avg_kappa,
                                                Rnv = rn * avg_kappa )

            rI, rV, rS, rN = RateEstimation.countSubstitutions( pi, Q )
            rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions( pi, Q0 )    
            rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions( pi, Q1 )    

            dS = rS / (3 * rS0) * t
            dN = rN / (3 * rN0) * t

            o_kappa = options.value_format % ( rI / rI0 * rV0 / rV )
            o_omega = options.value_format % (dN / dS)

            o_dn = options.value_format % dN
            o_ds = options.value_format % dS
            o_rn = options.value_format % rN
            o_rs = options.value_format % rS
            o_rn0 = options.value_format % rN0
            o_rs0 = options.value_format % rS0
            o_t = options.value_format % t
            o_t0 = options.value_format % t0

        except ZeroDivisionError:

            o_kappa = "na"
            o_omega = "na"
            o_dn = "na"
            o_ds = "na"
            o_rn = "na"
            o_rs = "na"
            o_rn0 = "na"
            o_rs0 = "na"
            o_t = "na"
            o_t0 = "na"
            Q = None
            msg = "insufficient data to estimate rate matrix."
        
        options.stdout.write( "\t".join( map(str, (
                        code, block,
                        o_dn, o_ds, o_omega,
                        "na", "na", "na", "na",
                        o_kappa, 
                        result.getLogLikelihood(),
                        "na",
                        nchars ))))

        if options.with_rho:
            options.stdout.write( "\t" + "\t".join( map(str, (o_rn, o_rs, o_t,
                                                              o_rn0, o_rs0, o_t0 ))))
            
        options.stdout.write( "\t%s\n" %  msg )