def readExtracts(options, map_component2input_id): """read extract information from filename supplied in the options. """ if not options.filename_extract_regions: return None if options.use_input_id: map_id2component = IOTools.getInvertedDictionary( map_component2input_id) else: map_id2component = {} for component in map_component2input_id.keys(): map_id2component[component] = (component,) map_component2extracts = collections.defaultdict(list) if not os.path.exists(options.filename_extract_regions): options.stdlog.write( "# could not find %s - ignored \n" % options.filename_extract_regions) else: for line in open(options.filename_extract_regions, "r"): if line[0] == "#": continue id, start, end = line[:-1].split("\t") start, end = int(start), int(end) if id not in map_id2component: continue for x in map_id2component[id]: map_component2extracts[x].append((start, end)) if options.loglevel >= 1: options.stdlog.write( "# read extracts for %i malis.\n" % len(map_component2extracts)) options.stdlog.flush() return map_component2extracts
def readExtracts(options, map_component2input_id): """read extract information from filename supplied in the options. """ if not options.filename_extract_regions: return None if options.use_input_id: map_id2component = IOTools.getInvertedDictionary( map_component2input_id) else: map_id2component = {} for component in map_component2input_id.keys(): map_id2component[component] = (component, ) map_component2extracts = collections.defaultdict(list) if not os.path.exists(options.filename_extract_regions): options.stdlog.write("# could not find %s - ignored \n" % options.filename_extract_regions) else: for line in open(options.filename_extract_regions, "r"): if line[0] == "#": continue id, start, end = line[:-1].split("\t") start, end = int(start), int(end) if id not in map_id2component: continue for x in map_id2component[id]: map_component2extracts[x].append((start, end)) if options.loglevel >= 1: options.stdlog.write("# read extracts for %i malis.\n" % len(map_component2extracts)) options.stdlog.flush() return map_component2extracts
def trainMali( mali, options ): """train a grammar on a multiple alignment.""" ## remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps( minimum_gaps = 1, frame=1 ) length = mali.getNumColumns() input_model = prepareGrammar( options ) for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename( id, species ) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True ) ids = mali.getIdentifiers() if options.input_filename_tree: nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") ) tree = nexus.trees[0] try: tree.relabel( map_old2new, warn = True ) except KeyError, msg: raise KeyError( "names in mali and tree are not congruent: %s" % msg )
def readAnnotations( options, map_component2input_id ): """read annotation information from filename supplied in the options. """ if not options.filename_annotate_regions: return None if options.use_input_id: map_id2component = IOTools.getInvertedDictionary( map_component2input_id ) else: map_id2component = {} for component in map_component2input_id.keys(): map_id2component[component] = (component,) map_component2annotations = collections.defaultdict( list ) if not os.path.exists( options.filename_annotate_regions ): options.stdlog.write("# could not find %s - ignored \n" % options.filename_annotate_regions ) else: for line in open(options.filename_annotate_regions, "r" ): if line[0] == "#": continue try: id, start, end, label = line[:-1].split("\t") except ValueError: raise ValueError("parsing error in line %s\n" % (line[:-1])) start, end = int(start), int(end) if id not in map_id2component: continue for x in map_id2component[id]: map_component2annotations[x].append( (start, end, label ) ) if options.loglevel >= 1: options.stdlog.write("# read annotations for %i malis.\n" % len(map_component2annotations)) options.stdlog.flush() return map_component2annotations
def readAnnotations(options, map_component2input_id): """read annotation information from filename supplied in the options. """ if not options.filename_annotate_regions: return None if options.use_input_id: map_id2component = IOTools.getInvertedDictionary( map_component2input_id) else: map_id2component = {} for component in map_component2input_id.keys(): map_id2component[component] = (component, ) map_component2annotations = collections.defaultdict(list) if not os.path.exists(options.filename_annotate_regions): options.stdlog.write("# could not find %s - ignored \n" % options.filename_annotate_regions) else: for line in open(options.filename_annotate_regions, "r"): if line[0] == "#": continue try: id, start, end, label = line[:-1].split("\t") except ValueError: raise ValueError("parsing error in line %s\n" % (line[:-1])) start, end = int(start), int(end) if id not in map_id2component: continue for x in map_id2component[id]: map_component2annotations[x].append((start, end, label)) if options.loglevel >= 1: options.stdlog.write("# read annotations for %i malis.\n" % len(map_component2annotations)) options.stdlog.flush() return map_component2annotations
def processMali(mali, options): ncols = mali.getNumColumns() if ncols == 0: raise "refusing to process empty alignment." ## add annotation of states if options.block_size != None: if options.block_size < 1: size = int(float(ncols) / 3.0 * options.block_size) * 3 else: size = int(options.block_size) * 3 size = min(size, ncols) mali.addAnnotation("STATE", "N" * size + "C" * (ncols - size)) ## remove gene ids for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename(id, species) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement(options.xrate_min_increment) ninput, noutput, nskipped = 0, 0, 0 # remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps(minimum_gaps=1, frame=3) if options.input_filename_tree: nexus = TreeTools.Newick2Nexus(open(options.input_filename_tree, "r")) tree = nexus.trees[0] tree.relabel(map_old2new) else: tree = None annotation = mali.getAnnotation("STATE") chars = set(list(annotation)) for c in chars: assert c in ( "N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized" if len(chars) == 1: if options.loglevel >= 1: options.stdlog.write("# WARNING: only a single block") blocks = (("B0_", chars[0]), ) else: blocks = (("B0_", "N"), ("B1_", "C")) result, mali, ids = prepareGrammar(xgram, mali, tree, map_old2new, blocks, options) trained_model = result.getModel() pis, matrices = RateEstimation.getRateMatrix(trained_model) annotation = mali.getAnnotation("STATE") for block, code in blocks: terminals = ("%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block) pi = pis[terminals] if options.shared_rates == "all": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa": rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa-ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "omega": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = block elif options.shared_rates == "omega-ds": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = "" elif options.shared_rates == "ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block else: rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block if options.shared_frequencies: frequency_prefix = "" else: frequency_prefix = block rs = trained_model.mGrammar.getParameter('%sRs' % rate_prefix_rs) rn = trained_model.mGrammar.getParameter('%sRn' % rate_prefix_rn) ri = trained_model.mGrammar.getParameter('%sRi' % rate_prefix_ri) rv = trained_model.mGrammar.getParameter('%sRv' % rate_prefix_rv) nchars = annotation.count(code) msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv) try: Q, t = RateEstimation.getQMatrix(pi, Rsi=rs * ri, Rsv=rs * rv, Rni=rn * ri, Rnv=rn * rv) avg_omega = (rs + rn) / 2.0 Q0, t0 = RateEstimation.getQMatrix(pi, Rsi=ri * avg_omega, Rsv=rv * avg_omega, Rni=ri * avg_omega, Rnv=rv * avg_omega) avg_kappa = (ri + rv) / 2.0 Q1, t1 = RateEstimation.getQMatrix(pi, Rsi=rs * avg_kappa, Rsv=rs * avg_kappa, Rni=rn * avg_kappa, Rnv=rn * avg_kappa) rI, rV, rS, rN = RateEstimation.countSubstitutions(pi, Q) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions(pi, Q0) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions(pi, Q1) dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_kappa = options.value_format % (rI / rI0 * rV0 / rV) o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 except ZeroDivisionError: o_kappa = "na" o_omega = "na" o_dn = "na" o_ds = "na" o_rn = "na" o_rs = "na" o_rn0 = "na" o_rs0 = "na" o_t = "na" o_t0 = "na" Q = None msg = "insufficient data to estimate rate matrix." options.stdout.write("\t".join( map(str, (code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", nchars)))) if options.with_rho: options.stdout.write( "\t" + "\t".join(map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0)))) options.stdout.write("\t%s\n" % msg)
def processMali(mali, options): map_new2old = mali.mapIdentifiers() ids = mali.getIdentifiers() invalid_chars = options.gap_chars + options.mask_chars has_non_overlaps = False pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids)): for y in range(0, x): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) elif options.iteration == "tree": pairs = [] else: raise "unknown iteration mode: %s" % (options.iteration) if options.remove_stops: for id, entry in mali.items(): s = entry.mString.upper() fragments = [] for x in range(0, len(s), 3): codon = s[x:x + 3] if Genomics.IsStopCodon(codon): codon = "NNN" fragments.append(codon) entry.mString = "".join(fragments) for x, y in pairs: noverlap = 0 for a, b in zip(mali[ids[x]], mali[ids[y]]): if a not in invalid_chars and b not in invalid_chars: noverlap += 1 if noverlap >= options.min_overlap: break else: has_non_overlaps = True break if options.tree: tree = TreeTools.Newick2Nexus(options.tree).trees[0] map_old2new = IOTools.getInvertedDictionary(map_new2old, make_unique=True) tree.relabel(map_old2new) else: tree = None if options.method == "paml": runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options) elif options.method == "xrate": runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
def Process(lines, other_trees, options, map_old2new, ntree): nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines)) if options.loglevel >= 1: options.stdlog.write("# read %i trees.\n" % len(nexus.trees)) nskipped = 0 ntotal = len(nexus.trees) extract_pattern = None species2remove = None write_map = False phylip_executable = None phylip_options = None index = 0 # default: do not output internal node names write_all_taxa = False for tree in nexus.trees: if options.outgroup: tree.root_with_outgroup(options.outgroup) for method in options.methods: if options.loglevel >= 3: options.stdlog.write("# applying method %s to tree %i.\n" % (method, index)) if method == "midpoint-root": tree.root_midpoint() elif method == "balanced-root": tree.root_balanced() elif method == "unroot": TreeTools.Unroot(tree) elif method == "phylip": if not phylip_executable: phylip_executable = options.parameters[0] del options.parameters[0] phylip_options = re.split("@", options.parameters[0]) del options.parameters[0] phylip = WrapperPhylip.Phylip() phylip.setProgram(phylip_executable) phylip.setOptions(phylip_options) phylip.setTree(tree) result = phylip.run() nexus.trees[index] = result.mNexus.trees[0] elif method == "normalize": if options.value == 0: v = 0 for n in tree.chain.keys(): v = max(v, tree.node(n).data.branchlength) else: v = options.value for n in tree.chain.keys(): tree.node(n).data.branchlength /= float(options.value) elif method == "divide-by-tree": if len(other_trees) > 1: other_tree = other_trees[ntree] else: other_tree = other_trees[0] # the trees have to be exactly the same!! if options.loglevel >= 2: print tree.display() print other_tree.display() if not tree.is_identical(other_tree): nskipped += 1 continue # even if the trees are the same (in topology), the node numbering might not be # the same. Thus build a map of node ids. map_a2b = TreeTools.GetNodeMap(tree, other_tree) for n in tree.chain.keys(): try: tree.node(n).data.branchlength /= float( other_tree.node(map_a2b[n]).data.branchlength) except ZeroDivisionError: options.stdlog.write( "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n" % (n, map_a2b[n], ntree)) continue elif method == "rename": if not map_old2new: map_old2new = IOTools.ReadMap(open(options.parameters[0], "r"), columns=(0, 1)) if options.invert_map: map_old2new = IOTools.getInvertedDictionary( map_old2new, make_unique=True) del options.parameters[0] unknown = [] for n, node in tree.chain.items(): if node.data.taxon: try: node.data.taxon = map_old2new[node.data.taxon] except KeyError: unknown.append(node.data.taxon) for taxon in unknown: tree.prune(taxon) # reformat terminals elif method == "extract-with-pattern": if not extract_pattern: extract_pattern = re.compile(options.parameters[0]) del options.parameters[0] for n in tree.get_terminals(): node = tree.node(n) node.data.taxon = extract_pattern.search( node.data.taxon).groups()[0] elif method == "set-uniform-branchlength": for n in tree.chain.keys(): tree.node(n).data.branchlength = options.value elif method == "build-map": # build a map of identifiers options.write_map = True for n in tree.get_terminals(): node = tree.node(n) if node.data.taxon not in map_old2new: new = options.template_identifier % (len(map_old2new) + 1) map_old2new[node.data.taxon] = new node.data.taxon = map_old2new[node.data.taxon] elif method == "remove-pattern": if species2remove is None: species2remove = re.compile(options.parameters[0]) del options.parameters taxa = [] for n in tree.get_terminals(): t = tree.node(n).data.taxon skip = False if species2remove.search(t): continue if not skip: taxa.append(t) TreeTools.PruneTree(tree, taxa) elif method == "add-node-names": inode = 0 write_all_taxa = True for n, node in tree.chain.items(): if not node.data.taxon: node.data.taxon = "inode%i" % inode inode += 1 elif method == "newick2nhx": # convert names to species names for n in tree.get_terminals(): t = tree.node(n).data.taxon d = t.split("|") if len(d) >= 2: tree.node(n).data.species = d[0] index += 1 ntree += 1 if options.output_format == "nh": options.stdout.write( TreeTools.Nexus2Newick( nexus, write_all_taxa=True, with_branchlengths=options.with_branchlengths) + "\n") else: for tree in nexus.trees: tree.writeToFile(options.stdout, format=options.output_format) return ntotal, nskipped, ntree
def processMali(mali, options): map_new2old = mali.mapIdentifiers() ids = mali.getIdentifiers() invalid_chars = options.gap_chars + options.mask_chars has_non_overlaps = False pairs = [] if options.iteration == "all-vs-all": for x in range(len(ids)): for y in range(0, x): pairs.append((x, y)) elif options.iteration == "first-vs-all": for y in range(1, len(ids)): pairs.append((0, y)) elif options.iteration == "pairwise": if len(ids) % 2 != 0: raise "uneven number of sequences (%i) not compatible with --iteration=pairwise" % len( ids) for x in range(0, len(ids), 2): pairs.append((x, x + 1)) elif options.iteration == "tree": pairs = [] else: raise "unknown iteration mode: %s" % (options.iteration) if options.remove_stops: for id, entry in mali.items(): s = entry.mString.upper() fragments = [] for x in range(0, len(s), 3): codon = s[x:x + 3] if Genomics.IsStopCodon(codon): codon = "NNN" fragments.append(codon) entry.mString = "".join(fragments) for x, y in pairs: noverlap = 0 for a, b in zip(mali[ids[x]], mali[ids[y]]): if a not in invalid_chars and b not in invalid_chars: noverlap += 1 if noverlap >= options.min_overlap: break else: has_non_overlaps = True break if options.tree: tree = TreeTools.Newick2Nexus(options.tree).trees[0] map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique=True) tree.relabel(map_old2new) else: tree = None if options.method == "paml": runCodeML(mali, tree, has_non_overlaps, pairs, map_new2old, options) elif options.method == "xrate": runXrate(mali, has_non_overlaps, pairs, map_new2old, options)
def getMergers(tree, map_strain2species, options): """merge strains to species. returns the new tree with species merged and a dictionary of genes including the genes that have been merged. Currently, only binary merges are supported. """ n = TreeTools.GetSize(tree) + 1 all_strains = map_strain2species.keys() all_species = map_strain2species.values() genes = [] for x in range(n): g = {} for s in all_strains: g[s] = set() genes.append(g) # build list of species pairs that can be joined. map_species2strain = IOTools.getInvertedDictionary(map_strain2species) pairs = [] for species, strains in map_species2strain.items(): for x in range(len(strains)): for y in range(0, x): pairs.append((strains[x], strains[y])) # map of genes to new genes # each entry in the list is a pair of genes of the same species # but different strains to be joined. map_genes2new_genes = [] # dictionary of merged genes. This is to ensure that no gene # is merged twice merged_genes = {} def count_genes(node_id): """record number of genes per species for each node This is done separately for each strain. The counts are aggregated for each species over strains by taking the maximum gene count per strain. This ignores any finer tree structure below a species node. """ node = tree.node(node_id) if node.succ: this_node_set = genes[node_id] # process non-leaf node for s in node.succ: # propagate: terminated nodes force upper nodes to terminate # (assigned to None). if not genes[s]: this_node_set = None break # check if node merges genes that are not part of the positive # set for strain in all_strains: if strain in map_strain2species: # merge genes from all children this_node_set[strain] = this_node_set[ strain].union(genes[s][strain]) if len(this_node_set[strain]) > 1: # more than two genes for a single species, so no # join this_node_set = None break elif strain not in map_strain2species and \ this_node_set[strain] > 0: this_node_set = None break if this_node_set is None: genes[node_id] = None return for strain_x, strain_y in pairs: if len(this_node_set[strain_x]) == 1 and len(this_node_set[strain_y]) == 1: species = map_strain2species[strain_x] gene_x, gene_y = tuple(this_node_set[strain_x])[0], tuple( this_node_set[strain_y])[0] # check if these to genes have already been merged or are # merged with other partners already # The merged genes are assigned the same node_id, if they have # been already merged. key1 = strain_x + gene_x key2 = strain_y + gene_y if key1 > key2: key1, key2 = key2, key1 merge = False if key1 in merged_genes and key2 in merged_genes: if merged_genes[key1] == merged_genes[key2]: merge = True elif key1 not in merged_genes and key2 not in merged_genes: merge = True merged_genes[key1] = node_id merged_genes[key2] = node_id if merge: map_genes2new_genes.append( (node_id, species, strain_x, gene_x, strain_y, gene_y)) # once two genes have been joined, they can not be remapped # further genes[node_id] = None return else: # process leaf strain, t, g, q = parseIdentifier(node.data.taxon, options) if strain in map_strain2species: genes[node_id][strain].add(g) else: # do not process nodes that do not need to be mapped genes[node_id] = None tree.dfs(tree.root, post_function=count_genes) return map_genes2new_genes
def Process(lines, other_trees, options, map_old2new, ntree): nexus = TreeTools.Newick2Nexus(map(lambda x: x[:-1], lines)) if options.loglevel >= 1: options.stdlog.write("# read %i trees.\n" % len(nexus.trees)) nskipped = 0 ntotal = len(nexus.trees) extract_pattern = None species2remove = None write_map = False phylip_executable = None phylip_options = None index = 0 # default: do not output internal node names write_all_taxa = False for tree in nexus.trees: if options.outgroup: tree.root_with_outgroup(options.outgroup) for method in options.methods: if options.loglevel >= 3: options.stdlog.write("# applying method %s to tree %i.\n" % (method, index)) if method == "midpoint-root": tree.root_midpoint() elif method == "balanced-root": tree.root_balanced() elif method == "unroot": TreeTools.Unroot(tree) elif method == "phylip": if not phylip_executable: phylip_executable = options.parameters[0] del options.parameters[0] phylip_options = re.split("@", options.parameters[0]) del options.parameters[0] phylip = WrapperPhylip.Phylip() phylip.setProgram(phylip_executable) phylip.setOptions(phylip_options) phylip.setTree(tree) result = phylip.run() nexus.trees[index] = result.mNexus.trees[0] elif method == "normalize": if options.value == 0: v = 0 for n in tree.chain.keys(): v = max(v, tree.node(n).data.branchlength) else: v = options.value for n in tree.chain.keys(): tree.node(n).data.branchlength /= float(options.value) elif method == "divide-by-tree": if len(other_trees) > 1: other_tree = other_trees[ntree] else: other_tree = other_trees[0] # the trees have to be exactly the same!! if options.loglevel >= 2: print tree.display() print other_tree.display() if not tree.is_identical(other_tree): nskipped += 1 continue # even if the trees are the same (in topology), the node numbering might not be # the same. Thus build a map of node ids. map_a2b = TreeTools.GetNodeMap(tree, other_tree) for n in tree.chain.keys(): try: tree.node(n).data.branchlength /= float(other_tree.node(map_a2b[n]).data.branchlength) except ZeroDivisionError: options.stdlog.write( "# Warning: branch for nodes %i and %i in tree-pair %i: divide by zero\n" % (n, map_a2b[n], ntree) ) continue elif method == "rename": if not map_old2new: map_old2new = IOTools.ReadMap(open(options.parameters[0], "r"), columns=(0, 1)) if options.invert_map: map_old2new = IOTools.getInvertedDictionary(map_old2new, make_unique=True) del options.parameters[0] unknown = [] for n, node in tree.chain.items(): if node.data.taxon: try: node.data.taxon = map_old2new[node.data.taxon] except KeyError: unknown.append(node.data.taxon) for taxon in unknown: tree.prune(taxon) # reformat terminals elif method == "extract-with-pattern": if not extract_pattern: extract_pattern = re.compile(options.parameters[0]) del options.parameters[0] for n in tree.get_terminals(): node = tree.node(n) node.data.taxon = extract_pattern.search(node.data.taxon).groups()[0] elif method == "set-uniform-branchlength": for n in tree.chain.keys(): tree.node(n).data.branchlength = options.value elif method == "build-map": # build a map of identifiers options.write_map = True for n in tree.get_terminals(): node = tree.node(n) if node.data.taxon not in map_old2new: new = options.template_identifier % (len(map_old2new) + 1) map_old2new[node.data.taxon] = new node.data.taxon = map_old2new[node.data.taxon] elif method == "remove-pattern": if species2remove is None: species2remove = re.compile(options.parameters[0]) del options.parameters taxa = [] for n in tree.get_terminals(): t = tree.node(n).data.taxon skip = False if species2remove.search(t): continue if not skip: taxa.append(t) TreeTools.PruneTree(tree, taxa) elif method == "add-node-names": inode = 0 write_all_taxa = True for n, node in tree.chain.items(): if not node.data.taxon: node.data.taxon = "inode%i" % inode inode += 1 elif method == "newick2nhx": # convert names to species names for n in tree.get_terminals(): t = tree.node(n).data.taxon d = t.split("|") if len(d) >= 2: tree.node(n).data.species = d[0] index += 1 ntree += 1 if options.output_format == "nh": options.stdout.write( TreeTools.Nexus2Newick(nexus, write_all_taxa=True, with_branchlengths=options.with_branchlengths) + "\n" ) else: for tree in nexus.trees: tree.writeToFile(options.stdout, format=options.output_format) return ntotal, nskipped, ntree
def processMali( mali, options ): ncols = mali.getNumColumns() if ncols == 0: raise "refusing to process empty alignment." ## add annotation of states if options.block_size != None: if options.block_size < 1: size = int( float( ncols ) / 3.0 * options.block_size) * 3 else: size = int( options.block_size ) * 3 size = min( size, ncols ) mali.addAnnotation( "STATE", "N" * size + "C" * (ncols - size)) ## remove gene ids for id in mali.getIdentifiers(): if options.separator in id: species = id.split(options.separator)[0] mali.rename( id, species ) map_new2old = mali.mapIdentifiers() map_old2new = IOTools.getInvertedDictionary( map_new2old, make_unique = True ) ids = mali.getIdentifiers() xgram = XGram.XGram() if options.xrate_min_increment: xgram.setMinIncrement( options.xrate_min_increment ) ninput, noutput, nskipped = 0, 0, 0 # remove empty columns and masked columns if options.clean_mali: mali.mGapChars = mali.mGapChars + ("n", "N") mali.removeGaps( minimum_gaps = 1, frame=3 ) if options.input_filename_tree: nexus = TreeTools.Newick2Nexus( open(options.input_filename_tree,"r") ) tree = nexus.trees[0] tree.relabel( map_old2new ) else: tree = None annotation = mali.getAnnotation( "STATE" ) chars = set(list(annotation)) for c in chars: assert c in ("N", "C"), "unknown annotation %s: only 'N' and 'C' are recognized" if len(chars) == 1: if options.loglevel >= 1: options.stdlog.write("# WARNING: only a single block" ) blocks = ( ("B0_", chars[0]), ) else: blocks = ( ("B0_", "N"), ("B1_", "C") ) result, mali, ids = prepareGrammar( xgram, mali, tree, map_old2new, blocks, options ) trained_model = result.getModel() pis, matrices = RateEstimation.getRateMatrix( trained_model ) annotation = mali.getAnnotation( "STATE" ) for block, code in blocks : terminals = ( "%sCOD0" % block, "%sCOD1" % block, "%sCOD2" % block ) pi = pis[terminals] if options.shared_rates == "all": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa": rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "kappa-ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = "" rate_prefix_rv = "" elif options.shared_rates == "omega": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = block elif options.shared_rates == "omega-ds": rate_prefix_rs = "" rate_prefix_rn = "" rate_prefix_ri = block rate_prefix_rv = "" elif options.shared_rates == "ds": rate_prefix_rs = "" rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block else: rate_prefix_rs = block rate_prefix_rn = block rate_prefix_ri = block rate_prefix_rv = block if options.shared_frequencies: frequency_prefix = "" else: frequency_prefix = block rs = trained_model.mGrammar.getParameter( '%sRs' % rate_prefix_rs ) rn = trained_model.mGrammar.getParameter( '%sRn' % rate_prefix_rn ) ri = trained_model.mGrammar.getParameter( '%sRi' % rate_prefix_ri ) rv = trained_model.mGrammar.getParameter( '%sRv' % rate_prefix_rv ) nchars = annotation.count( code ) msg = "iter=%i Rs=%6.4f Rn=%6.4f Ri=%6.4f Rv=%6.4f" % ( result.getNumIterations(), rs, rn, ri, rv ) try: Q, t = RateEstimation.getQMatrix( pi, Rsi=rs * ri, Rsv=rs * rv, Rni=rn * ri, Rnv=rn * rv ) avg_omega = (rs + rn) / 2.0 Q0, t0 = RateEstimation.getQMatrix( pi, Rsi = ri * avg_omega, Rsv = rv * avg_omega, Rni = ri * avg_omega, Rnv = rv * avg_omega ) avg_kappa = (ri + rv) / 2.0 Q1, t1 = RateEstimation.getQMatrix( pi, Rsi = rs * avg_kappa, Rsv = rs * avg_kappa, Rni = rn * avg_kappa, Rnv = rn * avg_kappa ) rI, rV, rS, rN = RateEstimation.countSubstitutions( pi, Q ) rI0, rV0, rS0, rN0 = RateEstimation.countSubstitutions( pi, Q0 ) rI1, rV1, rS1, rN1 = RateEstimation.countSubstitutions( pi, Q1 ) dS = rS / (3 * rS0) * t dN = rN / (3 * rN0) * t o_kappa = options.value_format % ( rI / rI0 * rV0 / rV ) o_omega = options.value_format % (dN / dS) o_dn = options.value_format % dN o_ds = options.value_format % dS o_rn = options.value_format % rN o_rs = options.value_format % rS o_rn0 = options.value_format % rN0 o_rs0 = options.value_format % rS0 o_t = options.value_format % t o_t0 = options.value_format % t0 except ZeroDivisionError: o_kappa = "na" o_omega = "na" o_dn = "na" o_ds = "na" o_rn = "na" o_rs = "na" o_rn0 = "na" o_rs0 = "na" o_t = "na" o_t0 = "na" Q = None msg = "insufficient data to estimate rate matrix." options.stdout.write( "\t".join( map(str, ( code, block, o_dn, o_ds, o_omega, "na", "na", "na", "na", o_kappa, result.getLogLikelihood(), "na", nchars )))) if options.with_rho: options.stdout.write( "\t" + "\t".join( map(str, (o_rn, o_rs, o_t, o_rn0, o_rs0, o_t0 )))) options.stdout.write( "\t%s\n" % msg )