def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-e", "--headers", dest="headers", action="store_true", help="first row is a header [ignored].") parser.add_option("-t", "--title", dest="title", type="string", help="page title.") parser.add_option("-f", "--footer", dest="footer", type="string", help="page footer.") parser.add_option("-c", "--contig-sizes", dest="filename_contig_sizes", type="string", help="filname with contig sizes.") parser.add_option("-r", "--radius", dest="radius", type="int", help="radius.") parser.add_option("-i", "--increment", dest="radius_increment", type="int", help="radius increment.") parser.add_option("-u", "--url", dest="url", type="string", help="string to build url for annotation.") parser.add_option("--min-contig", dest="min_contig_size", type="string", help="minimum contig size to delineate.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum branch length.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum branch length.") parser.set_defaults( filename_contig_sizes=None, headers=False, titles="", pattern_filename=None, title="", footer="", radius=3000, min_value=0.0, max_value=0.2, url=None, radius_increment=40, min_contig_size=10000, remove_empty_contigs=True, separator="|", quality2symbol={ 'CG': "circle", 'PG': "circle", 'SG': "circle" }, quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK"), sort_by_size=True, input_format="pairwise", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_contig_sizes: map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes, "r"), map_functions=(str, int)) # read data and get contigs that are used (i.e.: remove empty contigs) chrs = {} lines = sys.stdin.readlines() if options.remove_empty_contigs: for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue chrs[chr] = 1 for k in map_contig2size.keys(): if k not in chrs: del map_contig2size[k] k = map_contig2size.keys() if len(k) == 0: E.Stop() sys.exit(0) k.sort() if options.sort_by_size: k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y])) plot = DuplicationPlot(k, map_contig2size, num_entries=0) plot.mRadiusIncrement = options.radius_increment plot.mRadius = options.radius plot.mMaxValue = options.max_value plot.mMinValue = options.min_value if options.title: plot.setTitle(options.title) if options.footer: plot.setFooter(options.footer) plot.initializePlot() data = [] if options.input_format == "pairwise": # read data from pairwise analysis # format is: cluster_id, locations of duplications, tree of # duplications for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] mi, ma = 0, 0 found = False n = 0 chrs = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue chrs[chr] = 1 sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) xi = plot.getPosition(chr, strand, sbjct_from) xa = plot.getPosition(chr, strand, sbjct_to) if not mi: mi = xi else: mi = min(mi, xi) n += 1 ma = max(ma, xa) found = True if not found: continue cis = len(chrs) == 1 if options.loglevel >= 2: options.stdlog.write( "# adding duplications in cluster %s: %s with tree %s\n" % (cluster_id, in_locations, in_tree)) data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree)) data.sort() plot.mNumEntries = len(data) plot.initializePlot() last_ndups = 0 for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]: if ndups != last_ndups: plot.pushRadius() plot.addSeparator() last_ndups = ndups map_gene2location = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to) if not map_gene2location: continue tree = TreeTools.Newick2Tree(in_tree) # the last subset is all nodes again. s = TreeTools.GetSubsets(tree) is_first = True for children, height, branchlength in s[:-1]: if len(children) == 1: continue c = map(lambda x: x.split(options.separator), children) plot.addDuplication(c, map_gene2location, height, url=options.url, with_separator=is_first, link_to_previous=not is_first, quality2symbol=options.quality2symbol, quality2mask=options.quality2mask) is_first = False plot.writeToFile(sys.stdout) E.Stop()
def writeOrthologSets(outfile, nexus, extract_species, extract_gene, options, reference_tree=None, method="strict", outgroups=None): """output ortholog sets. A "strict" ortholog set contains exactly one gene for each species, while a "degenerate" ortholog set contains at least one gene for each species. """ ###################################################################### # build species set to compare sets = [] species = options.column2org nspecies = len(species) if options.enumeration == "monophyletic": if reference_tree: for members, h1, h2 in TreeTools.GetSubsets(reference_tree): if len(members) > 1: sets.append(members) else: raise "please specify a species tree for monophyletic enumeration" elif options.enumeration == "exhaustive": for x in range(2, len(species)): sets += list(SetTools.xuniqueCombinations(species, x)) sets.append(species) elif options.enumeration == "pairwise": for x in range(len(species) - 1): for y in range(x + 1, len(species)): sets.append((species[x], species[y])) elif options.enumeration == "full": sets.append(species) elif options.enumeration == "lineage": for s in species: sets.append((s, )) elif options.enumeration == "explicit": for x in range(2, len(options.species_set)): sets += list(SetTools.xuniqueCombinations(options.species_set, x)) sets.append(options.species_set) ###################################################################### # build sets with positional information xsets = [] map_frozenset2set = {} for x in range(len(sets)): ss = frozenset(map(lambda x: options.org2column[x], sets[x])) xsets.append(ss) map_frozenset2set[ss] = x ###################################################################### # collect outgroups if outgroups: noutgroups = set() for x in outgroups: noutgroups.add(options.org2column[x]) else: noutgroups = None ###################################################################### # loop over each tree and set # I did not see a way to loop a tree once for all sets without doing # complicated counting. The problem is that counting has to be stopped # at different tree heights for different sets. ninput, noutput, nempty, nskipped = 0, 0, 0, 0 counts = [0] * len(sets) options.stdout.write( "nspecies\tname\tid\tcluster\tpattern\t%s\tnode_id\tmembers\n" % "\t".join(species)) cluster_id = 0 nerrors = 0 for tree in nexus.trees: ninput += 1 ntotal_tree = 0 if options.loglevel >= 3: options.stdlog.write("# processing tree %s\n" % tree.name) if options.reroot: rerootTree(tree, extract_species, options) for c in range(len(xsets)): # numbered species set: 0,1,... sn = xsets[c] # literal species set: species1, species2, ... sl = sets[c] ortholog_nodes = getOrthologNodes(tree, sn, options, selector=method, outgroups=noutgroups) ntotal_tree += len(ortholog_nodes) n = 0 pattern = buildPattern(nspecies, sn) # check for inconsistent partitions (the same gene in different # ortholog clusters) within the current tree found_genes = set() ortho_sets = set() # reverse ortholog_node - work in top-down manner. ortholog_nodes.reverse() for node_id, members in ortholog_nodes: n += 1 cluster_id += 1 otus = filter(lambda x: extract_species(x) in sl, tree.get_taxa(node_id)) genes = set(map(extract_gene, otus)) if found_genes.intersection(genes): # only take largest cluster for lineage specific # duplications if method == "lineage": continue if frozenset(genes) in ortho_sets: nskipped += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: cluster %i: redundant node: %i - skipped because already present: %s\n" % (tree.name, n, node_id, str(found_genes.intersection(genes)))) else: nerrors += 1 if options.loglevel >= 1: options.stdlog.write( "# %s: cluster %i: inconsistent node: %i - the same gene in different clusters: %s\n" % (tree.name, n, node_id, str(found_genes.intersection(genes)))) found_genes = found_genes.union(genes) ortho_sets.add(frozenset(genes)) xpattern = buildPattern(nspecies, sn, members) options.stdout.write( "%i\t%s\t%i\t%i\t%s\t%s\t%i\t%s\n" % (len(sl), tree.name, n, cluster_id, "".join(pattern), "\t".join(xpattern), node_id, ";".join(otus))) counts[c] += n if ntotal_tree == 0: nempty += 1 else: noutput += 1 if options.loglevel >= 1: options.stdout.write( "# ninput=%i, nempty=%i, noutput=%i, nskipped=%i, nerrors=%i\n" % (ninput, nempty, noutput, nskipped, nerrors)) # write summary information if options.filename_summary: outfile = open(options.filename_summary, "w") else: outfile = options.stdout outfile.write("//\n") outfile.write("cluster\tpattern\tcounts\t%s\n" % ("\t".join(species))) for c in range(len(xsets)): pattern = buildPattern(nspecies, xsets[c]) outfile.write("%i\t%s\t%i\t%s\n" % (c, "".join(pattern), counts[c], "\t".join(pattern))) if outfile != options.stdout: outfile.close()
map_gene2location = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to) if not map_gene2location: continue tree = TreeTools.Newick2Tree(in_tree) # the last subset is all nodes again. s = TreeTools.GetSubsets(tree) is_first = True for children, height, branchlength in s[:-1]: if len(children) == 1: continue data = [] for c in map(lambda x: x.split(options.separator), children): if len(c) == 2: data.append((c[0], c[1], c[1], "CG")) elif len(c) == 1: data.append(("unk", c[0], c[0], "CG")) elif len(c) == 3: data.append((c[0], c[2], c[3], "CG")) for species, transcript, gene, quality in data: if not gene in map_gene2location: