def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/plot_duplications.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-e", "--headers", dest="headers", action="store_true", help="first row is a header [ignored].") parser.add_option("-t", "--title", dest="title", type="string", help="page title.") parser.add_option("-f", "--footer", dest="footer", type="string", help="page footer.") parser.add_option("-c", "--contig-sizes", dest="filename_contig_sizes", type="string", help="filname with contig sizes.") parser.add_option("-r", "--radius", dest="radius", type="int", help="radius.") parser.add_option("-i", "--increment", dest="radius_increment", type="int", help="radius increment.") parser.add_option("-u", "--url", dest="url", type="string", help="string to build url for annotation.") parser.add_option("--min-contig", dest="min_contig_size", type="string", help="minimum contig size to delineate.") parser.add_option("--min-value", dest="min_value", type="float", help="minimum branch length.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum branch length.") parser.set_defaults( filename_contig_sizes=None, headers=False, titles="", pattern_filename=None, title="", footer="", radius=3000, min_value=0.0, max_value=0.2, url=None, radius_increment=40, min_contig_size=10000, remove_empty_contigs=True, separator="|", quality2symbol={ 'CG': "circle", 'PG': "circle", 'SG': "circle" }, quality2mask=("RG", "CP", "PP", "SP", "RP", "CF", "PF", "SF", "UG", "UP", "UF", "BF", "UK"), sort_by_size=True, input_format="pairwise", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_contig_sizes: map_contig2size = IOTools.ReadMap(open(options.filename_contig_sizes, "r"), map_functions=(str, int)) # read data and get contigs that are used (i.e.: remove empty contigs) chrs = {} lines = sys.stdin.readlines() if options.remove_empty_contigs: for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue chrs[chr] = 1 for k in map_contig2size.keys(): if k not in chrs: del map_contig2size[k] k = map_contig2size.keys() if len(k) == 0: E.Stop() sys.exit(0) k.sort() if options.sort_by_size: k.sort(lambda x, y: cmp(map_contig2size[x], map_contig2size[y])) plot = DuplicationPlot(k, map_contig2size, num_entries=0) plot.mRadiusIncrement = options.radius_increment plot.mRadius = options.radius plot.mMaxValue = options.max_value plot.mMinValue = options.min_value if options.title: plot.setTitle(options.title) if options.footer: plot.setFooter(options.footer) plot.initializePlot() data = [] if options.input_format == "pairwise": # read data from pairwise analysis # format is: cluster_id, locations of duplications, tree of # duplications for line in lines: if line[0] == "#": continue d = line[:-1].split("\t") cluster_id, in_locations, in_tree = d[:3] mi, ma = 0, 0 found = False n = 0 chrs = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue chrs[chr] = 1 sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) xi = plot.getPosition(chr, strand, sbjct_from) xa = plot.getPosition(chr, strand, sbjct_to) if not mi: mi = xi else: mi = min(mi, xi) n += 1 ma = max(ma, xa) found = True if not found: continue cis = len(chrs) == 1 if options.loglevel >= 2: options.stdlog.write( "# adding duplications in cluster %s: %s with tree %s\n" % (cluster_id, in_locations, in_tree)) data.append((cis, n, mi, ma, cluster_id, in_locations, in_tree)) data.sort() plot.mNumEntries = len(data) plot.initializePlot() last_ndups = 0 for cis, ndups, mi, ma, cluster_id, in_locations, in_tree in data[:]: if ndups != last_ndups: plot.pushRadius() plot.addSeparator() last_ndups = ndups map_gene2location = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to) if not map_gene2location: continue tree = TreeTools.Newick2Tree(in_tree) # the last subset is all nodes again. s = TreeTools.GetSubsets(tree) is_first = True for children, height, branchlength in s[:-1]: if len(children) == 1: continue c = map(lambda x: x.split(options.separator), children) plot.addDuplication(c, map_gene2location, height, url=options.url, with_separator=is_first, link_to_previous=not is_first, quality2symbol=options.quality2symbol, quality2mask=options.quality2mask) is_first = False plot.writeToFile(sys.stdout) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_duplications.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-s", "--species", dest="species", type="string", help="species to use.") parser.add_option("-p", "--prefix", dest="prefix", type="string", help="prefix to use for temporary files.") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [counts|lists|hists|links].") parser.add_option("-o", "--filename-output", dest="filename_output", type="string", help="output filename.") parser.add_option("-f", "--functions", dest="functions", type="string", help="functions to grep [functional|pseudo|all].") parser.add_option("-l", "--locations", dest="locations", type="string", help="locations to grep [local|nojunk|all|...].") parser.add_option("-b", "--bin-size", dest="bin_size", type="string", help="bin size.") parser.add_option("-i", "--fit", dest="fit", type="string", help="fitting method [decay|power]") parser.add_option("--min-value", dest="min_value", type="float", help="minimum value for histogram.") parser.add_option("--max-value", dest="max_value", type="float", help="maximum value for histogram.") parser.add_option("--use-relative-height", dest="use_relative_height", action="store_true", help="use relative height values.") parser.add_option( "--reverse", dest="reverse", action="store_true", help="""reverse species. Histograms will show the age of duplications for duplicates in other genomes.""") parser.set_defaults(species="", functions="functional,pseudo,all", locations="local,nojunk,all", filename_output=None, bin_size=1.0, min_value=None, max_value=None, nonnull=None, use_relative_height=False, header=True, fit=None, reverse=False, method="counts") (options, args) = E.Start(parser, add_psql_options=True) options.species = options.species.split(",") options.locations = options.locations.split(",") options.functions = options.functions.split(",") if len(options.species) == 0: raise "please supply list of species." dbhandle = pgdb.connect(options.psql_connection) input_data = map(lambda x: x[:-1].split("\t"), filter(lambda x: x[0] != "#", sys.stdin.readlines())) ## remove header if options.header: del input_data[0] ## decide which columns to take ## 1st column: species1: this is the species in which duplications have occured. ## 2nd column: species2: this is the species with respect to which duplications occured. ## 3rd column: clusterid ## 4th column: chromosomes ## 5th column: function ## 6th column: height ## 7th column: relative height ## 8th column: locations ## 9th column: tree if options.use_relative_height: take = (0, 1, 2, 3, 4, 6, 7, 8) else: take = (0, 1, 2, 3, 4, 5, 7, 8) for x in range(len(input_data)): input_data[x] = tuple([input_data[x][y] for y in take]) map_pos2species = [] map_species2pos = {} for x in range(len(options.species)): map_species2pos[options.species[x]] = x map_pos2species.append(options.species[x]) outfile = None if options.method in ("counts", "medians"): if options.method == "counts": func = len elif options.method == "medians": func = numpy.median for location in options.locations: for function in options.functions: matrix = numpy.zeros( (len(options.species), len(options.species)), numpy.Float) data = GetSubset(input_data, location, function) ## sort by species1 and species2 data.sort() last_species1, last_species2 = None, None values = [] for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: if len(values) > 0: matrix[map_species2pos[last_species1], map_species2pos[last_species2]] = func( values) values = [] last_species1 = species1 last_species2 = species2 values.append(float(height)) if len(values) > 0: matrix[map_species2pos[last_species1], map_species2pos[last_species2]] = func(values) if options.filename_output: dict = {"f": function, "l": location} outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "matrix for method %s: location: %s, function: %s\n" % (options.method, location, function)) if options.method == "medians": format = "%6.4f" elif options.method == "counts": format = "%i" MatlabTools.WriteMatrix(matrix, outfile=outfile, format=format, row_headers=options.species, col_headers=options.species) if options.filename_output: outfile.close() elif options.method in ("lists", "lists-union"): ## write lists of duplicated genes in species1 as compared to species2 ## according to location/function ## First field : gene name ## Second field: cluster id ## Third field : number of other genes in cluster ## Fourth field: location of gene written = {} for location in options.locations: for function in options.functions: values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] data = GetSubset(input_data, location, function) ## sort by species1 and species2 data.sort() last_species1, last_species2 = None, None for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: ## write trees per cluster if options.filename_output: if options.method == "lists": if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1, "o": species2 } written = {} outfile = open(options.filename_output % dict, "w") elif options.method == "lists-union": if last_species1 != species1: if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1 } written = {} outfile = open( options.filename_output % dict, "w") else: outfile = sys.stdout if options.method == "lists": outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, species1, species2)) written = {} elif options.method == "lists-union": if last_species1 != species1: outfile.write( "location: %s, function: %s, species1: %s\n" % (location, function, species1)) written = {} last_species1 = species1 last_species2 = species2 # get tree tt = TreeTools.Newick2Tree(tree) taxa = TreeTools.GetTaxa(tt) for t in taxa: if t in written: continue outfile.write("%s\t%s\t%i\n" % (t, cluster_id, len(taxa))) written[t] = 1 elif options.method in ("hists", "fit-decay"): for location in options.locations: for function in options.functions: values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] data = GetSubset(input_data, location, function) data.sort() ################################################################ ## convert to matrix of list ## values[x][y] contains heights of duplications in species x with reference to y for species1, species2, cluster_id, l, f, height, locations, tree in data: try: values[map_species2pos[species1]][ map_species2pos[species2]].append(float(height)) except KeyError: continue ################################################################ ################################################################ ################################################################ # calculate histograms per species ################################################################ for s in options.species: histograms = [] headers = [] if options.filename_output: dict = {"f": function, "l": location, "s": s} outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write("location: %s, function: %s\n" % (location, function)) for x in range(len(options.species)): if options.reverse: ## duplications in species x vv = values[x][map_species2pos[s]] else: ## duplications in species s vv = values[map_species2pos[s]][x] if len(vv) == 0: pass else: headers.append(options.species[x]) h = Histogram.Calculate( vv, increment=options.bin_size, min_value=options.min_value, max_value=options.max_value, no_empty_bins=True) if options.method == "fit-decay": result = fit(h, [2.0, -1.0]) if result: outfile.write( "%s\t%s\t%s\t%i\t%f\t%f\ty = %f * exp ( %f * x )\n" % ( "function", s, options.species[x], h[0][1], result[0], result[1], result[0], result[1], )) elif options.method == "hists": histograms.append(h) if options.method == "hists": combined_histogram = Histogram.Combine( histograms, missing_value="-") outfile.write("bin\t" + "\t".join(headers) + "\n") Histogram.Write(outfile, combined_histogram) if options.filename_output: outfile.close() else: outfile.flush() elif options.method == "pairs": ## get branches with 0 branchlength for location in options.locations: if options.loglevel >= 2: options.stdlog.write("# processing location %s\n" % location) for function in options.functions: if options.loglevel >= 2: options.stdlog.write("# processing function %s " % function) options.stdlog.flush() data = GetSubset(input_data, location, function) if options.loglevel >= 2: options.stdlog.write("with %i data points\n" % len(data)) options.stdlog.flush() data.sort() last_species1, last_species2, last_cluster_id = None, None, None values = [] for species1, species2, cluster_id, l, f, height, locations, tree in data: if last_species1 != species1 or last_species2 != species2: ## write trees per cluster if options.filename_output: if outfile: outfile.close() dict = { "f": function, "l": location, "s": species1, "o": species2 } outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, species1, species2)) last_species1 = species1 last_species2 = species2 last_cluster_id = None if last_cluster_id != cluster_id: if last_cluster_id != None: pass last_cluster_id = cluster_id outfile.write("%s\t%s\t%s\t%s\n" % (cluster_id, height, locations, tree)) elif options.method == "links": ## write a tree for each species pair: ## each node is a gene+location, the weight of the vertex is the height ## further info added: cluster_id for the duplication for location in options.locations: if options.loglevel >= 2: options.stdlog.write("# processing location %s\n" % location) for function in options.functions: if options.loglevel >= 2: options.stdlog.write("# processing function %s " % function) options.stdlog.flush() data = GetSubset(input_data, location, function) if options.loglevel >= 2: options.stdlog.write("with %i data points\n" % len(data)) options.stdlog.flush() ## stores duplications within first species as compared to second species values = [[[] for y in range(len(options.species))] for x in range(len(options.species))] for species1, species2, cluster_id, l, f, height, locations, tree in data: values[map_species2pos[species1]][ map_species2pos[species2]].append( (cluster_id, -len(locations), locations, tree)) # get links per species for s in options.species: if options.loglevel >= 2: options.stdlog.write("# processing species %s\n" % s) headers = [] for x in range(len(options.species)): if map_pos2species[x] == s: continue vv = values[map_species2pos[s]][x] vv.sort() ## write trees per cluster if options.filename_output: dict = { "f": function, "l": location, "s": s, "o": map_pos2species[x] } outfile = open(options.filename_output % dict, "w") else: outfile = sys.stdout outfile.write( "location: %s, function: %s, species1: %s, species2: %s\n" % (location, function, s, map_pos2species[x])) ## only print out largest tree last_cluster_id = None for cluster_id, n, locations, tree in vv: if cluster_id != last_cluster_id: outfile.write("%s\t%s\t%s\n" % (cluster_id, locations, tree)) last_cluster_id = cluster_id if options.filename_output: outfile.close() E.Stop()
plot.addSeparator() last_ndups = ndups map_gene2location = {} for l in in_locations.split(";"): gene_id, chr, strand, sbjct_from, sbjct_to = l.split(":") if chr not in map_contig2size: continue sbjct_from, sbjct_to = int(sbjct_from), int(sbjct_to) map_gene2location[gene_id] = (chr, strand, sbjct_from, sbjct_to) if not map_gene2location: continue tree = TreeTools.Newick2Tree(in_tree) # the last subset is all nodes again. s = TreeTools.GetSubsets(tree) is_first = True for children, height, branchlength in s[:-1]: if len(children) == 1: continue data = [] for c in map(lambda x: x.split(options.separator), children): if len(c) == 2: data.append((c[0], c[1], c[1], "CG")) elif len(c) == 1: data.append(("unk", c[0], c[0], "CG")) elif len(c) == 3: data.append((c[0], c[2], c[3], "CG"))