def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/export_clade_data.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--filename-groups", dest="filename_groups", type="string", help="filename with orthologous groups to extract.") parser.set_defaults( table_name_malis="malis_genes_aa", table_name_members="groups_members", mode="sequences", filename_groups=None, output_format="fasta", separator="|", ) (options, args) = E.Start(parser, add_psql_options=True) # database handle for connecting to postgres dbhandle = pgdb.connect(options.psql_connection) if options.filename_groups: data, errors = IOTools.ReadList(open(options.filename_groups, "r")) groups = map(lambda x: x.split(options.separator)[:2], data) result = getMembersOfGroups(dbhandle, groups, options) if options.output_format == "fasta": for schema, gene_id, sequence in result: options.stdlog.write(">%s%s%s\n%s\n" % (schema, options.separator, gene_id, re.sub("-", "", sequence))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_test.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="choice", help="method to use [t-test=t-test,wilcox=wilcox]", choices=("t-test", "wilcox")) parser.add_option("-1", "--infile", dest="filename_input", type="string", help="input filename with vector of values.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename with vector of values.") parser.add_option("--header", dest="header", type="string", help="""header of value column [default=%default].""") parser.set_defaults( method="t-test", filename_input=None, header="value", ) (options, args) = E.Start(parser, add_pipe_options=True) if options.filename_input: infile = open(options.filename_input, "r") else: infile = sys.stdin values, errors = IOTools.ReadList(infile, map_function=float) if options.filename_input: infile.close() if errors: E.warn("errors in input: %s" % ";".join(map(str, errors))) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.filename_input2: infile = open(options.filename_input2, "r") values2, errors2 = IOTools.ReadList(infile, map_function=float) infile.close() else: values2 = None stat = Stats.Summary(values) power, diff_at_power95 = None, None if options.method == "t-test": if values2: result = R.t_test(values, values2, *xargs, **kwargs) else: result = R.t_test(values, *xargs, **kwargs) # compute power of test power = R.power_t_test(n=len(values), delta=abs(stat["mean"]), sd=stat["stddev"], sig_level=0.05)['power'] diff_at_power95 = R.power_t_test(n=len(values), power=0.95, sd=stat["stddev"], sig_level=0.05)['delta'] if options.method == "wilcox": result = R.wilcox_test(values, *xargs, **kwargs) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key, value in sorted(result.items()): if key == "data.name": continue if key == "p.value": options.stdout.write("%s\t%5.2e\n" % (str(key), value)) else: options.stdout.write("%s\t%s\n" % (str(key), str(value))) for key, value in stat.items(): options.stdout.write("%s\t%s\n" % (str(key), str(value))) if power: options.stdout.write("1-power\t%5.2e\n" % (1.0 - power)) options.stdout.write("diff_at_power95\t%f\n" % diff_at_power95) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, ) (options, args) = E.Start( parser, add_pipe_options=True, add_psql_options=True, ) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"), map_category=map_category2value) values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_category=map_category2value) E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2))) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""" ) R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""" ) R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""" ) R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""" ) print "## Results for %s" % result['method'] for x in ['p.value', 'statistic', 'alternative', 'method']: print x, result[x] E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: r_compare_distributions.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option( "-m", "--method", dest="method", type="choice", help= "method to use: ks=Kolmogorov-Smirnov, mwu=Mann-WhitneyU, shapiro=Shapiro-Wilk, paired-mwu=paired Mann-WhitneyU, paired-t=paired t-test [default=%default]", choices=("ks", "mwu", "shapiro", "paired-mwu", "paired-t")) parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("--plot-legend", dest="legend", type="string", help="legend for histograms." "") parser.add_option("-f", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.add_option( "-n", "--norm-test", dest="norm_test", action="store_true", help= """test if a set of values is normally distributed. Mean and variance are calculated from the data.""") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="""number of bins (for plotting purposes only).""") parser.add_option("--bin-size", dest="bin_size", type="float", help="""bin size for plot.""") parser.add_option("--min-value", dest="min_value", type="float", help="""minimum_value for plot.""") parser.add_option("--max-value", dest="max_value", type="float", help="""maximum_value for plot.""") parser.add_option("--skip-plot", dest="plot", action="store_false", help="""skipping plotting.""") parser.add_option("--header-names", dest="header", type="string", help="""header of value column [default=%default].""") parser.add_option("--title", dest="title", type="string", help="""plot title [default=%default].""") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, legend=None, norm_test=False, num_bins=0, legend_range="2,2", bin_size=None, min_value=None, plot=True, header="value", title=None, ) (options, args) = E.Start(parser, add_pipe_options=True) kwargs = {} xargs = [] for arg in args: if "=" in arg: key, value = arg.split("=") kwargs[key] = value else: xargs.append(arg) if options.legend: options.legend = options.legend.split(",") map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) f = str else: f = float if options.filename_input1: infile1 = IOTools.openFile(options.filename_input1, "r") else: infile1 = sys.stdin values1, errors1 = IOTools.ReadList(infile1, map_function=f, map_category=map_category2value) if options.filename_input1: infile1.close() if errors1 and options.loglevel >= 3: options.stdlog.write("# errors in input1: %s\n" % ";".join(map(str, errors1))) if options.norm_test: mean = R.mean(values1) stddev = R.sd(values1) options.stdlog.write( "# creating %i samples from normal distribution with mean %f and stddev %f\n" % (len(values1), mean, stddev)) values2 = R.rnorm(len(values1), mean, stddev) errors2 = () else: values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_function=f, map_category=map_category2value) if errors2 and options.loglevel >= 3: options.stdlog.write("# errors in input2: %s\n" % ";".join(map(str, errors2))) if options.loglevel >= 1: options.stdlog.write( "# ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i\n" % (len(values1), len(errors1), len(values2), len(errors2))) if options.method in ("paired-mwu", "paired-t"): if len(values1) != len(values2): raise ValueError( "number of values must be equal for paired tests.") if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2, *xargs, **kwargs) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False, correct=True, *xargs, **kwargs) elif options.method == "paired-mwu": result = R.wilcox_test(values1, values2, paired=True, correct=True, *xargs, **kwargs) elif options.method == "paired-t": result = R.t_test(values1, values2, paired=True, *xargs, **kwargs) elif options.method == "shapiro": if len(values1) > 5000: E.warn( "shapiro-wilk test only accepts < 5000 values, a random sample has been created." ) values1 = random.sample(values1, 5000) result = R.shapiro_test(values1, *xargs, **kwargs) if options.plot: R.assign("v1", values1) R.assign("v2", values2) if options.title: # set the size of the outer margins - the title needs to be added at the end # after plots have been created R.par(oma=R.c(0, 0, 4, 0)) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""" ) # compute breaks: min_value = min(min(values1), min(values2)) if options.min_value is not None: min_value = min(min_value, options.min_value) max_value = max(max(values1), max(values2)) if options.max_value is not None: max_value = max(max_value, options.max_value) extra_options = "" if options.num_bins and not (options.min_value or options.max_value): extra_options += ", breaks=%i" % options.num_bins elif options.num_bins and (options.min_value or options.max_value): bin_size = float((max_value - min_value)) / (options.num_bins + 1) breaks = [ min_value + x * bin_size for x in range(options.num_bins) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) elif options.bin_size is not None: num_bins = int(((max_value - min_value) / options.bin_size)) + 1 breaks = [ min_value + x * options.bin_size for x in range(num_bins + 1) ] extra_options += ", breaks=c(%s)" % ",".join(map(str, breaks)) R("""h1 <- hist( v1, freq=FALSE, density=20, main='Relative frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=FALSE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s)""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$density), max(h2$density)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) R("""h1 <- hist( v1, freq=TRUE, density=20, main='Absolute frequency histogram' %s)""" % extra_options) R("""h2 <- hist( v2, freq=TRUE, add=TRUE, density=20, col='red', offset=0.5, angle=135 %s )""" % extra_options) if options.legend: R("""legend( ( max(c(h1$breaks[-1], h2$breaks[-1])) - min(c(h1$breaks[1], h2$breaks[1]) ) ) / 2, max( max(h1$counts), max(h2$counts)) / 2, c('%s'), fill=c('white','red'))""" % ("','".join(options.legend))) if options.title: R.mtext(options.title, 3, outer=True, line=1, cex=1.5) if options.loglevel >= 1: options.stdout.write("## Results for %s\n" % result['method']) options.stdout.write("%s\t%s\n" % ("key", options.header)) for key in list(result.keys()): if key == "data.name": continue options.stdout.write("\t".join((key, str(result[key]))) + "\n") stat = Stats.Summary(values1) for key, value in list(stat.items()): options.stdout.write("%s1\t%s\n" % (str(key), str(value))) stat = Stats.Summary(values2) for key, value in list(stat.items()): options.stdout.write("%s2\t%s\n" % (str(key), str(value))) if options.plot: if options.hardcopy: R.dev_off() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: malis2malis.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-d", "--pattern-output", dest="pattern_output", type="string", help="filename pattern for output multiple alignment files.") parser.add_option("-f", "--filename-filter", dest="filename_filter", type="string", help="filename with strings to filter by.") parser.add_option("--list-filter", dest="list_filter", type="string", help="list of strings to filter by.") parser.set_defaults( pattern_output="%s.mali", methods="", parameters="", filename_filter=None, list_filter=None, ) addOptions(parser) (options, args) = E.Start(parser) options.methods = options.methods.split(",") options.parameters = options.parameters.split(",") if not options.pattern_mali: raise "Please specifiy a pattern to find the malis using --pattern-mali" #################################################################### #################################################################### #################################################################### ## Read components #################################################################### map_seq_id2component, map_component2seq_id, map_component2input_id = \ readComponents( options ) #################################################################### #################################################################### #################################################################### ## Read filtering information #################################################################### if options.filename_filter: id_filter, nerrors = IOTools.ReadList( open(options.filename_filter, "r")) if options.loglevel >= 1: options.stdlog.write( "# read %i identifiers to filter each multiple alignment with.\n" % len(id_filter)) options.stdlog.flush() elif options.list_filter: id_filter = options.list_filter.split(",") else: id_filter = None #################################################################### #################################################################### #################################################################### ## Read regions to mask #################################################################### map_component2masks = readMasks(options, map_component2input_id) #################################################################### #################################################################### #################################################################### ## Read regions to extract #################################################################### map_component2extracts = readExtracts(options, map_component2input_id) #################################################################### #################################################################### #################################################################### ## Read regions to annotate #################################################################### map_component2annotations = readAnnotations(options, map_component2input_id) #################################################################### #################################################################### #################################################################### ## Prepare for run #################################################################### component_ids = map_component2seq_id.keys() component_ids.sort() if options.loglevel >= 1: options.stdlog.write("# %i component ids to start with.\n" % (len(component_ids))) component_ids, map_sample2reference = selectComponents( component_ids, map_component2seq_id, map_component2input_id, id_filter, options) if options.test: component_ids = component_ids[:options.test] if options.loglevel >= 1: options.stdlog.write("# %i component ids selected for output.\n" % (len(component_ids))) ninput = 0 noutput = 0 nskipped = 0 nskipped_length = 0 for component_id in component_ids: ninput += 1 if options.loglevel >= 3: options.stdlog.write("# processing component %s\n" % (component_id)) mali = getMali(component_id, map_component2seq_id, map_component2input_id, id_filter, options) if mali == None: E.warn("empty mali returned for component %s" % (component_id)) nskipped += 1 continue if mali.getNumColumns() == 0: E.warn("skipping output of empty alignment for component %s" % (component_id)) nskipped += 1 continue mali.setName(str(component_id)) ############################################################### ## add annotations if map_component2annotations != None: annotateAlignment(mali, map_component2annotations, options) ############################################################### ## mask the alignment maskAlignment(mali, map_component2masks, map_component2extracts, map_sample2reference, options) if mali.getNumColumns() < options.minimum_mali_length: nskipped_length += 1 if options.loglevel >= 1: options.stdlog.write( "# component %s: skipped, because length %i less than threshold.\n" % (component_id, mali.getNumColumns())) continue ############################################################### ## prepare the mali for output if "%s" not in options.pattern_output: append = True else: append = False output_filename = re.sub("%s", component_id, options.pattern_output) input_id = map_component2input_id[component_id] if options.loglevel >= 2: options.stdlog.write( "# component %s: input from %s, goes to %s\n" % (component_id, input_id, output_filename)) dirname = os.path.dirname(output_filename) if dirname and not os.path.exists(dirname): os.makedirs(dirname) if not os.path.exists(output_filename): mali.writeToFile(open(output_filename, "w"), format=options.output_format) noutput += 1 else: if append: mali.writeToFile(open(output_filename, "a"), format=options.output_format) noutput += 1 else: if options.loglevel >= 1: options.stdlog.write( "# skipping because output for component %s already exists: %s\n" % (component_id, output_filename)) nskipped += 1 # if we only sample, stop if you have reached # the desired number if options.sample and noutput == options.sample: break E.info("ninput=%i, noutput=%i, nskipped=%i, nskipped_length=%i" % (ninput, noutput, nskipped, nskipped_length)) E.Stop()
aggregate="mean", value_format="%5.2f", method="counts") (options, args) = E.Start(parser) if not options.filename_map: raise "please supply filename mapping probesets to identifiers." map_probe2locus = IOTools.ReadMap(open(options.filename_map, "r")) matrix, row_headers, col_headers = MatlabTools.readMatrix( sys.stdin, format="full", headers=options.headers) if options.filename_tissues: tissues, nerrors = IOTools.ReadList(open(options.filename_tissues, "r")) tissues = set(tissues) columns = [] for x in range(len(col_headers)): if col_headers[x] in tissues: columns.append(x) else: columns = range(len(col_headers)) nrows, ncols = len(row_headers), len(col_headers) ninput, noutput, nkept = 0, 0, 0 no_map = [] degenerate = []
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: mali2summary.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm"), help="input format of multiple alignment") parser.add_option( "-a", "--alphabet", dest="alphabet", type="choice", choices=("aa", "na"), help="alphabet to use [default=%default].", ) parser.add_option( "-p", "--pattern-mali", dest="pattern_mali", type="string", help="filename pattern for input multiple alignment files.") parser.set_defaults( input_format="fasta", output_format="fasta", mask_chars="nN", gap_chars="-.", alphabet="na", pattern_mali=None, ) (options, args) = E.Start(parser) if options.pattern_mali: prefix_header = "prefix\t" prefix_row = "\t" else: prefix_header = "" prefix_row = "" options.stdout.write( "%sncol_mean\tpcol_mean\tncol_median\tpcol_median\tnrow_mean\tprow_mean\tnrow_median\tprow_median\n" % (prefix_header, )) ninput, nskipped, noutput, nempty = 0, 0, 0, 0 if options.pattern_mali: ids, errors = IOTools.ReadList(sys.stdin) E.debug("read %i identifiers.\n" % len(ids)) nsubstitutions = len(re.findall("%s", options.pattern_mali)) for id in ids: filename = options.pattern_mali % tuple([id] * nsubstitutions) ninput += 1 if not os.path.exists(filename): nskipped += 1 continue # read multiple alignment in various formats mali = Mali.Mali() mali.readFromFile(open(filename, "r"), format=options.input_format) if mali.isEmpty(): nempty += 1 continue E.debug("read mali with %i entries from %s.\n" % (len(mali), filename)) if analyzeMali(mali, options, prefix_row="%s\t" % id): noutput += 1 else: # read multiple alignment in various formats mali = Mali.Mali() mali.readFromFile(sys.stdin, format=options.input_format) ninput += 1 if mali.isEmpty(): nempty += 1 else: E.debug("read mali with %i entries." % (len(mali))) if analyzeMali(mali, options, prefix_row=""): noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i, nempty=%i." % (ninput, noutput, nskipped, nempty)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gpipe/filter_fasta.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-m", "--method", dest="method", type="choice", choices=("longest-transcript", "ids", "quality"), help= """method to apply to sequences ["longest-transcript", "ids", "quality"].""" ) parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one.") parser.add_option("-t", "--type", dest="type", type="choice", choices=("aa", "na"), help="sequence type (aa or na).") parser.set_defaults( methods="", parameters="", type="na", aa_mask_chars="xX", aa_mask_char="x", na_mask_chars="nN", na_mask_char="n", gap_chars="-.", gap_char="-", template_identifier="ID%06i", separator="|", ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") iterator = FastaIterator.FastaIterator(sys.stdin) if options.method == "quality": filter_quality = set(options.parameters) else: filter_quality = None sequences = [] ninput, noutput, nskipped = 0, 0, 0 while 1: try: cur_record = iterator.next() except StopIteration: break ninput += 1 if filter_quality: id = re.split(" ", cur_record.title)[0] species, transcript, gene, quality = id.split(options.separator) if quality not in filter_quality: nskipped += 1 continue sequences.append(cur_record) take = None if options.method == "longest-transcript": take = [] lengths = [] for x in range(len(sequences)): l = len(re.sub(" ", "", sequences[x].sequence)) id = re.split(" ", sequences[x].title)[0] species, transcript, gene = id.split(options.separator)[:3] lengths.append((species, gene, -l, x)) lengths.sort() last_species = None last_gene = None for species, gene, l, x in lengths: if last_species == species and last_gene == gene: continue take.append(x) last_species, last_gene = species, gene elif options.method == "ids": take = [] ids, nerrors = IOTools.ReadList(open(options.parameters[0], "r")) del options.parameters[0] ids = set(ids) for x in range(len(sequences)): id = re.split(" ", sequences[x].title)[0] if id in ids: take.append(x) if take != None: sequences = map(lambda x: sequences[x], take) noutput = len(sequences) for sequence in sequences: options.stdout.write(">%s\n%s\n" % (sequence.title, sequence.sequence)) if options.loglevel >= 1: options.stdlog.write("# ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_genetrees.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option( "-r", "--species-regex", dest="species_regex", type="string", help="regular expression to extractspecies from identifier.") parser.add_option( "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.add_option("--filename-filter-positives", dest="filename_filter_positives", type="string", help="filename with positive list of trees to analyze.") parser.add_option("-s", "--filename-species-tree", dest="filename_species_tree", type="string", help="filename with species tree.") parser.add_option( "--filename-species2colour", dest="filename_species2colour", type="string", help= "filename with map of species to colours. If not given, random colours are assigned to species." ) parser.add_option("-t", "--species-tree", dest="species_tree", type="string", help="species tree.") parser.add_option( "-e", "--filename-locations", dest="filename_locations", type="string", help= "filename with map of transcript information to location information.") parser.add_option("--no-create", dest="create", action="store_false", help="do not create files, but append to them.") parser.add_option( "--max-separation", dest="max_separation", type="int", help= "maximum allowable separation between syntenic segments for border plot (set to 0, if syntey is enough)." ) parser.add_option( "--filename-species2url", dest="filename_species2url", type="string", help="filename with mapping information of species to URL.") parser.add_option("--prefix", dest="prefix", type="string", help="prefix to add as first column.") parser.add_option( "--outgroup-species", dest="outgroup_species", type="string", help="species to used as outgroups. Separate multiple species by ','.") parser.add_option("--subtrees-trees", dest="subtrees_trees", action="store_true", help="write trees for subtrees.") parser.add_option("--subtrees-identifiers", dest="subtrees_identifiers", action="store_true", help="write identifiers of subtrees.") parser.add_option("--svg-add-ids", dest="svg_add_ids", action="store_true", help="add node ids to svg plot.") parser.add_option("--svg-otus", dest="svg_otus", type="string", help="otus to output in svg species tree.") parser.add_option("--svg-branch-lenghts", dest="svg_branch_lengths", type="choice", choices=("contemporary", "uniform", "median"), help="branch lengths in species tree.") parser.add_option("--print-totals", dest="print_totals", action="store_true", help="output totals sections.") parser.add_option("--print-subtotals", dest="print_subtotals", action="store_true", help="output subtotals sections.") parser.add_option( "--print-best", dest="print_best", action="store_true", help="output best node assignment for each node in gene tree.") parser.add_option("--print-svg", dest="print_svg", action="store_true", help="output svg files.") parser.add_option("--print-species-svg", dest="print_species_svg", action="store_true", help="output species svg files.") parser.add_option( "--output-pattern", dest="output_pattern", type="string", help= """output pattern for separate output of sections [default: %default]. Set to None, if output to stdout. Can contain one %s to be substituted with section.""" ) parser.add_option( "--output-pattern-svg", dest="output_pattern_svg", type="string", help= "filename for svg output. If it contains %s, this is replaced by gene_tree name." ) parser.add_option( "--filename-node-types", dest="filename_node_types", type="string", help="filename with node type information from a previous run.") parser.add_option("--analyze-resolution-data", dest="analyze_resolution_data", type="choice", action="append", choices=("stats", "histograms"), help="stdin is resolution data.") parser.add_option("--filter-quality", dest="filter_quality", type="choice", choices=("all", "genes", "pseudogenes"), help="filter predictions by gene type.") parser.add_option("--filter-location", dest="filter_location", type="choice", choices=("all", "local", "non-local", "cis", "unplaced"), help="filter predictions by location.") parser.add_option("--remove-unplaced", dest="remove_unplaced", action="store_true", help="remove predictions on unplaced contigs.") parser.add_option("--skip-without-outgroups", dest="skip_without_outgroups", action="store_true", help="skip clusters without outgroups.") parser.set_defaults( filter_quality="all", filter_location="all", remove_unplaced=False, species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", filename_species_tree=None, priority={ "Speciation": 0, "SpeciationDeletion": 1, "Transcripts": 2, "DuplicationLineage": 3, "Duplication": 4, "DuplicationDeletion": 5, "DuplicationInconsistency": 6, "Outparalogs": 7, "InconsistentTranscripts": 8, "Inconsistency": 9, "Masked": 10 }, species_tree=None, filename_species2colour=None, filename_locations=None, max_separation=0, filename_species2url=None, separator="|", prefix=None, output_pattern=None, output_pattern_svg=None, outgroup_species=None, svg_add_ids=False, svg_branch_lengths="median", svg_otus=None, subtrees=False, print_svg=False, print_subtotals=False, print_totals=False, print_best=False, subtrees_identifiers=False, create=True, min_branch_length=0.00, filename_node_types=None, format_branch_length="%6.4f", nodetypes_inconsistency=("InconsistentTranscripts", "Inconsistency"), analyze_resolution_data=None, warning_small_branch_length=0.01, filename_filter_positives=None, skip_without_outgroups=False, ) (options, args) = E.Start(parser, add_psql_options=True, add_csv_options=True) if options.outgroup_species: options.outgroup_species = set(options.outgroup_species.split(",")) if options.svg_otus: options.svg_otus = set(options.svg_otus.split(",")) rx_species = re.compile(options.species_regex) extract_species = lambda x: rx_species.match(x).groups()[0] if options.gene_regex: rx_gene = re.compile(options.gene_regex) extract_gene = lambda x: rx_gene.match(x).groups()[0] else: extract_gene = None extract_quality = lambda x: x.split(options.separator)[3] ######################################################################### ######################################################################### ######################################################################### # read positive list of malis ######################################################################### if options.filename_filter_positives: filter_positives, nerrors = IOTools.ReadList( open(options.filename_filter_positives, "r")) filter_positives = set(filter_positives) else: filter_positives = None ######################################################################### ######################################################################### ######################################################################### # read location info ######################################################################### if options.filename_locations: map_id2location = TreeReconciliation.readLocations( open(options.filename_locations, "r"), extract_species) else: map_id2location = {} if (options.remove_unplaced or options.filter_location != "all" ) and not options.filename_locations: raise "please supply a file with location information." ######################################################################### ######################################################################### ######################################################################### # delete output files ######################################################################### if options.create and options.output_pattern: for section in ("details", "subtrees", "subids", "details", "trees", "nodes", "categories"): fn = options.output_pattern % section if os.path.exists(fn): if options.loglevel >= 1: options.stdlog.write("# deleting file %s.\n" % fn) os.remove(fn) if options.loglevel >= 1: options.stdlog.write("# reading gene trees.\n") options.stdlog.flush() gene_nexus = TreeTools.Newick2Nexus(sys.stdin) Tree.updateNexus(gene_nexus) if options.loglevel >= 1: options.stdlog.write("# read %i gene trees from stdin.\n" % len(gene_nexus.trees)) options.stdlog.flush() ######################################################################### ######################################################################### ######################################################################### # main loop over gene trees ######################################################################### ninput, nfiltered, nskipped, noutput = 0, 0, 0, 0 nskipped_filter, nskipped_outgroups = 0, 0 # total counts total_heights_per_species = {} total_relheights_per_species = {} total_heights_per_tree = [] total_relheights_per_tree = [] for gene_tree in gene_nexus.trees: ninput += 1 xname = re.sub("_tree.*", "", gene_tree.name) xname = re.sub("subtree_", "", xname) if filter_positives and xname not in filter_positives: nskipped_filter += 1 continue if options.loglevel >= 6: gene_tree.display() ####################################################################### ####################################################################### ####################################################################### # get identifier for this tree and update prefixes accordingly ####################################################################### if options.prefix: if len(gene_nexus.trees) > 0: prefix_header = "prefix1\tprefix2\t" prefix_row = options.prefix + "\t" + gene_tree.name + "\t" prefix_prefix = options.prefix + "_" + gene_tree.name + "_" prefix_name = options.prefix + "_" + gene_tree.name else: prefix_header = "prefix\t" prefix_row = options.prefix + "\t" prefix_prefix = options.prefix + "_" prefix_name = options.prefix else: if len(gene_nexus.trees) > 0: prefix_header = "prefix\t" prefix_row = gene_tree.name + "\t" prefix_prefix = gene_tree.name + "\t" prefix_name = gene_tree.name else: prefix_header, prefix_row, prefix_prefix, prefix_name = "", "", "", "" ####################################################################### ####################################################################### ####################################################################### # apply filters to gene tree ####################################################################### TreeReconciliation.filterTree(gene_tree, options, map_id2location) otus = TreeTools.GetTaxa(gene_tree) if len(otus) <= 1: nfiltered += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: empty after filtering - skipped.\n" % gene_tree.name) continue this_species_list = map(extract_species, otus) # check, if only outgroups if options.outgroup_species: if not set(this_species_list).difference(options.outgroup_species): nfiltered += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: only outgroups after filtering - skipped.\n" % gene_tree.name) continue if options.skip_without_outgroups and not set( this_species_list).intersection(options.outgroup_species): nskipped_outgroups += 1 if options.loglevel >= 1: options.stdlog.write( "# tree %s: no outgroups - skipped.\n" % gene_tree.name) continue ####################################################################### ####################################################################### ####################################################################### # reroot gene tree, if outgroups have been given. ####################################################################### if options.outgroup_species: TreeReconciliation.rerootTree(gene_tree, extract_species, options) ####################################################################### ####################################################################### ####################################################################### # compute distance to root for each node ####################################################################### distance_to_root = TreeTools.GetDistanceToRoot(gene_tree) ####################################################################### ####################################################################### ####################################################################### # compute counts ####################################################################### # heights per tree heights_per_tree = [] # relative heights per tree relheights_per_tree = [] # distance to root heights_per_species = {} # distance to root (relative to maximum distance to root) relheights_per_species = {} analysis_set, gene_set, pseudogene_set, other_set = TreeReconciliation.getAnalysisSets( gene_tree, extract_quality, options) if len(analysis_set) == 0: if options.loglevel >= 1: options.stdlog.write( "# tree %s: empty analysis set - skipped.\n" % gene_tree.name) nskipped += 1 continue reference_height = TreeReconciliation.getReferenceHeight( distance_to_root, gene_tree, gene_set, options, extract_species, method="median") if reference_height is None: if options.loglevel >= 1: options.stdlog.write( "# tree %s: reference height not computable or 0 - skipped.\n" % gene_tree.name) nskipped += 1 continue for node_id in analysis_set: node = gene_tree.node(node_id) species = extract_species(node.data.taxon) height = distance_to_root[node_id] if height < options.warning_small_branch_length: options.stdlog.write( "# tree %s: small distance %s to root at node %i: %s\n" % (gene_tree.name, options.format_branch_length % height, node_id, node.data.taxon)) relheight = height / reference_height try: heights_per_species[species].append(height) except KeyError: heights_per_species[species] = [height] relheights_per_species[species] = [] relheights_per_species[species].append(relheight) # do not use outgroup species if options.outgroup_species and species in options.outgroup_species: continue heights_per_tree.append(height) relheights_per_tree.append(relheight) if options.loglevel >= 1: options.stdlog.write( "# tree %s: reference_height=%s\n" % (gene_tree.name, options.format_branch_length % reference_height)) options.stdlog.flush() if options.print_subtotals: printCounts(heights_per_species, relheights_per_species, heights_per_tree, relheights_per_tree, options, prefix_header, prefix_row) ####################################################################### ####################################################################### ####################################################################### # update total counts ####################################################################### TreeReconciliation.appendCounts(total_heights_per_species, heights_per_species) TreeReconciliation.appendCounts(total_relheights_per_species, relheights_per_species) TreeReconciliation.appendCounts(total_heights_per_tree, heights_per_tree) TreeReconciliation.appendCounts(total_relheights_per_tree, relheights_per_tree) noutput += 1 if options.print_totals: if options.prefix: prefix_header = "prefix1\tprefix2\t" prefix_row = options.prefix + "\t" + "total" + "\t" prefix_prefix = options.prefix + "_" + "total" + "_" prefix_name = options.prefix + "_" + "total" else: prefix_header = "prefix\t" prefix_row = "total" + "\t" prefix_prefix = "total" + "_" prefix_name = "total" printCounts(total_heights_per_species, total_relheights_per_species, total_heights_per_tree, total_relheights_per_tree, options, prefix_header, prefix_row) if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, nfiltered=%i, nskipped=%i, nskipped_filter=%i, nskipped_outgroups=%i, noutput=%i\n" % (ninput, nfiltered, nskipped, nskipped_filter, nskipped_outgroups, noutput)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: malis2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) addOptions(parser) parser.add_option( "--filename-coordinates", dest="filename_coordinates", type="string", help="filename of coordinates that constitute the multiple alignment.") parser.add_option("--filename-identifiers", dest="filename_identifiers", type="string", help="filename with list of identifiers to use.") parser.add_option( "-x", "--pattern-identifier", dest="pattern_identifier", type="string", help="pattern to extract identifier from a sequence header.") parser.add_option( "-w", "--width", dest="width", type="int", help= "width of an alignment column (choose 3 for codon alignments) [default=%default]." ) parser.add_option("-m", "--method", dest="methods", type="choice", choices=("filter-variants", ), help="methods to apply") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one.") parser.add_option("--mask-acgtn", dest="mask_actgn", action="store_true", help="mask. Anything not [ACGTN] will be N.") parser.set_defaults( pattern_identifier="(^\S+)", methods=[], parameters="", filename_identifiers=None, filename_coordinates=None, mask_acgtn=False, ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") if not options.pattern_mali: raise ValueError( "Please specifiy a pattern to find the malis using --pattern-mali") #################################################################### #################################################################### #################################################################### ## Read components #################################################################### map_seq_id2component, map_component2seq_id, map_component2input_id = \ readComponents( options ) #################################################################### #################################################################### #################################################################### ## Read regions to mask #################################################################### map_component2masks = readMasks(options, map_component2input_id) #################################################################### #################################################################### #################################################################### ## Read regions to extract #################################################################### map_component2extracts = readExtracts(options, map_component2input_id) #################################################################### #################################################################### #################################################################### ## read identifiers #################################################################### if options.filename_identifiers: identifiers, nerrors = IOTools.ReadList( open(options.filename_identifiers, "r")) identifiers_set = set(identifiers) else: identifiers = None identifiers_set = None #################################################################### #################################################################### #################################################################### ## Prepare for run #################################################################### rx_identifier = re.compile(options.pattern_identifier) ## build list of concatenated malis sequences = {} if identifiers: for id in identifiers_set: sequences[id] = [] else: identifiers_set = set() for seq_id in map_seq_id2component.keys(): id = rx_identifier.search(seq_id).groups()[0] sequences[id] = [] identifiers_set.add(id) identifiers = list(identifiers_set) identifiers.sort() component_ids = map_component2seq_id.keys() component_ids.sort() if options.test: component_ids = component_ids[:options.test] #################################################################### #################################################################### #################################################################### ## Build list of components to output. #################################################################### component_ids, map_sample2reference = selectComponents( component_ids, map_component2seq_id, map_component2input_id, None, options) nskipped = 0 new_component_ids = [] for component_id in component_ids: try: mali = getMali(component_id, map_component2seq_id, map_component2input_id, None, options) except OSError, msg: E.warn("could not find mali %s: %s" % (component_id, msg)) nskipped += 1 continue ############################################################### ############################################################### ############################################################### ## check if all identifiers in component are present in mali ## and build a temporary alignment with all of those found component_set = set(map_component2seq_id[component_id]) if len(component_set.difference(set(mali.getIdentifiers()))) != 0: nskipped += 1 continue found = {} is_double = None temp_mali = Mali.Mali() temp_mali.setName(str(component_id)) for seq_id in map_component2seq_id[component_id]: id = rx_identifier.search(seq_id).groups()[0] if id not in identifiers_set: continue if id in found: if options.skip_doubles: if options.loglevel >= 1: options.stdlog.write( "# component %s: removed double entry %s\n" % (component_id, seq_id)) continue else: is_double = id break if options.output_format == "codeml": if len(mali[seq_id]) % 3 != 0: raise "length of sequence %s is not a multiple of 3: %i" % ( seq_id, len(mali[seq_id])) ## change identifier to id found[id] = True entry = mali.getEntry(seq_id) temp_mali.addSequence(id, entry.mFrom, entry.mTo, entry.mString) if is_double: nskipped += 1 if options.loglevel >= 1: options.stdlog.write( "# component %s: skipped because it contains double entry %s\n" % (component_id, is_double)) continue if set(found.keys()) != identifiers_set: nskipped += 1 if options.loglevel >= 1: options.stdlog.write( "# component %s: skipped because incomplete: %s\n" % (component_id, str(found.keys()))) continue ############################################################### ############################################################### ############################################################### ## mask the temporary alignment maskAlignment(temp_mali, map_component2masks, map_component2extracts, map_sample2reference, options) for id, o in temp_mali.items(): if options.mask_acgtn: s = re.sub("[^ACGTNactgn]", "N", o.mString) else: s = o.mString sequences[id].append(s) new_component_ids.append(component_id) ## if we only sample, stop if you have reached ## the desired number if options.sample and len(new_component_ids) == options.sample: break
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/regions2gff.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="pattern to look for sequence filename.") parser.add_option( "-i", "--ids", dest="ids", type="string", help= "comma separated list of prediction ids. Use 'all' to use all predictions." ) parser.add_option("-f", "--filename-ids", dest="filename_ids", type="string", help="filename with prediction ids.") parser.add_option("-t", "--type", dest="type", type="choice", choices=("genes", "mrnas", "introns", "intronic", "exons", "exonic", "intergenic", "exons-third-codons"), help="type to output.") parser.add_option( "-e", "--extend-region", dest="extend_region", type="int", help="regions are extended by this margin at either end.") parser.add_option( "-r", "--shorten-region", dest="shorten_region", type="int", help="regions are shortened by this margin at either end.") parser.add_option("-m", "--min-length", dest="min_length", type="int", help="minimum length of segment.") parser.add_option("-s", "--schema", dest="schema", type="string", help="schema to take data from.") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("fasta", "table", "region"), help="output formats.") parser.add_option("--fasta-format", dest="fasta_format", type="choice", choices=("id-coordinates", "coordinates", "schema-coordinates"), help="output formats for fasta formatted headers.") parser.add_option("--orthologs", dest="orthologs", action="store_true", help="lookup up orthologs of prediction ids.") parser.add_option("--multiple", dest="multiple", action="store_true", help="""lookup up predictions in multiple species. Identifiers should be given as schema|prediction_id[|additional_fields]. Note that the genome file locations have to be consistent.""" ) parser.add_option("--id-format", dest="id_format", type="choice", choices=("id", "schema-id", "full"), help="output format for ids.") parser.add_option("--taboo-regions", dest="taboo_regions", type="choice", choices=("same", "both"), help="check for overlap in same/both strands.") parser.add_option("--filename-taboo-regions", dest="filename_taboo_regions", type="string", help="filename with information about taboo regions.") parser.add_option( "--filename-properties", dest="filename_properties", type="string", help= "filename with mapping information between features and properties.") parser.add_option( "--invert-properties", dest="invert-properties", action="store_true", help= "instead of printing features which have properties, print those that have not." ) parser.add_option( "--output-coordinate-format", dest="output_coordinate_format", type="choice", choices=("full", "long"), help= """output format of coordinates. Output format is contig:strand:from:to in zero based /forward/reverse strand coordinates in open/closed notation. 'long' includes the contig length as fifth field""" ) parser.set_defaults(genome_file="genome", identifiers=None, filename_ids="-", ids=None, extend_region=0, shorten_region=0, tablename_predictions="predictions", tablename_exons="exons", tablename_genes="genes", tablename_quality="quality", schema=None, output_format="fasta", fasta_format="id-coordinates", type="mrnas", min_length=1, id_format="id", mmultiple=False, separator="|", filename_taboo_regions=False, output_coordinate_format="full", filename_properties=None, invert_property=False, report_step=10000) (options, args) = E.Start(parser, add_psql_options=True) if options.orthologs: options.id_format = "schema-id" ## database handle for connecting to postgres dbhandle = pgdb.connect(options.psql_connection) ## Step 1 : Input of predictions ## read identifiers from file, command line arguments or stdin. if options.ids in ("all", "nr"): prediction_ids = options.ids if options.loglevel >= 1: options.stdlog.write("# using all prediction ids.\n") options.stdlog.flush() elif options.ids: prediction_ids = options.ids.split(",") elif len(args) > 0: prediction_ids = args elif options.filename_ids: prediction_ids = [] if options.filename_ids == "-": prediction_ids += IOTools.ReadList(sys.stdin)[0] elif options.filename_ids: prediction_ids += IOTools.ReadList(open(options.filename_ids, "r"))[0] if len(prediction_ids) == 0: raise "no prediction identifiers given." if options.loglevel >= 1: options.stdlog.write("# read %i prediction ids.\n" % len(prediction_ids)) options.stdlog.flush() if options.filename_taboo_regions: ## Note: the input has to be in forward coordinates in order for option "both" to work. taboo_regions = Regions.RegionFilter() if options.taboo_regions == "both": ignore_strand = True else: ignore_strand = False taboo_regions.readFromFile(open(options.filename_taboo_regions, "r"), ignore_strand=ignore_strand) else: taboo_regions = None map_feature2property = getMapFeature2Property(options) processPredictions(dbhandle, options.schema, options, prediction_ids, taboo_regions, map_feature2property) E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser.add_option("-s", "--species", dest="species", type="string", help="schema of master species." ) parser.set_defaults( tablename_orthologs = "orthology_pairwise1v5.orthologlinks_first", filename_ids = "-", schemas = None, species = None, ) (options, args) = E.Start( parser, add_psql_options = True ) dbhandle = pgdb.connect( options.psql_connection ) if options.filename_ids == "-": ids, errors = IOTools.ReadList(sys.stdin) extra_options = ["schema1 = '%s'" % options.species, "prediction_id1 IN ('%s')" % "','".join( ids ) ] if options.schemas: extra_options.append( "schema2 IN ('%s')" % "','".join(options.schemas)) statement = """SELECT prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2 FROM %s WHERE schema1 != schema2 AND %s ORDER BY prediction_id1""" % (options.tablename_orthologs, " AND ".join(extra_options)) cc = dbhandle.cursor() cc.execute(statement) result = cc.fetchall() cc.close() if options.schemas: schemas = options.schemas else: schemas = set( map( lambda x: x[1], result) ) ## compute counts degeneracies = {} for x in ids: degeneracies[x] = {} for s in schemas: degeneracies[x][s] = (0,0,0,0) for prediction_id1, schema2, prediction_id2, gene_id2, gd1, gd2, td1, td2 in result: degeneracies[prediction_id1][schema2] = (gd1, gd2, td1, td2) ## output options.stdout.write("%s\t%s\n" % ("prediction_id", "\t".join(schemas))) for x in ids: options.stdout.write("%s" % x) for s in schemas: options.stdout.write("\t%s:%s:%s:%s" % degeneracies[x][s]) options.stdout.write("\n") E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/analyze_ribosomes.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-s", "--schemas", dest="schemas", type="string", help="schemas in the set.") parser.add_option("-e", "--field-extract", dest="field_extract", type="string", help="pattern for the field to extract.") parser.add_option("-c", "--field-compare", dest="field_compare", type="string", help="pattern for the field to compare.") parser.add_option("-i", "--filename-identifiers", dest="filename_identifiers", type="string", help="identifiers in the positive set.") parser.add_option("-u", "--filename-subset", dest="filename_subset", type="string", help="subset in the positive set.") parser.add_option("--filter-min-ratio", dest="filter_min_ratio", type="float", help="minimum boundary for filter.") parser.add_option("--filter-max-ratio", dest="filter_max_ratio", type="float", help="maximum boundary for filter.") parser.add_option( "-o", "--output-fields", dest="output_fields", type="string", help= "output fields, choices are: zscore, val, nvals, sum, min, max, stddev, mean, median." ) parser.add_option( "--output-pattern", dest="output_pattern", type="string", help= "pattern for table headers, should contain %s for schema and %s for field anme." ) parser.add_option( "-f", "--output-format", dest="output_format", type="choice", choices=("table", "list", "values"), help="output format. Tabular form (one row per ortholog) or list form." ) parser.add_option("--format", dest="format", type="string", help="output format for numbers.") parser.add_option("--remove-na", dest="remove_na", action="store_true", help="remove entries with any na values.") parser.set_defaults( field_extract="%s_length", field_compare="%s_length", filename_identifiers=None, filename_subset=None, filter_min_ratio=0.00, filter_max_ratio=0.00, schemas="", output_fields="", output_pattern="%s_%s", output_format="table", format="%6.4f", remove_na=False, ) (options, args) = E.Start(parser, add_csv_options=True) options.schemas = options.schemas.split(",") if not options.schemas: raise "please supply schemas." if options.output_fields: options.output_fields = options.output_fields.split(",") else: options.output_fields = () fields, table = CSV.ReadTable(sys.stdin) map_fields2column = {} for x in fields: map_fields2column[x] = len(map_fields2column) if options.loglevel >= 1: options.stdlog.write("# read a %i x %i table.\n" % (len(table), len(fields))) if options.filename_subset: subset, nerrors = IOTools.ReadList(open(options.filename_subset, "r")) subset = set(subset) table = filter(lambda x: x[0] in subset, table) if options.loglevel >= 1: options.stdlog.write( "# subset of %i entries reduced table to a %i x %i table.\n" % (len(subset), len(table), len(fields))) if options.filename_identifiers: identifiers, nerrors = IOTools.ReadList( open(options.filename_identifiers, "r")) else: identifiers = [] identifiers = set(identifiers) # extract rows with positive identifiers positive_rows = filter(lambda x: x[0] in identifiers, table) if options.loglevel >= 1: options.stdlog.write( "# subset of %i identifiers gives %i positive entries.\n" % (len(identifiers), len(positive_rows))) if options.output_format == "table": options.stdout.write("id") for schema in options.schemas: if options.output_fields: for field in options.output_fields: options.stdout.write("\t" + options.output_pattern % (schema, field)) else: options.stdout.write("\t%s" % (schema)) options.stdout.write("\n") else: options.stdout.write("schema\tvalue\n") if identifiers: for row in positive_rows: if options.output_format == "table": options.stdout.write(row[0]) for schema in options.schemas: # set fields for extraction f_extract = map_fields2column[options.field_extract % schema] f_compare = map_fields2column[options.field_compare % schema] # get region for extraction if row[f_compare] != "na": r = float(row[f_compare]) if options.filter_min_ratio or options.filter_max_ratio: mi = r * options.filter_min_ratio ma = r * options.filter_max_ratio f = lambda x: x[f_compare] != "na" and float( x[f_compare] ) >= mi and float(x[f_compare]) <= ma and x[ 0] not in identifiers and x[f_extract] != "na" else: f = lambda x: x[0] not in identifiers and x[f_extract ] != "na" # extract values: filter by minimum and maximum range and remove # positive identifiers. v = float(row[f_extract]) values = map(lambda x: float(x[f_extract]), filter(f, table)) stats = Stats.DistributionalParameters(values) else: v = None for field in options.output_fields: if v is not None: if field == "zscore": f = options.format % stats.getZScore(v) elif field == "diff": f = options.format % (v - stats["mean"]) elif field == "reldiff": f = options.format % ( (v - stats["mean"]) / stats["mean"]) elif field == "val": f = options.format % v else: f = options.format % stats[field] else: f = "na" if options.output_format == "table": options.stdout.write("\t%s" % f) elif options.output_format == "list": options.stdout.write("%s\t%s\n" % (schema, f)) elif options.output_format == "values": options.stdout.write( "%s\t%s\t%5.2f\t%s\n" % (row[0], schema, v, ",".join( map(lambda x: options.format % x, values)))) if options.output_format == "table": options.stdout.write("\n") else: extract_columns = [] for schema in options.schemas: extract_columns.append(map_fields2column[options.field_extract % schema]) # simply dump a subset of values for row in table: skip = False if options.filter_min_ratio or options.filter_max_ratio: master = options.schemas[0] v = row[map_fields2column[options.field_compare % master]] if v == "na": continue v = float(v) mi = v * options.filter_min_ratio ma = v * options.filter_max_ratio for schema in options.schemas[1:]: r = row[map_fields2column[options.field_compare % schema]] if r == "na": if options.remove_na: skip = True continue r = float(r) if r < mi or r > ma: skip = True if options.loglevel >= 3: if options.format == "table": options.stdout.write("* ") options.stdout.write("%s\t" % row[0]) options.stdout.write("\t".join( [row[y] for y in extract_columns])) options.stdout.write("\n") break if skip: continue if options.output_format == "table": options.stdout.write("%s\t" % row[0]) options.stdout.write("\t".join( [row[y] for y in extract_columns])) options.stdout.write("\n") elif options.output_format == "list": has_na = False for x in range(len(options.schemas)): v = row[extract_columns[x]] if v == "na": has_na = True if has_na and options.remove_na: continue for x in range(len(options.schemas)): options.stdout.write( "%s\t%s\n" % (options.schemas[x], row[extract_columns[x]])) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: matrix2matrix.py 2782 2009-09-10 11:40:29Z andreas $" ) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "normalize-by-min-diagonal", "normalize-by-column", "log", "ln", "negzero2value", "set-diagonal", "subtract-matrix", "mix-matrix", "normalize-by-matrix", "normalize-by-column-max", "normalize-by-row-max", "normalize-by-column-min", "normalize-by-row-min", "normalize-by-column-median", "normalize-by-row-median", "normalize-by-column-mean", "normalize-by-row-mean", "normalize-by-column-total", "normalize-by-row-total", "correspondence-analysis", "normalize-by-value", "add-value", "sort-rows", "sort-columns", "transpose", "upper-bound", "lower-bound", "subtract-first-col", "multiply-by-value", "divide-by-value", "mask-rows", "mask-columns", "mask-rows-and-columns", "symmetrize-mean", "symmetrize-max", "symmetrize-min", ), help="""method to use [default=%default]""") parser.add_option("-s", "--scale", dest="scale", type="float", help="factor to scale matrix by [default=%default].") parser.add_option("-f", "--format", dest="format", type="string", help="output number format [default=%default].") parser.add_option("--filename-rows", dest="filename_rows", type="string", help="filename with rows to mask [default=%default].") parser.add_option("--filename-columns", dest="filename_columns", type="string", help="filename with columns to mask [default=%default].") parser.add_option("-p", "--parameters", dest="parameters", type="string", help="Parameters for various functions.") parser.add_option("-t", "--headers", dest="headers", action="store_true", help="matrix has row/column headers.") parser.add_option("--no-headers", dest="headers", action="store_false", help="matrix has no row/column headers.") parser.add_option("-a", "--value", dest="value", type="float", help="value to use for various algorithms.") parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=("full", "sparse", "phylip"), help="""input format for matrix.""") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=("full", "sparse", "phylip"), help="""output format for matrix.""") parser.add_option( "--missing", dest="missing", type="float", help= "value to use for missing values. If not set, missing values will cause the script to fail [default=%default]." ) parser.set_defaults( methods=[], scale=1.0, headers=True, format="%6.4f", output_format="full", input_format="full", value=0.0, parameters="", write_separators=True, filename_rows=None, filename_columns=None, missing=None, ) (options, args) = E.Start(parser) options.parameters = options.parameters.split(",") lines = filter(lambda x: x[0] != "#", sys.stdin.readlines()) if len(lines) == 0: raise IOError("no input") chunks = filter(lambda x: lines[x][0] == ">", range(len(lines))) if not chunks: options.write_separators = False chunks = [-1] chunks.append(len(lines)) if options.filename_rows: row_names, n = IOTools.ReadList(open(options.filename_rows, "r")) if options.filename_columns: column_names, n = IOTools.ReadList(open(options.filename_columns, "r")) for chunk in range(len(chunks) - 1): try: raw_matrix, row_headers, col_headers = MatlabTools.readMatrix( StringIO.StringIO("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])), format=options.input_format, headers=options.headers, missing=options.missing) except ValueError, msg: E.warn("matrix could not be read: %s" % msg) continue nrows, ncols = raw_matrix.shape E.debug("read matrix: %i x %i, %i row titles, %i colum titles" % (nrows, ncols, len(row_headers), len(col_headers))) parameter = 0 for method in options.methods: matrix = numpy.reshape(numpy.array(raw_matrix), raw_matrix.shape) if method in ("normalize-by-matrix", "subtract-matrix", "mix-matrix", "add-matrix"): other_matrix, other_row_headers, other_col_headers = MatlabTools.ReadMatrix( open(options.parameters[parameter], "r"), headers=options.headers) other_nrows, other_ncols = other_matrix.shape if options.loglevel >= 2: options.stdlog.write( "# read second matrix from %s: %i x %i, %i row titles, %i colum titles.\n" % (options.parameters[parameter], other_nrows, other_ncols, len(other_row_headers), len(other_col_headers))) parameter += 1 elif method == "normalize-by-min-diagonal": for x in range(nrows): for y in range(ncols): m = min(raw_matrix[x, x], raw_matrix[y, y]) if m > 0: matrix[x, y] = raw_matrix[x, y] / m elif method == "normalize-by-column": if nrows != ncols: raise "only supported for symmeric matrices." for x in range(nrows): for y in range(ncols): if raw_matrix[y, y] > 0: matrix[x, y] = raw_matrix[x, y] / raw_matrix[y, y] elif method == "normalize-by-value": matrix = raw_matrix / float(options.parameters[parameter]) parameter += 1 elif method == "normalize-by-row": if nrows != ncols: raise "only supported for symmeric matrices." for x in range(nrows): for y in range(ncols): if raw_matrix[y, y] > 0: matrix[x, y] = raw_matrix[x, y] / raw_matrix[x, x] elif method == "subtract-first-col": for x in range(nrows): for y in range(ncols): matrix[x, y] -= raw_matrix[x, 0] elif method.startswith("normalize-by-column"): if method.endswith("max"): f = max elif method.endswith("min"): f = min elif method.endswith("median"): f = scipy.median elif method.endswith("mean"): f = scipy.mean elif method.endswith("total"): f = sum for y in range(ncols): m = f(matrix[:, y]) if m != 0: for x in range(nrows): matrix[x, y] = matrix[x, y] / m elif method.startswith("normalize-by-row"): if method.endswith("max"): f = max elif method.endswith("min"): f = min elif method.endswith("median"): f = scipy.median elif method.endswith("mean"): f = scipy.mean elif method.endswith("total"): f = sum for x in range(nrows): m = f(matrix[x, :]) if m != 0: for y in range(ncols): matrix[x, y] = raw_matrix[x, y] / m elif method == "negzero2value": # set zero/negative values to a value for x in range(nrows): for y in range(ncols): if matrix[x, y] <= 0: matrix[x, y] = options.value elif method == "minmax": # set zero/negative values to a value for x in range(nrows): for y in range(ncols): matrix[x, y], matrix[y, x] = \ min(matrix[x, y], matrix[y, x]), \ max(matrix[x, y], matrix[y, x]) elif method == "log": # apply log to all values. for x in range(nrows): for y in range(ncols): if matrix[x, y] > 0: matrix[x, y] = math.log10(matrix[x, y]) elif method == "ln": for x in range(nrows): for y in range(ncols): if matrix[x, y] > 0: matrix[x, y] = math.log(matrix[x, y]) elif method == "transpose": matrix = numpy.transpose(matrix) row_headers, col_headers = col_headers, row_headers nrows, ncols = ncols, nrows elif method == "mul": matrix = numpy.dot(matrix, numpy.transpose(matrix)) col_headers = row_headers elif method == "multiply-by-value": matrix *= options.value elif method == "divide-by-value": matrix /= options.value elif method == "add-value": matrix += options.value elif method == "angle": # write angles between col vectors v1 = numpy.sqrt(numpy.sum(numpy.power(matrix, 2), 0)) matrix = numpy.dot(numpy.transpose(matrix), matrix) row_headers = col_headers nrows = ncols for x in range(nrows): for y in range(ncols): matrix[x, y] /= v1[x] * v1[y] elif method == "euclid": # convert to euclidean distance matrix matrix = numpy.zeros((ncols, ncols), numpy.float) for c1 in range(0, ncols - 1): for c2 in range(c1 + 1, ncols): for r in range(0, nrows): d = raw_matrix[r][c1] - raw_matrix[r][c2] matrix[c1, c2] += (d * d) matrix[c2, c1] = matrix[c1, c2] matrix = numpy.sqrt(matrix) row_headers = col_headers nrows = ncols elif method.startswith("symmetrize"): f = method.split("-")[1] if f == "max": f = max elif f == "min": f = min elif f == "mean": f = lambda x, y: float(x + y) / 2 if nrows != ncols: raise ValueError( "symmetrize only available for symmetric matrices") if row_headers != col_headers: raise ValueError( "symmetrize not available for permuted matrices") for x in range(nrows): for y in range(ncols): matrix[x, y] = matrix[y, x] = f(matrix[x, y], matrix[y, x]) elif method == "sub": matrix = options.value - matrix elif method in ("lower-bound", "upper-bound"): boundary = float(options.parameters[parameter]) new_value = float(options.parameters[parameter + 1]) parameter += 2 if method == "upper-bound": for x in range(nrows): for y in range(ncols): if matrix[x, y] > boundary: matrix[x, y] = new_value else: for x in range(nrows): for y in range(ncols): if matrix[x, y] < boundary: matrix[x, y] = new_value elif method == "subtract-matrix": matrix = matrix - other_matrix elif method == "add-matrix": matrix = matrix + other_matrix elif method == "normalize-by-matrix": # set 0s to 1 in the other matrix for x in range(nrows): for y in range(ncols): if other_matrix[x, y] == 0: other_matrix[x, y] = 1.0 matrix = matrix / other_matrix elif method == "mix-matrix": for x in range(len(other_row_headers) - 1): for y in range(x + 1, len(other_col_headers)): matrix[x, y] = other_matrix[x, y] elif method == "set-diagonal": value = float(options.parameters[parameter]) for x in range(min(nrows, ncols)): matrix[x, x] = value parameter += 1 elif method == "transpose": matrix = numpy.transpose(raw_matrix) row_headers, col_headers = col_headers, row_headers elif method == "correspondence-analysis": row_indices, col_indices = CorrespondenceAnalysis.GetIndices( raw_matrix) map_row_new2old = numpy.argsort(row_indices) map_col_new2old = numpy.argsort(col_indices) matrix, row_headers, col_headers = CorrespondenceAnalysis.GetPermutatedMatrix( raw_matrix, map_row_new2old, map_col_new2old, row_headers=row_headers, col_headers=col_headers) elif method == "mask-rows": r = set(row_names) for x in range(len(row_headers)): if row_headers[x] in r: matrix[x, :] = options.value elif method == "mask-columns": r = set(column_names) for x in range(len(col_headers)): if col_headers[x] in r: matrix[:, x] = options.value elif method == "mask-rows-and-columns": r = set(row_names) c = set(column_names) for x in range(len(row_headers)): for y in range(len(col_headers)): if row_headers[x] in r and col_headers[y] in c: matrix[x, y] = options.value raw_matrix = numpy.reshape(numpy.array(matrix), matrix.shape) else: # for simple re-formatting jobs matrix = raw_matrix if options.write_separators: options.stdout.write(lines[chunks[chunk]]) MatlabTools.writeMatrix(sys.stdout, matrix, value_format=options.format, format=options.output_format, row_headers=row_headers, col_headers=col_headers)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: codonbias_acai2tsv.py 865 2007-01-15 13:44:43Z andreas $" ) parser.add_option("-o", "--input-file-trace", dest="input_filename_trace", type="string", help="input filename for cai.", metavar="FILE") parser.add_option("-e", "--input-file-genes", dest="input_filename_genes", type="string", help="input filename for genes information from cai.", metavar="FILE") parser.add_option("-c", "--input-file-codons", dest="input_filename_codons", type="string", help="input filename for codon usage information.", metavar="FILE") parser.add_option("--input-file-sequences", dest="input_filename_sequences", type="string", help="input filename with sequences.", metavar="FILE") parser.add_option("-t", "--input-file-subset", dest="input_filename_subset", type="string", help="input filename with subset.", metavar="FILE") parser.add_option("--codon-table-format", dest="codon_table_format", type="choice", choices=("list", "matrix"), help="output options for output codon tables.") parser.add_option("--codon-table-type", dest="codon_table_type", type="choice", choices=("counts", "frequencies", "weights", "absolute-frequencies"), help="type of codon table.") parser.add_option("-r", "--reference", dest="reference", type="string", help="dump CAI reference weights for species.") parser.add_option("-s", "--select", dest="select", type="string", help="fields to select from genes table.") parser.add_option("-m", "--map", dest="input_filename_map", type="string", help="filename with mapping information for gene names.", metavar="FILE") parser.add_option("-i", "--invert-map", dest="invert_map", action="store_true", help="invert map.") parser.add_option( "-d", "--dominant-set", dest="dominant_set", type="float", help="only print out dominant set (# fraction of most biased genes).") parser.add_option( "--reverse-set", dest="reverse_set", action="store_true", help="print the reverse set, i.e., then non-dominant set.") parser.add_option( "-u", "--codon-usage", dest="codon_usage", type="string", help="print codon usage for the full/biased set of genes [full|biased]." ) parser.add_option( "-w", "--weights", dest="weights", type="string", help= "print weights [final-list|final-matrix|random|compute|weights|frequencies|absolute-frequencies]." ) parser.add_option("--weights-matrix2table", dest="weights_matrix2table", action="store_true", help="convert a weights matrix to a weights table.") parser.add_option("--get-preferred-codons", dest="get_preferred_codons", type="string", help="compute overview of preferred codons.") parser.set_defaults(input_filename="-", input_filename_trace=None, input_filename_genes=None, input_filename_codons=None, input_filename_map=None, input_filename_subset=None, input_filename_sequences=None, invert_map=False, select=None, codon_usage=None, weights=None, revserse_set=False, pseudocounts=1, codon_table_format="list", codon_table_type="weights", weights_matrix2table=False, random_size=1000, get_preferred_codons=None, dominant_set=0.0) (options, args) = E.Start(parser) if options.select: options.select = options.select.split(",") outfile = options.stdout ################################################################### # convert weights table to a codon table if options.weights_matrix2table: lines = options.stdin.readlines() data = [] for line in lines: if line[0] == "#": continue data += list(map(float, line[:-1].split(","))) weights = {} x = 0 for cc in OUTPUT_ORDER_CODON_MATRIX: for c in cc: weights[c] = data[x] x += 1 outfile.write("CODON\tWEIGHT\n") codons = weights.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, weights[codon])) E.Stop() sys.exit(1) ################################################################### map_genes = {} if options.input_filename_map: data = map( lambda x: x[:-1].split("\t")[:2], filter(lambda x: x[0] != "#", open(options.input_filename_map, "r").readlines())) for a, b in data: if options.invert_map: a, b = b, a map_genes[a] = b result = WrapperAdaptiveCAI.AdaptiveCAIResult() if options.input_filename_genes: gene_file = open(options.input_filename_genes, "r") else: gene_file = None if options.input_filename_codons: codon_file = open(options.input_filename_codons, "r") else: codon_file = None if options.input_filename_trace: trace_file = open(options.input_filename_trace, "r") else: trace_file = None if options.input_filename_subset: l, e = IOTools.ReadList(open(options.input_filename_subset, "r")) subset = set(l) if options.loglevel >= 1: options.stdlog.write("# read %i entries into subset from %s.\n" % (len(subset), options.input_filename_subset)) else: subset = None result.Read(gene_file=gene_file, codon_file=codon_file, trace_file=trace_file) if gene_file: gene_file.close() if codon_file: codon_file.close() if trace_file: trace_file.close() if options.reference: if options.reference not in CODON_PREFERENCES: raise "unknown species %s: possibles species are: %s" % ( options.reference, str(CODON_PREFERNCES.keys())) weights = Genomics.CalculateCAIWeightsFromCounts( CODON_PREFERENCES[options.reference], options.pseudocounts) for x in range(len(OUTPUT_ORDER_CODON_MATRIX)): outfile.write(",".join( map(lambda z: "%5.3f" % z, [ weights[codon.upper()] for codon in OUTPUT_ORDER_CODON_MATRIX[x] ]))) outfile.write("\n") if options.dominant_set and gene_file: cai_threshold = result.GetDominantThreshold(options.dominant_set) else: if options.reverse_set: cai_threshold = 1.0 else: cai_threshold = 0.0 if options.select: fields = [] titles = [] for x in options.select: f = re.match("(\S+) (AS|as) (\S+)", x) if f: fields.append(f.groups()[0].upper()) titles.append(f.groups()[2]) else: fields.append(x.upper()) titles.append(x) outfile.write("GENENAME\t" + string.join(titles, "\t") + "\n") for genename, data in result.mGeneInfo.items(): if genename in map_genes: genename = map_genes[genename] if options.reverse_set: if data["CAICLASS"] >= cai_threshold: continue else: if data["CAICLASS"] < cai_threshold: continue outfile.write(genename) for c in fields: outfile.write("\t%s" % str(data[c])) outfile.write("\n") if options.weights: format = options.codon_table_format if options.weights in ("compute-counts", "compute-weights", "compute-frequencies"): # compute codon usage weights from a set of sequences codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for x in codons: counts[x] = 0 if options.input_filename_sequences: sequences = Genomics.ReadPeptideSequences(open( options.input_filename_sequences, "r"), filter=subset) for key, sequence in sequences.items(): sequence = re.sub(" ", "", sequence) if len(sequence) % 3 != 0: raise "warning: sequence %s is not multiple of 3" % key for codon in [ sequence[x:x + 3] for x in range(0, len(sequence), 3) ]: counts[codon.upper()] += 1 if options.weights == "compute-frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.weights == "compute-weights": weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) else: weights = counts elif options.weights in ("final-list", "final-matrix"): weights = result.mFinalWeights if options.weights == "final-list": format = "list" else: format = "matrix" elif options.weights == "random": # get random weights codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for x in codons: counts[x] = random.randint(1, options.random_size) weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" elif options.weights == "biased": # get biased weights codons = Genomics.GetUniformCodonUsage() weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" elif options.weights in ("uniform-weights", "uniform-frequencies"): # get uniform weights codons = Genomics.GetUniformCodonUsage() if options.weights == ("uniform-weights"): weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) format = "matrix" else: weights = codons format = "list" elif options.weights in ("counts", "frequencies", "absolute-frequencies"): # get weights as frequencies # compute from scratch. In the caijava file, the absolute frequencey f / gene_length is # given. Thus the total number of codons is f * gene_length. codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for c in codons: counts[c] = 0 for genename, data in result.mGeneInfo.items(): if options.reverse_set: if data["CAICLASS"] >= cai_threshold: continue else: if data["CAICLASS"] < cai_threshold: continue l = data["GENELENGTH"] for c in codons: counts[c] += int(data[c] * l) if options.weights == "frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.weights == "counts": weights = counts elif options.weights == "absolute-frequencies": # compute absolute frequencies (with pseudo-counts, but do not # normalize per aa) weights = {} m = sum(counts.values()) for k, v in counts.items(): weights[k] = float(v) / m format = "list" elif options.weights == "subset": codons = CODON_PREFERENCES["dmelanogaster"].keys() counts = {} for c in codons: counts[c] = 0 for genename, data in result.mGeneInfo.items(): found = genename in subset if (not found and not options.reverse_set) or ( found and options.reverse_set): continue l = data["GENELENGTH"] for c in codons: counts[c] += int(data[c] * l) if options.codon_table_type == "frequencies": weights = Genomics.CalculateCodonFrequenciesFromCounts( counts, options.pseudocounts) elif options.codon_table_type == "weights": weights = Genomics.CalculateCAIWeightsFromCounts( counts, options.pseudocounts) elif options.codon_table_type == "counts": weights = counts if options.codon_table_type == "absolute-frequencies": # compute absolute frequencies (with pseudo-counts, but do not # normalize per aa) weights = {} m = sum(counts.values()) for k, v in counts.items(): weights[k] = float(v) / m else: raise "unknown weights %s" % options.weights if format == "list": outfile.write("CODON\tWEIGHT\n") codons = weights.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, weights[codon])) elif format == "matrix": for x in range(len(OUTPUT_ORDER_CODON_MATRIX)): outfile.write(",".join( map(lambda z: "%5.3f" % z, [ weights[codon.upper()] for codon in OUTPUT_ORDER_CODON_MATRIX[x] ]))) outfile.write("\n") if options.codon_usage: outfile.write("CODON\tFREQUENCY\n") if options.codon_usage == "biased": usages = result.mCodonUsages[-1] elif options.codon_usage == "full": usages = result.mCodonUsages[0] elif options.codon_usage == "weights": usages = WrapperAdaptiveCAI.CalculateWeightsFromUsage( result.mCodonUsages[0]) else: raise "unknown option '%s' for codon-usage." % options.codon_usage codons = usages.keys() codons.sort() for codon in codons: outfile.write("%s\t%f\n" % (codon, usages[codon])) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: diff_transcript_sets.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percent columns") parser.add_option("-d", "--dump-sets", dest="dump_sets", action="append", type="choice", choices=("rest_genes1", "rest_genes2", "intersection", "union"), help="dump sets of transcripts/genes") parser.add_option( "-o", "--output-filename-pattern", dest="output_pattern", type="string", help="output pattern to use for dumped sets. Should contain one %s.") parser.set_defaults( separator="|", add_percent="False", dump_sets=[], output_pattern="%s", ) (options, args) = E.Start(parser) options.filename1, options.filename2 = args ids1, nerrors1 = IOTools.ReadList(open(options.filename1, "r")) ids2, nerrors2 = IOTools.ReadList(open(options.filename2, "r")) genes1, transcripts1 = countGenesTranscripts(ids1, options) genes2, transcripts2 = countGenesTranscripts(ids2, options) options.stdout.write( "species\tngenes1\tntranscripts1\tngenes2\tntranscripts2\ttr_inter\ttr_union\ttr_rest1\ttr_rest2\ttr_inter\tg_union\tg_rest1\tg_rest2" ) options.stdout.write("\ttr_rest1\ttr_rest2\tg_rest1\tg_rest2") options.stdout.write("\n") for species in set(genes1.keys()).union(set(genes2.keys())): nt1, nt2, ng1, ng2 = "na", "na", "na", "na" if species in genes1: g1 = genes1[species] t1 = transcripts1[species] nt1 = "%i" % len(transcripts1[species]) ng1 = "%i" % len(genes1[species]) else: t1, g1 = None, None if species in genes2: g2 = genes2[species] t2 = transcripts2[species] nt2 = "%i" % len(transcripts2[species]) ng2 = "%i" % len(genes2[species]) else: t2, g2 = None, None if species in transcripts1 and transcripts2: ct = "%i" % len(t1.intersection(t2)) ut = "%i" % len(t2.union(t1)) rt1 = "%i" % len(t1.difference(t2)) rt2 = "%i" % len(t2.difference(t1)) else: ct, ut, rt1, rt2 = ["na"] * 4 if species in genes1 and genes2: cg = "%i" % len(g1.intersection(g2)) ug = "%i" % len(g2.union(g1)) rg1 = "%i" % len(g1.difference(g2)) rg2 = "%i" % len(g2.difference(g1)) else: cg, ug, rg1, rg2 = ["na"] * 4 options.stdout.write("\t".join((species, nt1, ng1, nt2, ng2))) options.stdout.write("\t") options.stdout.write("\t".join((ct, ut, rt1, rt2))) options.stdout.write("\t") options.stdout.write("\t".join((cg, ug, rg1, rg2))) if options.add_percent: if species in genes1 and genes2: rg1 = "%5.2f" % (100.0 * len(g1.difference(g2)) / len(g1)) rg2 = "%5.2f" % (100.0 * len(g2.difference(g1)) / len(g2)) if species in transcripts1 and transcripts2: rt1 = "%5.2f" % (100.0 * len(t1.difference(t2)) / len(t1)) rt2 = "%5.2f" % (100.0 * len(t2.difference(t1)) / len(t2)) options.stdout.write("\t") options.stdout.write("\t".join((rt1, rt2, rg1, rg2))) options.stdout.write("\n") for choice in options.dump_sets: output_set = None if choice == "rest_genes1" and g1 and g2: output_set = getTranscriptsForGenes(g1.difference(g2), ids1, options) elif choice == "rest_genes2" and g1 and g2: output_set = getTranscriptsForGenes(g2.difference(g1), ids2, options) if output_set: outfile = IOTools.openFile(options.output_pattern % (choice), "w") for x in output_set: outfile.write("%s\n" % (x, )) outfile.close() E.Stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--merge-exons", dest="merge_exons", action="store_true", help="merge overlapping exons of all transcripts " "within a gene. " "The merged exons will be output. " "Input needs to sorted by gene [default=%default].") parser.add_option("-t", "--merge-transcripts", dest="merge_transcripts", action="store_true", help="merge all transcripts within a gene. " "The entry will span the whole gene " "(exons and introns). " "The transcript does not include the UTR unless " "--with-utr is set. [default=%default].") parser.add_option("--merge-genes", dest="merge_genes", action="store_true", help="merge overlapping genes if their exons overlap. " "A gene with a single transcript containing all exons " "of the overlapping transcripts will be output. " "This operation ignores strand information " "The input needs te sorted by transcript " "[default=%default].") parser.add_option("--merge-exons-distance", dest="merge_exons_distance", type="int", help="distance in nucleotides between " "exons to be merged [default=%default].") parser.add_option("-j", "--join-exons", dest="join_exons", action="store_true", help="join all exons per transcript. " "A new transcript will be " "output that spans a whole transcript. " "Input needs to be sorted by transcript " "[default=%default].") parser.add_option("--unset-genes", dest="unset_genes", type="string", help="unset gene identifiers, keeping " "transcripts intact. " "New gene identifiers are set to the " "pattern given. For example, " "'--unset-genes=%06i' [default=%default].") parser.add_option("--sort", dest="sort", type="choice", choices=("gene", "gene+transcript", "transcript", "position", "contig+gene", "position+gene", "gene+position"), help="sort input data [default=%default].") parser.add_option("-u", "--with-utr", dest="with_utr", action="store_true", help="include utr in merged transcripts " "[default=%default].") parser.add_option("--intersect-transcripts", dest="intersect_transcripts", action="store_true", help="intersect all transcripts within a gene. " "The entry will only span those bases " "that are covered by all transcrips." "The transcript does not include the UTR unless " "--with-utr is set. This method " "will remove all other features (stop_codon, etc.) " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-i", "--merge-introns", dest="merge_introns", action="store_true", help="merge and output all introns within a " "gene. The output will contain " "all intronic regions within a gene. Single exon genes " "are skipped. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-g", "--set-transcript-to-gene", "--set-transcript2gene", dest="set_transcript2gene", action="store_true", help="set the transcript_id to the " "gene_id [default=%default].") parser.add_option("--set-protein-to-transcript", dest="set_protein2transcript", action="store_true", help="set the protein_id to the " "transcript_id [default=%default].") parser.add_option("--add-protein-id", dest="add_protein_id", type="string", help="add a protein_id for each transcript_id. " "The argument is a filename containing a mapping " "between " "transcript_id to protein_id [default=%default].") parser.add_option("-G", "--set-gene-to-transcript", "--set-gene2transcript", dest="set_gene2transcript", action="store_true", help="set the gene_id to the " "transcript_id [default=%default].") parser.add_option("-d", "--set-score2distance", dest="set_score2distance", action="store_true", help="set the score field for each feature to the " "distance to " "transcription start site [default=%default].") parser.add_option("--exons2introns", dest="exons2introns", action="store_true", help="for each gene build an 'intronic' transcript " "containing the union of all intronic regions " "of all transcripts in a gene." "The features are labeled as 'intron'." "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-f", "--filter", dest="filter", type="choice", choices=("gene", "transcript", "longest-gene", "longest-transcript", "representative-transcript"), help="apply a filter to the input file. Available " "filters are: " "'gene': filter by gene_id, " "'transcript': filter by transcript_id, " "'longest-gene': output the longest gene for " "overlapping genes ," "'longest-transcript': output the longest " "transcript per gene," "'representative-transcript': output the " "representative transcript per gene. " "The representative transcript is the transcript " "that shares most exons with " "the other transcripts in a gene. " "The input needs to be sorted by gene. " "[default=%default].") parser.add_option("-r", "--rename", dest="rename", type="choice", choices=("gene", "transcript"), help="rename genes or transcripts with a map " "given by the option `--apply`. " "Those that can not be renamed are removed " "[default=%default].") parser.add_option("--renumber-genes", dest="renumber_genes", type="string", help="renumber genes according to the given pattern. " "[default=%default].") parser.add_option("--renumber-transcripts", dest="renumber_transcripts", type="string", help="renumber transcripts according to the " "given pattern. " "[default=%default].") parser.add_option("-a", "--apply", dest="filename_filter", type="string", metavar="tsv", help="filename of ids to map/filter [default=%default].") parser.add_option("--invert-filter", dest="invert_filter", action="store_true", help="when using --filter, invert selection " "(like grep -v). " "[default=%default].") parser.add_option("--sample-size", dest="sample_size", type="int", help="extract a random sample of size # if the option " "'--filter' is set[default=%default].") parser.add_option("--intron-min-length", dest="intron_min_length", type="int", help="minimum length for introns (for --exons2introns) " "[default=%default].") parser.add_option("--min-exons-length", dest="min_exons_length", type="int", help="minimum length for gene (sum of exons) " "(--sample-size) [default=%default].") parser.add_option( "--intron-border", dest="intron_border", type="int", help="number of residues to exclude at intron at either end " "(--exons2introns) [default=%default].") parser.add_option("--transcripts2genes", dest="transcripts2genes", action="store_true", help="cluster overlapping transcripts into genes.") parser.add_option("--reset-strand", dest="reset_strand", action="store_true", help="remove strandedness of features (set to '.') when " "using --transcripts2genes" "[default=%default].") parser.add_option("--remove-overlapping", dest="remove_overlapping", type="string", metavar="gff", help="remove all transcripts that overlap intervals " "in a gff-formatted file." "The comparison ignores strand " "[default=%default].") parser.add_option("--permit-duplicates", dest="strict", action="store_false", help="permit duplicate genes. " "[default=%default]") parser.add_option("--remove-duplicates", dest="remove_duplicates", type="choice", choices=("gene", "transcript", "ucsc", "coordinates"), help="remove duplicates by gene/transcript. " "If ``ucsc`` is chosen, transcripts ending on _dup# are " "removed. This is necessary to remove duplicate entries " "that are next to each other in the sort order " "[%default]") parser.add_option("--rename-duplicates", dest="rename_duplicates", action="store_true", help="rename duplicate gene_ids and transcript_ids by " "addition of a numerical suffix") parser.set_defaults( sort=None, merge_exons=False, join_exons=False, merge_exons_distance=0, merge_transcripts=False, set_score2distance=False, set_gene2transcript=False, set_transcript2gene=False, set_protein2transcript=False, add_protein_id=None, filename_filter=None, filter=None, exons2introns=None, merge_genes=False, intron_border=None, intron_min_length=None, sample_size=0, min_exons_length=0, transripts2genes=False, reset_strand=False, with_utr=False, invert_filter=False, remove_duplicates=None, remove_overlapping=None, renumber_genes=None, unset_genes=None, renumber_transcripts=None, strict=True, intersect_transcripts=False, rename_duplicates=False, ) (options, args) = E.Start(parser, argv=argv) ninput, noutput, nfeatures, ndiscarded = 0, 0, 0, 0 if options.set_transcript2gene: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("transcript_id", gff.gene_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.remove_duplicates: counts = collections.defaultdict(int) if options.remove_duplicates == "ucsc": store = [] remove = set() f = lambda x: x[0].transcript_id gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) outf = lambda x: "\n".join([str(y) for y in x]) for entry in gffs: ninput += 1 store.append(entry) id = f(entry) if "_dup" in id: remove.add(re.sub("_dup\d+", "", id)) remove.add(id) for entry in store: id = f(entry) if id not in remove: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s" % (id)) else: if options.remove_duplicates == "gene": gffs = GTF.gene_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0][0].gene_id outf = lambda x: "\n".join( ["\n".join([str(y) for y in xx]) for xx in x]) elif options.remove_duplicates == "transcript": gffs = GTF.transcript_iterator(GTF.iterator(options.stdin), strict=False) f = lambda x: x[0].transcript_id outf = lambda x: "\n".join([str(y) for y in x]) elif options.remove_duplicates == "coordinates": gffs = GTF.chunk_iterator(GTF.iterator(options.stdin)) f = lambda x: x[0].contig + "_" + \ str(x[0].start) + "-" + str(x[0].end) outf = lambda x: "\n".join([str(y) for y in x]) store = [] for entry in gffs: ninput += 1 store.append(entry) id = f(entry) counts[id] += 1 # Assumes GTF file sorted by contig then start last_id = "" if options.remove_duplicates == "coordinates": for entry in store: id = f(entry) if id == last_id: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) else: options.stdout.write(outf(entry) + "\n") noutput += 1 last_id = id else: for entry in store: id = f(entry) if counts[id] == 1: options.stdout.write(outf(entry) + "\n") noutput += 1 else: ndiscarded += 1 E.info("discarded duplicates for %s: %i" % (id, counts[id])) elif options.sort: for gff in GTF.iterator_sorted(GTF.iterator(options.stdin), sort_order=options.sort): ninput += 1 options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_gene2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("gene_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.set_protein2transcript: for gff in GTF.iterator(options.stdin): ninput += 1 gff.setAttribute("protein_id", gff.transcript_id) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 elif options.add_protein_id: transcript2protein = IOTools.readMap(open(options.add_protein_id, "r")) missing = set() for gff in GTF.iterator(options.stdin): ninput += 1 if gff.transcript_id not in transcript2protein: if gff.transcript_id not in missing: E.debug(("removing transcript '%s' due to " "missing protein id") % gff.transcript_id) missing.add(gff.transcript_id) ndiscarded += 1 continue gff.setAttribute("protein_id", transcript2protein[gff.transcript_id]) options.stdout.write("%s\n" % str(gff)) noutput += 1 nfeatures += 1 E.info("transcripts removed due to missing protein ids: %i" % len(missing)) elif options.join_exons: for exons in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(exons[0].strand) contig = exons[0].contig transid = exons[0].transcript_id geneid = exons[0].gene_id biotype = exons[0].source all_start, all_end = min([x.start for x in exons ]), max([x.end for x in exons]) y = GTF.Entry() y.contig = contig y.source = biotype y.feature = "transcript" y.start = all_start y.end = all_end y.strand = strand y.transcript_id = transid y.gene_id = geneid options.stdout.write("%s\n" % str(y)) elif options.merge_genes: # merges overlapping genes # gffs = GTF.iterator_sorted_chunks(GTF.flat_gene_iterator( GTF.iterator(options.stdin)), sort_by="contig-strand-start") def iterate_chunks(gff_chunks): last = gff_chunks.next() to_join = [last] for gffs in gff_chunks: d = gffs[0].start - last[-1].end if gffs[0].contig == last[0].contig and \ gffs[0].strand == last[0].strand: assert gffs[0].start >= last[0].start, \ ("input file should be sorted by contig, strand " "and position: d=%i:\nlast=\n%s\nthis=\n%s\n") % \ (d, "\n".join([str(x) for x in last]), "\n".join([str(x) for x in gffs])) if gffs[0].contig != last[0].contig or \ gffs[0].strand != last[0].strand or \ d > 0: yield to_join to_join = [] last = gffs to_join.append(gffs) yield to_join raise StopIteration for chunks in iterate_chunks(gffs): ninput += 1 if len(chunks) > 1: gene_id = "merged_%s" % chunks[0][0].gene_id transcript_id = "merged_%s" % chunks[0][0].transcript_id info = ",".join([x[0].gene_id for x in chunks]) else: gene_id = chunks[0][0].gene_id transcript_id = chunks[0][0].transcript_id info = None intervals = [] for c in chunks: intervals += [(x.start, x.end) for x in c] intervals = Intervals.combine(intervals) # take single strand strand = chunks[0][0].strand for start, end in intervals: y = GTF.Entry() y.fromGTF(chunks[0][0], gene_id, transcript_id) y.start = start y.end = end y.strand = strand if info: y.addAttribute("merged", info) options.stdout.write("%s\n" % str(y)) nfeatures += 1 noutput += 1 elif options.renumber_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 if gtf.gene_id not in map_old2new: map_old2new[gtf.gene_id] = options.renumber_genes % ( len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[gtf.gene_id]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.unset_genes: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = gtf.transcript_id if key not in map_old2new: map_old2new[key] = options.unset_genes % (len(map_old2new) + 1) gtf.setAttribute("gene_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.renumber_transcripts: map_old2new = {} for gtf in GTF.iterator(options.stdin): ninput += 1 key = (gtf.gene_id, gtf.transcript_id) if key not in map_old2new: map_old2new[key] = options.renumber_transcripts % ( len(map_old2new) + 1) gtf.setAttribute("transcript_id", map_old2new[key]) options.stdout.write("%s\n" % str(gtf)) noutput += 1 elif options.transcripts2genes: transcripts = set() genes = set() reset_strand = options.reset_strand for gtfs in GTF.iterator_transcripts2genes(GTF.iterator( options.stdin)): ninput += 1 for gtf in gtfs: if reset_strand: gtf.strand = "." options.stdout.write("%s\n" % str(gtf)) transcripts.add(gtf.transcript_id) genes.add(gtf.gene_id) nfeatures += 1 noutput += 1 E.info("transcripts2genes: transcripts=%i, genes=%i" % (len(transcripts), len(genes))) elif options.rename: map_old2new = IOTools.readMap(open(options.filename_filter, "r")) if options.rename == "transcript": is_gene_id = False elif options.rename == "gene": is_gene_id = True for gff in GTF.iterator(options.stdin): ninput += 1 if is_gene_id: if gff.gene_id in map_old2new: gff.setAttribute("gene_id", map_old2new[gff.gene_id]) else: E.debug("removing missing gene_id %s" % gff.gene_id) ndiscarded += 1 continue else: if gff.transcript_id in map_old2new: gff.setAttribute("transcript_id", map_old2new[gff.transcript_id]) else: E.debug("removing missing transcript_id %s" % gff.transcript_id) ndiscarded += 1 continue noutput += 1 options.stdout.write("%s\n" % str(gff)) elif options.filter: keep_genes = set() if options.filter == "longest-gene": iterator = GTF.flat_gene_iterator(GTF.iterator(options.stdin)) coords = [] gffs = [] for gff in iterator: gff.sort(key=lambda x: x.start) coords.append((gff[0].contig, min([x.start for x in gff]), max([x.end for x in gff]), gff[0].gene_id)) gffs.append(gff) coords.sort() last_contig = None max_end = 0 longest_gene_id = None longest_length = None for contig, start, end, gene_id in coords: ninput += 1 if contig != last_contig or start >= max_end: if longest_gene_id: keep_genes.add(longest_gene_id) longest_gene_id = gene_id longest_length = end - start max_end = end else: if end - start > longest_length: longest_length, longest_gene_id = end - start, gene_id last_contig = contig max_end = max(max_end, end) keep_genes.add(longest_gene_id) invert = options.invert_filter for gff in gffs: keep = gff[0].gene_id in keep_genes if (keep and not invert) or (not keep and invert): noutput += 1 for g in gff: nfeatures += 1 options.stdout.write("%s\n" % g) else: ndiscarded += 1 elif options.filter in ("longest-transcript", "representative-transcript"): iterator = GTF.gene_iterator(GTF.iterator(options.stdin)) def selectLongestTranscript(gene): r = [] for transcript in gene: transcript.sort(key=lambda x: x.start) length = transcript[-1].end - transcript[0].start r.append((length, transcript)) r.sort() return r[-1][1] def selectRepresentativeTranscript(gene): '''select a representative transcript. The representative transcript represent the largest number of exons over all transcripts. ''' all_exons = [] for transcript in gene: all_exons.extend([(x.start, x.end) for x in transcript if x.feature == "exon"]) exon_counts = {} for key, exons in itertools.groupby(all_exons): exon_counts[key] = len(list(exons)) transcript_counts = [] for transcript in gene: count = sum([ exon_counts[(x.start, x.end)] for x in transcript if x.feature == "exon" ]) transcript_counts.append((count, transcript)) transcript_counts.sort() return transcript_counts[-1][1] if options.filter == "longest-transcript": _select = selectLongestTranscript elif options.filter == "representative-transcript": _select = selectRepresentativeTranscript for gene in iterator: ninput += 1 # sort in order to make reproducible which # gene is chosen. transcript = _select(sorted(gene)) noutput += 1 for g in transcript: nfeatures += 1 options.stdout.write("%s\n" % g) elif options.filter in ("gene", "transcript"): if options.filename_filter: ids, nerrors = IOTools.ReadList( open(options.filename_filter, "r")) E.info("read %i ids" % len(ids)) ids = set(ids) by_gene = options.filter == "gene" by_transcript = options.filter == "transcript" invert = options.invert_filter reset_strand = options.reset_strand for gff in GTF.iterator(options.stdin): ninput += 1 keep = False if by_gene: keep = gff.gene_id in ids if by_transcript: keep = gff.transcript_id in ids if (invert and keep) or (not invert and not keep): continue if reset_strand: gff.strand = "." options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.sample_size: if options.filter == "gene": iterator = GTF.flat_gene_iterator( GTF.iterator(options.stdin)) elif options.filter == "transcript": iterator = GTF.transcript_iterator( GTF.iterator(options.stdin)) if options.min_exons_length: iterator = GTF.iterator_min_feature_length( iterator, min_length=options.min_exons_length, feature="exon") data = [x for x in iterator] ninput = len(data) if len(data) > options.sample_size: data = random.sample(data, options.sample_size) for d in data: noutput += 1 for dd in d: nfeatures += 1 options.stdout.write(str(dd) + "\n") else: assert False, "please supply either a filename " "with ids to filter with (--apply) or a sample-size." elif options.exons2introns: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin)): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") input_ranges = Intervals.combine(cds_ranges + exon_ranges) if len(input_ranges) > 1: last = input_ranges[0][1] output_ranges = [] for start, end in input_ranges[1:]: output_ranges.append((last, start)) last = end if options.intron_border: b = options.intron_border output_ranges = [(x[0] + b, x[1] - b) for x in output_ranges] if options.intron_min_length: l = options.intron_min_length output_ranges = [ x for x in output_ranges if x[1] - x[0] > l ] for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "intron" entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 else: ndiscarded += 1 elif options.set_score2distance: for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 strand = Genomics.convertStrand(gffs[0].strand) all_start, all_end = min([x.start for x in gffs ]), max([x.end for x in gffs]) if strand != ".": t = 0 if strand == "-": gffs.reverse() for gff in gffs: gff.score = t t += gff.end - gff.start if strand == "-": gffs.reverse() for gff in gffs: options.stdout.write("%s\n" % str(gff)) nfeatures += 1 noutput += 1 elif options.remove_overlapping: index = GTF.readAndIndex( GTF.iterator(IOTools.openFile(options.remove_overlapping, "r"))) for gffs in GTF.transcript_iterator(GTF.iterator(options.stdin)): ninput += 1 found = False for e in gffs: if index.contains(e.contig, e.start, e.end): found = True break if found: ndiscarded += 1 else: noutput += 1 for e in gffs: nfeatures += 1 options.stdout.write("%s\n" % str(e)) elif options.intersect_transcripts: for gffs in GTF.gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 r = [] for g in gffs: if options.with_utr: ranges = GTF.asRanges(g, "exon") else: ranges = GTF.asRanges(g, "CDS") r.append(ranges) result = r[0] for x in r[1:]: result = Intervals.intersect(result, x) entry = GTF.Entry() entry.copy(gffs[0][0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = "exon" for start, end in result: entry.start = start entry.end = end options.stdout.write("%s\n" % str(entry)) nfeatures += 1 noutput += 1 elif options.rename_duplicates: gene_ids = list() transcript_ids = list() gtfs = list() for gtf in GTF.iterator(options.stdin): gtfs.append(gtf) if gtf.feature == "CDS": gene_ids.append(gtf.gene_id) transcript_ids.append(gtf.transcript_id) dup_gene = [item for item in set(gene_ids) if gene_ids.count(item) > 1] dup_transcript = [ item for item in set(transcript_ids) if transcript_ids.count(item) > 1 ] E.info("Number of duplicated gene_ids: %i" % len(dup_gene)) E.info("Number of duplicated transcript_ids: %i" % len(dup_transcript)) gene_dict = dict(zip(dup_gene, ([0] * len(dup_gene)))) transcript_dict = dict(zip(dup_transcript, ([0] * len(dup_transcript)))) for gtf in gtfs: if gtf.feature == "CDS": if gtf.gene_id in dup_gene: gene_dict[gtf.gene_id] = gene_dict[gtf.gene_id] + 1 gtf.setAttribute( 'gene_id', gtf.gene_id + "." + str(gene_dict[gtf.gene_id])) if gtf.transcript_id in dup_transcript: transcript_dict[gtf.transcript_id] = \ transcript_dict[gtf.transcript_id] + 1 gtf.setAttribute( 'transcript_id', gtf.transcript_id + "." + str(transcript_dict[gtf.transcript_id])) options.stdout.write("%s\n" % gtf) else: for gffs in GTF.flat_gene_iterator(GTF.iterator(options.stdin), strict=options.strict): ninput += 1 cds_ranges = GTF.asRanges(gffs, "CDS") exon_ranges = GTF.asRanges(gffs, "exon") # sanity checks strands = set([x.strand for x in gffs]) contigs = set([x.contig for x in gffs]) if len(strands) > 1: raise ValueError( "can not merge gene '%s' on multiple strands: %s" % (gffs[0].gene_id, str(strands))) if len(contigs) > 1: raise ValueError( "can not merge gene '%s' on multiple contigs: %s" % (gffs[0].gene_id, str(contigs))) strand = Genomics.convertStrand(gffs[0].strand) if cds_ranges and options.with_utr: cds_start, cds_end = cds_ranges[0][0], cds_ranges[-1][1] midpoint = (cds_end - cds_start) / 2 + cds_start utr_ranges = [] for start, end in Intervals.truncate(exon_ranges, cds_ranges): if end - start > 3: if strand == ".": feature = "UTR" elif strand == "+": if start < midpoint: feature = "UTR5" else: feature = "UTR3" elif strand == "-": if start < midpoint: feature = "UTR3" else: feature = "UTR5" utr_ranges.append((feature, start, end)) output_feature = "CDS" output_ranges = cds_ranges else: output_feature = "exon" output_ranges = exon_ranges utr_ranges = [] result = [] if options.merge_exons: # need to combine per feature - skip # utr_ranges = Intervals.combineAtDistance( # utr_ranges, # options.merge_exons_distance) output_ranges = Intervals.combineAtDistance( output_ranges, options.merge_exons_distance) for feature, start, end in utr_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.feature = feature entry.transcript_id = "merged" entry.start = start entry.end = end result.append(entry) for start, end in output_ranges: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = "merged" entry.feature = output_feature entry.start = start entry.end = end result.append(entry) elif options.merge_transcripts: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][0] entry.end = output_ranges[-1][1] result.append(entry) elif options.merge_introns: if len(output_ranges) >= 2: entry = GTF.Entry() entry.copy(gffs[0]) entry.clearAttributes() entry.transcript_id = entry.gene_id entry.start = output_ranges[0][1] entry.end = output_ranges[-1][0] result.append(entry) else: ndiscarded += 1 continue result.sort(key=lambda x: x.start) for x in result: options.stdout.write("%s\n" % str(x)) nfeatures += 1 noutput += 1 E.info("ninput=%i, noutput=%i, nfeatures=%i, ndiscarded=%i" % (ninput, noutput, nfeatures, ndiscarded)) E.Stop()
def selectComponents(component_ids, map_component2seq_id, map_component2input_id, id_filter, options): """select a set of components from component_ids. """ map_sample2reference = {} if options.sample: if options.sample_method == "simple-without-replacement": random.shuffle(component_ids) elif options.sample_method == "length-without-replacement": map_component_id2length = {} for component_id in component_ids: mali = getMali(component_id, map_component2seq_id, map_component2input_id, id_filter, options) if not mali: continue map_component_id2length[component_id] = mali.getWidth() reference_ids, nerrors = IOTools.ReadList( open(options.filename_sample_reference, "r")) # do not sample from the reference set sampled_components = set(reference_ids) new_component_ids = [] ninput, noutput, nskipped = 0, 0, 0 # now go through reference set for ref_id in reference_ids: ninput += 1 if ref_id not in map_component_id2length: if options.loglevel >= 1: options.stdlog.write( "# reference component %s not found.\n" % (str(ref_id))) nskipped += 1 continue ref_length = map_component_id2length[ref_id] ref_length_min = ref_length - 50 ref_length_max = ref_length + 50 # find all components with a length similar to ref_length excluding previously sampled ones. test_components = filter( lambda x: ref_length_min < map_component_id2length[x] < ref_length_max, component_ids) test_components = list( set(test_components).difference(sampled_components)) if len(test_components) == 0: if options.loglevel >= 1: options.stdlog.write("# reference components %s: skipped - no others with equivalent length around %i found." % \ ( ref_id, ref_length ) ) nskipped += 1 continue random.shuffle(test_components) component_id = test_components[0] sampled_components.add(component_id) map_sample2reference[component_id] = ref_id if options.loglevel >= 1: options.stdlog.write( "# reference component mapping: %s\t%s\t%i\t%i\t%i\n" % (ref_id, component_id, ref_length, map_component_id2length[component_id], len(test_components))) new_component_ids.append(component_id) noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# sampling results: ninput=%i, noutput=%i, nskipped=%i\n" % (ninput, noutput, nskipped)) component_ids = new_component_ids options.sample = len(new_component_ids) return component_ids, map_sample2reference
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/orthologs2list.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option( "-s", "--species-regex", dest="species_regex", type="string", help="regular expression to extract species from identifier.") parser.add_option( "-g", "--gene-regex", dest="gene_regex", type="string", help="regular expression to extract gene from identifier.") parser.add_option("-b", "--only-best", dest="only_best", action="store_true", help="write only the best pair for a pairing.") parser.add_option("-w", "--no-within", dest="within", action="store_false", help="do not write within species pairs.") parser.add_option("-d", "--distances", dest="filename_distances", type="string", help="filename with distances between transcripts.") parser.add_option( "-c", "--no-combine-genes", dest="combine_genes", action="store_false", help="do not combine orthologous clusters which contain the same gene." ) parser.add_option("--filename-restrict-filter1", dest="filename_restrict_filter1", type="string", help="filename with ids to filter out.") parser.add_option("--filename-restrict-filter2", dest="filename_restrict_filter2", type="string", help="filename with ids to filter out.") parser.add_option("-f", "--format", dest="format", type="choice", choices=("graph", "components"), help="output format.") parser.add_option("-m", "--mode", dest="mode", type="choice", choices=("orthologs", "orphans"), help="analyze either 'orthologs' or 'orphans'.") parser.add_option("--genome1", dest="genome1", type="string", help="first genome.") parser.add_option("--genome2", dest="genome2", type="string", help="second genome.") parser.set_defaults( species_regex="^([^|]+)\|", gene_regex="^[^|]+\|[^|]+\|([^|]+)\|", only_best=None, filename_distances=None, within=True, combine_genes=True, report_step=100000, use_networkx=False, separator="|", genome1=None, genome2=None, mode="orthologs", filename_restrict_filter1=None, filename_restrict_filter2=None, format="graph", ) (options, args) = E.Start(parser, add_pipe_options=True) rs = re.compile(options.species_regex) rg = re.compile(options.gene_regex) t0 = time.time() # retrieve matches between pairs: pairs = {} max_dist = 0 if options.filename_distances and options.only_best: infile = open(options.filename_distances, "r") for line in infile: if line[0] == "#": continue a, b, d = line[:-1].split("\t")[:3] d = float(d) if a < b: key = "%s-%s" % (a, b) else: key = "%s-%s" % (b, a) max_dist = max(d, max_dist) pairs[key] = d infile.close() cluster_id = 0 ninput, noutput, nmissed, nskipped, nsingletons = 0, 0, 0, 0, 0 # Read positive filter information: filter_restrict1 = {} if options.filename_restrict_filter1: xx, e = IOTools.ReadList(open(options.filename_restrict_filter1, "r")) for x in xx: filter_restrict1[Orthologs.Transcript(x).mTranscript] = True filter_restrict2 = {} if options.filename_restrict_filter2: xx, e = IOTools.ReadList(open(options.filename_restrict_filter2, "r")) for x in xx: filter_restrict2[Orthologs.Transcript(x).mTranscript] = True if options.loglevel >= 1: options.stdlog.write("# read filtering information: %i/%i\n" % (len(filter_restrict1), len(filter_restrict2))) t1 = time.time() if options.loglevel >= 1: options.stdlog.write("# finished input in %i seconds.\n" % (t1 - t0)) orthologs = [] if options.mode == "orthologs": orthologs = Orthologs.ReadInterpretation( sys.stdin, options.separator, genome1=options.genome1, genome2=options.genome2, filter_restrict_transcripts1=filter_restrict1, filter_restrict_transcripts2=filter_restrict2) else: orthologs = Orthologs.ReadOrphans( sys.stdin, options.separator, genome1=options.genome1, genome2=options.genome2, filter_restrict_transcripts1=filter_restrict1, filter_restrict_transcripts2=filter_restrict2) ninput = len(orthologs) max_dist = map(lambda x: x[4], orthologs) t2 = time.time() if options.loglevel >= 1: options.stdlog.write("# reading %i groups in %i seconds.\n" % (ninput, t2 - t1)) if options.combine_genes: if options.use_networkx: nclusters = len(orthologs) if options.loglevel >= 1: options.stdlog.write( "# before combining genes: %i clusters\n" % len(orthologs)) options.stdlog.flush() # build links between all genes # ignore warnings from networkx/matplotlib that a display # can not be found with warnings.catch_warnings(): warnings.simplefilter("ignore") import networkx graph = networkx.Graph() # This procedure skips genes with "0". This is a patch, because # these genes should not be there in the first place. iteration = 0 for transcripts1, transcripts2, genes1, genes2, weight in orthologs: iteration += 1 if options.loglevel >= 1: if (iteration % options.report_step == 0): options.stdlog.write( "# iteration: %i/%i (%i%%) in %i seconds.\n" % (iteration, nclusters, 100 * iteration / nclusters, time.time() - t2)) options.stdlog.flush() for g in genes1.keys(): graph.add_node((1, g)) for g in genes2.keys(): graph.add_node((2, g)) for g1 in genes1.keys(): if g1 == "0": continue for g2 in genes1.keys(): if g2 == "0": continue graph.add_edge((1, g1), (2, g2)) for g2 in genes2.keys(): if g2 == "0": continue graph.add_edge((1, g1), (2, g2)) for g1 in genes2.keys(): if g1 == "0": continue for g2 in genes2.keys(): if g2 == "0": continue graph.add_edge((2, g1), (2, g2)) if options.loglevel >= 1: options.stdlog.write("# created graph in %i seconds.\n" % (time.time() - t2)) options.stdlog.flush() tt2 = time.time() components = networkx.connected_components(graph) if options.loglevel >= 1: options.stdlog.write( "# calculated connected components in %i seconds\n" % (time.time() - tt2)) options.stdlog.flush() else: graph = GraphTools.ExternalGraph() iteration = 0 nclusters = len(orthologs) for transcripts1, transcripts2, genes1, genes2, weight in orthologs: iteration += 1 if options.loglevel >= 1: if (iteration % options.report_step == 0): options.stdlog.write( "# iteration: %i/%i (%i%%) in %i seconds.\n" % (iteration, nclusters, 100 * iteration / nclusters, time.time() - t1)) options.stdlog.flush() f = "%s;%s" for g1 in genes1.keys(): if g1 == "0": continue for g2 in genes1.keys(): if g2 == "0": continue graph.add_edge(f % (1, g1), f % (2, g2)) for g2 in genes2.keys(): if g2 == "0": continue graph.add_edge(f % (1, g1), f % (2, g2)) for g1 in genes2.keys(): if g1 == "0": continue for g2 in genes2.keys(): if g2 == "0": continue graph.add_edge(f % (2, g1), f % (2, g2)) if options.loglevel >= 1: options.stdlog.write("# created graph in %i seconds\n" % (time.time() - t2)) options.stdlog.flush() tt2 = time.time() graph.finalize() components = graph.connected_components() if options.loglevel >= 1: options.stdlog.write( "# retrieved %i connected components in %i seconds\n" % (len(components), time.time() - tt2)) options.stdlog.flush() for x in range(len(components)): components[x] = map(lambda y: y.split(";"), components[x]) tt2 = time.time() map_gene2cluster = {} for x in range(len(components)): for a, b in components[x]: map_gene2cluster[b] = x new_orthologs = [[[], [], 0] for x in range(len(components))] singletons = [] for transcripts1, transcripts2, genes1, genes2, weight in orthologs: if genes1: try: cluster_id = map_gene2cluster[genes1.keys()[0]] except KeyError: singletons.append(genes1) elif genes2: try: cluster_id = map_gene2cluster[genes2.keys()[0]] except KeyError: singletons.append(genes2) else: raise "Error, both genes1 and genes2 are emtpy." new_orthologs[cluster_id][0] += transcripts1 new_orthologs[cluster_id][1] += transcripts2 new_orthologs[cluster_id][2] = weight nsingletons = len(singletons) orthologs = map( lambda x: (x[0], x[1], Orthologs.GetGenes(x[0]), Orthologs.GetGenes(x[1]), weight), new_orthologs) if options.loglevel >= 1: options.stdlog.write("# combining genes in %i seconds\n" % (time.time() - tt2)) options.stdlog.flush() if options.loglevel >= 1: options.stdlog.write( "# after combining genes: %i clusters, %i singletons\n" % (len(orthologs), nsingletons)) t3 = time.time() if options.loglevel >= 1: options.stdlog.write("# gene clustering in %i seconds.\n" % (t3 - t2)) cluster_id = 0 def getCode(s): if len(s) == 1: return "1" elif len(s) == 0: return "0" else: return "m" for transcripts1, transcripts2, genes1, genes2, weight in orthologs: cluster_id += 1 g1 = getCode(genes1) g2 = getCode(genes2) t1 = getCode(transcripts1) t2 = getCode(transcripts2) if options.format == "graph": # find best transcripts best_transcripts = {} if options.only_best: # print only best match between each possible set of genes in # ortholog pair for gg1, tt1 in genes1.items(): for gg2, tt2 in genes2.items(): best = max_dist best_pair = None for x in tt1: for y in tt2: if x < y: key = "%s-%s" % (x, y) else: key = "%s-%s" % (y, x) if key in pairs: if best > pairs[key]: best = pairs[key] best_pair = (x, y) if best_pair: best_transcripts[x] = 1 best_transcripts[y] = 1 options.stdout.write( "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" % (best_pair[0], best_pair[1], weight, g1, g2, str(t1), str(t2), cluster_id)) noutput += 1 else: options.stdlog.write( "# missed link between: %s %s\n" % (str(genes1), str(genes2))) nmissed += 1 else: for x in transcripts1: for y in transcripts2: options.stdout.write( "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" % (x, y, weight, g1, g2, str(t1), str(t2), cluster_id)) noutput += 1 if options.within: # add self links for first species. for x in range(len(transcripts1) - 1): for y in range(x + 1, len(transcripts1)): if not best_transcripts or \ (transcripts1[x] in best_transcripts and transcripts1[y] in best_transcripts): options.stdout.write( "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" % (str(transcripts1[x]), str(transcripts1[y]), weight, g1, g2, str(t1), str(t2), cluster_id)) noutput += 1 # add self links for second species for x in range(len(transcripts2) - 1): for y in range(x + 1, len(transcripts2)): if not best_transcripts or \ (transcripts2[x] in best_transcripts and transcripts2[y] in best_transcripts): options.stdout.write( "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" % (str(transcripts2[x]), str(transcripts2[y]), weight, g1, g2, str(t1), str(t2), cluster_id)) noutput += 1 # If orphans, also add links for genes with a single # transripts. if options.mode == "orphans": if len(transcripts1) == 1: x, y = 0, 0 options.stdout.write( "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" % (str(transcripts1[x]), str(transcripts1[y]), weight, g1, g2, str(t1), str(t2), cluster_id)) elif len(transcripts2) == 1: x, y = 0, 0 options.stdout.write( "%s\t%s\t%6.4f\t%s%s\t%s%s\t%i\n" % (str(transcripts2[x]), str(transcripts2[y]), weight, g1, g2, str(t1), str(t2), cluster_id)) elif options.format == "components": for gg1, tt1 in genes1.items(): for t in tt1: options.stdout.write("%s\t%i\n" % (str(t), cluster_id)) for gg2, tt2 in genes2.items(): for t in tt2: options.stdout.write("%s\t%i" % (str(t), cluster_id)) if options.loglevel >= 1: options.stdout.write( "# ninput=%i, noutput=%i, nmissed=%i, skipped=%i, nsingletons=%i\n" % (ninput, noutput, nmissed, nskipped, nsingletons)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: graph_check.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("--filename-missing", dest="filename_missing", type="string", help="missing entries.") parser.add_option("--filename-found", dest="filename_found", type="string", help="found entries.") parser.add_option("--report-step1", dest="report_step1", type="int", help="report interval for input.") parser.add_option("--report-step2", dest="report_step2", type="int", help="report interval for processing.") parser.add_option("-n", "--filename-vertices", dest="filename_vertices", type="string", help="filename with vertices.") parser.add_option("-u", "--num-fields", dest="num_fields", type="int", help="number of fields to expect.") parser.add_option("-o", "--filename-output-pattern", dest="filename_output_pattern", type="string", help="filenames for output (should contain one %s for one section).") parser.add_option("-s", "--sort-order", dest="sort_order", type="choice", choices=("numeric", "alphanumeric"), help="sort order - if numeric, vertices are cast to int.") parser.set_defaults( filename_vertices=None, report_step1=100000, report_step2=10000, filename_output_pattern="%s", subsets=False, num_fields=11, sort_order="alphanumeric", ) (options, args) = E.Start(parser) if options.loglevel >= 1: options.stdlog.write("# output goes to:\n") options.stdlog.write("# errors: %s\n" % options.filename_output_pattern % "errors") options.stdlog.write("# missed query: %s\n" % options.filename_output_pattern % "missed_queries") options.stdlog.write("# missed sbjct: %s\n" % options.filename_output_pattern % "missed_sbjcts") options.stdlog.write("# missed self: %s\n" % options.filename_output_pattern % "missed_self") outfile_errors = open(options.filename_output_pattern % "errors", "w") if options.sort_order == "numeric": f = int else: f = str if options.filename_vertices: vv, errors = IOTools.ReadList( open(options.filename_vertices, "r"), map_function=f) vertices = {} # use flags for vertices # 1st bit: is query: 1 # 2nd bit: is sbjct: 2 # 3rd bit: has self: 4 for v in vv: vertices[v] = 0 else: raise "for the time being, specify a vertex file." options.stdout.write( "nqueries\tnsbjcts\tnvertices\tnlinks\tnlines\tnerrors\tncomments\tis_sorted\tnexpected\tnmissed_queries\tnmissed_sbjcts\tnmissed_self\n") ncomments, nlinks, nerrors, nlines = 0, 0, 0, 0 is_sorted = True last = None for line in sys.stdin: nlines += 1 if line[0] == "#": ncomments += 1 continue nlinks += 1 data = line[:-1].split("\t") if len(data) != options.num_fields: nerrors += 1 outfile_errors.write(line) outfile_errors.flush() continue q, s = f(data[0]), f(data[1]) if q == s: vertices[q] |= 4 vertices[q] |= 1 vertices[s] |= 2 if last and last > q: is_sorted = False outfile_errors.write( "# sort inconsistency between %s and %s at line %i\n" % (last, q, nlines)) outfile_errors.flush() if options.loglevel >= 1: options.stdlog.write( "# sort inconsistency between %s and %s at line %i\n" % (last, q, nlines)) options.stdlog.flush() if options.report_step1 and nlines % options.report_step1 == 0: writeInfo(options.stdlog, vertices, nlinks, nlines, nerrors, ncomments, is_sorted) last = q missed_queries, missed_sbjcts, missed_self = writeInfo( options.stdout, vertices, nlinks, nlines, nerrors, ncomments, is_sorted) if nerrors == 0: os.remove(options.filename_output_pattern % "errors") if missed_queries: writeSet(open(options.filename_output_pattern % "missed_queries", "w"), missed_queries) if missed_sbjcts: writeSet(open(options.filename_output_pattern % "missed_sbjcts", "w"), missed_sbjcts) if missed_self: writeSet(open(options.filename_output_pattern % "missed_self", "w"), missed_self) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: optic/annotate_clusters.py 2781 2009-09-10 11:33:14Z andreas $" ) parser.add_option( "-r", "--species-regex", dest="species_regex", type="string", help="regular expression to extractspecies from identifier.") parser.add_option( "--filename-map", dest="filename_map_id2cluster", type="string", help="filename with mapping information from id to cluster.") parser.add_option("--filename-interpro", dest="filename_interpro", type="string", help="filename with interpro domain information.") parser.add_option("--filename-pfam", dest="filename_pfam", type="string", help="filename with pfam domain information.") parser.set_defaults( master_species="dmel_vs_dmel4", separator="|", filename_map_id2cluster="input.map", filename_interpro="/home/andreas/projects/flies/data_1v5/interpro.list", filename_pfam="/home/andreas/projects/flies/data_1v5/pfam.list", write_no_annotation=True, separator_fields=";", ) (options, args) = E.Start(parser, add_psql_options=True, add_csv_options=True) clusters, nerrors = IOTools.ReadList(sys.stdin) map_id2cluster, map_cluster2id = IOTools.ReadMap(open( options.filename_map_id2cluster, "r"), both_directions=True) if len(clusters) == 0: clusters = map_cluster2id.keys() clusters.sort() if options.filename_interpro: map_id2interpro = readAnnotationInterpro( open(options.filename_interpro, "r")) if options.filename_pfam: map_id2pfam = readAnnotationPfam(open(options.filename_pfam, "r")) ninput, noutput, nnomaster, nnoannotation = 0, 0, 0, 0 nskipped = 0 options.stdout.write("cluster\tgenes") if map_id2interpro: options.stdout.write("\tinterpro\tidescription") if map_id2pfam: options.stdout.write("\tpfam\tpdescription") options.stdout.write("\n") for cluster in clusters: ninput += 1 if cluster not in map_cluster2id: if options.loglevel >= 1: options.stdlog.write("# cluster %s not in map.\n" % cluster) nskipped += 1 continue genes = set() for id in map_cluster2id[cluster]: s, t, g, q = id.split(options.separator) if s != options.master_species: continue genes.add(g) if not genes: nnomaster += 1 continue annotations_interpro = {} if map_id2interpro: for gene in genes: if gene in map_id2interpro: for annotation in map_id2interpro[gene]: annotations_interpro[ annotation.mIdentifier] = annotation annotations_pfam = {} if map_id2pfam: for gene in genes: if gene in map_id2pfam: for annotation in map_id2pfam[gene]: annotations_pfam[annotation.mIdentifier] = annotation nannotations = max(len(annotations_pfam), len(annotations_interpro)) if nannotations == 0 and not options.write_no_annotation: nnoannotation += 1 continue options.stdout.write("%s\t%s" % (cluster, ";".join(genes))) if map_id2interpro: printAnnotations(options.stdout, annotations_interpro, options) if map_id2pfam: printAnnotations(options.stdout, annotations_pfam, options) options.stdout.write("\n") noutput += 1 if options.loglevel >= 1: options.stdlog.write( "# ninput=%i, noutput=%i, nskipped=%i, nnomaster=%i, nnoannotation=%i\n" % (ninput, noutput, nskipped, nnomaster, nnoannotation)) E.Stop()