def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option("-i", "--input-format", dest="input_format", type="choice", choices=( "plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment [default=%default].") parser.add_option("-o", "--output-format", dest="output_format", type="choice", choices=( "plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"), help="output format of multiple alignment [default=%default].") parser.add_option("--with-ranges", dest="with_ranges", action="store_true", help="output alignment ranges (suffix /from-to after identifier) [default=%default].") parser.add_option("--without-ranges", dest="with_ranges", action="store_false", help="do not output alignment ranges (suffix /from-to after identifier) [default=%default].") parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true", help="permit duplicate entries [default=%default].") parser.add_option("-m", "--method", dest="methods", type="string", help="""methods to apply. Several methods can be specified in a ','-separated list [default=%default].""" ) parser.add_option("-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one [default=%default].") parser.add_option("-a", "--mask-char", dest="mask_char", type="string", help="character to identify/set masked characters [default=%default].") parser.set_defaults( input_format="fasta", output_format="fasta", methods="", parameters="", mask_char="x", gap_chars="-.nN", with_ranges=True, allow_duplicates=False, ) (options, args) = E.Start(parser) options.methods = options.methods.split(",") options.parameters = options.parameters.split(",") # 1. read multiple alignment in various formats if options.allow_duplicates: mali = Mali.SequenceCollection() else: mali = Mali.Mali() t1 = time.time() mali.readFromFile(options.stdin, format=options.input_format) E.info("read mali with %i entries in %i seconds." % (len(mali), time.time() - t1)) if len(mali) == 0: raise ValueError("empty multiple alignment") for method in options.methods: t1 = time.time() if method == "remove-unaligned-ends": mali.removeUnalignedEnds() elif method == "remove-end-gaps": mali.removeEndGaps() elif method == "remove-all-gaps": mali.removeGaps(minimum_gaps=len(mali)) elif method == "remove-any-gaps": mali.removeGaps(minimum_gaps=1) elif method == "remove-some-gaps": minimum_gaps = int(options.parameters[0]) del options.parameters[0] mali.removeGaps(minimum_gaps=minimum_gaps) elif method == "remove-empty-sequences": mali.removeEmptySequences() elif method == "upper": mali.upperCase() elif method == "lower": mali.lowerCase() elif method == "mark-codons": mali.markCodons() elif method == "remove-stops": mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"), allowed_matches=0, minimum_matches=1, delete_frame=3, search_frame=3) elif method == "shift-alignment": map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"), map_functions=(str, int)) del options.parameters[0] mali.shiftAlignment(map_id2offset) elif method == "propagate-masks": mali.propagateMasks(mask_char=options.mask_char) elif method == "recount": mali.recount() elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions", "keep-even-segments", "keep-odd-segments"): if os.path.exists(options.parameters[0]): map_id2transitions = IOTools.readMultiMap(open(options.parameters[0], "r"), map_functions=(str, int)) else: map_id2transitions = {} r = map(int, options.parameters[0].split(':')) r.sort() map_id2transitions["mali"] = r del options.parameters[0] if method == "mark-transitions": mali.markTransitions(map_id2transitions) elif method in ("filter-odd-transitions", "keep-even-segments"): mali.markTransitions(map_id2transitions, mode="keep-odd") elif method in ("filter-even-transitions", "keep-odd-segments"): mali.markTransitions(map_id2transitions, mode="keep-even") elif method == "propagate-transitions": mali.propagateTransitions() elif method == "map-annotation": # map annotations in one mali (stockholm-format) to the annotations in another. # Note: the first two sequence identifiers must be shared and the sequence of the # same length other_mali = Mali.Mali() other_mali.readFromFile( open(options.parameters[0], "r"), format="stockholm") del options.parameters[0] mali.copyAnnotations(other_mali) elif method == "add-annotation": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] AddAnnotation(mali, annotation_type, annotation_file) elif method == "mask-columns": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] maskColumns(mali, annotation_type, annotation_file) elif method == "remove-unaligned-pairs": removeUnalignedPairs(mali, options) elif method == "filter-3rd": filterMali(mali, "3rd") elif method == "filter-4d": filterMali(mali, "4d") elif method in ("mask-seg", "mask-bias"): a, b = method.split("-") maskMali(mali, b) elif method == "exclude-with-stop": mali.filter(method="with-stop") elif method == "exclude-with-stop": mali.filter(method="with-frameshift") E.info("applied method %s in %i seconds." % (method, time.time() - t1)) mali.writeToFile(options.stdout, format=options.output_format, write_ranges=options.with_ranges) E.Stop()
options.section, outfile_synonyms = outfile_synonyms, max_length = options.max_length, remove_regex = options.remove_regex ) if outfile_synonyms: outfile_synonyms.close() elif options.section == "annotations-go": assert options.input_filename_map, "please supply option --input-filename-map" iterator = GTF.iterator_filtered( GTF.iterator( options.stdin ), feature=options.feature ) geneid2categories = IOTools.readMultiMap( open( options.input_filename_map, "r") ) category2segments = collections.defaultdict( list ) for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for start, end, geneid in gffs: if geneid not in geneid2categories: continue for category in geneid2categories[geneid]: category2segments[category].append(nsegments) options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) ) nsegments += 1 for category, segments in category2segments.iteritems():
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: mali2mali.py 2782 2009-09-10 11:40:29Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-format", dest="input_format", type="choice", choices=("plain", "fasta", "clustal", "stockholm", "phylip"), help="input format of multiple alignment [default=%default].") parser.add_option( "-o", "--output-format", dest="output_format", type="choice", choices=("plain", "fasta", "stockholm", "phylip", "nexus", "plain-fasta"), help="output format of multiple alignment [default=%default].") parser.add_option( "--with-ranges", dest="with_ranges", action="store_true", help= "output alignment ranges (suffix /from-to after identifier) [default=%default]." ) parser.add_option( "--without-ranges", dest="with_ranges", action="store_false", help= "do not output alignment ranges (suffix /from-to after identifier) [default=%default]." ) parser.add_option("-u", "--allow-duplicates", dest="allow_duplicates", action="store_true", help="permit duplicate entries [default=%default].") parser.add_option( "-m", "--method", dest="methods", type="string", help= """methods to apply. Several methods can be specified in a ','-separated list [default=%default].""" ) parser.add_option( "-p", "--parameters", dest="parameters", type="string", help="parameter stack for methods that require one [default=%default]." ) parser.add_option( "-a", "--mask-char", dest="mask_char", type="string", help="character to identify/set masked characters [default=%default].") parser.set_defaults( input_format="fasta", output_format="fasta", methods="", parameters="", mask_char="x", gap_chars="-.nN", with_ranges=True, allow_duplicates=False, ) (options, args) = E.Start(parser) options.methods = options.methods.split(",") options.parameters = options.parameters.split(",") # 1. read multiple alignment in various formats if options.allow_duplicates: mali = Mali.SequenceCollection() else: mali = Mali.Mali() t1 = time.time() mali.readFromFile(options.stdin, format=options.input_format) E.info("read mali with %i entries in %i seconds." % (len(mali), time.time() - t1)) if len(mali) == 0: raise ValueError("empty multiple alignment") for method in options.methods: t1 = time.time() if method == "remove-unaligned-ends": mali.removeUnalignedEnds() elif method == "remove-end-gaps": mali.removeEndGaps() elif method == "remove-all-gaps": mali.removeGaps(minimum_gaps=len(mali)) elif method == "remove-any-gaps": mali.removeGaps(minimum_gaps=1) elif method == "remove-some-gaps": minimum_gaps = int(options.parameters[0]) del options.parameters[0] mali.removeGaps(minimum_gaps=minimum_gaps) elif method == "remove-empty-sequences": mali.removeEmptySequences() elif method == "upper": mali.upperCase() elif method == "lower": mali.lowerCase() elif method == "mark-codons": mali.markCodons() elif method == "remove-stops": mali.removePattern(lambda x: x.upper() in ("TAG", "TAA", "TGA"), allowed_matches=0, minimum_matches=1, delete_frame=3, search_frame=3) elif method == "shift-alignment": map_id2offset = IOTools.ReadMap(open(options.parameters[0], "r"), map_functions=(str, int)) del options.parameters[0] mali.shiftAlignment(map_id2offset) elif method == "propagate-masks": mali.propagateMasks(mask_char=options.mask_char) elif method == "recount": mali.recount() elif method in ("mark-transitions", "filter-odd-transitions", "filter-even-transitions", "keep-even-segments", "keep-odd-segments"): if os.path.exists(options.parameters[0]): map_id2transitions = IOTools.readMultiMap( open(options.parameters[0], "r"), map_functions=(str, int)) else: map_id2transitions = {} r = map(int, options.parameters[0].split(':')) r.sort() map_id2transitions["mali"] = r del options.parameters[0] if method == "mark-transitions": mali.markTransitions(map_id2transitions) elif method in ("filter-odd-transitions", "keep-even-segments"): mali.markTransitions(map_id2transitions, mode="keep-odd") elif method in ("filter-even-transitions", "keep-odd-segments"): mali.markTransitions(map_id2transitions, mode="keep-even") elif method == "propagate-transitions": mali.propagateTransitions() elif method == "map-annotation": # map annotations in one mali (stockholm-format) to the annotations in another. # Note: the first two sequence identifiers must be shared and the sequence of the # same length other_mali = Mali.Mali() other_mali.readFromFile(open(options.parameters[0], "r"), format="stockholm") del options.parameters[0] mali.copyAnnotations(other_mali) elif method == "add-annotation": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] AddAnnotation(mali, annotation_type, annotation_file) elif method == "mask-columns": annotation_type, annotation_file = options.parameters[:2] del options.parameters[:2] maskColumns(mali, annotation_type, annotation_file) elif method == "remove-unaligned-pairs": removeUnalignedPairs(mali, options) elif method == "filter-3rd": filterMali(mali, "3rd") elif method == "filter-4d": filterMali(mali, "4d") elif method in ("mask-seg", "mask-bias"): a, b = method.split("-") maskMali(mali, b) elif method == "exclude-with-stop": mali.filter(method="with-stop") elif method == "exclude-with-stop": mali.filter(method="with-frameshift") E.info("applied method %s in %i seconds." % (method, time.time() - t1)) mali.writeToFile(options.stdout, format=options.output_format, write_ranges=options.with_ranges) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser( version="%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--filename-annotations", dest="filename_annotations", type="string", help="filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default].") parser.add_option("-r", "--resolution", dest="resolution", type="int", help="resolution of count vector [default=%default].") parser.add_option("-b", "--num-bins", dest="num_bins", type="int", help="number of bins in count vector [default=%default].") parser.add_option("-i", "--num-samples", dest="num_samples", type="int", help="sample size to compute [default=%default].") parser.add_option("-w", "--workspace", dest="filename_workspace", type="string", help="filename with workspace information [default=%default].") parser.add_option("--workspace-builder", dest="workspace_builder", type="choice", choices=( "gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"), help="given a gff/gtf file build a workspace [default=%default].") parser.add_option("--workspace-labels", dest="workspace_labels", type="choice", choices=("none", "direction", "annotation"), help="labels to use for the workspace workspace [default=%default].") parser.add_option("--sampler", dest="sampler", type="choice", choices=("permutation", "gaps"), help="sampler to use. The sampler determines the null model of how segments are distributed in the workspace [default=%default]") parser.add_option("--counter", dest="counters", type="choice", action="append", choices=( "transcription", "closest-distance", "all-distances"), help="counter to use. The counter computes the quantity of interest [default=%default]") parser.add_option("--analysis", dest="analysis", type="choice", action="append", choices=("proximity", "area-under-curve"), help="analysis to perform [default=%default]") parser.add_option("--transform-counts", dest="transform_counts", type="choice", choices=("raw", "cumulative"), help="cumulate counts [default=%default].") parser.add_option("-s", "--segments", dest="filename_segments", type="string", help="filename with segment information [default=%default].") parser.add_option("--xrange", dest="xrange", type="string", help="xrange to plot [default=%default]") parser.add_option("-o", "--logscale", dest="logscale", type="string", help="use logscale on x, y or xy [default=%default]") parser.add_option("-p", "--plot", dest="plot", action="store_true", help="output plots [default=%default]") parser.add_option("--hardcopy", dest="hardcopy", type="string", help="output hardcopies to file [default=%default]") parser.add_option("--no-fdr", dest="do_fdr", action="store_false", help="do not compute FDR rates [default=%default]") parser.add_option("--segments-format", dest="segments_format", type="choice", choices=("gtf", "bed"), help="format of segments file [default=%default].") parser.add_option("--truncate", dest="truncate", action="store_true", help="truncate segments extending beyond a workspace [default=%default]") parser.add_option("--remove-overhangs", dest="remove_overhangs", action="store_true", help="remove segments extending beyond a workspace[default=%default]") parser.add_option("--keep-ambiguous", dest="keep_ambiguous", action="store_true", help="keep segments extending to more than one workspace [default=%default]") parser.set_defaults( filename_annotations=None, filename_workspace="workspace.gff", filename_segments="FastDown.gtf", filename_annotations_gtf="../data/tg1_territories.gff", workspace_builder="gff", workspace_labels="none", sampler="permutation", truncate=False, num_bins=10000, num_samples=10, resolution=100, plot_samples=False, plot_envelope=True, counters=[], transform_counts="raw", xrange=None, plot=False, logscale=None, output_all=False, do_test=False, analysis=[], do_fdr=True, hardcopy="%s.png", segments_format="gtf", remove_overhangs=False, ) (options, args) = E.Start(parser, argv=argv, add_output_options=True) ########################################### # setup options if options.sampler == "permutation": sampler = SamplerPermutation elif options.sampler == "gaps": sampler = SamplerGaps if options.xrange: options.xrange = map(float, options.xrange.split(",")) if len(options.counters) == 0: raise ValueError("please specify at least one counter.") if len(options.analysis) == 0: raise ValueError("please specify at least one analysis.") if options.workspace_labels == "annotation" and not options.filename_annotations: raise ValueError( "please specify --filename-annotations is --workspace-labels=annotations.") ########################################### # read data if options.workspace_labels == "annotation": def constant_factory(value): return itertools.repeat(value).next def dicttype(): return collections.defaultdict(constant_factory(("unknown",))) map_id2annotations = IOTools.readMultiMap(open(options.filename_annotations, "r"), dtype=dicttype) else: map_id2annotations = {} workspace = readWorkspace(open(options.filename_workspace, "r"), options.workspace_builder, options.workspace_labels, map_id2annotations) E.info("read workspace for %i contigs" % (len(workspace))) indexed_workspace = indexIntervals(workspace, with_values=True) segments = readSegments(open(options.filename_segments, "r"), indexed_workspace, format=options.segments_format, keep_ambiguous=options.keep_ambiguous, truncate=options.truncate, remove_overhangs=options.remove_overhangs) nsegments = 0 for contig, vv in segments.iteritems(): nsegments += len(vv) E.info("read %i segments for %i contigs" % (nsegments, len(workspace))) indexed_segments = indexIntervals(segments, with_values=False) if nsegments == 0: E.warn("no segments read - no computation done.") E.Stop() return # build labels labels = collections.defaultdict(int) for contig, vv in workspace.iteritems(): for start, end, v in vv: for l in v[0]: labels[l] += 1 for l in v[1]: labels[l] += 1 E.info("found %i workspace labels" % len(labels)) ########################################### # setup counting containers counters = [] for cc in options.counters: if cc == "transcription": counter = CounterTranscription elif cc == "closest-distance": counter = CounterClosestDistance elif cc == "all-distances": counter = CounterAllDistances if nsegments < 256: dtype = numpy.uint8 elif nsegments < 65536: dtype = numpy.uint16 elif nsegments < 4294967296: dtype = numpy.uint32 else: dtype = numpy.int E.debug("choosen dtype %s" % str(dtype)) E.info("samples space is %i bases: %i bins at %i resolution" % (options.num_bins * options.resolution, options.num_bins, options.resolution, )) E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" % (options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1), len(labels), options.num_samples, options.num_bins, )) c = CountingResults(labels) c.mObservedCounts = counter( labels, options.num_bins, options.resolution, dtype=dtype) simulated_counts = [] for x in range(options.num_samples): simulated_counts.append( counter(labels, options.num_bins, options.resolution, dtype=dtype)) c.mSimulatedCounts = simulated_counts c.mName = c.mObservedCounts.mName counters.append(c) E.info("allocated memory successfully") segments_per_workspace = [] segment_sizes = [] segments_per_label = collections.defaultdict(int) workspaces_per_label = collections.defaultdict(int) ############################################ # get observed and simpulated counts nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0 iteration2 = 0 for contig, vv in workspace.iteritems(): iteration2 += 1 E.info("counting %i/%i: %s %i segments" % (iteration2, len(workspace), contig, len(vv))) if len(vv) == 0: continue iteration1 = 0 for work_start, work_end, v in vv: left_labels, right_labels = v[0], v[1] iteration1 += 1 # ignore empty segments if contig not in indexed_segments: nempty_contigs += 1 continue r = indexed_segments[contig].find(work_start, work_end) segments_per_workspace.append(len(r)) if not r: nempty_workspaces += 1 continue # collect segments and stats nworkspaces += 1 observed = [(x.start, x.end) for x in r] observed.sort() segments_per_workspace.append(len(observed)) segment_sizes.extend([x[1] - x[0] for x in observed]) # collect basic counts for label in list(left_labels) + list(right_labels): workspaces_per_label[label] += 1 segments_per_label[label] += len(observed) # add observed counts for counter in counters: counter.mObservedCounts.addCounts( observed, work_start, work_end, left_labels, right_labels) # create sampler s = sampler(observed, work_start, work_end) # add simulated counts for iteration in range(options.num_samples): simulated = s.sample() for counter in counters: counter.mSimulatedCounts[iteration].addCounts( simulated, work_start, work_end, left_labels, right_labels) E.info("counting finished") E.info("nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" % (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs)) ###################################################### # transform counts if options.transform_counts == "cumulative": transform = cumulative_transform elif options.transform_counts == "raw": transform = normalize_transform #################################################### # analysis if "proximity" in options.analysis: outfile_proximity = E.openOutputFile("proximity") outfile_proximity.write("\t".join(("label", "observed", "pvalue", "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n") else: outfile_proximity = None if "area-under-curve" in options.analysis: outfile_auc = E.openOutputFile("auc") outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n") else: outfile_auc = None # qvalue: expected false positives at p-value # qvalue = expected false positives / if options.do_fdr: E.info("computing pvalues for fdr") for counter in counters: for label in labels: E.info("working on counter:%s label:%s" % (counter, label)) # collect all P-Values of simulated results to compute FDR sim_pvalues = [] medians = counter.getMedians(label) for median in medians: pvalue = float( scipy.stats.percentileofscore(medians, median)) / 100.0 sim_pvalues.append(pvalue) sim_pvalues.sort() else: sim_pvalues = [] # compute observed p-values for counter in counters: counter.update() obs_pvalues = [] for counter in counters: for label in labels: obs_pvalues.append(counter.mStats[label].pvalue) obs_pvalues.sort() # compute observed p-values if options.do_fdr: for counter in counters: counter.updateFDR(obs_pvalues, sim_pvalues) for counter in counters: outofbounds_sim, totals_sim = 0, 0 outofbounds_obs, totals_obs = 0, 0 for label in labels: for sample in range(options.num_samples): if counter.mSimulatedCounts[sample].mOutOfBounds[label]: E.debug("out of bounds: sample %i, label %s, counts=%i" % (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label])) outofbounds_sim += counter.mSimulatedCounts[ sample].mOutOfBounds[label] totals_sim += counter.mSimulatedCounts[sample].mTotals[label] outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label] totals_obs += counter.mObservedCounts.mTotals[label] E.info("out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" % (outofbounds_obs, totals_obs, 100.0 * outofbounds_obs / totals_obs, outofbounds_sim, totals_sim, 100.0 * outofbounds_sim / totals_sim, )) for label in labels: if outfile_auc: mmin, mmax, mmean = counter.getEnvelope( label, transform=normalize_transform) obs = normalize_transform( counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label]) def block_iterator(a1, a2, a3, num_bins): x = 0 while x < num_bins: while x < num_bins and a1[x] <= a2[x]: x += 1 start = x while x < options.num_bins and a1[x] > a2[x]: x += 1 end = x total_a1 = a1[start:end].sum() total_a3 = a3[start:end].sum() if total_a1 > total_a3: yield (total_a1 - total_a3, start, end, total_a1, total_a3) blocks = list( block_iterator(obs, mmax, mmean, options.num_bins)) if options.output_all: for delta, start, end, total_obs, total_mean in blocks: if end - start <= 1: continue outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) # output best block blocks.sort() delta, start, end, total_obs, total_mean = blocks[-1] outfile_auc.write("%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) if outfile_proximity: # find error bars at median st = counter.mStats[label] outfile_proximity.write("%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % (label, st.observed * options.resolution, st.pvalue, st.expected * options.resolution, st.ci95lower * options.resolution, st.ci95upper * options.resolution, IOTools.prettyFloat(st.qvalue), segments_per_label[label], workspaces_per_label[label], )) if options.plot: for counter in counters: plotCounts(counter, options, transform) # plot summary stats plt.figure() plt.title("distribution of workspace length") data = [] for contig, segs in workspace.iteritems(): if len(segs) == 0: continue data.extend([x[1] - x[0] for x in segs]) vals, bins = numpy.histogram( data, bins=numpy.arange(0, max(data), 100), new=True) t = float(sum(vals)) plt.plot(bins[:-1], numpy.cumsum(vals) / t) plt.gca().set_xscale('log') plt.legend() t = float(sum(vals)) plt.xlabel("size of workspace") plt.ylabel("cumulative relative frequency") if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "workspace_size")) plt.figure() plt.title("segments per block") vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange( 0, max(segments_per_workspace), 1), new=True) plt.plot(bins[:-1], vals) plt.xlabel("segments per block") plt.ylabel("absolute frequency") if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_block")) plt.figure() plt.title("workspaces per label") plt.barh( range(0, len(labels)), [workspaces_per_label[x] for x in labels], height=0.5) plt.yticks(range(0, len(labels)), labels) plt.ylabel("workspaces per label") plt.xlabel("absolute frequency") plt.gca().set_xscale('log') if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "workspaces_per_label")) plt.figure() plt.title("segments per label") plt.barh(range(0, len(labels)), [segments_per_label[x] for x in labels], height=0.5) plt.yticks(range(0, len(labels)), labels) plt.ylabel("segments per label") plt.xlabel("absolute frequency") plt.xticks(range(0, len(labels)), labels) if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_label")) if not options.hardcopy: plt.show() E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option("-g", "--genome-file", dest="genome_file", type="string", help="filename with genome.") parser.add_option("-f", "--features", dest="features", type="string", help="feature to collect [default=None].") parser.add_option("-i", "--files", dest="files", action="append", help="use multiple annotations [default=None].") parser.add_option( "-a", "--annotations", dest="annotations", type="string", help= "aggregate name for annotations if only single file is provided from STDIN [default=None]." ) parser.add_option( "--input-filename-map", dest="input_filename_map", type="string", help="filename with a map of gene_ids to categories [default=None].") parser.add_option( "--output-filename-synonyms", dest="output_filename_synonyms", type="string", help= "output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]." ) parser.add_option("-m", "--max-length", dest="max_length", type="string", help="maximum segment length [default=None].") parser.add_option("-s", "--section", dest="section", type="choice", choices=("segments", "annotations", "annotations-genes", "annotations-go", "workspace", "annotations-gff"), help="annotator section [default=None].") parser.add_option( "--subset", dest="subsets", type="string", action="append", help= "add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]." ) parser.add_option( "--remove-regex", dest="remove_regex", type="string", help="regular expression of contigs to remove [default=None].") parser.set_defaults( genome_file=None, feature=None, section="segments", annotations="annotations", max_length=100000, files=[], subsets=[], input_filename_map=None, output_filename_synonyms=None, input_format="gff", remove_regex=None, ) (options, args) = E.Start(parser) options.files += args if len(options.files) == 0: options.files.append("-") options.files = list( itertools.chain(*[re.split("[,; ]+", x) for x in options.files])) if options.subsets: subsets = collections.defaultdict(list) for s in options.subsets: filename_gff, label, filename_ids = s.split(",") subsets[filename_gff].append((label, filename_ids)) options.subsets = subsets if options.genome_file: fasta = IndexedFasta.IndexedFasta(options.genome_file) else: fasta = None if options.section == "segments": prefix = "##Segs" elif options.section.startswith("annotations"): prefix = "##Id" elif options.section == "workspace": prefix = "##Work" else: raise ValueError("unknown section %s" % options.section) ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0 if options.remove_regex: options.remove_regex = re.compile(options.remove_regex) if options.section in ("segments", "workspace"): iterator = GTF.iterator_filtered(GFF.iterator(options.stdin), feature=options.feature) if options.output_filename_synonyms: outfile_synonyms = open(options.output_filename_synonyms, "w") with_records = True else: outfile_synonyms = None with_records = False intervals = GTF.readAsIntervals(iterator, with_records=with_records) ninput, nsegments, ndiscarded, ncontigs = \ PipelineEnrichment.outputSegments(options.stdout, intervals, options.section, outfile_synonyms=outfile_synonyms, max_length=options.max_length, remove_regex=options.remove_regex) if outfile_synonyms: outfile_synonyms.close() elif options.section == "annotations-go": assert options.input_filename_map, "please supply option --input-filename-map" iterator = GTF.iterator_filtered(GTF.iterator(options.stdin), feature=options.feature) geneid2categories = IOTools.readMultiMap( open(options.input_filename_map, "r")) category2segments = collections.defaultdict(list) for contig, gffs in GTF.readAsIntervals(iterator, with_gene_id=True).items(): if options.remove_regex and options.remove_regex.search(contig): continue for start, end, geneid in gffs: if geneid not in geneid2categories: continue for category in geneid2categories[geneid]: category2segments[category].append(nsegments) options.stdout.write("%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 for category, segments in category2segments.iteritems(): options.stdout.write( "##Ann\t%s\t%s\n" % (category, "\t".join(["%i" % x for x in segments]))) E.info("set %s annotated with %i segments" % (category, len(segments))) elif options.section == "annotations": for filename in options.files: E.info("adding filename %s" % filename) start = nsegments is_gtf = False if filename == "-": iterator = GTF.iterator_filtered(GFF.iterator(sys.stdin), feature=options.feature) filename = options.annotations elif filename.endswith(".gtf"): is_gtf = True with open(filename, "r") as infile: iterator = GTF.iterator_filtered(GTF.iterator(infile), feature=options.feature) else: with open(filename, "r") as infile: iterator = GTF.iterator_filtered(GFF.iterator(infile), feature=options.feature) E.debug("processing %s" % (filename)) if not options.subsets or filename not in options.subsets: for contig, gffs in GTF.readAsIntervals(iterator).items(): if options.remove_regex and options.remove_regex.search( contig): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1])) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments)]))) E.info("set %s annotated with %i segments" % (filename, nsegments - start)) else: raise ValueError("don't know how to filter %s" % filename) elif options.section == "annotations-gff": for filename in options.files: if filename == "-": iterator = GTF.iterator(sys.stdin) else: iterator = GTF.iterator_filtered( GFF.iterator(open(filename, "r"))) segments = collections.defaultdict(list) for gff in iterator: segments[":".join((gff.source, gff.feature))].append( (gff.contig, gff.start, gff.end)) feature2segments = {} for feature, s in segments.iteritems(): s.sort() s1 = nsegments for contig, start, end in s: if options.remove_regex and options.remove_regex.search( contig): continue options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 feature2segments[feature] = (s1, nsegments) for feature, id_range in feature2segments.iteritems(): start, end = id_range options.stdout.write( "##Ann\t%s\t%s\n" % (feature, "\t".join(["%i" % x for x in xrange(start, end)]))) E.info("set %s annotated with %i segments" % (feature, end - start)) elif options.section == "annotations-genes": for filename in options.files: E.info("adding filename %s" % filename) start = nsegments assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \ "requiring .gtf files for gene list filtering, received %s" % filename infile = IOTools.openFile(filename) iterator = GTF.iterator_filtered(GTF.iterator(infile), feature=options.feature) E.debug("processing %s" % (filename)) if not options.subsets or filename not in options.subsets: # output all for contig, gffs in GTF.readAsIntervals(iterator).items(): if options.remove_regex and options.remove_regex.search( contig): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1])) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments)]))) E.info("set %s annotated with %i segments" % (filename, nsegments - start)) else: # create subsets E.debug("applying subsets for %s" % filename) geneid2label, label2segments = collections.defaultdict( list), {} for label, filename_ids in options.subsets[filename]: gene_ids = IOTools.readList(open(filename_ids, "r")) for gene_id in gene_ids: geneid2label[gene_id].append(label) label2segments[label] = [] for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id=True).items(): if options.remove_regex and options.remove_regex.search( contig): continue for start, end, gene_id in gffs: if gene_id not in geneid2label: continue for label in geneid2label[gene_id]: label2segments[label].append(nsegments) options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end)) nsegments += 1 for label, segments in label2segments.iteritems(): options.stdout.write( "##Ann\t%s\t%s\n" % (label, "\t".join(["%i" % x for x in segments]))) E.info("set %s (%s) annotated with %i segments" % (label, filename, len(segments))) E.info("ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ncontigs, nsegments, ndiscarded)) E.Stop()
def main(argv=sys.argv): parser = E.OptionParser( version= "%prog version: $Id: annotator_distance.py 2861 2010-02-23 17:36:32Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-a", "--annotations-tsv-file", dest="filename_annotations", type="string", help= "filename mapping gene ids to annotations (a tab-separated table with two-columns) [default=%default]." ) parser.add_option("-r", "--resolution", dest="resolution", type="int", help="resolution of count vector [default=%default].") parser.add_option( "-b", "--num-bins", dest="num_bins", type="int", help="number of bins in count vector [default=%default].") parser.add_option("-i", "--num-samples", dest="num_samples", type="int", help="sample size to compute [default=%default].") parser.add_option( "-w", "--workspace-bed-file", dest="filename_workspace", type="string", help="filename with workspace information [default=%default].") parser.add_option( "--workspace-builder", dest="workspace_builder", type="choice", choices=("gff", "gtf-intergenic", "gtf-intronic", "gtf-genic"), help="given a gff/gtf file build a workspace [default=%default].") parser.add_option( "--workspace-labels", dest="workspace_labels", type="choice", choices=("none", "direction", "annotation"), help="labels to use for the workspace workspace [default=%default].") parser.add_option( "--sampler", dest="sampler", type="choice", choices=("permutation", "gaps"), help= "sampler to use. The sampler determines the null model of how segments are distributed in the workspace [default=%default]" ) parser.add_option( "--counter", dest="counters", type="choice", action="append", choices=("transcription", "closest-distance", "all-distances"), help= "counter to use. The counter computes the quantity of interest [default=%default]" ) parser.add_option("--analysis", dest="analysis", type="choice", action="append", choices=("proximity", "area-under-curve"), help="analysis to perform [default=%default]") parser.add_option("--transform-counts", dest="transform_counts", type="choice", choices=("raw", "cumulative"), help="cumulate counts [default=%default].") parser.add_option( "-s", "--segments", dest="filename_segments", type="string", help="filename with segment information [default=%default].") parser.add_option("--xrange", dest="xrange", type="string", help="xrange to plot [default=%default]") parser.add_option("-o", "--logscale", dest="logscale", type="string", help="use logscale on x, y or xy [default=%default]") parser.add_option("-p", "--plot", dest="plot", action="store_true", help="output plots [default=%default]") parser.add_option("--hardcopy", dest="hardcopy", type="string", help="output hardcopies to file [default=%default]") parser.add_option("--no-fdr", dest="do_fdr", action="store_false", help="do not compute FDR rates [default=%default]") parser.add_option("--segments-format", dest="segments_format", type="choice", choices=("gtf", "bed"), help="format of segments file [default=%default].") parser.add_option( "--truncate", dest="truncate", action="store_true", help="truncate segments extending beyond a workspace [default=%default]" ) parser.add_option( "--remove-overhangs", dest="remove_overhangs", action="store_true", help="remove segments extending beyond a workspace[default=%default]") parser.add_option( "--keep-ambiguous", dest="keep_ambiguous", action="store_true", help= "keep segments extending to more than one workspace [default=%default]" ) parser.set_defaults( filename_annotations=None, filename_workspace="workspace.gff", filename_segments="FastDown.gtf", filename_annotations_gtf="../data/tg1_territories.gff", workspace_builder="gff", workspace_labels="none", sampler="permutation", truncate=False, num_bins=10000, num_samples=10, resolution=100, plot_samples=False, plot_envelope=True, counters=[], transform_counts="raw", xrange=None, plot=False, logscale=None, output_all=False, do_test=False, analysis=[], do_fdr=True, hardcopy="%s.png", segments_format="gtf", remove_overhangs=False, ) (options, args) = E.Start(parser, argv=argv, add_output_options=True) ########################################### # setup options if options.sampler == "permutation": sampler = SamplerPermutation elif options.sampler == "gaps": sampler = SamplerGaps if options.xrange: options.xrange = map(float, options.xrange.split(",")) if len(options.counters) == 0: raise ValueError("please specify at least one counter.") if len(options.analysis) == 0: raise ValueError("please specify at least one analysis.") if options.workspace_labels == "annotation" and not options.filename_annotations: raise ValueError( "please specify --annotations-tsv-file is --workspace-labels=annotations." ) ########################################### # read data if options.workspace_labels == "annotation": def constant_factory(value): return itertools.repeat(value).next def dicttype(): return collections.defaultdict(constant_factory(("unknown", ))) map_id2annotations = IOTools.readMultiMap(open( options.filename_annotations, "r"), dtype=dicttype) else: map_id2annotations = {} workspace = readWorkspace(open(options.filename_workspace, "r"), options.workspace_builder, options.workspace_labels, map_id2annotations) E.info("read workspace for %i contigs" % (len(workspace))) indexed_workspace = indexIntervals(workspace, with_values=True) segments = readSegments(open(options.filename_segments, "r"), indexed_workspace, format=options.segments_format, keep_ambiguous=options.keep_ambiguous, truncate=options.truncate, remove_overhangs=options.remove_overhangs) nsegments = 0 for contig, vv in segments.iteritems(): nsegments += len(vv) E.info("read %i segments for %i contigs" % (nsegments, len(workspace))) indexed_segments = indexIntervals(segments, with_values=False) if nsegments == 0: E.warn("no segments read - no computation done.") E.Stop() return # build labels labels = collections.defaultdict(int) for contig, vv in workspace.iteritems(): for start, end, v in vv: for l in v[0]: labels[l] += 1 for l in v[1]: labels[l] += 1 E.info("found %i workspace labels" % len(labels)) ########################################### # setup counting containers counters = [] for cc in options.counters: if cc == "transcription": counter = CounterTranscription elif cc == "closest-distance": counter = CounterClosestDistance elif cc == "all-distances": counter = CounterAllDistances if nsegments < 256: dtype = numpy.uint8 elif nsegments < 65536: dtype = numpy.uint16 elif nsegments < 4294967296: dtype = numpy.uint32 else: dtype = numpy.int E.debug("choosen dtype %s" % str(dtype)) E.info("samples space is %i bases: %i bins at %i resolution" % ( options.num_bins * options.resolution, options.num_bins, options.resolution, )) E.info("allocating counts: %i bytes (%i labels, %i samples, %i bins)" % ( options.num_bins * len(labels) * dtype().itemsize * (options.num_samples + 1), len(labels), options.num_samples, options.num_bins, )) c = CountingResults(labels) c.mObservedCounts = counter(labels, options.num_bins, options.resolution, dtype=dtype) simulated_counts = [] for x in range(options.num_samples): simulated_counts.append( counter(labels, options.num_bins, options.resolution, dtype=dtype)) c.mSimulatedCounts = simulated_counts c.mName = c.mObservedCounts.mName counters.append(c) E.info("allocated memory successfully") segments_per_workspace = [] segment_sizes = [] segments_per_label = collections.defaultdict(int) workspaces_per_label = collections.defaultdict(int) ############################################ # get observed and simpulated counts nworkspaces, nempty_workspaces, nempty_contigs, nmiddle = 0, 0, 0, 0 iteration2 = 0 for contig, vv in workspace.iteritems(): iteration2 += 1 E.info("counting %i/%i: %s %i segments" % (iteration2, len(workspace), contig, len(vv))) if len(vv) == 0: continue iteration1 = 0 for work_start, work_end, v in vv: left_labels, right_labels = v[0], v[1] iteration1 += 1 # ignore empty segments if contig not in indexed_segments: nempty_contigs += 1 continue r = indexed_segments[contig].find(work_start, work_end) segments_per_workspace.append(len(r)) if not r: nempty_workspaces += 1 continue # collect segments and stats nworkspaces += 1 observed = [(x.start, x.end) for x in r] observed.sort() segments_per_workspace.append(len(observed)) segment_sizes.extend([x[1] - x[0] for x in observed]) # collect basic counts for label in list(left_labels) + list(right_labels): workspaces_per_label[label] += 1 segments_per_label[label] += len(observed) # add observed counts for counter in counters: counter.mObservedCounts.addCounts(observed, work_start, work_end, left_labels, right_labels) # create sampler s = sampler(observed, work_start, work_end) # add simulated counts for iteration in range(options.num_samples): simulated = s.sample() for counter in counters: counter.mSimulatedCounts[iteration].addCounts( simulated, work_start, work_end, left_labels, right_labels) E.info("counting finished") E.info( "nworkspaces=%i, nmiddle=%i, nempty_workspaces=%i, nempty_contigs=%i" % (nworkspaces, nmiddle, nempty_workspaces, nempty_contigs)) ###################################################### # transform counts if options.transform_counts == "cumulative": transform = cumulative_transform elif options.transform_counts == "raw": transform = normalize_transform #################################################### # analysis if "proximity" in options.analysis: outfile_proximity = E.openOutputFile("proximity") outfile_proximity.write("\t".join( ("label", "observed", "pvalue", "expected", "CIlower", "CIupper", "qvalue", "segments", "workspaces")) + "\n") else: outfile_proximity = None if "area-under-curve" in options.analysis: outfile_auc = E.openOutputFile("auc") outfile_auc.write("label\tobserved\texpected\tCIlower\tCIupper\n") else: outfile_auc = None # qvalue: expected false positives at p-value # qvalue = expected false positives / if options.do_fdr: E.info("computing pvalues for fdr") for counter in counters: for label in labels: E.info("working on counter:%s label:%s" % (counter, label)) # collect all P-Values of simulated results to compute FDR sim_pvalues = [] medians = counter.getMedians(label) for median in medians: pvalue = float( scipy.stats.percentileofscore(medians, median)) / 100.0 sim_pvalues.append(pvalue) sim_pvalues.sort() else: sim_pvalues = [] # compute observed p-values for counter in counters: counter.update() obs_pvalues = [] for counter in counters: for label in labels: obs_pvalues.append(counter.mStats[label].pvalue) obs_pvalues.sort() # compute observed p-values if options.do_fdr: for counter in counters: counter.updateFDR(obs_pvalues, sim_pvalues) for counter in counters: outofbounds_sim, totals_sim = 0, 0 outofbounds_obs, totals_obs = 0, 0 for label in labels: for sample in range(options.num_samples): if counter.mSimulatedCounts[sample].mOutOfBounds[label]: E.debug( "out of bounds: sample %i, label %s, counts=%i" % (sample, label, counter.mSimulatedCounts[sample].mOutOfBounds[label])) outofbounds_sim += counter.mSimulatedCounts[ sample].mOutOfBounds[label] totals_sim += counter.mSimulatedCounts[sample].mTotals[label] outofbounds_obs += counter.mObservedCounts.mOutOfBounds[label] totals_obs += counter.mObservedCounts.mTotals[label] E.info( "out of bounds observations: observed=%i/%i (%5.2f%%), simulations=%i/%i (%5.2f%%)" % ( outofbounds_obs, totals_obs, 100.0 * outofbounds_obs / totals_obs, outofbounds_sim, totals_sim, 100.0 * outofbounds_sim / totals_sim, )) for label in labels: if outfile_auc: mmin, mmax, mmean = counter.getEnvelope( label, transform=normalize_transform) obs = normalize_transform( counter.mObservedCounts[label], counter.mObservedCounts.mOutOfBounds[label]) def block_iterator(a1, a2, a3, num_bins): x = 0 while x < num_bins: while x < num_bins and a1[x] <= a2[x]: x += 1 start = x while x < options.num_bins and a1[x] > a2[x]: x += 1 end = x total_a1 = a1[start:end].sum() total_a3 = a3[start:end].sum() if total_a1 > total_a3: yield (total_a1 - total_a3, start, end, total_a1, total_a3) blocks = list( block_iterator(obs, mmax, mmean, options.num_bins)) if options.output_all: for delta, start, end, total_obs, total_mean in blocks: if end - start <= 1: continue outfile_auc.write( "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) # output best block blocks.sort() delta, start, end, total_obs, total_mean = blocks[-1] outfile_auc.write( "%s\t%i\t%i\t%i\t%f\t%f\t%f\t%f\t%f\n" % (label, start * options.resolution, end * options.resolution, (end - start) * options.resolution, total_obs, total_mean, delta, total_obs / total_mean, 100.0 * (total_obs / total_mean - 1.0))) if outfile_proximity: # find error bars at median st = counter.mStats[label] outfile_proximity.write( "%s\t%i\t%f\t%i\t%i\t%i\t%s\t%i\t%i\n" % ( label, st.observed * options.resolution, st.pvalue, st.expected * options.resolution, st.ci95lower * options.resolution, st.ci95upper * options.resolution, IOTools.val2str(st.qvalue), segments_per_label[label], workspaces_per_label[label], )) if options.plot: for counter in counters: plotCounts(counter, options, transform) # plot summary stats plt.figure() plt.title("distribution of workspace length") data = [] for contig, segs in workspace.iteritems(): if len(segs) == 0: continue data.extend([x[1] - x[0] for x in segs]) vals, bins = numpy.histogram(data, bins=numpy.arange(0, max(data), 100), new=True) t = float(sum(vals)) plt.plot(bins[:-1], numpy.cumsum(vals) / t) plt.gca().set_xscale('log') plt.legend() t = float(sum(vals)) plt.xlabel("size of workspace") plt.ylabel("cumulative relative frequency") if options.hardcopy: plt.savefig(os.path.expanduser(options.hardcopy % "workspace_size")) plt.figure() plt.title("segments per block") vals, bins = numpy.histogram(segments_per_workspace, bins=numpy.arange( 0, max(segments_per_workspace), 1), new=True) plt.plot(bins[:-1], vals) plt.xlabel("segments per block") plt.ylabel("absolute frequency") if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_block")) plt.figure() plt.title("workspaces per label") plt.barh(range(0, len(labels)), [workspaces_per_label[x] for x in labels], height=0.5) plt.yticks(range(0, len(labels)), labels) plt.ylabel("workspaces per label") plt.xlabel("absolute frequency") plt.gca().set_xscale('log') if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "workspaces_per_label")) plt.figure() plt.title("segments per label") plt.barh(range(0, len(labels)), [segments_per_label[x] for x in labels], height=0.5) plt.yticks(range(0, len(labels)), labels) plt.ylabel("segments per label") plt.xlabel("absolute frequency") plt.xticks(range(0, len(labels)), labels) if options.hardcopy: plt.savefig( os.path.expanduser(options.hardcopy % "segments_per_label")) if not options.hardcopy: plt.show() E.Stop()
def main( argv = None ): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version = "%prog version: $Id: gff2annotator2tsv.py 2861 2010-02-23 17:36:32Z andreas $", usage = globals()["__doc__"]) parser.add_option( "-g", "--genome-file", dest="genome_file", type="string", help="filename with genome." ) parser.add_option( "-f", "--features", dest="features", type="string", help="feature to collect [default=None]." ) parser.add_option( "-i", "--files", dest="files", action="append", help="use multiple annotations [default=None]." ) parser.add_option( "-a", "--annotations", dest="annotations", type="string", help="aggregate name for annotations if only single file is provided from STDIN [default=None]." ) parser.add_option( "--input-filename-map", dest="input_filename_map", type="string", help="filename with a map of gene_ids to categories [default=None]." ) parser.add_option( "--output-filename-synonyms", dest="output_filename_synonyms", type="string", help="output filename for synonyms. For workspace building, the gff source will be used as the id (instead of the contig) [default=None]." ) parser.add_option( "-m", "--max-length", dest="max_length", type="string", help="maximum segment length [default=None]." ) parser.add_option( "-s", "--section", dest="section", type="choice", choices=("segments", "annotations", "annotations-genes", "annotations-go", "workspace", "annotations-gff" ), help="annotator section [default=None]." ) parser.add_option( "--subset", dest="subsets", type="string", action="append", help="add filenames to delimit subsets within the gff files. The syntax is filename.gff,label,filename.ids [default=None]." ) parser.add_option( "--remove-regex", dest="remove_regex", type="string", help="regular expression of contigs to remove [default=None]." ) parser.set_defaults( genome_file = None, feature = None, section = "segments", annotations = "annotations", max_length = 100000, files = [], subsets = [], input_filename_map = None, output_filename_synonyms = None, input_format = "gff", remove_regex = None, ) (options, args) = E.Start( parser ) options.files += args if len(options.files) == 0: options.files.append("-") options.files = list( itertools.chain( *[ re.split( "[,; ]+", x) for x in options.files ] ) ) if options.subsets: subsets = collections.defaultdict( list ) for s in options.subsets: filename_gff,label,filename_ids = s.split( "," ) subsets[filename_gff].append( (label,filename_ids) ) options.subsets = subsets if options.genome_file: fasta = IndexedFasta.IndexedFasta( options.genome_file ) else: fasta = None if options.section == "segments": prefix = "##Segs" elif options.section.startswith( "annotations" ): prefix = "##Id" elif options.section == "workspace": prefix = "##Work" else: raise ValueError("unknown section %s" % options.section) ninput, ncontigs, nsegments, ndiscarded = 0, 0, 0, 0 if options.remove_regex: options.remove_regex = re.compile( options.remove_regex ) if options.section in ("segments", "workspace"): iterator = GTF.iterator_filtered( GFF.iterator( options.stdin ), feature=options.feature ) if options.output_filename_synonyms: outfile_synonyms = open(options.output_filename_synonyms, "w") with_records = True else: outfile_synonyms = None with_records = False intervals =GTF.readAsIntervals( iterator, with_records = with_records ) ninput, nsegments, ndiscarded, ncontigs = \ PipelineEnrichment.outputSegments( options.stdout, intervals, options.section, outfile_synonyms = outfile_synonyms, max_length = options.max_length, remove_regex = options.remove_regex ) if outfile_synonyms: outfile_synonyms.close() elif options.section == "annotations-go": assert options.input_filename_map, "please supply option --input-filename-map" iterator = GTF.iterator_filtered( GTF.iterator( options.stdin ), feature=options.feature ) geneid2categories = IOTools.readMultiMap( open( options.input_filename_map, "r") ) category2segments = collections.defaultdict( list ) for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for start, end, geneid in gffs: if geneid not in geneid2categories: continue for category in geneid2categories[geneid]: category2segments[category].append(nsegments) options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) ) nsegments += 1 for category, segments in category2segments.iteritems(): options.stdout.write("##Ann\t%s\t%s\n" % (category, "\t".join( ["%i" % x for x in segments ] ) ) ) E.info( "set %s annotated with %i segments" % (category, len(segments)) ) elif options.section == "annotations": for filename in options.files: E.info( "adding filename %s" % filename ) start = nsegments is_gtf = False if filename == "-": iterator = GTF.iterator_filtered( GFF.iterator( sys.stdin ), feature=options.feature ) filename = options.annotations elif filename.endswith(".gtf"): is_gtf = True with open( filename, "r") as infile: iterator = GTF.iterator_filtered( GTF.iterator( infile ), feature=options.feature ) else: with open( filename, "r") as infile: iterator = GTF.iterator_filtered( GFF.iterator( infile ), feature=options.feature ) E.debug("processing %s" % (filename)) if not options.subsets or filename not in options.subsets: for contig, gffs in GTF.readAsIntervals( iterator ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0], x[1] ) ) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) ) E.info( "set %s annotated with %i segments" % (filename, nsegments - start) ) else: raise ValueError("don't know how to filter %s" % filename ) elif options.section == "annotations-gff": for filename in options.files: if filename == "-": iterator = GTF.iterator( sys.stdin ) else: iterator = GTF.iterator_filtered( GFF.iterator( open( filename, "r") ) ) segments = collections.defaultdict( list ) for gff in iterator: segments[":".join((gff.source,gff.feature))].append( (gff.contig,gff.start, gff.end) ) feature2segments = {} for feature, s in segments.iteritems(): s.sort() s1 = nsegments for contig, start, end in s: if options.remove_regex and options.remove_regex.search( contig ): continue options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) ) nsegments += 1 feature2segments[feature] = (s1, nsegments) for feature, id_range in feature2segments.iteritems(): start, end = id_range options.stdout.write("##Ann\t%s\t%s\n" % (feature, "\t".join( ["%i" % x for x in xrange( start,end) ] ) ) ) E.info( "set %s annotated with %i segments" % (feature, end-start) ) elif options.section == "annotations-genes": for filename in options.files: E.info( "adding filename %s" % filename ) start = nsegments assert filename.endswith(".gtf") or filename.endswith(".gtf.gz"), \ "requiring .gtf files for gene list filtering, received %s" % filename infile = IOTools.openFile( filename ) iterator = GTF.iterator_filtered( GTF.iterator( infile ), feature=options.feature ) E.debug("processing %s" % (filename)) if not options.subsets or filename not in options.subsets: ## output all for contig, gffs in GTF.readAsIntervals( iterator ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for x in gffs: options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, x[0],x[1] ) ) nsegments += 1 options.stdout.write("##Ann\t%s\t%s\n" % (filename, "\t".join( ["%i" % x for x in range(start, nsegments) ] ) ) ) E.info( "set %s annotated with %i segments" % (filename, nsegments - start) ) else: ## create subsets E.debug("applying subsets for %s" % filename ) geneid2label, label2segments = collections.defaultdict(list) , {} for label, filename_ids in options.subsets[filename]: gene_ids = IOTools.readList( open(filename_ids, "r") ) for gene_id in gene_ids: geneid2label[gene_id].append( label ) label2segments[label] = [] for contig, gffs in GTF.readAsIntervals( iterator, with_gene_id = True ).items(): if options.remove_regex and options.remove_regex.search( contig ): continue for start, end, gene_id in gffs: if gene_id not in geneid2label: continue for label in geneid2label[gene_id]: label2segments[label].append(nsegments) options.stdout.write( "%s\t%i\t%s\t(%i,%i)\n" % (prefix, nsegments, contig, start, end ) ) nsegments += 1 for label, segments in label2segments.iteritems(): options.stdout.write("##Ann\t%s\t%s\n" % (label, "\t".join( ["%i" % x for x in segments ] ) ) ) E.info( "set %s (%s) annotated with %i segments" % (label, filename, len(segments)) ) E.info( "ninput=%i, ncontigs=%i, nsegments=%i, ndiscarded=%i" % (ninput, ncontigs, nsegments, ndiscarded)) E.Stop()