def main(argv): if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append", help="filename with annotations [default=%default].") parser.add_option( "-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", help= "filename with segments. Also accepts a glob in parentheses [default=%default]." ) parser.add_option( "-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", help= "filename with workspace segments. Also accepts a glob in parentheses [default=%default]." ) parser.add_option( "-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append", help= "filename with isochore segments. Also accepts a glob in parentheses [default=%default]." ) parser.add_option( "-o", "--order", dest="output_order", type="choice", choices=("track", "annotation", "fold", "pvalue", "qvalue"), help="order results in output by fold, track, etc. [default=%default]." ) parser.add_option( "-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help= "method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option( "--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", help= "ignore segment tracks - all segments belong to one track [default=%default]" ) parser.add_option( "--enable-split-tracks", dest="enable_split_tracks", action="store_true", help="permit the same track to be in multiple files [default=%default]" ) parser.add_option("--output-bed", dest="output_bed", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output bed files [default=%default].") parser.add_option("--output-stats", dest="output_stats", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output overlap summary stats [default=%default].") parser.add_option( "--restrict-workspace", dest="restrict_workspace", action="store_true", help="restrict workspace to those segments that contain both track" " and annotations [default=%default]") parser.add_option("--counter", dest="counters", type="choice", action="append", choices=("binom", "hyperg"), help="counter to use [default=%default].") parser.add_option( "--output-tables-pattern", dest="output_tables_pattern", type="string", help= "output pattern for result tables. Used if there are multiple counters used [default=%default]." ) parser.set_defaults(annotation_files=[], segment_files=[], workspace_files=[], sample_files=[], counters=[], output_stats=[], output_bed=[], output_tables_pattern="%s.tsv.gz", output_order="fold", input_filename_counts=None, input_filename_results=None, pvalue_method="empirical", output_plots_pattern=None, output_samples_pattern=None, qvalue_method="storey", qvalue_lambda=None, qvalue_pi0_method="smoother", ignore_segment_tracks=False, input_filename_descriptions=None, conditional="unconditional", conditional_extension=None, conditional_expansion=None, restrict_workspace=False, enable_split_tracks=False, shift_expansion=2.0, shift_extension=0, overlap_mode="midpoint", truncate_workspace_to_annotations=False, truncate_segments_to_workspace=False) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) tstart = time.time() if len(options.counters) == 0: options.counters.append("binom") ############################################ segments, annotations, workspaces, isochores = IO.buildSegments(options) E.info("intervals loaded in %i seconds" % (time.time() - tstart)) # filter segments by workspace workspace = IO.applyIsochores(segments, annotations, workspaces, options, isochores) ############################################ description_header, descriptions, description_width = IO.readDescriptions( options) ############################################ ############################################ # compute per contig # compute bases covered by workspace workspace2basecoverage, isochores = {}, [] for contig, ww in workspace.iteritems(): workspace2basecoverage[contig] = ww.sum() isochores.append(contig) # compute percentage of bases covered by annotations in workspace # per isochore annotation2basecoverage = collections.defaultdict(dict) for annotation, aa in annotations.iteritems(): for isochore, a in aa.iteritems(): # need to truncate to workspace? annotation2basecoverage[annotation][isochore] = a.sum() results_per_contig = collections.defaultdict(list) E.info("computing counts per isochore") # results per isochore def emptyResult(segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace): return GREAT_RESULT._make(( segment, annotation, isochore, counter, 0, # observed 0, # expected nsegments_in_workspace, 0, # nannotations_in_workspace 0, # nsegments_overlapping_annotation 0, # nannotations_overlapping_segments 0, # basecoverage_intersection 0, # basecoverage_segments basecoverage_annotation, basecoverage_workspace, 0.0, 1.0, 1.0, 1.0)) for isochore in isochores: basecoverage_workspace = workspace2basecoverage[isochore] # iterate over all isochores for segment, segmentdict in segments.iteritems(): try: ss = segmentdict[isochore] # select segments overlapping workspace segments_in_workspace = GatSegmentList.SegmentList(clone=ss) segments_in_workspace.intersect(workspace[isochore]) # number of segments in workspace nsegments_in_workspace = len(segments_in_workspace) except KeyError: ss = None basecoverage_segments = segments_in_workspace.sum() for annotation, annotationdict in annotations.iteritems(): # if annotation != "GO:0030957": continue try: aa = annotationdict[isochore] except KeyError: aa = None # p_A: proportion of bases covered by annotation try: basecoverage_annotation = annotation2basecoverage[ annotation][isochore] except KeyError: basecoverage_annotation = 0 if ss == None or aa == None: for counter in options.counters: results_per_contig[(counter, segment, annotation)].append( emptyResult( segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace)) continue # select segments overlapping annotation segments_overlapping_annotation = GatSegmentList.SegmentList( clone=ss) segments_overlapping_annotation.intersect( annotations[annotation][isochore]) # number of segments in annotation nsegments_overlapping_annotation = ss.intersectionWithSegments( annotations[annotation][isochore], mode=options.overlap_mode) # number of nucleotides at the intersection of segments, # annotation and workspace basecoverage_intersection = segments_overlapping_annotation.sum( ) annotations_overlapping_segments = GatSegmentList.SegmentList( clone=aa) annotations_overlapping_segments.intersect(ss) nannotations_overlapping_segments = len( annotations_overlapping_segments) nannotations_in_workspace = len(aa) if nannotations_in_workspace == 0: for counter in options.counters: results_per_contig[(counter, segment, annotation)].append( emptyResult( segment, annotation, isochore, counter, nsegments_in_workspace, basecoverage_annotation, basecoverage_workspace)) continue fraction_coverage_annotation = basecoverage_annotation / \ float(basecoverage_workspace) fraction_hit_annotation = float( nannotations_overlapping_segments ) / nannotations_in_workspace for counter in options.counters: if counter.startswith("binom"): # GREAT binomial probability over "regions" # n = number of genomic regions = nannotations_in_workspace # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation # kpi = genomic regions with annotation hit by segments = nannotations_in_segments # sf = survival functions = 1 -cdf # probability of observing >kpi in a sample of n where the probabily of succes is # ppi. pvalue = scipy.stats.binom.sf( nsegments_overlapping_annotation - 1, nsegments_in_workspace, fraction_coverage_annotation) expected = fraction_coverage_annotation * \ nsegments_in_workspace observed = nsegments_overlapping_annotation elif counter.startswith("hyperg"): # hypergeometric probability over nucleotides # Sampling without replacement # x,M,n,M # x = observed number of nucleotides in overlap of segments,annotations and workspace # M = number of nucleotides in workspace # n = number of nucleotides in annotations (and workspace) # N = number of nucleotides in segments (and workspace) # P-value of obtaining >x number of nucleotides # overlapping. rv = scipy.stats.hypergeom(basecoverage_workspace, basecoverage_annotation, basecoverage_segments) pvalue = rv.sf(basecoverage_intersection) expected = rv.mean() observed = basecoverage_intersection if expected != 0: fold = float(observed) / expected else: fold = 1.0 r = GREAT_RESULT._make( (segment, annotation, isochore, counter, observed, expected, nsegments_in_workspace, nannotations_in_workspace, nsegments_overlapping_annotation, nannotations_overlapping_segments, basecoverage_intersection, basecoverage_segments, basecoverage_annotation, basecoverage_workspace, fraction_coverage_annotation, fold, pvalue, 1.0)) # print "\t".join( map(str, r)) results_per_contig[(counter, segment, annotation)].append(r) E.info("merging counts per isochore") # compute sums results = [] for niteration, pair in enumerate(results_per_contig.iteritems()): counter, segment, annotation = pair[0] data = pair[1] nsegments_in_workspace = sum([x.nsegments_in_workspace for x in data]) nsegments_overlapping_annotation = sum([x.observed for x in data]) nannotations_in_workspace = sum( [x.nannotations_in_workspace for x in data]) nannotations_overlapping_segments = sum( [x.nannotations_overlapping_segments for x in data]) basecoverage_intersection = sum( [x.basecoverage_intersection for x in data]) basecoverage_segments = sum([x.basecoverage_segments for x in data]) basecoverage_annotation = sum( [x.basecoverage_annotation for x in data]) basecoverage_workspace = sum([x.basecoverage_workspace for x in data]) fraction_coverage_annotation = basecoverage_annotation / \ float(basecoverage_workspace) if counter.startswith("binom"): pvalue = scipy.stats.binom.sf(nsegments_overlapping_annotation - 1, nsegments_in_workspace, fraction_coverage_annotation) expected = fraction_coverage_annotation * nsegments_in_workspace observed = nsegments_overlapping_annotation elif counter.startswith("hyperg"): rv = scipy.stats.hypergeom(basecoverage_workspace, basecoverage_annotation, basecoverage_segments) pvalue = rv.sf(basecoverage_intersection) expected = rv.mean() observed = basecoverage_intersection if expected != 0: fold = float(observed) / expected else: fold = 1.0 r = GREAT_RESULT._make( (segment, annotation, "all", counter, observed, expected, nsegments_in_workspace, nannotations_in_workspace, nsegments_overlapping_annotation, nannotations_overlapping_segments, basecoverage_intersection, basecoverage_segments, basecoverage_annotation, basecoverage_workspace, fraction_coverage_annotation, fold, pvalue, 1.0)) results.append(r) IO.outputResults(results, options, GREAT_RESULT._fields, description_header, description_width, descriptions) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-o", "--order", dest="output_order", type="choice", choices=("track", "annotation", "fold", "pvalue", "qvalue", "observed"), help="order results in output by fold, track, etc. [default=%default]." ) parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=( "empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option( "-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help= "method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option( "--pseudo-count", dest="pseudo_count", type="float", help= "pseudo count. The pseudo count is added to both the observed and expected overlap. " " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." ) parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.set_defaults( pvalue_method="empirical", qvalue_method="BH", qvalue_lambda=None, qvalue_pi0_method="smoother", # pseudo count for fold change computation to avoid 0 fc pseudo_count=1.0, output_order="observed", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) input_filenames_counts = args ################################################## E.info("received %i filenames with counts" % len(input_filenames_counts)) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) all_annotator_results = [] for input_filename_counts in input_filenames_counts: E.info("processing %s" % input_filename_counts) annotator_results = gat.fromCounts(input_filename_counts) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) GatEngine.updatePValues(annotator_results, options.pvalue_method) ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") GatEngine.updateQValues(annotator_results, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) all_annotator_results.append(annotator_results) pseudo_count = options.pseudo_count results = [] if len(all_annotator_results) == 1: E.info("performing pairwise comparison within a single file") # collect all annotations annotations, segments = list(), set() for x in all_annotator_results[0]: segments.add(x.track) annotations.append(x) if len(segments) != 1: raise NotImplementedError("multiple segments of interest") for data1, data2 in itertools.combinations(annotations, 2): # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) else: E.info("performing pairwise comparison between multiple files") ################################################## # perform pairwise comparison for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2): E.info("comparing %i and %i" % (index1, index2)) a, b = all_annotator_results[index1], all_annotator_results[index2] # index results in a and b aa = collections.defaultdict(dict) for x in a: aa[x.track][x.annotation] = x bb = collections.defaultdict(dict) for x in b: bb[x.track][x.annotation] = x tracks_a = set(aa.keys()) tracks_b = set(bb.keys()) shared_tracks = tracks_a.intersection(tracks_b) if len(shared_tracks) == 0: E.warn("no shared tracks between {} and {}".format( index1, index2)) for track in sorted(shared_tracks): E.debug("computing results for track {}".format(track)) # get shared annotations annotations1 = aa[track].keys() annotations2 = bb[track].keys() shared_annotations = list( set(annotations1).intersection(set(annotations2))) E.info("%i shared annotations" % len(shared_annotations)) for annotation in shared_annotations: # if not annotation.startswith("Ram:"): continue data1 = aa[track][annotation] data2 = bb[track][annotation] # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, # stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(track, annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) if len(results) == 0: E.critical("no results found") E.Stop() return IO.outputResults(results, options, GatEngine.AnnotatorResult.headers, description_header, description_width, descriptions, format_observed="%6.4f") IO.plotResults(results, options) # write footer and output benchmark information. E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-l", "--sample-file", dest="sample_files", type="string", action="append", help="filename with sample files. Start processing from samples [default=%default].") parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=("empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option("--results-file", dest="input_filename_results", type="string", help="start processing from results - no segments required [default=%default].") parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.add_option("--output-samples-pattern", dest="output_samples_pattern", type="string", help="output pattern for samples. Samples are stored in bed format, one for " " each segment [default=%default]") parser.add_option("--plots", dest="plots", type="choice", choices=("all", "bars-per-track", "bars", ), help="plots to be created [default=%default].") parser.set_defaults( sample_files=[], num_samples=1000, output_stats=[], output_filename_counts=None, output_order="fold", input_filename_results=None, pvalue_method="empirical", output_plots_pattern=None, plots=[], ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) annotator_results = IO.readAnnotatorResults(options.input_filename_results) if "speparate-bars" in options.plots: plotBarplots(annotator_results, options) if "bars" in options.plots: plotBarplot(annotator_results, options) # write footer and output benchmark information. E.Stop()
def main(argv): if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-a", "--gene-file", "--annotations", dest="annotation_files", type="string", action="append", help="filename with annotations - here, location of genes [default=%default].") parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append", help="filename with segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append", help="filename with workspace segments. Also accepts a glob in parentheses [default=%default].") parser.add_option("-g", "--number-of-genes", dest="number_of_genes", type="int", help="total number of genes [default=%default]") parser.add_option("-m", "--annotation-file", dest="annotation_file", type="string", help="filename mapping genes to annotations [default=%default]") parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option("--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option("--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true", help="ignore segment tracks - all segments belong to one track [default=%default]") parser.add_option("--enable-split-tracks", dest="enable_split_tracks", action="store_true", help="permit the same track to be in multiple files [default=%default]") parser.add_option("--output-bed", dest="output_bed", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output bed files [default=%default].") parser.add_option("--output-stats", dest="output_stats", type="choice", action="append", choices=("all", "annotations", "segments", "workspaces", "isochores", "overlap"), help="output overlap summary stats [default=%default].") parser.set_defaults( annotation_files=[], segment_files=[], workspace_files=[], sample_files=[], annotation_file=None, num_samples=1000, nbuckets=100000, bucket_size=1, counter="nucleotide-overlap", output_stats=[], output_bed=[], output_filename_counts=None, output_order="fold", cache=None, input_filename_counts=None, input_filename_results=None, pvalue_method="empirical", output_plots_pattern=None, output_samples_pattern=None, qvalue_method="storey", qvalue_lambda=None, qvalue_pi0_method="smoother", sampler="annotator", ignore_segment_tracks=False, input_filename_descriptions=None, conditional="unconditional", conditional_extension=None, conditional_expansion=None, restrict_workspace=False, enable_split_tracks=False, shift_expansion=2.0, shift_extension=0, overlap_mode="midpoint", number_of_genes=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) tstart = time.time() # load segments options.segment_files = IO.expandGlobs(options.segment_files) options.annotation_files = IO.expandGlobs(options.annotation_files) options.workspace_files = IO.expandGlobs(options.workspace_files) # read one or more segment files segments = IO.readSegmentList("segments", options.segment_files, options) if options.ignore_segment_tracks: segments.merge(delete=True) E.info("merged all segments into one track with %i segments" % len(segments)) if len(segments) > 1000: raise ValueError( "too many (%i) segment files - use track definitions or --ignore-segment-tracks" % len(segments)) # load workspace workspaces = IO.readSegmentList( "workspaces", options.workspace_files, options, options.enable_split_tracks) # intersect workspaces to build a single workspace E.info("collapsing workspaces") workspaces.collapse() # use merged workspace only, discard others workspaces.restrict("collapsed") workspace = workspaces["collapsed"] E.info("intervals loaded in %i seconds" % (time.time() - tstart)) ############################################ # load table mapping a gene id to annotations gene2annotations = IOTools.readMultiMap(IOTools.openFile(options.annotation_file), has_header=True) annotations = set([y for x in gene2annotations.values() for y in x]) E.info("loaded %i annotations for %i genes" % (len(gene2annotations), len(annotations))) ############################################ # load bed file with gene coordinates assert len(options.annotation_files) == 1 indexed_genes = collections.defaultdict(Intersecter) total_genes = 0 # number of genes per contig contig2ngenes = collections.defaultdict(int) # compute number of genes with a particular annotation # per contig annotation2ngenes = collections.defaultdict(int) for line in IOTools.openFile(options.annotation_files[0]): if line.startswith("#"): continue contig, start, end, gene_id = line[:-1].split("\t")[:4] indexed_genes[contig].add_interval( Interval(int(start), int(end), gene_id)) contig2ngenes[contig] += 1 total_genes += 1 try: for annotation in gene2annotations[gene_id]: annotation2ngenes[annotation] += 1 except KeyError: pass E.info("indexed locations for %i contigs" % len(indexed_genes)) ############################################ description_header, descriptions, description_width = IO.readDescriptions( options) ############################################ ############################################ # compute results E.info("computing counts") results = [] # iterate over segments for segment, segmentdict in segments.iteritems(): # genes hit by segments per annotation genes_hit_by_segments_with_annotations = collections.defaultdict(int) # genes hit by segments genes_hit_by_segments = 0 for contig, ss in segmentdict.iteritems(): for start, end in ss: overlapping_genes = list( indexed_genes[contig].find(start, end)) genes_hit_by_segments += len(overlapping_genes) for x in overlapping_genes: gene_id = x.value try: for annotation in gene2annotations[gene_id]: genes_hit_by_segments_with_annotations[ annotation] += 1 except KeyError: pass # N = number of genes in genome N = total_genes # n = number of genes selected by segments n = genes_hit_by_segments for annotation in annotations: # K = number of genes carrying annotation K = annotation2ngenes[annotation] # k = number of genes selected by segments and with annotation k = genes_hit_by_segments_with_annotations[annotation] if n == 0 or N == 0 or K == 0: expected = 0 fold = 1.0 pvalue = 1.0 else: expected = float(n * K) / N fold = k / expected pvalue = scipy.stats.hypergeom.sf(k - 1, N, K, n) r = GENESET_RESULT._make(( segment, annotation, N, K, n, k, expected, fold, pvalue, 1.0)) results.append(r) IO.outputResults(results, options, GENESET_RESULT._fields, description_header, description_width, descriptions) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv parser = gat.buildParser(usage=globals()["__doc__"]) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) ################################################## size_pos, size_segment = SegmentList.getSegmentSize() E.debug("sizes: pos=%i segment=%i, max_coord=%i" % (size_pos, size_segment, 2**(8 * size_pos))) ################################################## # set default counter if not options.counters: options.counters.append("nucleotide-overlap") ################################################## if options.output_tables_pattern is not None: if "%s" not in options.output_tables_pattern: raise ValueError( "output_tables_pattern should contain at least one '%s'") if options.output_samples_pattern is not None: if "%s" not in options.output_samples_pattern: raise ValueError( "output_samples_pattern should contain at least one '%s'") if options.output_counts_pattern is not None: if "%s" not in options.output_counts_pattern: raise ValueError( "output_counts_pattern should contain at least one '%s'") if options.random_seed is not None: # initialize python random number generator random.seed(options.random_seed) # initialize numpy random number generator numpy.random.seed(options.random_seed) ################################################## # read fold changes that results should be compared with if options.null != "default": if not os.path.exists(options.null): raise OSError("file %s not found" % options.null) E.info("reading reference results from %s" % options.null) options.reference = IO.readAnnotatorResults(options.null) else: options.reference = None if options.input_filename_counts: # use pre-computed counts annotator_results = Engine.fromCounts(options.input_filename_counts) elif options.input_filename_results: # use previous results (re-computes fdr) E.info("reading gat results from %s" % options.input_filename_results) annotator_results = IO.readAnnotatorResults( options.input_filename_results) else: # do full gat analysis annotator_results = fromSegments(options, args) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) Engine.updatePValues(annotator_results, options.pvalue_method) ################################################## # output IO.outputResults(annotator_results, options, Engine.AnnotatorResultExtended.headers, description_header, description_width, descriptions) IO.plotResults(annotator_results, options) # write footer and output benchmark information. E.Stop()