def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option("-o", "--order", dest="output_order", type="choice", choices=( "track", "annotation", "fold", "pvalue", "qvalue", "observed"), help="order results in output by fold, track, etc. [default=%default].") parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=("empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=( "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help="method to perform multiple testing correction by controlling the fdr [default=%default].") parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option("--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option("--pseudo-count", dest="pseudo_count", type="float", help="pseudo count. The pseudo count is added to both the observed and expected overlap. " " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default].") parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.set_defaults( pvalue_method="empirical", qvalue_method="BH", qvalue_lambda=None, qvalue_pi0_method="smoother", # pseudo count for fold change computation to avoid 0 fc pseudo_count=1.0, output_order="observed", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) input_filenames_counts = args ################################################## E.info("received %i filenames with counts" % len(input_filenames_counts)) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) all_annotator_results = [] for input_filename_counts in input_filenames_counts: E.info("processing %s" % input_filename_counts) annotator_results = gat.fromCounts(input_filename_counts) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) GatEngine.updatePValues(annotator_results, options.pvalue_method) ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") GatEngine.updateQValues(annotator_results, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) all_annotator_results.append(annotator_results) pseudo_count = options.pseudo_count results = [] if len(all_annotator_results) == 1: E.info("performing pairwise comparison within a file") # collect all annotations annotations, segments = list(), set() for x in all_annotator_results[0]: segments.add(x.track) annotations.append(x) if len(segments) != 1: raise NotImplementedError("multiple segments of interest") for data1, data2 in itertools.combinations(annotations, 2): # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) else: E.info("performing pairwise comparison between files") ################################################## # perform pairwise comparison for index1, index2 in itertools.combinations(range(len(input_filenames_counts)), 2): E.info("comparing %i and %i" % (index1, index2)) a, b = all_annotator_results[index1], all_annotator_results[index2] # index results in a and b aa = collections.defaultdict(dict) for x in a: aa[x.track][x.annotation] = x bb = collections.defaultdict(dict) for x in b: bb[x.track][x.annotation] = x if len(aa.keys()) != 1 or len(bb.keys()) != 1: raise NotImplementedError("multiple segments of interest") track = "merged" # get shared annotations annotations1 = aa[track].keys() annotations2 = bb[track].keys() shared_annotations = list( set(annotations1).intersection(set(annotations2))) E.info("%i shared annotations" % len(shared_annotations)) for annotation in shared_annotations: # if not annotation.startswith("Ram:"): continue data1 = aa[track][annotation] data2 = bb[track][annotation] # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, # stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(track, annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) if len(results) == 0: E.critical("no results found") E.Stop() return IO.outputResults(results, options, GatEngine.AnnotatorResult.headers, description_header, description_width, descriptions, format_observed="%6.4f") IO.plotResults(results, options) # write footer and output benchmark information. E.Stop()
def buildSegments(options): '''load segments, annotations and workspace from parameters defined in *options*. The workspace will be split by isochores. returns segments, annotations and workspace. ''' options.segment_files = expandGlobs(options.segment_files) options.annotation_files = expandGlobs(options.annotation_files) options.workspace_files = expandGlobs(options.workspace_files) options.sample_files = expandGlobs(options.sample_files) ################################################## # arguments sanity check if not options.segment_files: raise ValueError("please specify at least one segment file") if not options.annotation_files: raise ValueError("please specify at least one annotation file") if not options.workspace_files: raise ValueError("please specify at least one workspace file") # read one or more segment files segments = readSegmentList("segments", options.segment_files, ignore_tracks=options.ignore_segment_tracks) segments.normalize() if segments.sum() == 0: E.critical("no segments in input file - run aborted") raise ValueError("segments file is empty - run aborted") if len(segments) > 1000: raise ValueError("too many (%i) segment files - use track definitions " "or --ignore-segment-tracks" % len(segments)) annotations = readSegmentList( "annotations", options.annotation_files, enable_split_tracks=options.enable_split_tracks, ignore_tracks=options.annotations_label is not None) if options.annotations_label is not None: annotations.setName(options.annotations_label) if options.annotations_to_points: annotations.toPositions(options.annotations_to_points) if options.overlapping_annotations: # only sort, do not merge annotations.sort() else: annotations.normalize() workspaces = readSegmentList("workspaces", options.workspace_files, options, options.enable_split_tracks) workspaces.normalize() # intersect workspaces to build a single workspace E.info("collapsing workspaces") dumpStats(workspaces, "stats_workspaces_input", options) workspaces.collapse() dumpStats(workspaces, "stats_workspaces_collapsed", options) # use merged workspace only, discard others workspaces.restrict("collapsed") # build isochores or intersect annotations/segments with workspace if options.isochore_files: # read one or more isochore files isochores = Engine.IntervalCollection(name="isochores") E.info("%s: reading isochores from %i files" % ("isochores", len(options.isochore_files))) isochores.load(options.isochore_files) dumpStats(isochores, "stats_isochores_raw", options) # merge isochores and check if consistent (fully normalized) isochores.sort() # check that there are no overlapping segments within isochores isochores.check() # TODO: flag is_normalized not properly set isochores.normalize() # check that there are no overlapping segments between isochores # truncate isochores to workspace # crucial if isochores are larger than workspace. isochores.intersect(workspaces["collapsed"]) else: isochores = None return segments, annotations, workspaces, isochores
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = optparse.OptionParser( version= "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-o", "--order", dest="output_order", type="choice", choices=("track", "annotation", "fold", "pvalue", "qvalue", "observed"), help="order results in output by fold, track, etc. [default=%default]." ) parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice", choices=( "empirical", "norm", ), help="type of pvalue reported [default=%default].") parser.add_option( "-q", "--qvalue-method", dest="qvalue_method", type="choice", choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"), help= "method to perform multiple testing correction by controlling the fdr [default=%default]." ) parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float", help="fdr computation: lambda [default=%default].") parser.add_option( "--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice", choices=("smoother", "bootstrap"), help="fdr computation: method for estimating pi0 [default=%default].") parser.add_option( "--descriptions", dest="input_filename_descriptions", type="string", help="filename mapping annotation terms to descriptions. " " if given, the output table will contain additional columns " " [default=%default]") parser.add_option( "--pseudo-count", dest="pseudo_count", type="float", help= "pseudo count. The pseudo count is added to both the observed and expected overlap. " " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]." ) parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string", help="output pattern for plots [default=%default]") parser.set_defaults( pvalue_method="empirical", qvalue_method="BH", qvalue_lambda=None, qvalue_pi0_method="smoother", # pseudo count for fold change computation to avoid 0 fc pseudo_count=1.0, output_order="observed", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.Start(parser, argv=argv, add_output_options=True) input_filenames_counts = args ################################################## E.info("received %i filenames with counts" % len(input_filenames_counts)) ################################################## description_header, descriptions, description_width = IO.readDescriptions( options) all_annotator_results = [] for input_filename_counts in input_filenames_counts: E.info("processing %s" % input_filename_counts) annotator_results = gat.fromCounts(input_filename_counts) ################################################## if options.pvalue_method != "empirical": E.info("updating pvalues to %s" % options.pvalue_method) GatEngine.updatePValues(annotator_results, options.pvalue_method) ################################################## ################################################## ################################################## # compute global fdr ################################################## E.info("computing FDR statistics") GatEngine.updateQValues(annotator_results, method=options.qvalue_method, vlambda=options.qvalue_lambda, pi0_method=options.qvalue_pi0_method) all_annotator_results.append(annotator_results) pseudo_count = options.pseudo_count results = [] if len(all_annotator_results) == 1: E.info("performing pairwise comparison within a single file") # collect all annotations annotations, segments = list(), set() for x in all_annotator_results[0]: segments.add(x.track) annotations.append(x) if len(segments) != 1: raise NotImplementedError("multiple segments of interest") for data1, data2 in itertools.combinations(annotations, 2): # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) else: E.info("performing pairwise comparison between multiple files") ################################################## # perform pairwise comparison for index1, index2 in itertools.combinations( range(len(input_filenames_counts)), 2): E.info("comparing %i and %i" % (index1, index2)) a, b = all_annotator_results[index1], all_annotator_results[index2] # index results in a and b aa = collections.defaultdict(dict) for x in a: aa[x.track][x.annotation] = x bb = collections.defaultdict(dict) for x in b: bb[x.track][x.annotation] = x tracks_a = set(aa.keys()) tracks_b = set(bb.keys()) shared_tracks = tracks_a.intersection(tracks_b) if len(shared_tracks) == 0: E.warn("no shared tracks between {} and {}".format( index1, index2)) for track in sorted(shared_tracks): E.debug("computing results for track {}".format(track)) # get shared annotations annotations1 = aa[track].keys() annotations2 = bb[track].keys() shared_annotations = list( set(annotations1).intersection(set(annotations2))) E.info("%i shared annotations" % len(shared_annotations)) for annotation in shared_annotations: # if not annotation.startswith("Ram:"): continue data1 = aa[track][annotation] data2 = bb[track][annotation] # note that fold changes can be very large if there are 0 samples # this is fine for getting the distributional params (mean, # stddev) fold_changes1 = data1.observed / (data1.samples + pseudo_count) fold_changes2 = data2.observed / (data2.samples + pseudo_count) # add a separate fc pseudo-count to avoid 0 values fold_changes1 += 0.0001 fold_changes2 += 0.0001 # Test is if relative fold change rfc is different from 1 # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2 # = obs1 / obs2 * exp2 / exp1 # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1 # # Convert to log space for easier plotting # Move the observed fold ratio in order to get an idea of the magnitude # of the underlying fold change delta_fold = data2.fold - data1.fold sampled_delta_fold = numpy.log( fold_changes1 / fold_changes2) + delta_fold observed_delta_fold = 0.0 + delta_fold result = GatEngine.AnnotatorResult(track, annotation, "na", observed_delta_fold, sampled_delta_fold, reference=None, pseudo_count=0) results.append(result) if len(results) == 0: E.critical("no results found") E.Stop() return IO.outputResults(results, options, GatEngine.AnnotatorResult.headers, description_header, description_width, descriptions, format_observed="%6.4f") IO.plotResults(results, options) # write footer and output benchmark information. E.Stop()
def buildSegments(options): '''load segments, annotations and workspace from parameters defined in *options*. The workspace will be split by isochores. returns segments, annotations and workspace. ''' options.segment_files = expandGlobs(options.segment_files) options.annotation_files = expandGlobs(options.annotation_files) options.workspace_files = expandGlobs(options.workspace_files) options.sample_files = expandGlobs(options.sample_files) ################################################## # arguments sanity check if not options.segment_files: raise ValueError("please specify at least one segment file") if not options.annotation_files: raise ValueError("please specify at least one annotation file") if not options.workspace_files: raise ValueError("please specify at least one workspace file") # read one or more segment files segments = readSegmentList("segments", options.segment_files, options) if options.ignore_segment_tracks: segments.merge(delete=True) E.info("merged all segments into one track with %i segments" % len(segments)) if segments.sum() == 0: E.critical("no segments in input file - run aborted") raise ValueError("segments file is empty - run aborted") if len(segments) > 1000: raise ValueError( "too many (%i) segment files - use track definitions or --ignore-segment-tracks" % len(segments)) annotations = readSegmentList( "annotations", options.annotation_files, options, options.enable_split_tracks) workspaces = readSegmentList( "workspaces", options.workspace_files, options, options.enable_split_tracks) # intersect workspaces to build a single workspace E.info("collapsing workspaces") dumpStats(workspaces, "stats_workspaces_input", options) workspaces.collapse() dumpStats(workspaces, "stats_workspaces_collapsed", options) # use merged workspace only, discard others workspaces.restrict("collapsed") # build isochores or intersect annotations/segments with workspace if options.isochore_files: # read one or more isochore files isochores = GatEngine.IntervalCollection(name="isochores") E.info("%s: reading isochores from %i files" % ("isochores", len(options.isochore_files))) isochores.load(options.isochore_files) dumpStats(isochores, "stats_isochores_raw", options) # merge isochores and check if consistent (fully normalized) isochores.sort() # check that there are no overlapping segments within isochores isochores.check() # TODO: flag is_normalized not properly set isochores.normalize() # check that there are no overlapping segments between isochores # truncate isochores to workspace # crucial if isochores are larger than workspace. isochores.intersect(workspaces["collapsed"]) else: isochores = None return segments, annotations, workspaces, isochores