Ejemplo n.º 1
0
def readSegmentList(label,
                    filenames,
                    enable_split_tracks=False,
                    ignore_tracks=False):
    """read one or more segment files.

    Arguments
    ---------
    label : string
        Label to use for IntervalCollection.
    filenames : list
        List of filenames to load in :term:`bed` format.
    enable_split_tracks : bool
        If True, allow tracks to be split across multiple files.
    ignore_tracks : int
        If True, ignore track information.

    Returns
    -------
    segments : IntervalCollection
        The segment collection.
    """
    results = Engine.IntervalCollection(name=label)
    E.info("%s: reading tracks from %i files" % (label, len(filenames)))
    results.load(filenames,
                 allow_multiple=enable_split_tracks,
                 ignore_tracks=ignore_tracks)
    E.info("%s: read %i tracks from %i files" %
           (label, len(results), len(filenames)))
    return results
Ejemplo n.º 2
0
def readDescriptions(options):
    '''read descriptions from tab separated file.'''

    description_header, descriptions, description_width = [], {}, 0
    if options.input_filename_descriptions:
        E.info("reading descriptions from %s" %
               options.input_filename_descriptions)

        with IOTools.openFile(options.input_filename_descriptions) as inf:
            first = True
            for line in inf:
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")

                if description_width:
                    assert len(data) - 1 == description_width, \
                        "inconsistent number of descriptions in %s" %\
                        options.input_filename_descriptions
                else:
                    description_width = len(data) - 1

                if first:
                    description_header = data[1:]
                    first = False
                else:
                    descriptions[data[0]] = data[1:]
        assert len(description_header) == description_width, \
            "number of descriptions (%i) inconsistent with header (%s) in %s" % \
            (description_width, len(description_header),
             options.input_filename_descriptions)

    return description_header, descriptions, description_width
Ejemplo n.º 3
0
def readSegmentList(label,
                    filenames,
                    enable_split_tracks=False,
                    ignore_tracks=False):
    """read one or more segment files.

    Arguments
    ---------
    label : string
        Label to use for IntervalCollection.
    filenames : list
        List of filenames to load in :term:`bed` format.
    enable_split_tracks : bool
        If True, allow tracks to be split across multiple files.
    ignore_tracks : int
        If True, ignore track information.

    Returns
    -------
    segments : IntervalCollection
        The segment collection.
    """
    results = Engine.IntervalCollection(name=label)
    E.info("%s: reading tracks from %i files" % (label, len(filenames)))
    results.load(filenames,
                 allow_multiple=enable_split_tracks,
                 ignore_tracks=ignore_tracks)
    E.info("%s: read %i tracks from %i files" %
           (label, len(results), len(filenames)))
    return results
Ejemplo n.º 4
0
def readDescriptions(options):
    '''read descriptions from tab separated file.'''

    description_header, descriptions, description_width = [], {}, 0
    if options.input_filename_descriptions:
        E.info("reading descriptions from %s" %
               options.input_filename_descriptions)

        with IOTools.openFile(options.input_filename_descriptions) as inf:
            first = True
            for line in inf:
                if line.startswith("#"):
                    continue
                data = line[:-1].split("\t")

                if description_width:
                    assert len(
                        data) - 1 == description_width, "inconsistent number of descriptions in %s" % options.input_filename_descriptions
                else:
                    description_width = len(data) - 1

                if first:
                    description_header = data[1:]
                    first = False
                else:
                    descriptions[data[0]] = data[1:]
        assert len(description_header) == description_width, "number of descriptions (%i) inconsistent with header (%s) in %s" % \
            (description_width, len(description_header),
             options.input_filename_descriptions)

    return description_header, descriptions, description_width
Ejemplo n.º 5
0
def fromCounts(filename):
    '''build annotator results from a tab-separated table
    with counts.'''

    annotator_results = []

    with IOTools.openFile(filename, "r") as infile:

        E.info("loading data")

        header = infile.readline()
        if not header == "track\tannotation\tobserved\tcounts\n":
            raise ValueError("%s not a counts file: got %s" % (infile, header))

        for line in infile:
            track, annotation, observed, counts = line[:-1].split("\t")
            samples = numpy.array(
                list(map(float, counts.split(","))), dtype=numpy.float)
            observed = float(observed)
            annotator_results.append(Engine.AnnotatorResult(
                track=track,
                annotation=annotation,
                counter="na",
                observed=observed,
                samples=samples))

    return annotator_results
Ejemplo n.º 6
0
def readSegmentList(label, filenames, options, enable_split_tracks=False):
    # read one or more segment files
    results = GatEngine.IntervalCollection(name=label)
    E.info("%s: reading tracks from %i files" % (label, len(filenames)))
    results.load(filenames, split_tracks=enable_split_tracks)
    E.info("%s: read %i tracks from %i files" %
           (label, len(results), len(filenames)))
    dumpStats(results, "stats_%s_raw" % label, options)
    results.normalize()
    dumpStats(results, "stats_%s_normed" % label, options)
    return results
Ejemplo n.º 7
0
    def computeSamples(self, work, report_interval=100):
        '''compute samples according to work.

        returns a list of results.
        '''
        n = len(work)

        E.debug('sampling will work on %i items' % n)

        results = []

        if self.num_threads == 0:
            for i, w in enumerate(work):
                r = computeSample(
                    (w, self.samples_outfile, self.outfile_sample_metrics,
                     None))
                if i % report_interval == 0:
                    E.info("%i/%i done (%5.2f)" % (i, n, 100.0 * i / n))
                results.append(r)
        else:
            E.info("generating processpool with %i threads for %i items" %
                   (self.num_threads, len(work)))

            manager = multiprocessing.Manager()

            lock = manager.Lock()

            pool = multiprocessing.Pool(self.num_threads)

            # use file names - not files when multiprocessing
            samples_outfile, metrics_outfile = None, None
            if self.samples_outfile:
                samples_outfile = self.samples_outfile.name
                self.samples_outfile.flush()
            if self.outfile_sample_metrics:
                metrics_outfile = self.outfile_sample_metrics.name
                self.outfile_sample_metrics.flush()

            ww = [(w, samples_outfile, metrics_outfile, lock) for w in work]

            for i, r in enumerate(pool.imap_unordered(computeSample, ww)):
                if i % report_interval == 0:
                    E.info("%i/%i done (%5.2f)" % (i, n, 100.0 * i / n))
                results.append(r)

            pool.close()
            pool.join()

        return results
Ejemplo n.º 8
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-o", "--order", dest="output_order", type="choice",
                      choices=(
                          "track", "annotation", "fold", "pvalue", "qvalue", "observed"),
                      help="order results in output by fold, track, etc. [default=%default].")

    parser.add_option("-p", "--pvalue-method", dest="pvalue_method", type="choice",
                      choices=("empirical", "norm", ),
                      help="type of pvalue reported [default=%default].")

    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
                      choices=(
                          "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default].")

    parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
                      choices=("smoother", "bootstrap"),
                      help="fdr computation: method for estimating pi0 [default=%default].")

    parser.add_option("--descriptions", dest="input_filename_descriptions", type="string",
                      help="filename mapping annotation terms to descriptions. "
                      " if given, the output table will contain additional columns "
                      " [default=%default]")

    parser.add_option("--pseudo-count", dest="pseudo_count", type="float",
                      help="pseudo count. The pseudo count is added to both the observed and expected overlap. "
                      " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default].")

    parser.add_option("--output-plots-pattern", dest="output_plots_pattern", type="string",
                      help="output pattern for plots [default=%default]")

    parser.set_defaults(
        pvalue_method="empirical",
        qvalue_method="BH",
        qvalue_lambda=None,
        qvalue_pi0_method="smoother",
        # pseudo count for fold change computation to avoid 0 fc
        pseudo_count=1.0,
        output_order="observed",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    input_filenames_counts = args

    ##################################################
    E.info("received %i filenames with counts" % len(input_filenames_counts))

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    all_annotator_results = []

    for input_filename_counts in input_filenames_counts:

        E.info("processing %s" % input_filename_counts)

        annotator_results = gat.fromCounts(input_filename_counts)

        ##################################################
        if options.pvalue_method != "empirical":
            E.info("updating pvalues to %s" % options.pvalue_method)
            GatEngine.updatePValues(annotator_results, options.pvalue_method)

        ##################################################
        ##################################################
        ##################################################
        # compute global fdr
        ##################################################
        E.info("computing FDR statistics")
        GatEngine.updateQValues(annotator_results,
                                method=options.qvalue_method,
                                vlambda=options.qvalue_lambda,
                                pi0_method=options.qvalue_pi0_method)

        all_annotator_results.append(annotator_results)

    pseudo_count = options.pseudo_count
    results = []

    if len(all_annotator_results) == 1:
        E.info("performing pairwise comparison within a file")

        # collect all annotations
        annotations, segments = list(), set()
        for x in all_annotator_results[0]:
            segments.add(x.track)
            annotations.append(x)

        if len(segments) != 1:
            raise NotImplementedError("multiple segments of interest")

        for data1, data2 in itertools.combinations(annotations, 2):

            # note that fold changes can be very large if there are 0 samples
            # this is fine for getting the distributional params (mean, stddev)
            fold_changes1 = data1.observed / (data1.samples + pseudo_count)
            fold_changes2 = data2.observed / (data2.samples + pseudo_count)

            # add a separate fc pseudo-count to avoid 0 values
            fold_changes1 += 0.0001
            fold_changes2 += 0.0001

            # Test is if relative fold change rfc is different from 1
            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
            #                       = obs1 / obs2 * exp2 / exp1
            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
            #
            # Convert to log space for easier plotting
            # Move the observed fold ratio in order to get an idea of the magnitude
            # of the underlying fold change
            delta_fold = data2.fold - data1.fold
            sampled_delta_fold = numpy.log(
                fold_changes1 / fold_changes2) + delta_fold
            observed_delta_fold = 0.0 + delta_fold

            result = GatEngine.AnnotatorResult(data1.annotation, data2.annotation,
                                               "na",
                                               observed_delta_fold,
                                               sampled_delta_fold,
                                               reference=None,
                                               pseudo_count=0)

            results.append(result)

    else:
        E.info("performing pairwise comparison between files")

        ##################################################
        # perform pairwise comparison
        for index1, index2 in itertools.combinations(range(len(input_filenames_counts)), 2):
            E.info("comparing %i and %i" % (index1, index2))
            a, b = all_annotator_results[index1], all_annotator_results[index2]

            # index results in a and b
            aa = collections.defaultdict(dict)
            for x in a:
                aa[x.track][x.annotation] = x

            bb = collections.defaultdict(dict)
            for x in b:
                bb[x.track][x.annotation] = x

            if len(aa.keys()) != 1 or len(bb.keys()) != 1:
                raise NotImplementedError("multiple segments of interest")

            track = "merged"
            # get shared annotations
            annotations1 = aa[track].keys()
            annotations2 = bb[track].keys()
            shared_annotations = list(
                set(annotations1).intersection(set(annotations2)))
            E.info("%i shared annotations" % len(shared_annotations))

            for annotation in shared_annotations:

                # if not annotation.startswith("Ram:"): continue

                data1 = aa[track][annotation]
                data2 = bb[track][annotation]

                # note that fold changes can be very large if there are 0 samples
                # this is fine for getting the distributional params (mean,
                # stddev)
                fold_changes1 = data1.observed / (data1.samples + pseudo_count)
                fold_changes2 = data2.observed / (data2.samples + pseudo_count)

                # add a separate fc pseudo-count to avoid 0 values
                fold_changes1 += 0.0001
                fold_changes2 += 0.0001

                # Test is if relative fold change rfc is different from 1
                # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
                #                       = obs1 / obs2 * exp2 / exp1
                # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
                #
                # Convert to log space for easier plotting
                # Move the observed fold ratio in order to get an idea of the magnitude
                # of the underlying fold change
                delta_fold = data2.fold - data1.fold
                sampled_delta_fold = numpy.log(
                    fold_changes1 / fold_changes2) + delta_fold
                observed_delta_fold = 0.0 + delta_fold

                result = GatEngine.AnnotatorResult(track, annotation,
                                                   "na",
                                                   observed_delta_fold,
                                                   sampled_delta_fold,
                                                   reference=None,
                                                   pseudo_count=0)

                results.append(result)

    if len(results) == 0:
        E.critical("no results found")
        E.Stop()
        return

    IO.outputResults(results,
                     options,
                     GatEngine.AnnotatorResult.headers,
                     description_header,
                     description_width,
                     descriptions,
                     format_observed="%6.4f")

    IO.plotResults(results, options)

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 9
0
def main(argv):

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-a",
                      "--annotation-file",
                      "--annotations",
                      dest="annotation_files",
                      type="string",
                      action="append",
                      help="filename with annotations [default=%default].")

    parser.add_option(
        "-s",
        "--segment-file",
        "--segments",
        dest="segment_files",
        type="string",
        action="append",
        help=
        "filename with segments. Also accepts a glob in parentheses [default=%default]."
    )

    parser.add_option(
        "-w",
        "--workspace-file",
        "--workspace",
        dest="workspace_files",
        type="string",
        action="append",
        help=
        "filename with workspace segments. Also accepts a glob in parentheses [default=%default]."
    )

    parser.add_option(
        "-i",
        "--isochore-file",
        "--isochores",
        dest="isochore_files",
        type="string",
        action="append",
        help=
        "filename with isochore segments. Also accepts a glob in parentheses [default=%default]."
    )

    parser.add_option(
        "-o",
        "--order",
        dest="output_order",
        type="choice",
        choices=("track", "annotation", "fold", "pvalue", "qvalue"),
        help="order results in output by fold, track, etc. [default=%default]."
    )

    parser.add_option(
        "-q",
        "--qvalue-method",
        dest="qvalue_method",
        type="choice",
        choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg",
                 "BY", "none"),
        help=
        "method to perform multiple testing correction by controlling the fdr [default=%default]."
    )

    parser.add_option("--qvalue-lambda",
                      dest="qvalue_lambda",
                      type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option(
        "--qvalue-pi0-method",
        dest="qvalue_pi0_method",
        type="choice",
        choices=("smoother", "bootstrap"),
        help="fdr computation: method for estimating pi0 [default=%default].")
    parser.add_option(
        "--descriptions",
        dest="input_filename_descriptions",
        type="string",
        help="filename mapping annotation terms to descriptions. "
        " if given, the output table will contain additional columns "
        " [default=%default]")

    parser.add_option(
        "--ignore-segment-tracks",
        dest="ignore_segment_tracks",
        action="store_true",
        help=
        "ignore segment tracks - all segments belong to one track [default=%default]"
    )

    parser.add_option(
        "--enable-split-tracks",
        dest="enable_split_tracks",
        action="store_true",
        help="permit the same track to be in multiple files [default=%default]"
    )

    parser.add_option("--output-bed",
                      dest="output_bed",
                      type="choice",
                      action="append",
                      choices=("all", "annotations", "segments", "workspaces",
                               "isochores", "overlap"),
                      help="output bed files [default=%default].")

    parser.add_option("--output-stats",
                      dest="output_stats",
                      type="choice",
                      action="append",
                      choices=("all", "annotations", "segments", "workspaces",
                               "isochores", "overlap"),
                      help="output overlap summary stats [default=%default].")

    parser.add_option(
        "--restrict-workspace",
        dest="restrict_workspace",
        action="store_true",
        help="restrict workspace to those segments that contain both track"
        " and annotations [default=%default]")

    parser.add_option("--counter",
                      dest="counters",
                      type="choice",
                      action="append",
                      choices=("binom", "hyperg"),
                      help="counter to use [default=%default].")

    parser.add_option(
        "--output-tables-pattern",
        dest="output_tables_pattern",
        type="string",
        help=
        "output pattern for result tables. Used if there are multiple counters used [default=%default]."
    )

    parser.set_defaults(annotation_files=[],
                        segment_files=[],
                        workspace_files=[],
                        sample_files=[],
                        counters=[],
                        output_stats=[],
                        output_bed=[],
                        output_tables_pattern="%s.tsv.gz",
                        output_order="fold",
                        input_filename_counts=None,
                        input_filename_results=None,
                        pvalue_method="empirical",
                        output_plots_pattern=None,
                        output_samples_pattern=None,
                        qvalue_method="storey",
                        qvalue_lambda=None,
                        qvalue_pi0_method="smoother",
                        ignore_segment_tracks=False,
                        input_filename_descriptions=None,
                        conditional="unconditional",
                        conditional_extension=None,
                        conditional_expansion=None,
                        restrict_workspace=False,
                        enable_split_tracks=False,
                        shift_expansion=2.0,
                        shift_extension=0,
                        overlap_mode="midpoint",
                        truncate_workspace_to_annotations=False,
                        truncate_segments_to_workspace=False)

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    tstart = time.time()

    if len(options.counters) == 0:
        options.counters.append("binom")

    ############################################
    segments, annotations, workspaces, isochores = IO.buildSegments(options)
    E.info("intervals loaded in %i seconds" % (time.time() - tstart))

    # filter segments by workspace
    workspace = IO.applyIsochores(segments, annotations, workspaces, options,
                                  isochores)

    ############################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    ############################################
    ############################################
    # compute per contig

    # compute bases covered by workspace
    workspace2basecoverage, isochores = {}, []
    for contig, ww in workspace.iteritems():
        workspace2basecoverage[contig] = ww.sum()
        isochores.append(contig)

    # compute percentage of bases covered by annotations in workspace
    # per isochore
    annotation2basecoverage = collections.defaultdict(dict)
    for annotation, aa in annotations.iteritems():
        for isochore, a in aa.iteritems():
            # need to truncate to workspace?
            annotation2basecoverage[annotation][isochore] = a.sum()

    results_per_contig = collections.defaultdict(list)

    E.info("computing counts per isochore")

    # results per isochore

    def emptyResult(segment, annotation, isochore, counter,
                    nsegments_in_workspace, basecoverage_annotation,
                    basecoverage_workspace):
        return GREAT_RESULT._make((
            segment,
            annotation,
            isochore,
            counter,
            0,  # observed
            0,  # expected
            nsegments_in_workspace,
            0,  # nannotations_in_workspace
            0,  # nsegments_overlapping_annotation
            0,  # nannotations_overlapping_segments
            0,  # basecoverage_intersection
            0,  # basecoverage_segments
            basecoverage_annotation,
            basecoverage_workspace,
            0.0,
            1.0,
            1.0,
            1.0))

    for isochore in isochores:
        basecoverage_workspace = workspace2basecoverage[isochore]

        # iterate over all isochores
        for segment, segmentdict in segments.iteritems():
            try:
                ss = segmentdict[isochore]
                # select segments overlapping workspace
                segments_in_workspace = GatSegmentList.SegmentList(clone=ss)
                segments_in_workspace.intersect(workspace[isochore])
                # number of segments in workspace
                nsegments_in_workspace = len(segments_in_workspace)
            except KeyError:
                ss = None

            basecoverage_segments = segments_in_workspace.sum()

            for annotation, annotationdict in annotations.iteritems():

                # if annotation != "GO:0030957": continue

                try:
                    aa = annotationdict[isochore]
                except KeyError:
                    aa = None

                # p_A: proportion of bases covered by annotation
                try:
                    basecoverage_annotation = annotation2basecoverage[
                        annotation][isochore]
                except KeyError:
                    basecoverage_annotation = 0

                if ss == None or aa == None:
                    for counter in options.counters:
                        results_per_contig[(counter, segment,
                                            annotation)].append(
                                                emptyResult(
                                                    segment, annotation,
                                                    isochore, counter,
                                                    nsegments_in_workspace,
                                                    basecoverage_annotation,
                                                    basecoverage_workspace))
                    continue

                # select segments overlapping annotation
                segments_overlapping_annotation = GatSegmentList.SegmentList(
                    clone=ss)
                segments_overlapping_annotation.intersect(
                    annotations[annotation][isochore])
                # number of segments in annotation
                nsegments_overlapping_annotation = ss.intersectionWithSegments(
                    annotations[annotation][isochore],
                    mode=options.overlap_mode)

                # number of nucleotides at the intersection of segments,
                # annotation and workspace
                basecoverage_intersection = segments_overlapping_annotation.sum(
                )

                annotations_overlapping_segments = GatSegmentList.SegmentList(
                    clone=aa)
                annotations_overlapping_segments.intersect(ss)
                nannotations_overlapping_segments = len(
                    annotations_overlapping_segments)

                nannotations_in_workspace = len(aa)
                if nannotations_in_workspace == 0:
                    for counter in options.counters:
                        results_per_contig[(counter, segment,
                                            annotation)].append(
                                                emptyResult(
                                                    segment, annotation,
                                                    isochore, counter,
                                                    nsegments_in_workspace,
                                                    basecoverage_annotation,
                                                    basecoverage_workspace))
                    continue

                fraction_coverage_annotation = basecoverage_annotation / \
                    float(basecoverage_workspace)
                fraction_hit_annotation = float(
                    nannotations_overlapping_segments
                ) / nannotations_in_workspace

                for counter in options.counters:
                    if counter.startswith("binom"):
                        # GREAT binomial probability over "regions"
                        # n = number of genomic regions = nannotations_in_workspace
                        # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation
                        # kpi = genomic regions with annotation hit by segments = nannotations_in_segments
                        # sf = survival functions = 1 -cdf
                        # probability of observing >kpi in a sample of n where the probabily of succes is
                        # ppi.
                        pvalue = scipy.stats.binom.sf(
                            nsegments_overlapping_annotation - 1,
                            nsegments_in_workspace,
                            fraction_coverage_annotation)

                        expected = fraction_coverage_annotation * \
                            nsegments_in_workspace
                        observed = nsegments_overlapping_annotation

                    elif counter.startswith("hyperg"):

                        # hypergeometric probability over nucleotides
                        # Sampling without replacement
                        # x,M,n,M
                        # x = observed number of nucleotides in overlap of segments,annotations and workspace
                        # M = number of nucleotides in workspace
                        # n = number of nucleotides in annotations (and workspace)
                        # N = number of nucleotides in segments (and workspace)
                        # P-value of obtaining >x number of nucleotides
                        # overlapping.
                        rv = scipy.stats.hypergeom(basecoverage_workspace,
                                                   basecoverage_annotation,
                                                   basecoverage_segments)

                        pvalue = rv.sf(basecoverage_intersection)
                        expected = rv.mean()
                        observed = basecoverage_intersection

                    if expected != 0:
                        fold = float(observed) / expected
                    else:
                        fold = 1.0

                    r = GREAT_RESULT._make(
                        (segment, annotation, isochore, counter, observed,
                         expected, nsegments_in_workspace,
                         nannotations_in_workspace,
                         nsegments_overlapping_annotation,
                         nannotations_overlapping_segments,
                         basecoverage_intersection, basecoverage_segments,
                         basecoverage_annotation, basecoverage_workspace,
                         fraction_coverage_annotation, fold, pvalue, 1.0))
                    # print "\t".join( map(str, r))
                    results_per_contig[(counter, segment,
                                        annotation)].append(r)

    E.info("merging counts per isochore")

    # compute sums
    results = []

    for niteration, pair in enumerate(results_per_contig.iteritems()):

        counter, segment, annotation = pair[0]
        data = pair[1]

        nsegments_in_workspace = sum([x.nsegments_in_workspace for x in data])
        nsegments_overlapping_annotation = sum([x.observed for x in data])
        nannotations_in_workspace = sum(
            [x.nannotations_in_workspace for x in data])
        nannotations_overlapping_segments = sum(
            [x.nannotations_overlapping_segments for x in data])

        basecoverage_intersection = sum(
            [x.basecoverage_intersection for x in data])
        basecoverage_segments = sum([x.basecoverage_segments for x in data])
        basecoverage_annotation = sum(
            [x.basecoverage_annotation for x in data])
        basecoverage_workspace = sum([x.basecoverage_workspace for x in data])

        fraction_coverage_annotation = basecoverage_annotation / \
            float(basecoverage_workspace)

        if counter.startswith("binom"):
            pvalue = scipy.stats.binom.sf(nsegments_overlapping_annotation - 1,
                                          nsegments_in_workspace,
                                          fraction_coverage_annotation)
            expected = fraction_coverage_annotation * nsegments_in_workspace
            observed = nsegments_overlapping_annotation
        elif counter.startswith("hyperg"):
            rv = scipy.stats.hypergeom(basecoverage_workspace,
                                       basecoverage_annotation,
                                       basecoverage_segments)

            pvalue = rv.sf(basecoverage_intersection)
            expected = rv.mean()
            observed = basecoverage_intersection

        if expected != 0:
            fold = float(observed) / expected
        else:
            fold = 1.0

        r = GREAT_RESULT._make(
            (segment, annotation, "all", counter, observed, expected,
             nsegments_in_workspace, nannotations_in_workspace,
             nsegments_overlapping_annotation,
             nannotations_overlapping_segments, basecoverage_intersection,
             basecoverage_segments, basecoverage_annotation,
             basecoverage_workspace, fraction_coverage_annotation, fold,
             pvalue, 1.0))

        results.append(r)

    IO.outputResults(results, options, GREAT_RESULT._fields,
                     description_header, description_width, descriptions)

    E.Stop()
Ejemplo n.º 10
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(
        version=
        "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o",
        "--order",
        dest="output_order",
        type="choice",
        choices=("track", "annotation", "fold", "pvalue", "qvalue",
                 "observed"),
        help="order results in output by fold, track, etc. [default=%default]."
    )

    parser.add_option("-p",
                      "--pvalue-method",
                      dest="pvalue_method",
                      type="choice",
                      choices=(
                          "empirical",
                          "norm",
                      ),
                      help="type of pvalue reported [default=%default].")

    parser.add_option(
        "-q",
        "--qvalue-method",
        dest="qvalue_method",
        type="choice",
        choices=("storey", "BH", "bonferroni", "holm", "hommel", "hochberg",
                 "BY", "none"),
        help=
        "method to perform multiple testing correction by controlling the fdr [default=%default]."
    )

    parser.add_option("--qvalue-lambda",
                      dest="qvalue_lambda",
                      type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option(
        "--qvalue-pi0-method",
        dest="qvalue_pi0_method",
        type="choice",
        choices=("smoother", "bootstrap"),
        help="fdr computation: method for estimating pi0 [default=%default].")

    parser.add_option(
        "--descriptions",
        dest="input_filename_descriptions",
        type="string",
        help="filename mapping annotation terms to descriptions. "
        " if given, the output table will contain additional columns "
        " [default=%default]")

    parser.add_option(
        "--pseudo-count",
        dest="pseudo_count",
        type="float",
        help=
        "pseudo count. The pseudo count is added to both the observed and expected overlap. "
        " Using a pseudo-count avoids gat reporting fold changes of 0 [default=%default]."
    )

    parser.add_option("--output-plots-pattern",
                      dest="output_plots_pattern",
                      type="string",
                      help="output pattern for plots [default=%default]")

    parser.set_defaults(
        pvalue_method="empirical",
        qvalue_method="BH",
        qvalue_lambda=None,
        qvalue_pi0_method="smoother",
        # pseudo count for fold change computation to avoid 0 fc
        pseudo_count=1.0,
        output_order="observed",
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    input_filenames_counts = args

    ##################################################
    E.info("received %i filenames with counts" % len(input_filenames_counts))

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    all_annotator_results = []

    for input_filename_counts in input_filenames_counts:

        E.info("processing %s" % input_filename_counts)

        annotator_results = gat.fromCounts(input_filename_counts)

        ##################################################
        if options.pvalue_method != "empirical":
            E.info("updating pvalues to %s" % options.pvalue_method)
            GatEngine.updatePValues(annotator_results, options.pvalue_method)

        ##################################################
        ##################################################
        ##################################################
        # compute global fdr
        ##################################################
        E.info("computing FDR statistics")
        GatEngine.updateQValues(annotator_results,
                                method=options.qvalue_method,
                                vlambda=options.qvalue_lambda,
                                pi0_method=options.qvalue_pi0_method)

        all_annotator_results.append(annotator_results)

    pseudo_count = options.pseudo_count
    results = []

    if len(all_annotator_results) == 1:
        E.info("performing pairwise comparison within a single file")

        # collect all annotations
        annotations, segments = list(), set()
        for x in all_annotator_results[0]:
            segments.add(x.track)
            annotations.append(x)

        if len(segments) != 1:
            raise NotImplementedError("multiple segments of interest")

        for data1, data2 in itertools.combinations(annotations, 2):

            # note that fold changes can be very large if there are 0 samples
            # this is fine for getting the distributional params (mean, stddev)
            fold_changes1 = data1.observed / (data1.samples + pseudo_count)
            fold_changes2 = data2.observed / (data2.samples + pseudo_count)

            # add a separate fc pseudo-count to avoid 0 values
            fold_changes1 += 0.0001
            fold_changes2 += 0.0001

            # Test is if relative fold change rfc is different from 1
            # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
            #                       = obs1 / obs2 * exp2 / exp1
            # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
            #
            # Convert to log space for easier plotting
            # Move the observed fold ratio in order to get an idea of the magnitude
            # of the underlying fold change
            delta_fold = data2.fold - data1.fold
            sampled_delta_fold = numpy.log(
                fold_changes1 / fold_changes2) + delta_fold
            observed_delta_fold = 0.0 + delta_fold

            result = GatEngine.AnnotatorResult(data1.annotation,
                                               data2.annotation,
                                               "na",
                                               observed_delta_fold,
                                               sampled_delta_fold,
                                               reference=None,
                                               pseudo_count=0)

            results.append(result)

    else:
        E.info("performing pairwise comparison between multiple files")

        ##################################################
        # perform pairwise comparison
        for index1, index2 in itertools.combinations(
                range(len(input_filenames_counts)), 2):
            E.info("comparing %i and %i" % (index1, index2))
            a, b = all_annotator_results[index1], all_annotator_results[index2]

            # index results in a and b
            aa = collections.defaultdict(dict)
            for x in a:
                aa[x.track][x.annotation] = x

            bb = collections.defaultdict(dict)
            for x in b:
                bb[x.track][x.annotation] = x

            tracks_a = set(aa.keys())
            tracks_b = set(bb.keys())
            shared_tracks = tracks_a.intersection(tracks_b)
            if len(shared_tracks) == 0:
                E.warn("no shared tracks between {} and {}".format(
                    index1, index2))

            for track in sorted(shared_tracks):
                E.debug("computing results for track {}".format(track))
                # get shared annotations
                annotations1 = aa[track].keys()
                annotations2 = bb[track].keys()
                shared_annotations = list(
                    set(annotations1).intersection(set(annotations2)))
                E.info("%i shared annotations" % len(shared_annotations))

                for annotation in shared_annotations:

                    # if not annotation.startswith("Ram:"): continue

                    data1 = aa[track][annotation]
                    data2 = bb[track][annotation]

                    # note that fold changes can be very large if there are 0 samples
                    # this is fine for getting the distributional params (mean,
                    # stddev)
                    fold_changes1 = data1.observed / (data1.samples +
                                                      pseudo_count)
                    fold_changes2 = data2.observed / (data2.samples +
                                                      pseudo_count)

                    # add a separate fc pseudo-count to avoid 0 values
                    fold_changes1 += 0.0001
                    fold_changes2 += 0.0001

                    # Test is if relative fold change rfc is different from 1
                    # note: rfc = fc1 / fc2 = obs1 / exp1 * obs2 / exp2
                    #                       = obs1 / obs2 * exp2 / exp1
                    # Thus, it is equivalent to test rfc = obs1/obs2 versus exp2 / exp1
                    #
                    # Convert to log space for easier plotting
                    # Move the observed fold ratio in order to get an idea of the magnitude
                    # of the underlying fold change
                    delta_fold = data2.fold - data1.fold
                    sampled_delta_fold = numpy.log(
                        fold_changes1 / fold_changes2) + delta_fold
                    observed_delta_fold = 0.0 + delta_fold

                    result = GatEngine.AnnotatorResult(track,
                                                       annotation,
                                                       "na",
                                                       observed_delta_fold,
                                                       sampled_delta_fold,
                                                       reference=None,
                                                       pseudo_count=0)

                    results.append(result)

    if len(results) == 0:
        E.critical("no results found")
        E.Stop()
        return

    IO.outputResults(results,
                     options,
                     GatEngine.AnnotatorResult.headers,
                     description_header,
                     description_width,
                     descriptions,
                     format_observed="%6.4f")

    IO.plotResults(results, options)

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 11
0
def buildSegments(options):
    '''load segments, annotations and workspace from parameters
    defined in *options*.

    The workspace will be split by isochores.

    returns segments, annotations and workspace.
    '''

    options.segment_files = expandGlobs(options.segment_files)
    options.annotation_files = expandGlobs(options.annotation_files)
    options.workspace_files = expandGlobs(options.workspace_files)
    options.sample_files = expandGlobs(options.sample_files)

    ##################################################
    # arguments sanity check
    if not options.segment_files:
        raise ValueError("please specify at least one segment file")
    if not options.annotation_files:
        raise ValueError("please specify at least one annotation file")
    if not options.workspace_files:
        raise ValueError("please specify at least one workspace file")

    # read one or more segment files
    segments = readSegmentList("segments",
                               options.segment_files,
                               ignore_tracks=options.ignore_segment_tracks)
    segments.normalize()

    if segments.sum() == 0:
        E.critical("no segments in input file - run aborted")
        raise ValueError("segments file is empty - run aborted")

    if len(segments) > 1000:
        raise ValueError("too many (%i) segment files - use track definitions "
                         "or --ignore-segment-tracks" % len(segments))

    annotations = readSegmentList(
        "annotations",
        options.annotation_files,
        enable_split_tracks=options.enable_split_tracks,
        ignore_tracks=options.annotations_label is not None)

    if options.annotations_label is not None:
        annotations.setName(options.annotations_label)

    if options.annotations_to_points:
        annotations.toPositions(options.annotations_to_points)

    if options.overlapping_annotations:
        # only sort, do not merge
        annotations.sort()
    else:
        annotations.normalize()

    workspaces = readSegmentList("workspaces", options.workspace_files,
                                 options, options.enable_split_tracks)
    workspaces.normalize()

    # intersect workspaces to build a single workspace
    E.info("collapsing workspaces")
    dumpStats(workspaces, "stats_workspaces_input", options)
    workspaces.collapse()
    dumpStats(workspaces, "stats_workspaces_collapsed", options)

    # use merged workspace only, discard others
    workspaces.restrict("collapsed")

    # build isochores or intersect annotations/segments with workspace
    if options.isochore_files:

        # read one or more isochore files
        isochores = Engine.IntervalCollection(name="isochores")
        E.info("%s: reading isochores from %i files" %
               ("isochores", len(options.isochore_files)))
        isochores.load(options.isochore_files)
        dumpStats(isochores, "stats_isochores_raw", options)

        # merge isochores and check if consistent (fully normalized)
        isochores.sort()

        # check that there are no overlapping segments within isochores
        isochores.check()

        # TODO: flag is_normalized not properly set
        isochores.normalize()

        # check that there are no overlapping segments between isochores

        # truncate isochores to workspace
        # crucial if isochores are larger than workspace.
        isochores.intersect(workspaces["collapsed"])

    else:
        isochores = None

    return segments, annotations, workspaces, isochores
Ejemplo n.º 12
0
def plotResults(results, options):
    '''plot annotator results.'''

    ##################################################
    # plot histograms
    if options.output_plots_pattern and HASPLOT:

        def buildPlotFilename(options, key):
            filename = re.sub("%s", key, options.output_plots_pattern)
            filename = re.sub("[^a-zA-Z0-9-_./]", "_", filename)
            dirname = os.path.dirname(filename)
            if dirname and not os.path.exists(dirname):
                os.makedirs(dirname)
            return filename

        E.info("plotting sample stats")

        for r in results:

            plt.figure()
            k = []
            if r.track != "merged":
                k.append(r.track)
            k.append(r.annotation)
            if r.counter != "na":
                k.append(r.counter)
            key = "-".join(k)

            s = r.samples
            hist, bins = numpy.histogram(s, bins=100)

            # plot bars
            plt.hist(s, bins=100, normed=True, label=key)

            plt.axvline(r.observed, color='r', linewidth=2)

            # plot estimated
            sigma = r.stddev
            mu = r.expected
            plt.plot(bins,
                     1.0 / (sigma * numpy.sqrt(2 * numpy.pi)) *
                     numpy.exp(-(bins - mu)**2 / (2 * sigma**2)),
                     label="std distribution",
                     linewidth=2,
                     color='g')

            plt.legend()
            filename = buildPlotFilename(options, key)
            plt.savefig(filename)

        E.info("plotting P-value distribution")

        key = "pvalue"
        plt.figure()

        x, bins, y = plt.hist([r.pvalue for r in results],
                              bins=numpy.arange(0, 1.05, 0.025),
                              label="pvalue")

        plt.hist([r.qvalue for r in results],
                 bins=numpy.arange(0, 1.05, 0.025),
                 label="qvalue",
                 alpha=0.5)

        plt.legend()

        # hist, bins = numpy.histogram( \
        #     [r.pvalue for r in Engine.iterator_results(annotator_results) ],
        #     bins = 20 )
        # plt.plot( bins[:-1], hist, label = key )

        filename = buildPlotFilename(options, key)
        plt.savefig(filename)
Ejemplo n.º 13
0
def applyIsochores(segments, annotations, workspaces,
                   options,
                   isochores=None,
                   truncate_segments_to_workspace=False,
                   truncate_workspace_to_annotations=False,
                   restrict_workspace=False,
                   ):
    '''apply isochores to segments and annotations.

    Segments and annotations are filtered in place to keep 
    only those overlapping the workspace.

    If *isochores* are given, isochores are applied.

    If *truncate_segments_to_workspace*, truncate segments
    to workspace.

    If *restrict_workspace* is set, the workspace is confined
    to those parts that overlap both a segment and an annotation.

    If *truncate_workspace_to_annotations* is set, the workspace
    is truncated to keep only those parts that overlap annotations.

    returns a workspace divided into isochores.
    '''

    if isochores:
        # intersect isochores and workspaces, segments and annotations
        # workspace and annotations are truncated
        # with segments it is optional.
        E.info("adding isochores to workspace")
        workspaces.toIsochores(isochores, truncate=True)
        annotations.toIsochores(isochores, truncate=True)
        segments.toIsochores(
            isochores, truncate=options.truncate_segments_to_workspace)

        if workspaces.sum() == 0:
            raise ValueError("isochores and workspaces do not overlap")
        if annotations.sum() == 0:
            raise ValueError("isochores and annotations do not overlap")
        if segments.sum() == 0:
            raise ValueError("isochores and segments do not overlap")

        dumpStats(workspaces, "stats_workspaces_isochores", options)
        dumpStats(annotations, "stats_annotations_isochores", options)
        dumpStats(segments, "stats_segments_isochores", options)

        dumpBed(workspaces, "workspaces_isochores", options)
        dumpBed(annotations, "annotations_isochores", options)
        dumpBed(segments, "segments_isochores", options)

    else:
        # intersect workspace and segments/annotations
        # annotations and segments are truncated by workspace
        if options.truncate_segments_to_workspace:
            segments.intersect(workspaces["collapsed"])
        else:
            segments.filter(workspaces["collapsed"])

        annotations.intersect(workspaces["collapsed"])

        dumpStats(annotations, "stats_annotations_truncated", options)
        dumpStats(segments, "stats_segments_truncated", options)

    workspace = workspaces["collapsed"]

    if restrict_workspace:

        E.info("restricting workspace")
        # this is very cumbersome - refactor merge and collapse
        # to return an IntervalDictionary instead of adding it
        # to the list of tracks
        for x in (segments, annotations):
            if "merged" in segments:
                workspace.filter(segments["merged"])
            else:
                segments.merge()
                workspace.filter(segments["merged"])
                del segments["merged"]

        dumpStats(workspaces, "stats_workspaces_restricted", options)

    if truncate_workspace_to_annotations:

        E.info("truncating workspace to annotations")
        annotations.merge()
        workspace.intersect(annotations["merged"])
        del annotations["merged"]

        dumpStats(workspaces, "stats_workspaces_truncated", options)

    # segments.dump( open("segments_dump.bed", "w" ) )
    # workspaces.dump( open("workspaces_dump.bed", "w" ) )

    # output overlap stats
    # output segment densities per workspace
    if "overlap" in options.output_stats or \
            "all" in options.output_stats:
        for track in segments.tracks:
            workspaces.outputOverlapStats(E.openOutputFile("overlap_%s" % track),
                                          segments[track])

    return workspace
Ejemplo n.º 14
0
    def sample(self, track, counts, counters, segs,
               annotations, workspace,
               outfiles):
        '''sample and return counts.

        Return a list of counted results for each counter.
        '''

        E.info("performing unconditional sampling")
        counts_per_track = [collections.defaultdict(list) for x in counters]

        # rebuild non-isochore annotations and workspace
        contig_annotations = annotations.clone()
        contig_annotations.fromIsochores()
        contig_annotations.setName("contig_" + annotations.getName())

        contig_workspace = workspace.clone()
        contig_workspace.fromIsochores()

        E.info("workspace without conditioning: %i segments, %i nucleotides" %
               (workspace.counts(),
                workspace.sum()))

        temp_segs, _, temp_workspace = self.workspace_generator(
            segs, None, workspace)

        E.info("workspace after conditioning: %i segments, %i nucleotides" %
               (workspace.counts(),
                workspace.sum()))

        if workspace.sum() == 0:
            E.warn("empty workspace - no computation performed")
            return None

        work = [WorkData(track,
                         x,
                         self.sampler,
                         temp_segs,
                         annotations,
                         contig_annotations,
                         temp_workspace,
                         contig_workspace,
                         counters,
                         ) for x in range(self.num_samples)]

        if self.num_threads > 0:
            E.info("setting up shared data for multi-processing")
            annotations.share()
            contig_annotations.share()
            contig_workspace.share("contig_workspace")
            temp_segs.share("generated_segments")
            temp_workspace.share("generated_workspace")

        E.info("sampling started")
        results = self.computeSamples(work)
        E.info("sampling completed")

        if self.num_threads > 0:
            E.info("retrieving private data")
            annotations.unshare()
            contig_annotations.unshare()
            contig_workspace.unshare()
            temp_segs.unshare()
            temp_workspace.unshare()

        # collate results
        for result in results:
            for counter_id, counter in enumerate(counters):
                for annotation in annotations.tracks:
                    counts_per_track[counter_id][annotation].append(
                        result[counter_id][annotation])

        self.outputSampleStats(None, "", [])

        return counts_per_track
Ejemplo n.º 15
0
def run(segments,
        annotations,
        workspace,
        sampler,
        counters,
        workspace_generator,
        **kwargs):
    '''run an enrichment analysis.

    segments: an IntervalCollection
    workspace: an IntervalCollection
    annotations: an IntervalCollection

    kwargs recognized are:

    cache
       filename of cache

    num_samples
       number of samples to compute

    output_counts_pattern
       output counts to filename

    output_samples_pattern
       if given, output samles to these files, one per segment

    sample_files
       if given, read samples from these files.

    fdr
       method to compute qvalues

    outfiles
       dictionary of optional additional output files.

    pseudo_count
       pseudo_count to add to observed and expected values

    reference
       data with reference observed and expected values.
    '''

    # get arguments
    num_samples = kwargs.get("num_samples", 10000)
    cache = kwargs.get("cache", None)
    output_counts_pattern = kwargs.get("output_counts_pattern", None)
    sample_files = kwargs.get("sample_files", [])
    pseudo_count = kwargs.get("pseudo_count", 1.0)
    reference = kwargs.get("reference", None)
    output_samples_pattern = kwargs.get("output_samples_pattern", None)
    outfiles = kwargs.get("outfiles", {})
    num_threads = kwargs.get("num_threads", 0)

    ##################################################
    ##################################################
    ##################################################
    # computing summary metrics for segments
    if "segment_metrics" in outfiles:
        E.info("computing summary metrics for segments")
        outfile = outfiles["segment_metrics"]
        outfile.write("track\tsection\tmetric\t%s\n" %
                      "\t".join(Stats.Summary().getHeaders()))
        for track in segments.tracks:
            IO.outputMetrics(outfile,
                             segments[track],
                             workspace,
                             track,
                             'segments',
                             )
        E.info("wrote summary metrics for segments to %s" % str(outfile))

    ##################################################
    ##################################################
    ##################################################
    # collect observed counts from segments
    E.info("collecting observed counts")
    observed_counts = []
    for counter in counters:
        observed_counts.append(Engine.computeCounts(
            counter=counter,
            aggregator=sum,
            segments=segments,
            annotations=annotations,
            workspace=workspace,
            workspace_generator=workspace_generator))

    ##################################################
    ##################################################
    ##################################################
    # sample and collect counts
    ##################################################
    E.info("starting sampling")

    if cache:
        E.info("samples are cached in %s" % cache)
        samples = Engine.SamplesCached(filename=cache)
    elif sample_files:
        if not output_samples_pattern:
            raise ValueError(
                "require output_samples_pattern if loading samples from files")
        # build regex
        regex = re.compile(re.sub("%s", "(\S+)", output_samples_pattern))
        E.info("loading samples from %i files" % len(sample_files))
        samples = Engine.SamplesFile(
            filenames=sample_files,
            regex=regex)
    else:
        samples = Engine.Samples()

    sampled_counts = {}

    counts = E.Counter()

    ntracks = len(segments.tracks)

    for ntrack, track in enumerate(segments.tracks):

        segs = segments[track]

        E.info("sampling: %s: %i/%i" % (track, ntrack + 1, ntracks))

        if output_samples_pattern and not sample_files:
            filename = re.sub("%s", track, output_samples_pattern)
            E.debug("saving samples to %s" % filename)
            dirname = os.path.dirname(filename)
            if dirname and not os.path.exists(dirname):
                os.makedirs(dirname)
            if filename.endswith(".gz"):
                samples_outfile = gzip.open(filename, "w")
            else:
                samples_outfile = open(filename, "w")
        else:
            samples_outfile = None

        if workspace_generator.is_conditional:
            outer_sampler = ConditionalSampler(num_samples,
                                               samples,
                                               samples_outfile,
                                               sampler,
                                               workspace_generator,
                                               counters,
                                               outfiles,
                                               num_threads=num_threads)
        else:
            outer_sampler = UnconditionalSampler(num_samples,
                                                 samples,
                                                 samples_outfile,
                                                 sampler,
                                                 workspace_generator,
                                                 counters,
                                                 outfiles,
                                                 num_threads=num_threads)

        counts_per_track = outer_sampler.sample(
            track, counts, counters, segs, annotations, workspace, outfiles)

        # skip empty tracks
        if counts_per_track is None:
            continue

        if samples_outfile:
            samples_outfile.close()

        sampled_counts[track] = counts_per_track

        # old code, refactor into loop to save samples
        if 0:
            E.info("sampling stats: %s" % str(counts))
            if track not in samples:
                E.warn("no samples for track %s" % track)
                continue

            # clean up samples
            del samples[track]

    E.info("sampling finished")

    # build annotator results
    E.info("computing PValue statistics")

    annotator_results = list()
    counter_id = 0
    for counter, observed_count in zip(counters, observed_counts):
        for track, r in observed_count.items():
            for annotation, observed in r.items():
                temp_segs, temp_annos, temp_workspace = workspace_generator(
                    segments[track],
                    annotations[annotation],
                    workspace)

                # ignore empty results
                if temp_workspace.sum() == 0:
                    continue

                # if reference is given, p-value will indicate difference
                # The test that track and annotation are present is done
                # elsewhere
                if reference:
                    ref = reference[track][annotation]
                else:
                    ref = None

                annotator_results.append(Engine.AnnotatorResultExtended(
                    track=track,
                    annotation=annotation,
                    counter=counter.name,
                    observed=observed,
                    samples=sampled_counts[track][counter_id][annotation],
                    track_segments=temp_segs,
                    annotation_segments=temp_annos,
                    workspace=temp_workspace,
                    reference=ref,
                    pseudo_count=pseudo_count))
        counter_id += 1

    # dump (large) table with counts
    if output_counts_pattern:
        for counter in counters:
            name = counter.name
            filename = re.sub("%s", name, output_counts_pattern)

            E.info("writing counts to %s" % filename)
            output = [x for x in annotator_results if x.counter == name]
            outfile = IOTools.openFile(filename, "w")
            outfile.write("track\tannotation\tobserved\tcounts\n")

            for o in output:
                outfile.write("%s\t%s\t%i\t%s\n" %
                              (o.track, o.annotation,
                               o.observed,
                               ",".join(["%i" % x for x in o.samples])))

    return annotator_results
Ejemplo n.º 16
0
def applyIsochores(
    segments,
    annotations,
    workspaces,
    options,
    isochores=None,
    truncate_segments_to_workspace=False,
    truncate_workspace_to_annotations=False,
    restrict_workspace=False,
):
    '''apply isochores to segments and annotations.

    Segments and annotations are filtered in place to keep only those
    overlapping the workspace.

    If *isochores* are given, isochores are applied.

    If *truncate_segments_to_workspace*, truncate segments
    to workspace.

    If *restrict_workspace* is set, the workspace is confined
    to those parts that overlap both a segment and an annotation.

    If *truncate_workspace_to_annotations* is set, the workspace
    is truncated to keep only those parts that overlap annotations.

    returns a workspace divided into isochores.

    '''

    if isochores:
        # intersect isochores and workspaces, segments and annotations
        # workspace and annotations are truncated
        # with segments it is optional.
        E.info("adding isochores to workspace")
        workspaces.toIsochores(isochores, truncate=True)
        annotations.toIsochores(isochores, truncate=True)
        segments.toIsochores(isochores,
                             truncate=options.truncate_segments_to_workspace)

        if workspaces.sum() == 0:
            raise ValueError("isochores and workspaces do not overlap")
        if annotations.sum() == 0:
            raise ValueError("isochores and annotations do not overlap")
        if segments.sum() == 0:
            raise ValueError("isochores and segments do not overlap")

        dumpStats(workspaces, "stats_workspaces_isochores", options)
        dumpStats(annotations, "stats_annotations_isochores", options)
        dumpStats(segments, "stats_segments_isochores", options)

        dumpBed(workspaces, "workspaces_isochores", options)
        dumpBed(annotations, "annotations_isochores", options)
        dumpBed(segments, "segments_isochores", options)

    else:
        # intersect workspace and segments/annotations
        # annotations and segments are truncated by workspace
        if options.truncate_segments_to_workspace:
            segments.intersect(workspaces["collapsed"])
        else:
            segments.filter(workspaces["collapsed"])

        annotations.intersect(workspaces["collapsed"])

        dumpStats(annotations, "stats_annotations_truncated", options)
        dumpStats(segments, "stats_segments_truncated", options)

    workspace = workspaces["collapsed"]

    if restrict_workspace:

        E.info("restricting workspace")
        # this is very cumbersome - refactor merge and collapse
        # to return an IntervalDictionary instead of adding it
        # to the list of tracks
        for x in (segments, annotations):
            if "merged" in segments:
                workspace.filter(segments["merged"])
            else:
                segments.merge()
                workspace.filter(segments["merged"])
                del segments["merged"]

        dumpStats(workspaces, "stats_workspaces_restricted", options)

    if truncate_workspace_to_annotations:

        E.info("truncating workspace to annotations")
        annotations.merge()
        annotations["merged"].normalize()
        workspace.intersect(annotations["merged"])
        del annotations["merged"]

        dumpStats(workspaces, "stats_workspaces_truncated", options)

    # segments.dump( open("segments_dump.bed", "w" ) )
    # workspaces.dump( open("workspaces_dump.bed", "w" ) )

    # output overlap stats
    # output segment densities per workspace
    if "overlap" in options.output_stats or \
            "all" in options.output_stats:
        for track in segments.tracks:
            workspaces.outputOverlapStats(
                E.openOutputFile("overlap_%s" % track), segments[track])

    return workspace
Ejemplo n.º 17
0
def buildSegments(options):
    '''load segments, annotations and workspace from parameters
    defined in *options*.

    The workspace will be split by isochores.

    returns segments, annotations and workspace.
    '''

    options.segment_files = expandGlobs(options.segment_files)
    options.annotation_files = expandGlobs(options.annotation_files)
    options.workspace_files = expandGlobs(options.workspace_files)
    options.sample_files = expandGlobs(options.sample_files)

    ##################################################
    # arguments sanity check
    if not options.segment_files:
        raise ValueError("please specify at least one segment file")
    if not options.annotation_files:
        raise ValueError("please specify at least one annotation file")
    if not options.workspace_files:
        raise ValueError("please specify at least one workspace file")

    # read one or more segment files
    segments = readSegmentList("segments", options.segment_files, options)
    if options.ignore_segment_tracks:
        segments.merge(delete=True)
        E.info("merged all segments into one track with %i segments" %
               len(segments))

    if segments.sum() == 0:
        E.critical("no segments in input file - run aborted")
        raise ValueError("segments file is empty - run aborted")

    if len(segments) > 1000:
        raise ValueError(
            "too many (%i) segment files - use track definitions or --ignore-segment-tracks" % len(segments))

    annotations = readSegmentList(
        "annotations", options.annotation_files, options, options.enable_split_tracks)
    workspaces = readSegmentList(
        "workspaces", options.workspace_files, options, options.enable_split_tracks)

    # intersect workspaces to build a single workspace
    E.info("collapsing workspaces")
    dumpStats(workspaces, "stats_workspaces_input", options)
    workspaces.collapse()
    dumpStats(workspaces, "stats_workspaces_collapsed", options)

    # use merged workspace only, discard others
    workspaces.restrict("collapsed")

    # build isochores or intersect annotations/segments with workspace
    if options.isochore_files:

        # read one or more isochore files
        isochores = GatEngine.IntervalCollection(name="isochores")
        E.info("%s: reading isochores from %i files" %
               ("isochores", len(options.isochore_files)))
        isochores.load(options.isochore_files)
        dumpStats(isochores, "stats_isochores_raw", options)

        # merge isochores and check if consistent (fully normalized)
        isochores.sort()

        # check that there are no overlapping segments within isochores
        isochores.check()

        # TODO: flag is_normalized not properly set
        isochores.normalize()

        # check that there are no overlapping segments between isochores

        # truncate isochores to workspace
        # crucial if isochores are larger than workspace.
        isochores.intersect(workspaces["collapsed"])

    else:
        isochores = None

    return segments, annotations, workspaces, isochores
Ejemplo n.º 18
0
def plotResults(results, options):
    '''plot annotator results.'''

    ##################################################
    # plot histograms
    if options.output_plots_pattern and HASPLOT:

        def buildPlotFilename(options, key):
            filename = re.sub("%s", key, options.output_plots_pattern)
            filename = re.sub("[^a-zA-Z0-9-_./]", "_", filename)
            dirname = os.path.dirname(filename)
            if dirname and not os.path.exists(dirname):
                os.makedirs(dirname)
            return filename

        E.info("plotting sample stats")

        for r in results:

            plt.figure()
            k = []
            if r.track != "merged":
                k.append(r.track)
            k.append(r.annotation)
            if r.counter != "na":
                k.append(r.counter)
            key = "-".join(k)

            s = r.samples
            hist, bins = numpy.histogram(s,
                                         bins=100)

            # plot bars
            plt.hist(s, bins=100, normed=True, label=key)

            plt.axvline(r.observed, color='r', linewidth=2)

            # plot estimated
            sigma = r.stddev
            mu = r.expected
            plt.plot(bins,
                     1.0 / (sigma * numpy.sqrt(2 * numpy.pi)) *
                     numpy.exp(- (bins - mu) ** 2 / (2 * sigma ** 2)),
                     label="std distribution",
                     linewidth=2,
                     color='g')

            plt.legend()
            filename = buildPlotFilename(options, key)
            plt.savefig(filename)

        E.info("plotting P-value distribution")

        key = "pvalue"
        plt.figure()

        x, bins, y = plt.hist([r.pvalue for r in results],
                              bins=numpy.arange(0, 1.05, 0.025),
                              label="pvalue")

        plt.hist([r.qvalue for r in results],
                 bins=numpy.arange(0, 1.05, 0.025),
                 label="qvalue",
                 alpha=0.5)

        plt.legend()

        # hist, bins = numpy.histogram( \
        #     [r.pvalue for r in GatEngine.iterator_results(annotator_results) ],
        #     bins = 20 )
        # plt.plot( bins[:-1], hist, label = key )

        filename = buildPlotFilename(options, key)
        plt.savefig(filename)
Ejemplo n.º 19
0
def outputResults(results,
                  options,
                  header,
                  description_header,
                  description_width,
                  descriptions,
                  format_observed="%i"):
    '''compute FDR and output results.'''

    pvalues = [x.pvalue for x in results]

    ##################################################
    ##################################################
    ##################################################
    # compute global fdr
    ##################################################
    E.info("computing FDR statistics")
    qvalues = GatEngine.getQValues(pvalues,
                                   method=options.qvalue_method,
                                   vlambda=options.qvalue_lambda,
                                   pi0_method=options.qvalue_pi0_method)

    try:
        results = [x._replace(qvalue=qvalue)
                   for x, qvalue in zip(results, qvalues)]
        is_tuple = True
    except AttributeError:
        # not a namedtuple
        for x, qvalue in zip(results, qvalues):
            x.qvalue = qvalue
            x.format_observed = format_observed

        is_tuple = False

    counters = set([x.counter for x in results])

    for counter in counters:

        if len(counters) == 1:
            outfile = options.stdout
            output = results
        else:
            outfilename = re.sub("%s", counter, options.output_tables_pattern)
            E.info("output for counter %s goes to outfile %s" %
                   (counter, outfilename))
            outfile = IOTools.openFile(outfilename, "w")
            output = [x for x in results if x.counter == counter]

        outfile.write(
            "\t".join(list(header) + list(description_header)) + "\n")

        if options.output_order == "track":
            output.sort(key=lambda x: (x.track, x.annotation))
        elif options.output_order == "observed":
            output.sort(key=lambda x: x.observed)
        elif options.output_order == "annotation":
            output.sort(key=lambda x: (x.annotation, x.track))
        elif options.output_order == "fold":
            output.sort(key=lambda x: x.fold)
        elif options.output_order == "pvalue":
            output.sort(key=lambda x: x.pvalue)
        elif options.output_order == "qvalue":
            output.sort(key=lambda x: x.qvalue)
        else:
            raise ValueError("unknown sort order %s" % options.output_order)

        for result in output:
            if is_tuple:
                outfile.write("\t".join(map(str, result)))
            else:
                outfile.write(str(result))

            if descriptions:
                try:
                    outfile.write(
                        "\t" + "\t".join(descriptions[result.annotation]))
                except KeyError:
                    outfile.write("\t" + "\t".join([""] * description_width))
            outfile.write("\n")

        if outfile != options.stdout:
            outfile.close()
Ejemplo n.º 20
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    parser = gat.buildParser(usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    ##################################################
    size_pos, size_segment = GatSegmentList.getSegmentSize()
    E.debug("sizes: pos=%i segment=%i, max_coord=%i" %
            (size_pos, size_segment, 2 ** (8 * size_pos)))

    ##################################################
    # set default counter
    if not options.counters:
        options.counters.append("nucleotide-overlap")

    ##################################################
    if options.output_tables_pattern != None:
        if "%s" not in options.output_tables_pattern:
            raise ValueError(
                "output_tables_pattern should contain at least one '%s'")

    if options.output_samples_pattern != None:
        if "%s" not in options.output_samples_pattern:
            raise ValueError(
                "output_samples_pattern should contain at least one '%s'")

    if options.output_counts_pattern != None:
        if "%s" not in options.output_counts_pattern:
            raise ValueError(
                "output_counts_pattern should contain at least one '%s'")

    ##################################################
    # read fold changes that results should be compared with
    if options.null != "default":
        if not os.path.exists(options.null):
            raise OSError("file %s not found" % options.null)
        E.info("reading reference results from %s" % options.null)
        options.reference = IO.readAnnotatorResults(options.null)
    else:
        options.reference = None

    if options.input_filename_counts:
        # use pre-computed counts
        annotator_results = GatEngine.fromCounts(options.input_filename_counts)

    elif options.input_filename_results:
        # use previous results (re-computes fdr)
        E.info("reading gat results from %s" % options.input_filename_results)
        annotator_results = IO.readAnnotatorResults(
            options.input_filename_results)

    else:
        # do full gat analysis
        annotator_results = fromSegments(options, args)

    ##################################################
    if options.pvalue_method != "empirical":
        E.info("updating pvalues to %s" % options.pvalue_method)
        GatEngine.updatePValues(annotator_results, options.pvalue_method)

    ##################################################
    # output
    IO.outputResults(annotator_results,
                     options,
                     GatEngine.AnnotatorResultExtended.headers,
                     description_header,
                     description_width,
                     descriptions)

    IO.plotResults(annotator_results, options)

    # write footer and output benchmark information.
    E.Stop()
Ejemplo n.º 21
0
def fromSegments(options, args):
    '''run analysis from segment files.

    This is the most common use case.
    '''

    tstart = time.time()

    # build segments
    segments, annotations, workspaces, isochores = IO.buildSegments(options)

    E.info("intervals loaded in %i seconds" % (time.time() - tstart))

    # open various additional output files
    outfiles = {}
    for section in (
            "sample",
            "segment_metrics",
            "sample_metrics",
    ):
        if section in options.output_stats or \
            "all" in options.output_stats or \
                len([x for x in options.output_stats
                     if re.search(x, "section")]) > 0:
            outfiles[section] = E.openOutputFile(section)

    if 'sample_metrics' in outfiles:
        outfiles['sample_metrics'].write(
            "track\tsection\tmetric\t%s\n" %
            "\t".join(Stats.Summary().getHeaders()))

    # filter segments by workspace
    workspace = IO.applyIsochores(
        segments,
        annotations,
        workspaces,
        options,
        isochores,
        truncate_segments_to_workspace=options.truncate_segments_to_workspace,
        truncate_workspace_to_annotations=options.
        truncate_workspace_to_annotations,
        restrict_workspace=options.restrict_workspace)

    # check memory requirements
    # previous algorithm: memory requirements if all samples are stored
    # counts = segments.countsPerTrack()
    # max_counts = max(counts.values())
    # memory = 8 * 2 * options.num_samples * max_counts * len(workspace)

    # initialize sampler
    if options.sampler == "annotator":
        sampler = Engine.SamplerAnnotator(bucket_size=options.bucket_size,
                                          nbuckets=options.nbuckets)
    elif options.sampler == "shift":
        sampler = Engine.SamplerShift(radius=options.shift_expansion,
                                      extension=options.shift_extension)
    elif options.sampler == "segments":
        sampler = Engine.SamplerSegments()
    elif options.sampler == "local-permutation":
        sampler = Engine.SamplerLocalPermutation()
    elif options.sampler == "global-permutation":
        sampler = Engine.SamplerGlobalPermutation()
    elif options.sampler == "brute-force":
        sampler = Engine.SamplerBruteForce()
    elif options.sampler == "uniform":
        sampler = Engine.SamplerUniform()

    # initialize counter
    counters = []
    for counter in options.counters:
        if counter == "nucleotide-overlap":
            counters.append(Engine.CounterNucleotideOverlap())
        elif counter == "nucleotide-density":
            counters.append(Engine.CounterNucleotideDensity())
        elif counter == "segment-overlap":
            counters.append(Engine.CounterSegmentOverlap())
        elif counter == "annotation-overlap":
            counters.append(Engine.CounterAnnotationOverlap())
        elif counter == "segment-midoverlap":
            counters.append(Engine.CounterSegmentMidpointOverlap())
        elif counter == "annotation-midoverlap":
            counters.append(Engine.CounterAnnotationMidpointOverlap())
        else:
            raise ValueError("unknown counter '%s'" % counter)

    # initialize workspace generator
    if options.conditional == "unconditional":
        workspace_generator = Engine.UnconditionalWorkspace()
    elif options.conditional == "cooccurance":
        workspace_generator = Engine.ConditionalWorkspaceCooccurance()
    elif options.conditional == "annotation-centered":
        if options.conditional_expansion is None:
            raise ValueError(
                "please specify either --conditional-expansion or "
                "--conditional-extension")
        workspace_generator = Engine.ConditionalWorkspaceAnnotationCentered(
            options.conditional_extension, options.conditional_expansion)
    elif options.conditional == "segment-centered":
        if options.conditional_expansion is None:
            raise ValueError(
                "please specify either --conditional-expansion or "
                "--conditional-extension")

        workspace_generator = Engine.ConditionalWorkspaceSegmentCentered(
            options.conditional_extension, options.conditional_expansion)
    else:
        raise ValueError("unknown conditional workspace '%s'" %
                         options.conditional)

    # check if reference is compplete
    if options.reference:
        for track in segments.tracks:
            if track not in options.reference:
                raise ValueError("missing track '%s' in reference" % track)
            r = options.reference[track]
            for annotation in annotations.tracks:
                if annotation not in r:
                    raise ValueError(
                        "missing annotation '%s' in annotations for "
                        "track='%s'" % (annotation, track))

    # compute
    annotator_results = gat.run(
        segments,
        annotations,
        workspace,
        sampler,
        counters,
        workspace_generator=workspace_generator,
        num_samples=options.num_samples,
        cache=options.cache,
        outfiles=outfiles,
        output_counts_pattern=options.output_counts_pattern,
        output_samples_pattern=options.output_samples_pattern,
        sample_files=options.sample_files,
        conditional=options.conditional,
        conditional_extension=options.conditional_extension,
        reference=options.reference,
        pseudo_count=options.pseudo_count,
        num_threads=options.num_threads)

    return annotator_results
Ejemplo n.º 22
0
    def sample(self, track, counts, counters, segs, annotations, workspace,
               outfiles):
        '''conditional sampling - sample using only those
        segments that contain both a segment and an annotation.

        return dictionary with counts per track
        '''

        E.info("performing conditional sampling")
        counts_per_track = [collections.defaultdict(list) for x in counters]

        # rebuild non-isochore annotations and workspace
        contig_annotations = annotations.clone()
        contig_annotations.fromIsochores()
        contig_annotations.setName("contig_" + annotations.getName())

        contig_workspace = workspace.clone()
        contig_workspace.fromIsochores()

        E.info("setting up shared data for multi-processing")
        annotations.share()
        contig_annotations.share()
        contig_workspace.share("contig_workspace")

        E.info("workspace without conditioning: %i segments, %i nucleotides" %
               (workspace.counts(),
                workspace.sum()))

        if workspace.sum() == 0:
            E.warn("empty workspace - no computation performed")
            return None

        # compute samples conditionally - need to proceed by annotation
        for annoid, annotation in enumerate(annotations.tracks):

            annos = annotations[annotation]

            temp_segs, temp_annotations, temp_workspace = \
                self.workspace_generator(segs, annos, workspace)

            # set up sharing
            temp_segs.share("generated_segments")
            temp_workspace.share("generated_workspace")

            E.info("workspace for annotation %s: %i segments, %i nucleotides" %
                   (annotation,
                    temp_workspace.counts(),
                    temp_workspace.sum()))

            work = [WorkData('_'.join((track, annoid)),
                             x,
                             self.sampler,
                             temp_segs,
                             annotations,
                             contig_annotations,
                             temp_workspace,
                             contig_workspace,
                             counters,
                             ) for x in range(self.num_samples)]

            E.info("sampling for annotation '%s' started" % annotation)
            results = self.computeSamples(work)
            E.info("sampling for annotation '%s' completed" % annotation)

            for result in results:
                for counter_id, counter in enumerate(counters):
                    counts_per_track[counter_id][annotation].append(
                        result[counter_id][annotation])

        return counts_per_track
Ejemplo n.º 23
0
def main(argv):

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-a", "--gene-file", "--annotations", dest="annotation_files", type="string", action="append",
                      help="filename with annotations - here, location of genes [default=%default].")

    parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append",
                      help="filename with segments. Also accepts a glob in parentheses [default=%default].")

    parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append",
                      help="filename with workspace segments. Also accepts a glob in parentheses [default=%default].")

    parser.add_option("-g", "--number-of-genes", dest="number_of_genes", type="int",
                      help="total number of genes [default=%default]")

    parser.add_option("-m", "--annotation-file", dest="annotation_file", type="string",
                      help="filename mapping genes to annotations [default=%default]")

    parser.add_option("-o", "--order", dest="output_order", type="choice",
                      choices=(
                          "track", "annotation", "fold", "pvalue", "qvalue"),
                      help="order results in output by fold, track, etc. [default=%default].")

    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
                      choices=(
                          "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default].")

    parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
                      choices=("smoother", "bootstrap"),
                      help="fdr computation: method for estimating pi0 [default=%default].")
    parser.add_option("--descriptions", dest="input_filename_descriptions", type="string",
                      help="filename mapping annotation terms to descriptions. "
                      " if given, the output table will contain additional columns "
                      " [default=%default]")

    parser.add_option("--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true",
                      help="ignore segment tracks - all segments belong to one track [default=%default]")

    parser.add_option("--enable-split-tracks", dest="enable_split_tracks", action="store_true",
                      help="permit the same track to be in multiple files [default=%default]")

    parser.add_option("--output-bed", dest="output_bed", type="choice", action="append",
                      choices=("all",
                               "annotations", "segments",
                               "workspaces", "isochores",
                               "overlap"),
                      help="output bed files [default=%default].")

    parser.add_option("--output-stats", dest="output_stats", type="choice", action="append",
                      choices=("all",
                               "annotations", "segments",
                               "workspaces", "isochores",
                               "overlap"),
                      help="output overlap summary stats [default=%default].")

    parser.set_defaults(
        annotation_files=[],
        segment_files=[],
        workspace_files=[],
        sample_files=[],
        annotation_file=None,
        num_samples=1000,
        nbuckets=100000,
        bucket_size=1,
        counter="nucleotide-overlap",
        output_stats=[],
        output_bed=[],
        output_filename_counts=None,
        output_order="fold",
        cache=None,
        input_filename_counts=None,
        input_filename_results=None,
        pvalue_method="empirical",
        output_plots_pattern=None,
        output_samples_pattern=None,
        qvalue_method="storey",
        qvalue_lambda=None,
        qvalue_pi0_method="smoother",
        sampler="annotator",
        ignore_segment_tracks=False,
        input_filename_descriptions=None,
        conditional="unconditional",
        conditional_extension=None,
        conditional_expansion=None,
        restrict_workspace=False,
        enable_split_tracks=False,
        shift_expansion=2.0,
        shift_extension=0,
        overlap_mode="midpoint",
        number_of_genes=None,
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    tstart = time.time()

    # load segments
    options.segment_files = IO.expandGlobs(options.segment_files)
    options.annotation_files = IO.expandGlobs(options.annotation_files)
    options.workspace_files = IO.expandGlobs(options.workspace_files)

    # read one or more segment files
    segments = IO.readSegmentList("segments", options.segment_files, options)
    if options.ignore_segment_tracks:
        segments.merge(delete=True)
        E.info("merged all segments into one track with %i segments" %
               len(segments))

    if len(segments) > 1000:
        raise ValueError(
            "too many (%i) segment files - use track definitions or --ignore-segment-tracks" % len(segments))

    # load workspace
    workspaces = IO.readSegmentList(
        "workspaces", options.workspace_files, options, options.enable_split_tracks)

    # intersect workspaces to build a single workspace
    E.info("collapsing workspaces")
    workspaces.collapse()

    # use merged workspace only, discard others
    workspaces.restrict("collapsed")
    workspace = workspaces["collapsed"]

    E.info("intervals loaded in %i seconds" % (time.time() - tstart))

    ############################################
    # load table mapping a gene id to annotations
    gene2annotations = IOTools.readMultiMap(IOTools.openFile(options.annotation_file),
                                            has_header=True)
    annotations = set([y for x in gene2annotations.values() for y in x])
    E.info("loaded %i annotations for %i genes" %
           (len(gene2annotations), len(annotations)))

    ############################################
    # load bed file with gene coordinates
    assert len(options.annotation_files) == 1
    indexed_genes = collections.defaultdict(Intersecter)
    total_genes = 0
    # number of genes per contig
    contig2ngenes = collections.defaultdict(int)
    # compute number of genes with a particular annotation
    # per contig
    annotation2ngenes = collections.defaultdict(int)
    for line in IOTools.openFile(options.annotation_files[0]):
        if line.startswith("#"):
            continue
        contig, start, end, gene_id = line[:-1].split("\t")[:4]
        indexed_genes[contig].add_interval(
            Interval(int(start), int(end), gene_id))
        contig2ngenes[contig] += 1
        total_genes += 1
        try:
            for annotation in gene2annotations[gene_id]:
                annotation2ngenes[annotation] += 1
        except KeyError:
            pass
    E.info("indexed locations for %i contigs" % len(indexed_genes))

    ############################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    ############################################
    ############################################
    # compute results
    E.info("computing counts")

    results = []
    # iterate over segments
    for segment, segmentdict in segments.iteritems():

        # genes hit by segments per annotation
        genes_hit_by_segments_with_annotations = collections.defaultdict(int)

        # genes hit by segments
        genes_hit_by_segments = 0

        for contig, ss in segmentdict.iteritems():
            for start, end in ss:
                overlapping_genes = list(
                    indexed_genes[contig].find(start, end))
                genes_hit_by_segments += len(overlapping_genes)
                for x in overlapping_genes:
                    gene_id = x.value
                    try:
                        for annotation in gene2annotations[gene_id]:
                            genes_hit_by_segments_with_annotations[
                                annotation] += 1
                    except KeyError:
                        pass

        # N = number of genes in genome
        N = total_genes
        # n   = number of genes selected by segments
        n = genes_hit_by_segments

        for annotation in annotations:
            # K = number of genes carrying annotation
            K = annotation2ngenes[annotation]
            # k = number of genes selected by segments and with annotation
            k = genes_hit_by_segments_with_annotations[annotation]

            if n == 0 or N == 0 or K == 0:
                expected = 0
                fold = 1.0
                pvalue = 1.0
            else:
                expected = float(n * K) / N
                fold = k / expected
                pvalue = scipy.stats.hypergeom.sf(k - 1, N, K, n)

            r = GENESET_RESULT._make((
                segment, annotation,
                N,
                K,
                n,
                k,
                expected,
                fold,
                pvalue,
                1.0))

            results.append(r)

    IO.outputResults(results,
                     options,
                     GENESET_RESULT._fields,
                     description_header,
                     description_width,
                     descriptions)

    E.Stop()
Ejemplo n.º 24
0
def main(argv):

    if not argv:
        argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser(version="%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $",
                                   usage=globals()["__doc__"])

    parser.add_option("-a", "--annotation-file", "--annotations", dest="annotation_files", type="string", action="append",
                      help="filename with annotations [default=%default].")

    parser.add_option("-s", "--segment-file", "--segments", dest="segment_files", type="string", action="append",
                      help="filename with segments. Also accepts a glob in parentheses [default=%default].")

    parser.add_option("-w", "--workspace-file", "--workspace", dest="workspace_files", type="string", action="append",
                      help="filename with workspace segments. Also accepts a glob in parentheses [default=%default].")

    parser.add_option("-i", "--isochore-file", "--isochores", dest="isochore_files", type="string", action="append",
                      help="filename with isochore segments. Also accepts a glob in parentheses [default=%default].")

    parser.add_option("-o", "--order", dest="output_order", type="choice",
                      choices=(
                          "track", "annotation", "fold", "pvalue", "qvalue"),
                      help="order results in output by fold, track, etc. [default=%default].")

    parser.add_option("-q", "--qvalue-method", dest="qvalue_method", type="choice",
                      choices=(
                          "storey", "BH", "bonferroni", "holm", "hommel", "hochberg", "BY", "none"),
                      help="method to perform multiple testing correction by controlling the fdr [default=%default].")

    parser.add_option("--qvalue-lambda", dest="qvalue_lambda", type="float",
                      help="fdr computation: lambda [default=%default].")

    parser.add_option("--qvalue-pi0-method", dest="qvalue_pi0_method", type="choice",
                      choices=("smoother", "bootstrap"),
                      help="fdr computation: method for estimating pi0 [default=%default].")
    parser.add_option("--descriptions", dest="input_filename_descriptions", type="string",
                      help="filename mapping annotation terms to descriptions. "
                      " if given, the output table will contain additional columns "
                      " [default=%default]")

    parser.add_option("--ignore-segment-tracks", dest="ignore_segment_tracks", action="store_true",
                      help="ignore segment tracks - all segments belong to one track [default=%default]")

    parser.add_option("--enable-split-tracks", dest="enable_split_tracks", action="store_true",
                      help="permit the same track to be in multiple files [default=%default]")

    parser.add_option("--output-bed", dest="output_bed", type="choice", action="append",
                      choices=("all",
                               "annotations", "segments",
                               "workspaces", "isochores",
                               "overlap"),
                      help="output bed files [default=%default].")

    parser.add_option("--output-stats", dest="output_stats", type="choice", action="append",
                      choices=("all",
                               "annotations", "segments",
                               "workspaces", "isochores",
                               "overlap"),
                      help="output overlap summary stats [default=%default].")

    parser.add_option("--restrict-workspace", dest="restrict_workspace", action="store_true",
                      help="restrict workspace to those segments that contain both track"
                      " and annotations [default=%default]")

    parser.add_option("--counter", dest="counters", type="choice", action="append",
                      choices=("binom", "hyperg"),
                      help="counter to use [default=%default].")

    parser.add_option("--output-tables-pattern", dest="output_tables_pattern", type="string",
                      help="output pattern for result tables. Used if there are multiple counters used [default=%default].")

    parser.set_defaults(
        annotation_files=[],
        segment_files=[],
        workspace_files=[],
        sample_files=[],
        counters=[],
        output_stats=[],
        output_bed=[],
        output_tables_pattern="%s.tsv.gz",
        output_order="fold",
        input_filename_counts=None,
        input_filename_results=None,
        pvalue_method="empirical",
        output_plots_pattern=None,
        output_samples_pattern=None,
        qvalue_method="storey",
        qvalue_lambda=None,
        qvalue_pi0_method="smoother",
        ignore_segment_tracks=False,
        input_filename_descriptions=None,
        conditional="unconditional",
        conditional_extension=None,
        conditional_expansion=None,
        restrict_workspace=False,
        enable_split_tracks=False,
        shift_expansion=2.0,
        shift_extension=0,
        overlap_mode="midpoint",
        truncate_workspace_to_annotations=False,
        truncate_segments_to_workspace=False
    )

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    tstart = time.time()

    if len(options.counters) == 0:
        options.counters.append("binom")

    ############################################
    segments, annotations, workspaces, isochores = IO.buildSegments(options)
    E.info("intervals loaded in %i seconds" % (time.time() - tstart))

    # filter segments by workspace
    workspace = IO.applyIsochores(
        segments, annotations, workspaces, options, isochores)

    ############################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    ############################################
    ############################################
    # compute per contig

    # compute bases covered by workspace
    workspace2basecoverage, isochores = {}, []
    for contig, ww in workspace.iteritems():
        workspace2basecoverage[contig] = ww.sum()
        isochores.append(contig)

    # compute percentage of bases covered by annotations in workspace
    # per isochore
    annotation2basecoverage = collections.defaultdict(dict)
    for annotation, aa in annotations.iteritems():
        for isochore, a in aa.iteritems():
            # need to truncate to workspace?
            annotation2basecoverage[annotation][isochore] = a.sum()

    results_per_contig = collections.defaultdict(list)

    E.info("computing counts per isochore")
    # results per isochore

    def emptyResult(segment, annotation, isochore,
                    counter,
                    nsegments_in_workspace,
                    basecoverage_annotation,
                    basecoverage_workspace):
        return GREAT_RESULT._make((
            segment, annotation, isochore,
            counter,
            0,  # observed
            0,  # expected
            nsegments_in_workspace,
            0,  # nannotations_in_workspace
            0,  # nsegments_overlapping_annotation
            0,  # nannotations_overlapping_segments
            0,  # basecoverage_intersection
            0,  # basecoverage_segments
            basecoverage_annotation,
            basecoverage_workspace,
            0.0,
            1.0,
            1.0,
            1.0))

    for isochore in isochores:
        basecoverage_workspace = workspace2basecoverage[isochore]

        # iterate over all isochores
        for segment, segmentdict in segments.iteritems():
            try:
                ss = segmentdict[isochore]
                # select segments overlapping workspace
                segments_in_workspace = GatSegmentList.SegmentList(clone=ss)
                segments_in_workspace.intersect(workspace[isochore])
                # number of segments in workspace
                nsegments_in_workspace = len(segments_in_workspace)
            except KeyError:
                ss = None

            basecoverage_segments = segments_in_workspace.sum()

            for annotation, annotationdict in annotations.iteritems():

                # if annotation != "GO:0030957": continue

                try:
                    aa = annotationdict[isochore]
                except KeyError:
                    aa = None

                # p_A: proportion of bases covered by annotation
                try:
                    basecoverage_annotation = annotation2basecoverage[
                        annotation][isochore]
                except KeyError:
                    basecoverage_annotation = 0

                if ss == None or aa == None:
                    for counter in options.counters:
                        results_per_contig[(counter, segment, annotation)].append(emptyResult(segment, annotation,
                                                                                              isochore,
                                                                                              counter,
                                                                                              nsegments_in_workspace,
                                                                                              basecoverage_annotation,
                                                                                              basecoverage_workspace))
                    continue

                # select segments overlapping annotation
                segments_overlapping_annotation = GatSegmentList.SegmentList(
                    clone=ss)
                segments_overlapping_annotation.intersect(
                    annotations[annotation][isochore])
                # number of segments in annotation
                nsegments_overlapping_annotation = ss.intersectionWithSegments(annotations[annotation][isochore],
                                                                               mode=options.overlap_mode)

                # number of nucleotides at the intersection of segments,
                # annotation and workspace
                basecoverage_intersection = segments_overlapping_annotation.sum()

                annotations_overlapping_segments = GatSegmentList.SegmentList(
                    clone=aa)
                annotations_overlapping_segments.intersect(ss)
                nannotations_overlapping_segments = len(
                    annotations_overlapping_segments)

                nannotations_in_workspace = len(aa)
                if nannotations_in_workspace == 0:
                    for counter in options.counters:
                        results_per_contig[(counter, segment, annotation)].append(emptyResult(segment,
                                                                                              annotation,
                                                                                              isochore,
                                                                                              counter,
                                                                                              nsegments_in_workspace,
                                                                                              basecoverage_annotation,
                                                                                              basecoverage_workspace))
                    continue

                fraction_coverage_annotation = basecoverage_annotation / \
                    float(basecoverage_workspace)
                fraction_hit_annotation = float(
                    nannotations_overlapping_segments) / nannotations_in_workspace

                for counter in options.counters:
                    if counter.startswith("binom"):
                        # GREAT binomial probability over "regions"
                        # n = number of genomic regions = nannotations_in_workspace
                        # ppi = fraction of genome annotated by annotation = fraction_coverage_annotation
                        # kpi = genomic regions with annotation hit by segments = nannotations_in_segments
                        # sf = survival functions = 1 -cdf
                        # probability of observing >kpi in a sample of n where the probabily of succes is
                        # ppi.
                        pvalue = scipy.stats.binom.sf(nsegments_overlapping_annotation - 1,
                                                      nsegments_in_workspace,
                                                      fraction_coverage_annotation)

                        expected = fraction_coverage_annotation * \
                            nsegments_in_workspace
                        observed = nsegments_overlapping_annotation

                    elif counter.startswith("hyperg"):

                        # hypergeometric probability over nucleotides
                        # Sampling without replacement
                        # x,M,n,M
                        # x = observed number of nucleotides in overlap of segments,annotations and workspace
                        # M = number of nucleotides in workspace
                        # n = number of nucleotides in annotations (and workspace)
                        # N = number of nucleotides in segments (and workspace)
                        # P-value of obtaining >x number of nucleotides
                        # overlapping.
                        rv = scipy.stats.hypergeom(basecoverage_workspace,
                                                   basecoverage_annotation,
                                                   basecoverage_segments)

                        pvalue = rv.sf(basecoverage_intersection)
                        expected = rv.mean()
                        observed = basecoverage_intersection

                    if expected != 0:
                        fold = float(observed) / expected
                    else:
                        fold = 1.0

                    r = GREAT_RESULT._make((
                        segment, annotation, isochore,
                        counter,
                        observed,
                        expected,
                        nsegments_in_workspace,
                        nannotations_in_workspace,
                        nsegments_overlapping_annotation,
                        nannotations_overlapping_segments,
                        basecoverage_intersection,
                        basecoverage_segments,
                        basecoverage_annotation,
                        basecoverage_workspace,
                        fraction_coverage_annotation,
                        fold,
                        pvalue,
                        1.0))
                    # print "\t".join( map(str, r))
                    results_per_contig[
                        (counter, segment, annotation)].append(r)

    E.info("merging counts per isochore")

    # compute sums
    results = []

    for niteration, pair in enumerate(results_per_contig.iteritems()):

        counter, segment, annotation = pair[0]
        data = pair[1]

        nsegments_in_workspace = sum([x.nsegments_in_workspace for x in data])
        nsegments_overlapping_annotation = sum([x.observed for x in data])
        nannotations_in_workspace = sum(
            [x.nannotations_in_workspace for x in data])
        nannotations_overlapping_segments = sum(
            [x.nannotations_overlapping_segments for x in data])

        basecoverage_intersection = sum(
            [x.basecoverage_intersection for x in data])
        basecoverage_segments = sum([x.basecoverage_segments for x in data])
        basecoverage_annotation = sum(
            [x.basecoverage_annotation for x in data])
        basecoverage_workspace = sum([x.basecoverage_workspace for x in data])

        fraction_coverage_annotation = basecoverage_annotation / \
            float(basecoverage_workspace)

        if counter.startswith("binom"):
            pvalue = scipy.stats.binom.sf(nsegments_overlapping_annotation - 1,
                                          nsegments_in_workspace,
                                          fraction_coverage_annotation)
            expected = fraction_coverage_annotation * nsegments_in_workspace
            observed = nsegments_overlapping_annotation
        elif counter.startswith("hyperg"):
            rv = scipy.stats.hypergeom(basecoverage_workspace,
                                       basecoverage_annotation,
                                       basecoverage_segments)

            pvalue = rv.sf(basecoverage_intersection)
            expected = rv.mean()
            observed = basecoverage_intersection

        if expected != 0:
            fold = float(observed) / expected
        else:
            fold = 1.0

        r = GREAT_RESULT._make((
            segment, annotation, "all",
            counter,
            observed,
            expected,
            nsegments_in_workspace,
            nannotations_in_workspace,
            nsegments_overlapping_annotation,
            nannotations_overlapping_segments,
            basecoverage_intersection,
            basecoverage_segments,
            basecoverage_annotation,
            basecoverage_workspace,
            fraction_coverage_annotation,
            fold,
            pvalue,
            1.0))

        results.append(r)

    IO.outputResults(results,
                     options,
                     GREAT_RESULT._fields,
                     description_header,
                     description_width,
                     descriptions)

    E.Stop()
Ejemplo n.º 25
0
def outputResults(results,
                  options,
                  header,
                  description_header,
                  description_width,
                  descriptions,
                  format_observed="%i"):
    '''compute FDR and output results.'''

    pvalues = [x.pvalue for x in results]

    ##################################################
    ##################################################
    ##################################################
    # compute global fdr
    ##################################################
    E.info("computing FDR statistics")
    qvalues = Engine.getQValues(pvalues,
                                method=options.qvalue_method,
                                vlambda=options.qvalue_lambda,
                                pi0_method=options.qvalue_pi0_method)

    try:
        results = [
            x._replace(qvalue=qvalue) for x, qvalue in zip(results, qvalues)
        ]
        is_tuple = True
    except AttributeError:
        # not a namedtuple
        for x, qvalue in zip(results, qvalues):
            x.qvalue = qvalue
            x.format_observed = format_observed

        is_tuple = False

    counters = set([x.counter for x in results])

    for counter in counters:

        if len(counters) == 1:
            outfile = options.stdout
            output = results
        else:
            outfilename = re.sub("%s", counter, options.output_tables_pattern)
            E.info("output for counter %s goes to outfile %s" %
                   (counter, outfilename))
            outfile = IOTools.openFile(outfilename, "w")
            output = [x for x in results if x.counter == counter]

        outfile.write("\t".join(list(header) + list(description_header)) +
                      "\n")

        if options.output_order == "track":
            output.sort(key=lambda x: (x.track, x.annotation))
        elif options.output_order == "observed":
            output.sort(key=lambda x: x.observed)
        elif options.output_order == "annotation":
            output.sort(key=lambda x: (x.annotation, x.track))
        elif options.output_order == "fold":
            output.sort(key=lambda x: x.fold)
        elif options.output_order == "pvalue":
            output.sort(key=lambda x: x.pvalue)
        elif options.output_order == "qvalue":
            output.sort(key=lambda x: x.qvalue)
        else:
            raise ValueError("unknown sort order %s" % options.output_order)

        for result in output:
            if is_tuple:
                outfile.write("\t".join(map(str, result)))
            else:
                outfile.write(str(result))

            if descriptions:
                try:
                    outfile.write("\t" +
                                  "\t".join(descriptions[result.annotation]))
                except KeyError:
                    outfile.write("\t" + "\t".join([""] * description_width))
            outfile.write("\n")

        if outfile != options.stdout:
            outfile.close()
Ejemplo n.º 26
0
def fromSegments(options, args):
    '''run analysis from segment files.

    This is the most common use case.
    '''

    tstart = time.time()

    ##################################################
    ##################################################
    ##################################################
    # build segments
    segments, annotations, workspaces, isochores = IO.buildSegments(options)

    E.info("intervals loaded in %i seconds" % (time.time() - tstart))

    ##################################################
    ##################################################
    ##################################################
    # open various additional output files
    ##################################################
    outfiles = {}
    for section in ("sample",
                    "segment_metrics",
                    "sample_metrics",
                    ):
        if section in options.output_stats or \
            "all" in options.output_stats or \
                len([x for x in options.output_stats if re.search(x, "section")]) > 0:
            outfiles[section] = E.openOutputFile(section)

    if 'sample_metrics' in outfiles:
        outfiles['sample_metrics'].write(
            "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders()))

    # filter segments by workspace
    workspace = IO.applyIsochores(
        segments,
        annotations,
        workspaces,
        options,
        isochores,
        truncate_segments_to_workspace=options.truncate_segments_to_workspace,
        truncate_workspace_to_annotations=options.truncate_workspace_to_annotations,
        restrict_workspace=options.restrict_workspace)

    ##################################################
    ##################################################
    ##################################################
    # check memory requirements
    counts = segments.countsPerTrack()
    max_counts = max(counts.values())
    # previous algorithm: memory requirements if all samples are stored
    memory = 8 * 2 * options.num_samples * max_counts * len(workspace)

    ##################################################
    ##################################################
    ##################################################
    # initialize sampler
    if options.sampler == "annotator":
        sampler = GatEngine.SamplerAnnotator(
            bucket_size=options.bucket_size,
            nbuckets=options.nbuckets)
    elif options.sampler == "shift":
        sampler = GatEngine.SamplerShift(
            radius=options.shift_expansion,
            extension=options.shift_extension)
    elif options.sampler == "segments":
        sampler = GatEngine.SamplerSegments()
    elif options.sampler == "local-permutation":
        sampler = GatEngine.SamplerLocalPermutation()
    elif options.sampler == "global-permutation":
        sampler = GatEngine.SamplerGlobalPermutation()
    elif options.sampler == "brute-force":
        sampler = GatEngine.SamplerBruteForce()
    elif options.sampler == "uniform":
        sampler = GatEngine.SamplerUniform()

    ##################################################
    ##################################################
    ##################################################
    # initialize counter
    counters = []
    for counter in options.counters:
        if counter == "nucleotide-overlap":
            counters.append(GatEngine.CounterNucleotideOverlap())
        elif counter == "nucleotide-density":
            counters.append(GatEngine.CounterNucleotideDensity())
        elif counter == "segment-overlap":
            counters.append(GatEngine.CounterSegmentOverlap())
        elif counter == "annotations-overlap":
            counters.append(GatEngine.CounterAnnotationsOverlap())
        elif counter == "segment-midoverlap":
            counters.append(GatEngine.CounterSegmentMidpointOverlap())
        elif counter == "annotations-midoverlap":
            counters.append(GatEngine.CounterAnnotationsMidpointOverlap())
        else:
            raise ValueError("unknown counter '%s'" % counter)

    ##################################################
    ##################################################
    ##################################################
    # initialize workspace generator
    if options.conditional == "unconditional":
        workspace_generator = GatEngine.UnconditionalWorkspace()
    elif options.conditional == "cooccurance":
        workspace_generator = GatEngine.ConditionalWorkspaceCooccurance()
    elif options.conditional == "annotation-centered":
        if options.conditional_extension == options.conditional_expansion is None:
            raise ValueError(
                "please specify either --conditional-expansion or "
                "--conditional-extension")
        workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered(
            options.conditional_extension,
            options.conditional_expansion)
    elif options.conditional == "segment-centered":
        if options.conditional_extension == options.conditional_expansion is None:
            raise ValueError(
                "please specify either --conditional-expansion or "
                "--conditional-extension")

        workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered(
            options.conditional_extension,
            options.conditional_expansion)
    else:
        raise ValueError("unknown conditional workspace '%s'" %
                         options.conditional)

    ##################################################
    ##################################################
    ##################################################
    # check if reference is compplete
    ##################################################
    if options.reference:
        for track in segments.tracks:
            if track not in options.reference:
                raise ValueError("missing track '%s' in reference" % track)
            r = options.reference[track]
            for annotation in annotations.tracks:
                if annotation not in r:
                    raise ValueError(
                        "missing annotation '%s' in annotations for "
                        "track='%s'" % (annotation, track))

    ##################################################
    ##################################################
    ##################################################
    # compute
    ##################################################
    annotator_results = gat.run(
        segments,
        annotations,
        workspace,
        sampler,
        counters,
        workspace_generator=workspace_generator,
        num_samples=options.num_samples,
        cache=options.cache,
        outfiles=outfiles,
        output_counts_pattern=options.output_counts_pattern,
        output_samples_pattern=options.output_samples_pattern,
        sample_files=options.sample_files,
        conditional=options.conditional,
        conditional_extension=options.conditional_extension,
        reference=options.reference,
        pseudo_count=options.pseudo_count,
        num_threads=options.num_threads)

    return annotator_results
Ejemplo n.º 27
0
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv:
        argv = sys.argv

    parser = gat.buildParser(usage=globals()["__doc__"])

    # add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    ##################################################
    description_header, descriptions, description_width = IO.readDescriptions(
        options)

    ##################################################
    size_pos, size_segment = SegmentList.getSegmentSize()
    E.debug("sizes: pos=%i segment=%i, max_coord=%i" %
            (size_pos, size_segment, 2**(8 * size_pos)))

    ##################################################
    # set default counter
    if not options.counters:
        options.counters.append("nucleotide-overlap")

    ##################################################
    if options.output_tables_pattern is not None:
        if "%s" not in options.output_tables_pattern:
            raise ValueError(
                "output_tables_pattern should contain at least one '%s'")

    if options.output_samples_pattern is not None:
        if "%s" not in options.output_samples_pattern:
            raise ValueError(
                "output_samples_pattern should contain at least one '%s'")

    if options.output_counts_pattern is not None:
        if "%s" not in options.output_counts_pattern:
            raise ValueError(
                "output_counts_pattern should contain at least one '%s'")

    if options.random_seed is not None:
        # initialize python random number generator
        random.seed(options.random_seed)
        # initialize numpy random number generator
        numpy.random.seed(options.random_seed)

    ##################################################
    # read fold changes that results should be compared with
    if options.null != "default":
        if not os.path.exists(options.null):
            raise OSError("file %s not found" % options.null)
        E.info("reading reference results from %s" % options.null)
        options.reference = IO.readAnnotatorResults(options.null)
    else:
        options.reference = None

    if options.input_filename_counts:
        # use pre-computed counts
        annotator_results = Engine.fromCounts(options.input_filename_counts)

    elif options.input_filename_results:
        # use previous results (re-computes fdr)
        E.info("reading gat results from %s" % options.input_filename_results)
        annotator_results = IO.readAnnotatorResults(
            options.input_filename_results)

    else:
        # do full gat analysis
        annotator_results = fromSegments(options, args)

    ##################################################
    if options.pvalue_method != "empirical":
        E.info("updating pvalues to %s" % options.pvalue_method)
        Engine.updatePValues(annotator_results, options.pvalue_method)

    ##################################################
    # output
    IO.outputResults(annotator_results, options,
                     Engine.AnnotatorResultExtended.headers,
                     description_header, description_width, descriptions)

    IO.plotResults(annotator_results, options)

    # write footer and output benchmark information.
    E.Stop()