Ejemplo n.º 1
0
def fromSegments(options, args):
    '''run analysis from segment files.

    This is the most common use case.
    '''

    tstart = time.time()

    ##################################################
    ##################################################
    ##################################################
    # build segments
    segments, annotations, workspaces, isochores = IO.buildSegments(options)

    E.info("intervals loaded in %i seconds" % (time.time() - tstart))

    ##################################################
    ##################################################
    ##################################################
    # open various additional output files
    ##################################################
    outfiles = {}
    for section in ("sample",
                    "segment_metrics",
                    "sample_metrics",
                    ):
        if section in options.output_stats or \
            "all" in options.output_stats or \
                len([x for x in options.output_stats if re.search(x, "section")]) > 0:
            outfiles[section] = E.openOutputFile(section)

    if 'sample_metrics' in outfiles:
        outfiles['sample_metrics'].write(
            "track\tsection\tmetric\t%s\n" % "\t".join(Stats.Summary().getHeaders()))

    # filter segments by workspace
    workspace = IO.applyIsochores(
        segments,
        annotations,
        workspaces,
        options,
        isochores,
        truncate_segments_to_workspace=options.truncate_segments_to_workspace,
        truncate_workspace_to_annotations=options.truncate_workspace_to_annotations,
        restrict_workspace=options.restrict_workspace)

    ##################################################
    ##################################################
    ##################################################
    # check memory requirements
    counts = segments.countsPerTrack()
    max_counts = max(counts.values())
    # previous algorithm: memory requirements if all samples are stored
    memory = 8 * 2 * options.num_samples * max_counts * len(workspace)

    ##################################################
    ##################################################
    ##################################################
    # initialize sampler
    if options.sampler == "annotator":
        sampler = GatEngine.SamplerAnnotator(
            bucket_size=options.bucket_size,
            nbuckets=options.nbuckets)
    elif options.sampler == "shift":
        sampler = GatEngine.SamplerShift(
            radius=options.shift_expansion,
            extension=options.shift_extension)
    elif options.sampler == "segments":
        sampler = GatEngine.SamplerSegments()
    elif options.sampler == "local-permutation":
        sampler = GatEngine.SamplerLocalPermutation()
    elif options.sampler == "global-permutation":
        sampler = GatEngine.SamplerGlobalPermutation()
    elif options.sampler == "brute-force":
        sampler = GatEngine.SamplerBruteForce()
    elif options.sampler == "uniform":
        sampler = GatEngine.SamplerUniform()

    ##################################################
    ##################################################
    ##################################################
    # initialize counter
    counters = []
    for counter in options.counters:
        if counter == "nucleotide-overlap":
            counters.append(GatEngine.CounterNucleotideOverlap())
        elif counter == "nucleotide-density":
            counters.append(GatEngine.CounterNucleotideDensity())
        elif counter == "segment-overlap":
            counters.append(GatEngine.CounterSegmentOverlap())
        elif counter == "annotations-overlap":
            counters.append(GatEngine.CounterAnnotationsOverlap())
        elif counter == "segment-midoverlap":
            counters.append(GatEngine.CounterSegmentMidpointOverlap())
        elif counter == "annotations-midoverlap":
            counters.append(GatEngine.CounterAnnotationsMidpointOverlap())
        else:
            raise ValueError("unknown counter '%s'" % counter)

    ##################################################
    ##################################################
    ##################################################
    # initialize workspace generator
    if options.conditional == "unconditional":
        workspace_generator = GatEngine.UnconditionalWorkspace()
    elif options.conditional == "cooccurance":
        workspace_generator = GatEngine.ConditionalWorkspaceCooccurance()
    elif options.conditional == "annotation-centered":
        if options.conditional_extension == options.conditional_expansion is None:
            raise ValueError(
                "please specify either --conditional-expansion or "
                "--conditional-extension")
        workspace_generator = GatEngine.ConditionalWorkspaceAnnotationCentered(
            options.conditional_extension,
            options.conditional_expansion)
    elif options.conditional == "segment-centered":
        if options.conditional_extension == options.conditional_expansion is None:
            raise ValueError(
                "please specify either --conditional-expansion or "
                "--conditional-extension")

        workspace_generator = GatEngine.ConditionalWorkspaceSegmentCentered(
            options.conditional_extension,
            options.conditional_expansion)
    else:
        raise ValueError("unknown conditional workspace '%s'" %
                         options.conditional)

    ##################################################
    ##################################################
    ##################################################
    # check if reference is compplete
    ##################################################
    if options.reference:
        for track in segments.tracks:
            if track not in options.reference:
                raise ValueError("missing track '%s' in reference" % track)
            r = options.reference[track]
            for annotation in annotations.tracks:
                if annotation not in r:
                    raise ValueError(
                        "missing annotation '%s' in annotations for "
                        "track='%s'" % (annotation, track))

    ##################################################
    ##################################################
    ##################################################
    # compute
    ##################################################
    annotator_results = gat.run(
        segments,
        annotations,
        workspace,
        sampler,
        counters,
        workspace_generator=workspace_generator,
        num_samples=options.num_samples,
        cache=options.cache,
        outfiles=outfiles,
        output_counts_pattern=options.output_counts_pattern,
        output_samples_pattern=options.output_samples_pattern,
        sample_files=options.sample_files,
        conditional=options.conditional,
        conditional_extension=options.conditional_extension,
        reference=options.reference,
        pseudo_count=options.pseudo_count,
        num_threads=options.num_threads)

    return annotator_results
Ejemplo n.º 2
0
def dumpStats(coll, section, options):
    if section in options.output_stats or \
            "all" in options.output_stats or \
            len([x for x in options.output_stats
                 if re.search(x, section)]) > 0:
        coll.outputStats(E.openOutputFile(section))
Ejemplo n.º 3
0
def dumpBed(coll, section, options):
    if section in options.output_bed or \
            "all" in options.output_bed or \
            len([x for x in options.output_bed if re.search(x, section)]) > 0:
        coll.save(E.openOutputFile(section + ".bed"))
Ejemplo n.º 4
0
def applyIsochores(
    segments,
    annotations,
    workspaces,
    options,
    isochores=None,
    truncate_segments_to_workspace=False,
    truncate_workspace_to_annotations=False,
    restrict_workspace=False,
):
    '''apply isochores to segments and annotations.

    Segments and annotations are filtered in place to keep only those
    overlapping the workspace.

    If *isochores* are given, isochores are applied.

    If *truncate_segments_to_workspace*, truncate segments
    to workspace.

    If *restrict_workspace* is set, the workspace is confined
    to those parts that overlap both a segment and an annotation.

    If *truncate_workspace_to_annotations* is set, the workspace
    is truncated to keep only those parts that overlap annotations.

    returns a workspace divided into isochores.

    '''

    if isochores:
        # intersect isochores and workspaces, segments and annotations
        # workspace and annotations are truncated
        # with segments it is optional.
        E.info("adding isochores to workspace")
        workspaces.toIsochores(isochores, truncate=True)
        annotations.toIsochores(isochores, truncate=True)
        segments.toIsochores(isochores,
                             truncate=options.truncate_segments_to_workspace)

        if workspaces.sum() == 0:
            raise ValueError("isochores and workspaces do not overlap")
        if annotations.sum() == 0:
            raise ValueError("isochores and annotations do not overlap")
        if segments.sum() == 0:
            raise ValueError("isochores and segments do not overlap")

        dumpStats(workspaces, "stats_workspaces_isochores", options)
        dumpStats(annotations, "stats_annotations_isochores", options)
        dumpStats(segments, "stats_segments_isochores", options)

        dumpBed(workspaces, "workspaces_isochores", options)
        dumpBed(annotations, "annotations_isochores", options)
        dumpBed(segments, "segments_isochores", options)

    else:
        # intersect workspace and segments/annotations
        # annotations and segments are truncated by workspace
        if options.truncate_segments_to_workspace:
            segments.intersect(workspaces["collapsed"])
        else:
            segments.filter(workspaces["collapsed"])

        annotations.intersect(workspaces["collapsed"])

        dumpStats(annotations, "stats_annotations_truncated", options)
        dumpStats(segments, "stats_segments_truncated", options)

    workspace = workspaces["collapsed"]

    if restrict_workspace:

        E.info("restricting workspace")
        # this is very cumbersome - refactor merge and collapse
        # to return an IntervalDictionary instead of adding it
        # to the list of tracks
        for x in (segments, annotations):
            if "merged" in segments:
                workspace.filter(segments["merged"])
            else:
                segments.merge()
                workspace.filter(segments["merged"])
                del segments["merged"]

        dumpStats(workspaces, "stats_workspaces_restricted", options)

    if truncate_workspace_to_annotations:

        E.info("truncating workspace to annotations")
        annotations.merge()
        annotations["merged"].normalize()
        workspace.intersect(annotations["merged"])
        del annotations["merged"]

        dumpStats(workspaces, "stats_workspaces_truncated", options)

    # segments.dump( open("segments_dump.bed", "w" ) )
    # workspaces.dump( open("workspaces_dump.bed", "w" ) )

    # output overlap stats
    # output segment densities per workspace
    if "overlap" in options.output_stats or \
            "all" in options.output_stats:
        for track in segments.tracks:
            workspaces.outputOverlapStats(
                E.openOutputFile("overlap_%s" % track), segments[track])

    return workspace
Ejemplo n.º 5
0
def dumpBed(coll, section, options):
    if section in options.output_bed or \
            "all" in options.output_bed or \
            len([x for x in options.output_bed if re.search(x, section)]) > 0:
        coll.save(E.openOutputFile(section + ".bed"))
Ejemplo n.º 6
0
def dumpStats(coll, section, options):
    if section in options.output_stats or \
            "all" in options.output_stats or \
            len([x for x in options.output_stats if re.search(x, section)]) > 0:
        coll.outputStats(E.openOutputFile(section))
Ejemplo n.º 7
0
def applyIsochores(segments, annotations, workspaces,
                   options,
                   isochores=None,
                   truncate_segments_to_workspace=False,
                   truncate_workspace_to_annotations=False,
                   restrict_workspace=False,
                   ):
    '''apply isochores to segments and annotations.

    Segments and annotations are filtered in place to keep 
    only those overlapping the workspace.

    If *isochores* are given, isochores are applied.

    If *truncate_segments_to_workspace*, truncate segments
    to workspace.

    If *restrict_workspace* is set, the workspace is confined
    to those parts that overlap both a segment and an annotation.

    If *truncate_workspace_to_annotations* is set, the workspace
    is truncated to keep only those parts that overlap annotations.

    returns a workspace divided into isochores.
    '''

    if isochores:
        # intersect isochores and workspaces, segments and annotations
        # workspace and annotations are truncated
        # with segments it is optional.
        E.info("adding isochores to workspace")
        workspaces.toIsochores(isochores, truncate=True)
        annotations.toIsochores(isochores, truncate=True)
        segments.toIsochores(
            isochores, truncate=options.truncate_segments_to_workspace)

        if workspaces.sum() == 0:
            raise ValueError("isochores and workspaces do not overlap")
        if annotations.sum() == 0:
            raise ValueError("isochores and annotations do not overlap")
        if segments.sum() == 0:
            raise ValueError("isochores and segments do not overlap")

        dumpStats(workspaces, "stats_workspaces_isochores", options)
        dumpStats(annotations, "stats_annotations_isochores", options)
        dumpStats(segments, "stats_segments_isochores", options)

        dumpBed(workspaces, "workspaces_isochores", options)
        dumpBed(annotations, "annotations_isochores", options)
        dumpBed(segments, "segments_isochores", options)

    else:
        # intersect workspace and segments/annotations
        # annotations and segments are truncated by workspace
        if options.truncate_segments_to_workspace:
            segments.intersect(workspaces["collapsed"])
        else:
            segments.filter(workspaces["collapsed"])

        annotations.intersect(workspaces["collapsed"])

        dumpStats(annotations, "stats_annotations_truncated", options)
        dumpStats(segments, "stats_segments_truncated", options)

    workspace = workspaces["collapsed"]

    if restrict_workspace:

        E.info("restricting workspace")
        # this is very cumbersome - refactor merge and collapse
        # to return an IntervalDictionary instead of adding it
        # to the list of tracks
        for x in (segments, annotations):
            if "merged" in segments:
                workspace.filter(segments["merged"])
            else:
                segments.merge()
                workspace.filter(segments["merged"])
                del segments["merged"]

        dumpStats(workspaces, "stats_workspaces_restricted", options)

    if truncate_workspace_to_annotations:

        E.info("truncating workspace to annotations")
        annotations.merge()
        workspace.intersect(annotations["merged"])
        del annotations["merged"]

        dumpStats(workspaces, "stats_workspaces_truncated", options)

    # segments.dump( open("segments_dump.bed", "w" ) )
    # workspaces.dump( open("workspaces_dump.bed", "w" ) )

    # output overlap stats
    # output segment densities per workspace
    if "overlap" in options.output_stats or \
            "all" in options.output_stats:
        for track in segments.tracks:
            workspaces.outputOverlapStats(E.openOutputFile("overlap_%s" % track),
                                          segments[track])

    return workspace
Ejemplo n.º 8
0
def fromSegments(options, args):
    '''run analysis from segment files.

    This is the most common use case.
    '''

    tstart = time.time()

    # build segments
    segments, annotations, workspaces, isochores = IO.buildSegments(options)

    E.info("intervals loaded in %i seconds" % (time.time() - tstart))

    # open various additional output files
    outfiles = {}
    for section in (
            "sample",
            "segment_metrics",
            "sample_metrics",
    ):
        if section in options.output_stats or \
            "all" in options.output_stats or \
                len([x for x in options.output_stats
                     if re.search(x, "section")]) > 0:
            outfiles[section] = E.openOutputFile(section)

    if 'sample_metrics' in outfiles:
        outfiles['sample_metrics'].write(
            "track\tsection\tmetric\t%s\n" %
            "\t".join(Stats.Summary().getHeaders()))

    # filter segments by workspace
    workspace = IO.applyIsochores(
        segments,
        annotations,
        workspaces,
        options,
        isochores,
        truncate_segments_to_workspace=options.truncate_segments_to_workspace,
        truncate_workspace_to_annotations=options.
        truncate_workspace_to_annotations,
        restrict_workspace=options.restrict_workspace)

    # check memory requirements
    # previous algorithm: memory requirements if all samples are stored
    # counts = segments.countsPerTrack()
    # max_counts = max(counts.values())
    # memory = 8 * 2 * options.num_samples * max_counts * len(workspace)

    # initialize sampler
    if options.sampler == "annotator":
        sampler = Engine.SamplerAnnotator(bucket_size=options.bucket_size,
                                          nbuckets=options.nbuckets)
    elif options.sampler == "shift":
        sampler = Engine.SamplerShift(radius=options.shift_expansion,
                                      extension=options.shift_extension)
    elif options.sampler == "segments":
        sampler = Engine.SamplerSegments()
    elif options.sampler == "local-permutation":
        sampler = Engine.SamplerLocalPermutation()
    elif options.sampler == "global-permutation":
        sampler = Engine.SamplerGlobalPermutation()
    elif options.sampler == "brute-force":
        sampler = Engine.SamplerBruteForce()
    elif options.sampler == "uniform":
        sampler = Engine.SamplerUniform()

    # initialize counter
    counters = []
    for counter in options.counters:
        if counter == "nucleotide-overlap":
            counters.append(Engine.CounterNucleotideOverlap())
        elif counter == "nucleotide-density":
            counters.append(Engine.CounterNucleotideDensity())
        elif counter == "segment-overlap":
            counters.append(Engine.CounterSegmentOverlap())
        elif counter == "annotation-overlap":
            counters.append(Engine.CounterAnnotationOverlap())
        elif counter == "segment-midoverlap":
            counters.append(Engine.CounterSegmentMidpointOverlap())
        elif counter == "annotation-midoverlap":
            counters.append(Engine.CounterAnnotationMidpointOverlap())
        else:
            raise ValueError("unknown counter '%s'" % counter)

    # initialize workspace generator
    if options.conditional == "unconditional":
        workspace_generator = Engine.UnconditionalWorkspace()
    elif options.conditional == "cooccurance":
        workspace_generator = Engine.ConditionalWorkspaceCooccurance()
    elif options.conditional == "annotation-centered":
        if options.conditional_expansion is None:
            raise ValueError(
                "please specify either --conditional-expansion or "
                "--conditional-extension")
        workspace_generator = Engine.ConditionalWorkspaceAnnotationCentered(
            options.conditional_extension, options.conditional_expansion)
    elif options.conditional == "segment-centered":
        if options.conditional_expansion is None:
            raise ValueError(
                "please specify either --conditional-expansion or "
                "--conditional-extension")

        workspace_generator = Engine.ConditionalWorkspaceSegmentCentered(
            options.conditional_extension, options.conditional_expansion)
    else:
        raise ValueError("unknown conditional workspace '%s'" %
                         options.conditional)

    # check if reference is compplete
    if options.reference:
        for track in segments.tracks:
            if track not in options.reference:
                raise ValueError("missing track '%s' in reference" % track)
            r = options.reference[track]
            for annotation in annotations.tracks:
                if annotation not in r:
                    raise ValueError(
                        "missing annotation '%s' in annotations for "
                        "track='%s'" % (annotation, track))

    # compute
    annotator_results = gat.run(
        segments,
        annotations,
        workspace,
        sampler,
        counters,
        workspace_generator=workspace_generator,
        num_samples=options.num_samples,
        cache=options.cache,
        outfiles=outfiles,
        output_counts_pattern=options.output_counts_pattern,
        output_samples_pattern=options.output_samples_pattern,
        sample_files=options.sample_files,
        conditional=options.conditional,
        conditional_extension=options.conditional_extension,
        reference=options.reference,
        pseudo_count=options.pseudo_count,
        num_threads=options.num_threads)

    return annotator_results