Beispiel #1
0
def retained_intron(annotation, density,
                    exon_offset, intron_offset,
                    annotation_type="rmats"):
    """
    Creates an r x c pandas dataframe of r events for a
    Retained Intron (RI) feature.

    A RI matrix will contain two distinct regions:

    |_]----||----[_|

    Parameters
    ----------
    annotation : str
        path of file containing the annotation
    density : density.ReadDensity
        object containing the positive and negative BigWig files
    exon_offset : int
        how far into the exon boundary to plot
    intron_offset : int
        how far from the exon boundary to plot
    annotation_type : str
        may be rmats format or any additional defined format in Feature

    Returns
    -------
    pandas.DataFrame : dataframe of r events for an MXE feature.
    """

    three_upstream = {}
    five_downstream = {}
    with open(annotation) as f:
        for line in f:
            if not line.startswith('event_name') and not line.startswith('ID') and not line.startswith('annotation'):
                event = line.rstrip()  # .split('\t')[0]
                upstream_interval, downstream_interval = Feature.Retained_intron(
                    event,
                    annotation_type
                ).get_bedtools()

                """three prime upstream region"""
                wiggle = intervals.three_prime_site(
                    density, downstream_interval, upstream_interval,
                    exon_offset, intron_offset
                )
                three_upstream[event] = wiggle

                """five prime site of downstream region"""
                wiggle = intervals.five_prime_site(
                    density, upstream_interval, downstream_interval,
                    exon_offset, intron_offset
                )
                five_downstream[event] = wiggle

        three_upstream = pd.DataFrame(three_upstream).T
        five_downstream = pd.DataFrame(five_downstream).T

    ra = pd.concat([three_upstream, five_downstream], axis=1)
    ra.columns = range(0, ra.shape[1])
    return ra
Beispiel #2
0
def unscaled_region(
        annotation, density, annotation_type,
        upstream_offset, downstream_offset
):
    """
    Given an exon, return a dataframe of densities
    Parameters
    ----------
    annotation : basestring
        filename of the annotation file to use
    density : density.ReadDensity
        object that contains positive and negative normalized density *.bw
    upstream_offset : int
        number of bases into the exon to return densities for. NOTE: this
        nomenclature assumes we're plotting (+) exon|intron junctions, so
        exons are 'upstream'. However 'upstream_offset' still refers to
        exons in (-) intron|exon features.
    downstream_offset : int
        number of bases from the exon boundary to return. NOTE: this
        nomenclature assumes we're plotting (+) exon|intron junctions, so
        introns are 'downstream'. However 'downstream_offset' still refers to
        introns in (-) intron|exon features.
    annotation_type : basestring
        name of the annotation feature described by density.Feature

    Returns
    -------
    dataframe with (r rows and c cols) describing the density values across
    a list of exons defined by annotation_file.
    r = intron_offset + exon_offset + exon_offset + intron_offset
    c = number of annotations in annotation_file
    """
    up = {}  # describes for every event the upstream region
    down = {}  # describes for every event the downstream region
    with open(annotation) as f:
        for line in f:
            if not line.startswith('event_name') and not line.startswith('ID') and not line.startswith('annotation'):
                event = line.rstrip()  # .split('\t')[0]

                interval = Feature.Feature(
                    event,
                    annotation_type
                ).get_bedtool()

                """ calculate five prime site region """
                # [      ]---|----[  |     ]
                wiggle = intervals.five_prime_site(
                    density, None, interval, upstream_offset,
                    downstream_offset, stop_at_midpoint=True
                )
                up[event] = wiggle
                """ calculate the three prime site region """
                wiggle = intervals.three_prime_site(
                    density, None, interval, upstream_offset,
                    downstream_offset, stop_at_midpoint=True
                )
                down[event] = wiggle

    up = pd.DataFrame(up).T
    down = pd.DataFrame(down).T

    # combine both regions in order to normalize together.
    ra = pd.concat([up, down], axis=1)
    ra.columns = range(0, ra.shape[1])

    return ra
Beispiel #3
0
def skipped_exon(annotation, density, exon_offset, intron_offset,
                 annotation_type="rmats"):
    """
    Creates an r x c pandas dataframe of r events for a skipped
    exon feature. An SE matrix will contain four distinct regions:

    |_]----||----[__||__]----||----[_|

    - [..exon_offset]--intron_offset--... 3' site of upstream exon
    - ...--intron_offset--[exon_offset..] 5' site of upstream skipped exon
    - [..exon_offset]--intron_offset--... 3' site of downstream skipped exon
    - ..--intron_offset--[exon_offset..] 5' site of downstream exon

    Parameters
    ----------
    annotation : str
        path of file containing the annotation
    density : density.ReadDensity
        object containing positive and negative BigWig files
    exon_offset : int
        how far into the exon boundary to plot
    intron_offset : int
        how far after the exon boundary to plot
    annotation_type : str
        may be rmats format or any additional defined format in Feature
    Returns
    -------

    """

    three_upstream = {}
    five_skipped = {}
    three_skipped = {}
    five_downstream = {}

    with open(annotation) as f:
        for line in f:
            if not line.startswith('event_name') and not line.startswith('ID') and not line.startswith('annotation'):
                event = line.rstrip()
                try:
                    upstream_interval, interval, downstream_interval = \
                        Feature.Skipped_exon(
                            event,
                            annotation_type
                        ).get_bedtools()
                except Exception as e:
                    print("Having trouble parsing event: \
                    {} (assumed type: {})".format(event, annotation_type))

                """three prime upstream region"""
                wiggle = intervals.three_prime_site(
                    density, interval, upstream_interval,
                    exon_offset, intron_offset
                )
                three_upstream[event] = wiggle
                """five prime site of skipped region"""
                wiggle = intervals.five_prime_site(
                    density, upstream_interval, interval,
                    exon_offset, intron_offset
                )
                five_skipped[event] = wiggle
                """three prime site of skipped region"""
                wiggle = intervals.three_prime_site(
                    density, downstream_interval, interval,
                    exon_offset, intron_offset
                )
                three_skipped[event] = wiggle
                """five prime site of downstream region"""
                wiggle = intervals.five_prime_site(
                    density, interval, downstream_interval,
                    exon_offset, intron_offset
                )
                five_downstream[event] = wiggle

        three_upstream = pd.DataFrame(three_upstream).T
        five_skipped = pd.DataFrame(five_skipped).T
        three_skipped = pd.DataFrame(three_skipped).T
        five_downstream = pd.DataFrame(five_downstream).T

    ra = pd.concat(
        [three_upstream, five_skipped, three_skipped, five_downstream],
        axis=1
    )
    ra.columns = range(0, ra.shape[1])
    return ra
Beispiel #4
0
def alt_3p_splice_site(annotation, density, exon_offset, intron_offset,
                       annotation_type="rmats"):
    """
    Creates an r x c pandas dataframe of r events for an
    alternative 3' splice site feature. An A3SS matrix will
    contain three distinct regions:

    __|__]------||-----[__|____
    __|__]------|    |------[__|

    - the [..exon_offset]--intron_offset--... 3' site of an upstream exon
    - the ..--intron_offset--[exon_offset..] 5' site of the alt1 spliced exon
    - the ..--intron_offset--[exon_offset..] 5' site of the alt2 spliced exon

    Parameters
    ----------
    annotation : str
        path of file containing the annotation
    density : density.ReadDensity
        object containing positive and negative BigWig files
    exon_offset : int
        how far into the exon boundary to plot
    intron_offset : int
        how far after the exon boundary to plot
    annotation_type : str
        may be rmats format or any additional defined format in Feature
    Returns
    -------
    pandas.DataFrame : a dataframe of r events for an A3SS feature.
    """

    three_upstream = {}
    five_alt1 = {}
    five_alt2 = {}

    with open(annotation) as f:
        for line in f:
            if not line.startswith('event_name') and not line.startswith('ID') and not line.startswith('annotation'):
                event = line.rstrip()
                upstream, alt1, alt2 = Feature.Alt_3p_splice_site(
                    event, annotation_type
                ).get_bedtools()
                """ upstream region """
                wiggle = intervals.three_prime_site(
                    density, alt1, upstream, exon_offset, intron_offset
                )
                three_upstream[event] = wiggle
                """ five prime site of alt1 (longer exon) """
                wiggle = intervals.five_prime_site(
                    density, upstream, alt1, exon_offset, intron_offset
                )
                five_alt1[event] = wiggle
                """ five prime site of alt2 (shorter exon) """
                wiggle = intervals.five_prime_site(
                    density, upstream, alt2, exon_offset, intron_offset
                )
                five_alt2[event] = wiggle

        three_upstream = pd.DataFrame(three_upstream).T
        five_alt1 = pd.DataFrame(five_alt1).T
        five_alt2 = pd.DataFrame(five_alt2).T

    ra = pd.concat([three_upstream, five_alt1, five_alt2], axis=1)
    ra.columns = range(0, ra.shape[1])

    return ra
Beispiel #5
0
def mutually_exc_exon(annotation, density, exon_offset, intron_offset,
                      annotation_type="rmats"):
    """

    Creates an r x c pandas dataframe of r events for a mutually exclusive
    exon feature. An MXE matrix will contain six distinct regions:

    |_]----||----[__||__]----||----[__||__]----||----[_|

    - [..exon_offset]--intron_offset--... 3' site of an upstream exon
    - ...--intron_offset--[exon_offset..] 5' site of upstream skipped exon
    - [..exon_offset]--intron_offset--... 3' site of upstream skipped exon
    - ...--intron_offset--[exon_offset..] 5' site of downstream skipped exon
    - [..exon_offset]--intron_offset--... 3' site of downstream skipped exon
    - ..--intron_offset--[exon_offset..] 5' site of downstream exon

    Parameters
    ----------
    annotation : basestring
        path of file containing the annotation
    density : density.ReadDensity
        object containing positive and negative BigWig files
    exon_offset : int
        how far into the exon boundary to plot
    intron_offset : int
        how far after the exon boundary to plot
    annotation_type : basestring
        Must be "rmats" or any additional defined format (see: density.Feature)
    Returns
    -------
    pandas.DataFrame
        A dataframe of r events for an MXE feature (see: description).
    """

    three_upstream = {}
    three_up_mxe = {}
    five_up_mxe = {}
    three_down_mxe = {}
    five_down_mxe = {}
    five_downstream = {}

    with open(annotation) as f:
        for line in f:
            if not line.startswith('event_name') and not line.startswith('ID') and not line.startswith('annotation'):
                event = line.rstrip()  # .split('\t')[0]
                upstream_interval, upstream_mxe_interval, \
                    downstream_mxe_interval, downstream_interval = \
                    Feature.Mutually_exclusive_exon(
                        event,
                        annotation_type
                    ).get_bedtools()

                """three prime upstream region"""
                wiggle = intervals.three_prime_site(
                    density, upstream_mxe_interval, upstream_interval,
                    exon_offset, intron_offset
                )
                three_upstream[event] = wiggle
                """five prime site of mxe1 (upstream mxe) region"""
                wiggle = intervals.five_prime_site(
                    density, upstream_interval, upstream_mxe_interval,
                    exon_offset, intron_offset
                )
                five_up_mxe[event] = wiggle
                """three prime site of mxe1 (upstream mxe) region"""
                wiggle = intervals.three_prime_site(
                    density, downstream_mxe_interval, upstream_mxe_interval,
                    exon_offset, intron_offset
                )
                three_up_mxe[event] = wiggle
                """five prime site of mxe2 (downstream mxe) region"""
                wiggle = intervals.five_prime_site(
                    density, upstream_mxe_interval, downstream_mxe_interval,
                    exon_offset, intron_offset
                )
                five_down_mxe[event] = wiggle
                """three prime site of mxe2 (downstream mxe) region"""
                wiggle = intervals.three_prime_site(
                    density, downstream_interval, downstream_mxe_interval,
                    exon_offset, intron_offset
                )
                three_down_mxe[event] = wiggle
                """five prime site of downstream region"""
                wiggle = intervals.five_prime_site(
                    density, downstream_mxe_interval, downstream_interval,
                    exon_offset, intron_offset
                )
                five_downstream[event] = wiggle

        # TODO make this more efficient.
        three_upstream = pd.DataFrame(three_upstream).T
        five_up_mxe = pd.DataFrame(five_up_mxe).T
        three_up_mxe = pd.DataFrame(three_up_mxe).T
        five_down_mxe = pd.DataFrame(five_down_mxe).T
        three_down_mxe = pd.DataFrame(three_down_mxe).T
        five_downstream = pd.DataFrame(five_downstream).T

    ra = pd.concat([
        three_upstream, five_up_mxe, three_up_mxe,
        five_down_mxe, three_down_mxe, five_downstream
    ], axis=1
    )
    ra.columns = range(0, ra.shape[1])
    return ra
Beispiel #6
0
def phastcon_region(annotation, density, annotation_type, exon_offset,
                    intron_offset, peak, mask_df):
    """
    Produces a matrix corresponding to a region that contains a peak using values
    that only overlap that peak

    Parameters
    ----------
    annotation
    density
    annotation_type
    upstream_offset
    downstream_offset
    peak: density.Peak

    Returns
    -------

    """
    three_upstream = {}
    five_downstream = {}
    with open(annotation) as f:
        for line in f:
            if not line.startswith('event_name') and not line.startswith(
                    'ID') and not line.startswith('annotation'):
                event = line.rstrip()  # .split('\t')[0]
                upstream_interval, downstream_interval = Feature.Phastcon(
                    event, annotation_type).get_bedtools()
                """three prime upstream region"""
                wiggle = intervals.three_prime_site(density,
                                                    downstream_interval,
                                                    upstream_interval,
                                                    exon_offset,
                                                    intron_offset,
                                                    fill_pads_with=-1)
                if mask_df:
                    region = intervals.bedtool_from_renamed_twobed_index(
                        event, 'upstream')

                    masked_interval = peak.values(region.chrom, region.start,
                                                  region.end, region.strand)
                    if sum(masked_interval) > 0:
                        for pos in masked_interval.index:
                            wiggle[pos] = wiggle[pos] if masked_interval.loc[
                                pos] > 0 else np.nan
                        # if event == 'chr1\t1234724\t1234736\tENST00000354700.5\t0\t-\tchr1\t1235210\t1235285\tENST00000354700.5\t0\t-':
                        #     print("upstream", region)
                        #     print(wiggle)
                    else:
                        wiggle = [np.nan for pos in wiggle]
                three_upstream[event] = wiggle
                """five prime site of downstream region"""
                wiggle = intervals.five_prime_site(density,
                                                   upstream_interval,
                                                   downstream_interval,
                                                   exon_offset,
                                                   intron_offset,
                                                   fill_pads_with=-1)

                if mask_df:
                    region = intervals.bedtool_from_renamed_twobed_index(
                        event, 'downstream')
                    masked_interval = peak.values(region.chrom, region.start,
                                                  region.end, region.strand)
                    if sum(masked_interval) > 0:

                        for pos in masked_interval.index:
                            wiggle[pos] = wiggle[pos] if (
                                masked_interval.loc[pos] > 0
                                and wiggle[pos] >= 0) else np.nan
                        # if event == 'chr1\t1234724\t1234736\tENST00000354700.5\t0\t-\tchr1\t1235210\t1235285\tENST00000354700.5\t0\t-':
                        #     print("downstream", region)
                    else:
                        wiggle = [np.nan for pos in wiggle]
                five_downstream[event] = wiggle

        three_upstream = pd.DataFrame(three_upstream).T
        five_downstream = pd.DataFrame(five_downstream).T

    ra = pd.merge(three_upstream,
                  five_downstream,
                  how='outer',
                  left_index=True,
                  right_index=True)
    ra = ra.replace(-1, np.nan)
    # ra = pd.concat([three_upstream, five_downstream], axis=1)
    ra.columns = range(0, ra.shape[1])
    return ra
    """