def _get_counts_and_sequence(gtf_iterator, bam, fasta,
                             seperate_UTRs=False):
    '''Called by pentamer_enrichment. This function will return an iterator
    that yeilds tuples of profiles accross transcripts or introns and the
    sequence for which the profile is determined'''

    for transcript in gtf_iterator:

        E.debug("Counting transcript %s" % transcript[0].transcript_id)
        contig, strand = transcript[0].contig, transcript[0].strand

        # exons
        exons = GTF.asRanges(transcript, "exon")
        sequence = "".join(fasta.getSequence(contig, strand, exon[0], exon[1])
                           for exon in exons)
        exon_counts = count_transcript(transcript, bam)
        yield (exon_counts, sequence)

        # introns
        intron_intervals = GTF.toIntronIntervals(transcript)
        intron_counts = count_intervals(bam, intron_intervals, contig, strand)

        if intron_counts.sum() == 0:
            continue

        for intron in intron_intervals:
            
            seq = fasta.getSequence(contig, strand, intron[0], intron[1])
            profile = intron_counts.loc[float(intron[0]):float(intron[1])]
            profile.index = profile.index - intron[0]
            yield (profile, seq)
Beispiel #2
0
def _get_counts_and_sequence(gtf_iterator, bam, fasta, seperate_UTRs=False):
    '''Called by pentamer_enrichment. This function will return an iterator
    that yeilds tuples of profiles accross transcripts or introns and the
    sequence for which the profile is determined'''

    for transcript in gtf_iterator:

        E.debug("Counting transcript %s" % transcript[0].transcript_id)
        contig, strand = transcript[0].contig, transcript[0].strand

        # exons
        exons = GTF.asRanges(transcript, "exon")
        sequence = "".join(
            fasta.getSequence(contig, strand, exon[0], exon[1])
            for exon in exons)
        exon_counts = count_transcript(transcript, bam)
        yield (exon_counts, sequence)

        # introns
        intron_intervals = GTF.toIntronIntervals(transcript)
        intron_counts = count_intervals(bam, intron_intervals, contig, strand)

        if intron_counts.sum() == 0:
            continue

        for intron in intron_intervals:

            seq = fasta.getSequence(contig, strand, intron[0], intron[1])
            profile = intron_counts.loc[float(intron[0]):float(intron[1])]
            profile.index = profile.index - intron[0]
            yield (profile, seq)
Beispiel #3
0
def processing_index(interval_iterator, bam, window_size=50):
    '''Calculate the processing index for the speicied sample, using the
    provided interval_iterator to get the cleavage sites. The iterator
    can be GTF or BED, as long as it has end, contig and strand
    attributes. The end attribute will be used to define the
    cleavage site.

    The proccessing index for G genes is defined as:
    
    .. math::

       pi = log_2( \frac{\sum_{i=1}^{G} N_i^{PM}}{\sum_{i=1}^{G} N_i^M})

    after Baejen et al Mol Cell 5(55):745-757. However, Beaejen et al
    normalise this number to the total number of genes, which seems
    wrong to me. '''

    n_pm = 0
    n_m = 0
 
    for site in interval_iterator:
    
        if site.strand == "+":
            pos = site.end
        elif site.strand == "-":
            pos = site.start
        else:
            raise ValueError(
                "processing index not valid for unstranded cleavage points in "
                "entry\n" + str(site)+"\n")

        upstream_interval = (pos - window_size, pos)
        downstream_interval = (pos, pos + window_size)

        counts = count_intervals(bam, [upstream_interval, downstream_interval],
                                 site.contig, site.strand)

        # We are currently in genome cooridinates, not transcript

        if site.strand == "+":
            # pandas indexing is inclusive
            n_up = counts.iloc[:pos-1].sum()
            n_down = counts.iloc[pos:].sum()
        elif site.strand == "-":
            n_up = counts.iloc[pos:].sum()
            n_down = counts.iloc[:pos-1].sum()

        n_pm += n_down
        n_m += n_up-n_down

    pi = np.log2(float(n_pm)/float(max(1, n_m)))

    return pi
Beispiel #4
0
def _get_counts_and_sequence(gtf_iterator, bam, fasta,
                             seperate_UTRs=False):
    '''Called by pentamer_enrichment. This function will return an iterator
    that yeilds tuples of profiles accross transcripts or introns and the
    sequence for which the profile is determined'''

    for transcript in gtf_iterator:

        transcript = [e for e in transcript if hasattr(e, "transcript_id")]
        if len(transcript)==0:
            continue
        
        
        contig, strand = transcript[0].contig, transcript[0].strand

        # exons
        exons = GTF.asRanges(transcript, "exon")
        try:
            sequence = "".join(fasta.getSequence(contig, "+", exon[0], exon[1])
                               for exon in exons)
        except KeyError:
            continue
        if strand == "-":
            sequence = revcomp(sequence)
            
        exon_counts = count_transcript(transcript, bam)

        if exon_counts.sum() > 0:
            yield (exon_counts, sequence)

        # introns
        intron_intervals = GTF.toIntronIntervals(transcript)
        intron_counts = count_intervals(bam, intron_intervals, contig, strand)

        if intron_counts.sum() == 0:
            continue

        for intron in intron_intervals:
            
            seq = fasta.getSequence(contig, "+", intron[0], intron[1])
            if strand == "-":
                seq = revcomp(seq)
                
            profile = intron_counts.loc[float(intron[0]):float(intron[1])]
            profile.index = profile.index - intron[0]
            if profile.sum() > 0:
                yield (profile, seq)
Beispiel #5
0
def _get_counts_and_sequence(gtf_iterator, bam, fasta, seperate_UTRs=False):
    '''Called by pentamer_enrichment. This function will return an iterator
    that yeilds tuples of profiles accross transcripts or introns and the
    sequence for which the profile is determined'''

    for transcript in gtf_iterator:

        transcript = [e for e in transcript if hasattr(e, "transcript_id")]
        if len(transcript) == 0:
            continue

        contig, strand = transcript[0].contig, transcript[0].strand

        # exons
        exons = GTF.asRanges(transcript, "exon")
        try:
            sequence = "".join(
                fasta.getSequence(contig, "+", exon[0], exon[1])
                for exon in exons)
        except KeyError:
            continue
        if strand == "-":
            sequence = revcomp(sequence)

        exon_counts = count_transcript(transcript, bam)

        if exon_counts.sum() > 0:
            yield (exon_counts, sequence)

        # introns
        intron_intervals = GTF.toIntronIntervals(transcript)
        intron_counts = count_intervals(bam, intron_intervals, contig, strand)

        if intron_counts.sum() == 0:
            continue

        for intron in intron_intervals:

            seq = fasta.getSequence(contig, "+", intron[0], intron[1])
            if strand == "-":
                seq = revcomp(seq)

            profile = intron_counts.loc[float(intron[0]):float(intron[1])]
            profile.index = profile.index - intron[0]
            if profile.sum() > 0:
                yield (profile, seq)
Beispiel #6
0
def _get_profiles_and_conveter(gtf_iterator, bam):

    for transcript in gtf_iterator:

        transcript = [e for e in transcript if hasattr(e, "transcript_id")]
        if len(transcript) == 0:
            continue
        
        gene_id = transcript[0].gene_id
        #transcript_id = transcript[0].transcript_id
        contig = transcript[0].contig
        strand = transcript[0].strand
         
        E.debug("Crunching gene: %s:"
                % gene_id)
         
        # exons
        profile = count_transcript(transcript, bam)

        if profile.sum() > 0:
            converter = TranscriptCoordInterconverter(transcript)
            yield (profile, converter, LiteExon(0, converter.length),
                   contig, strand)

        # introns
       
        intron_intervals = GTF.toIntronIntervals(transcript)
        intron_counts = count_intervals(bam, intron_intervals,
                                        contig, strand)
        if intron_counts.sum() == 0:
            continue

        converter = TranscriptCoordInterconverter(transcript,
                                                  introns=True)
        intron_counts.index = converter.genome2transcript(
            intron_counts.index.values)

        for intron in intron_intervals:
            intron = (intron[0], intron[1] - 1)
            intron = converter.genome2transcript(intron)
            intron = sorted(intron)
            intron = (intron[0], intron[1] + 1)
            profile = intron_counts.loc[float(intron[0]):float(intron[1])]
            if profile.sum() > 0:
                yield (profile, converter, LiteExon(*intron), contig, strand)
Beispiel #7
0
def _get_profiles_and_conveter(gtf_iterator, bam):

    for transcript in gtf_iterator:

        transcript = [e for e in transcript if hasattr(e, "transcript_id")]
        if len(transcript) == 0:
            continue

        gene_id = transcript[0].gene_id
        #transcript_id = transcript[0].transcript_id
        contig = transcript[0].contig
        strand = transcript[0].strand

        E.debug("Crunching gene: %s:" % gene_id)

        # exons
        profile = count_transcript(transcript, bam)

        if profile.sum() > 0:
            converter = TranscriptCoordInterconverter(transcript)
            yield (profile, converter, LiteExon(0, converter.length), contig,
                   strand)

        # introns

        intron_intervals = GTF.toIntronIntervals(transcript)
        intron_counts = count_intervals(bam, intron_intervals, contig, strand)
        if intron_counts.sum() == 0:
            continue

        converter = TranscriptCoordInterconverter(transcript, introns=True)
        intron_counts.index = converter.genome2transcript(
            intron_counts.index.values)

        for intron in intron_intervals:
            intron = (intron[0], intron[1] - 1)
            intron = converter.genome2transcript(intron)
            intron = sorted(intron)
            intron = (intron[0], intron[1] + 1)
            profile = intron_counts.loc[float(intron[0]):float(intron[1])]
            if profile.sum() > 0:
                yield (profile, converter, LiteExon(*intron), contig, strand)
Beispiel #8
0
def processing_index(interval_iterator, bam, window_size=50):
    '''Calculate the ratio of processed transcripts to non-processed

    Parameters
    ----------
    interval_iterator : CGAT.Bed or CGAT.GTF-like iterator
        The iterator must yeild objects that have a start, end and strand
        attribute. Processing index will be calculated around these.
    bam : *_getter-like function
        A getter function returned by the `make_getter` function, this will
        be used to retrieve cross-link counts.
    window_size : int, optional
        How far up and downstream of the the processing site to consider.

    Returns
    -------
    int
        processing index averaged over all processing sites given.

    Notes
    -----
    The proccessing index for G genes is defined as:

    .. math::

       pi = log_2( \frac{\sum_{i=1}^{G} N_i^{PM}}{\sum_{i=1}^{G} N_i^M})

    after Baejen et al Mol Cell 5(55):745-757. However, Beaejen et al
    normalise this number to the total number of genes, which seems
wrong to me. '''

    n_pm = 0
    n_m = 0

    for site in interval_iterator:

        if site.strand == "+":
            pos = site.end
        elif site.strand == "-":
            pos = site.start
        else:
            raise ValueError(
                "processing index not valid for unstranded cleavage points in "
                "entry\n" + str(site) + "\n")

        upstream_interval = (pos - window_size, pos)
        downstream_interval = (pos, pos + window_size)

        counts = count_intervals(bam, [upstream_interval, downstream_interval],
                                 site.contig, site.strand)

        # We are currently in genome cooridinates, not transcript

        if site.strand == "+":
            # pandas indexing is inclusive
            n_up = counts.iloc[:pos - 1].sum()
            n_down = counts.iloc[pos:].sum()
        elif site.strand == "-":
            n_up = counts.iloc[pos:].sum()
            n_down = counts.iloc[:pos - 1].sum()

        n_pm += n_down
        n_m += n_up - n_down

    pi = np.log2(float(n_pm) / float(max(1, n_m)))

    return pi
Beispiel #9
0
def processing_index(interval_iterator, bam, window_size=50):
    '''Calculate the ratio of processed transcripts to non-processed

    Parameters
    ----------
    interval_iterator : CGAT.Bed or CGAT.GTF-like iterator
        The iterator must yeild objects that have a start, end and strand
        attribute. Processing index will be calculated around these.
    bam : *_getter-like function
        A getter function returned by the `make_getter` function, this will
        be used to retrieve cross-link counts.
    window_size : int, optional
        How far up and downstream of the the processing site to consider.

    Returns
    -------
    int
        processing index averaged over all processing sites given.

    Notes
    -----
    The proccessing index for G genes is defined as:

    .. math::

       pi = log_2( \frac{\sum_{i=1}^{G} N_i^{PM}}{\sum_{i=1}^{G} N_i^M})

    after Baejen et al Mol Cell 5(55):745-757. However, Beaejen et al
    normalise this number to the total number of genes, which seems
wrong to me. '''

    n_pm = 0
    n_m = 0

    for site in interval_iterator:

        if site.strand == "+":
            pos = site.end
        elif site.strand == "-":
            pos = site.start
        else:
            raise ValueError(
                "processing index not valid for unstranded cleavage points in "
                "entry\n" + str(site)+"\n")

        upstream_interval = (pos - window_size, pos)
        downstream_interval = (pos, pos + window_size)

        counts = count_intervals(bam, [upstream_interval, downstream_interval],
                                 site.contig, site.strand)

        # We are currently in genome cooridinates, not transcript

        if site.strand == "+":
            # pandas indexing is inclusive
            n_up = counts.iloc[:pos-1].sum()
            n_down = counts.iloc[pos:].sum()
        elif site.strand == "-":
            n_up = counts.iloc[pos:].sum()
            n_down = counts.iloc[:pos-1].sum()

        n_pm += n_down
        n_m += n_up-n_down

    pi = np.log2(float(n_pm)/float(max(1, n_m)))

    return pi