Esempio n. 1
0
def _score_splice_site(seq, splicetype="donor"):
    """
    Return PSSM splice site score

    @type seq:  string
    @param seq: DNA sequence of EXACT length of the PSSM

    @type splicetype:   string
    @param splicetype:  'donor' or 'acceptor'

    @rtype:     float
    @return:    PSSM score for this splice site
    """
    if splicetype == 'acceptor':
        return pssmscore(seq, IC_ACCEPTOR)
    else:
        return pssmscore(seq, IC_DONOR)
Esempio n. 2
0
def scan_pssm_splice_site(seq,
                          splicetype="donor",
                          override_pattern_offset=(),
                          min_pssm_score=None,
                          allow_non_canonical=False,
                          non_canonical_min_pssm_score=0.0):
    """
    Find splice sites by a PSSM on input sequence

    @type seq:  string
    @param seq: DNA sequence of EXACT length of the PSSM

    @type splicetype:   string
    @param splicetype:  'donor' or 'acceptor'

    @type min_pssm_score:   float
    @param min_pssm_score:

    @type allow_non_canonical:  boolean
    @param allow_non_canonical: True of False

    @type non_canonical_min_pssm_score:   float
    @param non_canonical_min_pssm_score:

    @type override_pattern_offset:  tuple
    @param override_pattern_offset: tuple with 2 integers; use cautiously!!

    @rtype:     list
    @return:    list with SpliceDonors or SpliceAcceptors
    """
    if splicetype == 'acceptor':
        PSSM_MATRIX = IC_ACCEPTOR
        pattern_offset = IC_ACCEPTOR_PATTERN_OFFSET
        canonical = "AG"
    else:
        PSSM_MATRIX = IC_DONOR
        pattern_offset = IC_DONOR_PATTERN_OFFSET
        canonical = "GT"

    if allow_non_canonical:
        # obtain PSSM_IC for non-canonical (GC) donors
        IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE)

    # hmm... somebody knows what he or she is doing ;-)
    if override_pattern_offset:
        pattern_offset = override_pattern_offset

    pssmlength = len(PSSM_MATRIX)
    sites = []
    for offset in range(0, len(seq) - pssmlength + 1):
        # get sequence slice of pattern and actual splice site
        seqpart = seq[offset:offset + pssmlength].upper()
        splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]]

        # continue if non-canonical sites if not requested for
        if not allow_non_canonical and splicesite != canonical:
            continue
        elif splicesite == canonical:
            # score this splicesite
            score = _score_splice_site(seqpart, splicetype=splicetype)
            # check if site must be stored
            if min_pssm_score or min_pssm_score == 0.0:
                if score < min_pssm_score:
                    continue
        elif splicesite != canonical and splicetype == 'donor':
            # score non-canonical donor site
            score = pssmscore(seqpart, IC_NCGC_DONOR)
            # check if site must be stored
            if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0:
                if score < non_canonical_min_pssm_score:
                    continue
        else:
            continue

        if splicetype == 'acceptor':
            a = SpliceAcceptor(offset,
                               seqpart,
                               acceptor=splicesite,
                               pssm_score=score)
            sites.append(a)
        else:
            d = SpliceDonor(offset,
                            seqpart,
                            donor=splicesite,
                            pssm_score=score)
            sites.append(d)

    # return sites for Donor
    if splicetype == 'donor':
        sites.reverse()

    # and return
    return sites