Esempio n. 1
0
 def __init__(self, ic=[], ignore_unambiguity=True, relativescore=True):
     """ """
     if not ic: ic = IC_ACCEPTOR
     Pssm.__init__(self,
                   ic=ic,
                   ignore_unambiguity=ignore_unambiguity,
                   relativescore=relativescore)
Esempio n. 2
0
def scan_pssm_splice_site(seq,splicetype="donor",
    override_pattern_offset=(),min_pssm_score=None,
    allow_non_canonical=False,non_canonical_min_pssm_score=0.0,
    ignore_unambiguity=False,relativescore=False,):
    """
    Find splice sites by a PSSM on input sequence

    @type  seq:  string
    @param seq: DNA sequence of EXACT length of the PSSM

    @type  splicetype:   string
    @param splicetype:  'donor' or 'acceptor'

    @type  min_pssm_score:   float
    @param min_pssm_score:

    @type  allow_non_canonical:  boolean
    @param allow_non_canonical: True of False

    @type  non_canonical_min_pssm_score:   float
    @param non_canonical_min_pssm_score:

    @type  override_pattern_offset:  tuple
    @param override_pattern_offset: tuple with 2 integers; use cautiously!!

    @rtype:  list
    @return: list with SpliceDonors or SpliceAcceptors
    """
    if splicetype == 'acceptor':
        PSSM_MATRIX     = IC_ACCEPTOR
        pattern_offset  = IC_ACCEPTOR_PATTERN_OFFSET 
        canonical       = "AG"
        # initialize Psmm (scoring) class
        canssPssm = Pssm(ic=IC_ACCEPTOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)
        # import output SpliceAcceptor object
        from acceptor import SpliceAcceptor
    elif splicetype == 'donor':
        PSSM_MATRIX     = IC_DONOR
        pattern_offset  = IC_DONOR_PATTERN_OFFSET 
        canonical       = "GT"
        # initialize Psmm (scoring) class
        canssPssm = Pssm(ic=IC_DONOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)
        # import output SpliceDonor object
        from donor import SpliceDonor
    else:
        message = "'splicetype' (%s) not in [donor,acceptor]" % splicetype
        raise InproperlyAppliedArgument, message

    if allow_non_canonical:
        # obtain PSSM_IC for non-canonical (GC) donors
        IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE)
        noncanonical = ["GC"]
        # initialize Psmm (scoring) class
        noncanssPssm = Pssm(ic=IC_NCGC_DONOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)


    # hmm... somebody knows what he or she is doing ;-)
    if override_pattern_offset:
        pattern_offset = override_pattern_offset

    pssmlength = len(PSSM_MATRIX)
    sites = []
    for offset in range(0, len(seq) - pssmlength + 1 ):
        # get sequence slice of pattern and actual splice site
        seqpart = seq[offset:offset+pssmlength].upper()
        splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]]

        # continue if non-canonical sites if not requested for
        if not allow_non_canonical and splicesite != canonical:
            continue
        elif splicesite == canonical:
            # score this splicesite
            #score = _score_splice_site(seqpart,splicetype=splicetype)
            score = canssPssm.score(seqpart)
            # check if site must be stored
            if min_pssm_score or min_pssm_score == 0.0:
                if score < min_pssm_score:
                    continue
        elif splicesite != canonical and splicetype == 'donor' and splicesite in noncanonical:
            # score non-canonical donor site
            #score = pssmscore(seqpart,IC_NCGC_DONOR) 
            score = noncanssPssm.score(seqpart)
            # check if site must be stored
            if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0:
                if score < non_canonical_min_pssm_score:
                    continue
            ####print seqpart, score, offset
        else:
            continue

        if splicetype=='acceptor':
            a = SpliceAcceptor(offset,seqpart,acceptor=splicesite,pssm_score=score)
            sites.append(a)
        else:
            d = SpliceDonor(offset,seqpart,donor=splicesite,pssm_score=score)
            sites.append(d)

    # return sites for Donor
    if splicetype == 'donor':
        sites.reverse()

    # and return
    return sites
Esempio n. 3
0
 def __init__(self,ic=[],ignore_unambiguity=True,relativescore=True):
     """ """
     if not ic: ic=IC_ACCEPTOR
     Pssm.__init__(self,ic=ic,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)
Esempio n. 4
0
def scan_pssm_splice_site(
    seq,
    splicetype="donor",
    override_pattern_offset=(),
    min_pssm_score=None,
    allow_non_canonical=False,
    non_canonical_min_pssm_score=0.0,
    ignore_unambiguity=False,
    relativescore=False,
):
    """
    Find splice sites by a PSSM on input sequence

    @type  seq:  string
    @param seq: DNA sequence of EXACT length of the PSSM

    @type  splicetype:   string
    @param splicetype:  'donor' or 'acceptor'

    @type  min_pssm_score:   float
    @param min_pssm_score:

    @type  allow_non_canonical:  boolean
    @param allow_non_canonical: True of False

    @type  non_canonical_min_pssm_score:   float
    @param non_canonical_min_pssm_score:

    @type  override_pattern_offset:  tuple
    @param override_pattern_offset: tuple with 2 integers; use cautiously!!

    @rtype:  list
    @return: list with SpliceDonors or SpliceAcceptors
    """
    if splicetype == 'acceptor':
        PSSM_MATRIX = IC_ACCEPTOR
        pattern_offset = IC_ACCEPTOR_PATTERN_OFFSET
        canonical = "AG"
        # initialize Psmm (scoring) class
        canssPssm = Pssm(ic=IC_ACCEPTOR,
                         ignore_unambiguity=ignore_unambiguity,
                         relativescore=relativescore)
        # import output SpliceAcceptor object
        from acceptor import SpliceAcceptor
    elif splicetype == 'donor':
        PSSM_MATRIX = IC_DONOR
        pattern_offset = IC_DONOR_PATTERN_OFFSET
        canonical = "GT"
        # initialize Psmm (scoring) class
        canssPssm = Pssm(ic=IC_DONOR,
                         ignore_unambiguity=ignore_unambiguity,
                         relativescore=relativescore)
        # import output SpliceDonor object
        from donor import SpliceDonor
    else:
        message = "'splicetype' (%s) not in [donor,acceptor]" % splicetype
        raise InproperlyAppliedArgument, message

    if allow_non_canonical:
        # obtain PSSM_IC for non-canonical (GC) donors
        IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE)
        noncanonical = ["GC"]
        # initialize Psmm (scoring) class
        noncanssPssm = Pssm(ic=IC_NCGC_DONOR,
                            ignore_unambiguity=ignore_unambiguity,
                            relativescore=relativescore)

    # hmm... somebody knows what he or she is doing ;-)
    if override_pattern_offset:
        pattern_offset = override_pattern_offset

    pssmlength = len(PSSM_MATRIX)
    sites = []
    for offset in range(0, len(seq) - pssmlength + 1):
        # get sequence slice of pattern and actual splice site
        seqpart = seq[offset:offset + pssmlength].upper()
        splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]]

        # continue if non-canonical sites if not requested for
        if not allow_non_canonical and splicesite != canonical:
            continue
        elif splicesite == canonical:
            # score this splicesite
            #score = _score_splice_site(seqpart,splicetype=splicetype)
            score = canssPssm.score(seqpart)
            # check if site must be stored
            if min_pssm_score or min_pssm_score == 0.0:
                if score < min_pssm_score:
                    continue
        elif splicesite != canonical and splicetype == 'donor' and splicesite in noncanonical:
            # score non-canonical donor site
            #score = pssmscore(seqpart,IC_NCGC_DONOR)
            score = noncanssPssm.score(seqpart)
            # check if site must be stored
            if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0:
                if score < non_canonical_min_pssm_score:
                    continue
            ####print seqpart, score, offset
        else:
            continue

        if splicetype == 'acceptor':
            a = SpliceAcceptor(offset,
                               seqpart,
                               acceptor=splicesite,
                               pssm_score=score)
            sites.append(a)
        else:
            d = SpliceDonor(offset,
                            seqpart,
                            donor=splicesite,
                            pssm_score=score)
            sites.append(d)

    # return sites for Donor
    if splicetype == 'donor':
        sites.reverse()

    # and return
    return sites
Esempio n. 5
0
def scan_pssm_tss(seq,override_pattern_offset=(),min_pssm_score=None,
    allow_non_canonical=False,non_canonical_min_pssm_score=0.0,
    ignore_unambiguity=False,relativescore=False,):
    """
    Find TSS's by a PSSM on input sequence

    @type seq:  string
    @param seq: DNA sequence of EXACT length of the PSSM

    @type min_pssm_score:   float
    @param min_pssm_score:

    @type allow_non_canonical:  boolean
    @param allow_non_canonical: True of False

    @type non_canonical_min_pssm_score:   float
    @param non_canonical_min_pssm_score:

    @type override_pattern_offset:  tuple
    @param override_pattern_offset: tuple with 2 integers; use cautiously!!

    @rtype:  list
    @return: list with TranslationalStartSites
    """
    pattern_offset  = IC_TSS_PATTERN_OFFSET
    canonical       = "ATG"
    regex_pattern   = "ATG"
    # obtain Psmm (scoring) class
    tssPssm = Pssm(ic=IC_TSS,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)

    # hmm... somebody knows what he or she is doing ;-)
    if override_pattern_offset:
        pattern_offset = override_pattern_offset

    if allow_non_canonical:
        # http://en.wikipedia.org/wiki/Start_codon
        # Blattner, F. R.; Plunkett, G.; Bloch, C. A.; Perna, N. T.; Burland, V.; Riley, M.; Collado-vides, J.; Glasner, J. D. et al (1997). "The Complete Genome Sequence of Escherichia coli K-12". Science 277 (5331): 1453-62
        noncanonicalGTG = "GTG"
        noncanonicalTTG = "TTG"
        regex_pattern   = "[ATG]TG"
        tssPssmGTG = deepcopy(tssPssm)
        tssPssmGTG.ic[pattern_offset[0]]['A'],tssPssmGTG.ic[pattern_offset[0]]['G'] = tssPssmGTG.ic[pattern_offset[0]]['G'],tssPssmGTG.ic[pattern_offset[0]]['A'] 
        tssPssmTTG = deepcopy(tssPssm)
        tssPssmTTG.ic[pattern_offset[0]]['A'],tssPssmGTG.ic[pattern_offset[0]]['T'] = tssPssmTTG.ic[pattern_offset[0]]['T'],tssPssmTTG.ic[pattern_offset[0]]['A']  

    pssmlength = len(IC_TSS)
    sites = []
    
    for atg in finditer(compile(regex_pattern),seq.upper()[0:-pattern_offset[1]]):
        if atg.start() < pattern_offset[0]: continue
        seqpart = seq[atg.start()-pattern_offset[0]:atg.end()+pattern_offset[1]]
        tss = atg.group()
        offset = atg.start()-pattern_offset[0]

        # continue if non-canonical sites if not requested for
        if tss == canonical:
            # score this TSS
            score = tssPssm.score(seqpart)

            # continue if score is to low
            if min_pssm_score or min_pssm_score == 0.0:
                if score < min_pssm_score:
                    continue
        else:
            if atg.group() == noncanonicalGTG:
                score = tssPssmGTG.score(seqpart)
            elif atg.group() == noncanonicalTTG: 
                score = tssPssmTTG.score(seqpart)
            else:
                continue

            # continue if score is to low
            if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0:
                if score < non_canonical_min_pssm_score: 
                    continue


        # phase of this tss
        phase = (offset+pattern_offset[0]) % 3

        # instantiate tss object
        t = TranslationalStartSite(offset,seqpart,pssm_score=score,phase=phase)
        sites.append(t)

    # and return
    return sites