def __init__(self, ic=[], ignore_unambiguity=True, relativescore=True): """ """ if not ic: ic = IC_ACCEPTOR Pssm.__init__(self, ic=ic, ignore_unambiguity=ignore_unambiguity, relativescore=relativescore)
def scan_pssm_splice_site(seq,splicetype="donor", override_pattern_offset=(),min_pssm_score=None, allow_non_canonical=False,non_canonical_min_pssm_score=0.0, ignore_unambiguity=False,relativescore=False,): """ Find splice sites by a PSSM on input sequence @type seq: string @param seq: DNA sequence of EXACT length of the PSSM @type splicetype: string @param splicetype: 'donor' or 'acceptor' @type min_pssm_score: float @param min_pssm_score: @type allow_non_canonical: boolean @param allow_non_canonical: True of False @type non_canonical_min_pssm_score: float @param non_canonical_min_pssm_score: @type override_pattern_offset: tuple @param override_pattern_offset: tuple with 2 integers; use cautiously!! @rtype: list @return: list with SpliceDonors or SpliceAcceptors """ if splicetype == 'acceptor': PSSM_MATRIX = IC_ACCEPTOR pattern_offset = IC_ACCEPTOR_PATTERN_OFFSET canonical = "AG" # initialize Psmm (scoring) class canssPssm = Pssm(ic=IC_ACCEPTOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore) # import output SpliceAcceptor object from acceptor import SpliceAcceptor elif splicetype == 'donor': PSSM_MATRIX = IC_DONOR pattern_offset = IC_DONOR_PATTERN_OFFSET canonical = "GT" # initialize Psmm (scoring) class canssPssm = Pssm(ic=IC_DONOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore) # import output SpliceDonor object from donor import SpliceDonor else: message = "'splicetype' (%s) not in [donor,acceptor]" % splicetype raise InproperlyAppliedArgument, message if allow_non_canonical: # obtain PSSM_IC for non-canonical (GC) donors IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE) noncanonical = ["GC"] # initialize Psmm (scoring) class noncanssPssm = Pssm(ic=IC_NCGC_DONOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore) # hmm... somebody knows what he or she is doing ;-) if override_pattern_offset: pattern_offset = override_pattern_offset pssmlength = len(PSSM_MATRIX) sites = [] for offset in range(0, len(seq) - pssmlength + 1 ): # get sequence slice of pattern and actual splice site seqpart = seq[offset:offset+pssmlength].upper() splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]] # continue if non-canonical sites if not requested for if not allow_non_canonical and splicesite != canonical: continue elif splicesite == canonical: # score this splicesite #score = _score_splice_site(seqpart,splicetype=splicetype) score = canssPssm.score(seqpart) # check if site must be stored if min_pssm_score or min_pssm_score == 0.0: if score < min_pssm_score: continue elif splicesite != canonical and splicetype == 'donor' and splicesite in noncanonical: # score non-canonical donor site #score = pssmscore(seqpart,IC_NCGC_DONOR) score = noncanssPssm.score(seqpart) # check if site must be stored if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0: if score < non_canonical_min_pssm_score: continue ####print seqpart, score, offset else: continue if splicetype=='acceptor': a = SpliceAcceptor(offset,seqpart,acceptor=splicesite,pssm_score=score) sites.append(a) else: d = SpliceDonor(offset,seqpart,donor=splicesite,pssm_score=score) sites.append(d) # return sites for Donor if splicetype == 'donor': sites.reverse() # and return return sites
def __init__(self,ic=[],ignore_unambiguity=True,relativescore=True): """ """ if not ic: ic=IC_ACCEPTOR Pssm.__init__(self,ic=ic,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore)
def scan_pssm_splice_site( seq, splicetype="donor", override_pattern_offset=(), min_pssm_score=None, allow_non_canonical=False, non_canonical_min_pssm_score=0.0, ignore_unambiguity=False, relativescore=False, ): """ Find splice sites by a PSSM on input sequence @type seq: string @param seq: DNA sequence of EXACT length of the PSSM @type splicetype: string @param splicetype: 'donor' or 'acceptor' @type min_pssm_score: float @param min_pssm_score: @type allow_non_canonical: boolean @param allow_non_canonical: True of False @type non_canonical_min_pssm_score: float @param non_canonical_min_pssm_score: @type override_pattern_offset: tuple @param override_pattern_offset: tuple with 2 integers; use cautiously!! @rtype: list @return: list with SpliceDonors or SpliceAcceptors """ if splicetype == 'acceptor': PSSM_MATRIX = IC_ACCEPTOR pattern_offset = IC_ACCEPTOR_PATTERN_OFFSET canonical = "AG" # initialize Psmm (scoring) class canssPssm = Pssm(ic=IC_ACCEPTOR, ignore_unambiguity=ignore_unambiguity, relativescore=relativescore) # import output SpliceAcceptor object from acceptor import SpliceAcceptor elif splicetype == 'donor': PSSM_MATRIX = IC_DONOR pattern_offset = IC_DONOR_PATTERN_OFFSET canonical = "GT" # initialize Psmm (scoring) class canssPssm = Pssm(ic=IC_DONOR, ignore_unambiguity=ignore_unambiguity, relativescore=relativescore) # import output SpliceDonor object from donor import SpliceDonor else: message = "'splicetype' (%s) not in [donor,acceptor]" % splicetype raise InproperlyAppliedArgument, message if allow_non_canonical: # obtain PSSM_IC for non-canonical (GC) donors IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE) noncanonical = ["GC"] # initialize Psmm (scoring) class noncanssPssm = Pssm(ic=IC_NCGC_DONOR, ignore_unambiguity=ignore_unambiguity, relativescore=relativescore) # hmm... somebody knows what he or she is doing ;-) if override_pattern_offset: pattern_offset = override_pattern_offset pssmlength = len(PSSM_MATRIX) sites = [] for offset in range(0, len(seq) - pssmlength + 1): # get sequence slice of pattern and actual splice site seqpart = seq[offset:offset + pssmlength].upper() splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]] # continue if non-canonical sites if not requested for if not allow_non_canonical and splicesite != canonical: continue elif splicesite == canonical: # score this splicesite #score = _score_splice_site(seqpart,splicetype=splicetype) score = canssPssm.score(seqpart) # check if site must be stored if min_pssm_score or min_pssm_score == 0.0: if score < min_pssm_score: continue elif splicesite != canonical and splicetype == 'donor' and splicesite in noncanonical: # score non-canonical donor site #score = pssmscore(seqpart,IC_NCGC_DONOR) score = noncanssPssm.score(seqpart) # check if site must be stored if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0: if score < non_canonical_min_pssm_score: continue ####print seqpart, score, offset else: continue if splicetype == 'acceptor': a = SpliceAcceptor(offset, seqpart, acceptor=splicesite, pssm_score=score) sites.append(a) else: d = SpliceDonor(offset, seqpart, donor=splicesite, pssm_score=score) sites.append(d) # return sites for Donor if splicetype == 'donor': sites.reverse() # and return return sites
def scan_pssm_tss(seq,override_pattern_offset=(),min_pssm_score=None, allow_non_canonical=False,non_canonical_min_pssm_score=0.0, ignore_unambiguity=False,relativescore=False,): """ Find TSS's by a PSSM on input sequence @type seq: string @param seq: DNA sequence of EXACT length of the PSSM @type min_pssm_score: float @param min_pssm_score: @type allow_non_canonical: boolean @param allow_non_canonical: True of False @type non_canonical_min_pssm_score: float @param non_canonical_min_pssm_score: @type override_pattern_offset: tuple @param override_pattern_offset: tuple with 2 integers; use cautiously!! @rtype: list @return: list with TranslationalStartSites """ pattern_offset = IC_TSS_PATTERN_OFFSET canonical = "ATG" regex_pattern = "ATG" # obtain Psmm (scoring) class tssPssm = Pssm(ic=IC_TSS,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore) # hmm... somebody knows what he or she is doing ;-) if override_pattern_offset: pattern_offset = override_pattern_offset if allow_non_canonical: # http://en.wikipedia.org/wiki/Start_codon # Blattner, F. R.; Plunkett, G.; Bloch, C. A.; Perna, N. T.; Burland, V.; Riley, M.; Collado-vides, J.; Glasner, J. D. et al (1997). "The Complete Genome Sequence of Escherichia coli K-12". Science 277 (5331): 1453-62 noncanonicalGTG = "GTG" noncanonicalTTG = "TTG" regex_pattern = "[ATG]TG" tssPssmGTG = deepcopy(tssPssm) tssPssmGTG.ic[pattern_offset[0]]['A'],tssPssmGTG.ic[pattern_offset[0]]['G'] = tssPssmGTG.ic[pattern_offset[0]]['G'],tssPssmGTG.ic[pattern_offset[0]]['A'] tssPssmTTG = deepcopy(tssPssm) tssPssmTTG.ic[pattern_offset[0]]['A'],tssPssmGTG.ic[pattern_offset[0]]['T'] = tssPssmTTG.ic[pattern_offset[0]]['T'],tssPssmTTG.ic[pattern_offset[0]]['A'] pssmlength = len(IC_TSS) sites = [] for atg in finditer(compile(regex_pattern),seq.upper()[0:-pattern_offset[1]]): if atg.start() < pattern_offset[0]: continue seqpart = seq[atg.start()-pattern_offset[0]:atg.end()+pattern_offset[1]] tss = atg.group() offset = atg.start()-pattern_offset[0] # continue if non-canonical sites if not requested for if tss == canonical: # score this TSS score = tssPssm.score(seqpart) # continue if score is to low if min_pssm_score or min_pssm_score == 0.0: if score < min_pssm_score: continue else: if atg.group() == noncanonicalGTG: score = tssPssmGTG.score(seqpart) elif atg.group() == noncanonicalTTG: score = tssPssmTTG.score(seqpart) else: continue # continue if score is to low if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0: if score < non_canonical_min_pssm_score: continue # phase of this tss phase = (offset+pattern_offset[0]) % 3 # instantiate tss object t = TranslationalStartSite(offset,seqpart,pssm_score=score,phase=phase) sites.append(t) # and return return sites