def scan_pssm_splice_site(seq,splicetype="donor", override_pattern_offset=(),min_pssm_score=None, allow_non_canonical=False,non_canonical_min_pssm_score=0.0, ignore_unambiguity=False,relativescore=False,): """ Find splice sites by a PSSM on input sequence @type seq: string @param seq: DNA sequence of EXACT length of the PSSM @type splicetype: string @param splicetype: 'donor' or 'acceptor' @type min_pssm_score: float @param min_pssm_score: @type allow_non_canonical: boolean @param allow_non_canonical: True of False @type non_canonical_min_pssm_score: float @param non_canonical_min_pssm_score: @type override_pattern_offset: tuple @param override_pattern_offset: tuple with 2 integers; use cautiously!! @rtype: list @return: list with SpliceDonors or SpliceAcceptors """ if splicetype == 'acceptor': PSSM_MATRIX = IC_ACCEPTOR pattern_offset = IC_ACCEPTOR_PATTERN_OFFSET canonical = "AG" # initialize Psmm (scoring) class canssPssm = Pssm(ic=IC_ACCEPTOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore) # import output SpliceAcceptor object from acceptor import SpliceAcceptor elif splicetype == 'donor': PSSM_MATRIX = IC_DONOR pattern_offset = IC_DONOR_PATTERN_OFFSET canonical = "GT" # initialize Psmm (scoring) class canssPssm = Pssm(ic=IC_DONOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore) # import output SpliceDonor object from donor import SpliceDonor else: message = "'splicetype' (%s) not in [donor,acceptor]" % splicetype raise InproperlyAppliedArgument, message if allow_non_canonical: # obtain PSSM_IC for non-canonical (GC) donors IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE) noncanonical = ["GC"] # initialize Psmm (scoring) class noncanssPssm = Pssm(ic=IC_NCGC_DONOR,ignore_unambiguity=ignore_unambiguity,relativescore=relativescore) # hmm... somebody knows what he or she is doing ;-) if override_pattern_offset: pattern_offset = override_pattern_offset pssmlength = len(PSSM_MATRIX) sites = [] for offset in range(0, len(seq) - pssmlength + 1 ): # get sequence slice of pattern and actual splice site seqpart = seq[offset:offset+pssmlength].upper() splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]] # continue if non-canonical sites if not requested for if not allow_non_canonical and splicesite != canonical: continue elif splicesite == canonical: # score this splicesite #score = _score_splice_site(seqpart,splicetype=splicetype) score = canssPssm.score(seqpart) # check if site must be stored if min_pssm_score or min_pssm_score == 0.0: if score < min_pssm_score: continue elif splicesite != canonical and splicetype == 'donor' and splicesite in noncanonical: # score non-canonical donor site #score = pssmscore(seqpart,IC_NCGC_DONOR) score = noncanssPssm.score(seqpart) # check if site must be stored if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0: if score < non_canonical_min_pssm_score: continue ####print seqpart, score, offset else: continue if splicetype=='acceptor': a = SpliceAcceptor(offset,seqpart,acceptor=splicesite,pssm_score=score) sites.append(a) else: d = SpliceDonor(offset,seqpart,donor=splicesite,pssm_score=score) sites.append(d) # return sites for Donor if splicetype == 'donor': sites.reverse() # and return return sites
import sys from pssm import parse_ic_data, parse_ic_file if len(sys.argv) == 2: IC = parse_ic_file(sys.argv[1]) else: ic_data = [] for line in sys.stdin.readlines(): if line[0] == '#': continue ic_data.append(line.strip()) IC = parse_ic_data("\n".join(ic_data)) buffer = [] num_seqs = 1000 for col in range(len(IC)): buffer.append([]) vdict = IC[col] for base,value in vdict.iteritems(): freq = pow(2,value-2.0) cnt = int(round(freq*num_seqs)) buffer[-1].extend( [ base ]*cnt ) while len(buffer[-1]) < num_seqs: buffer[-1].append("n") seqs = {} cnt=1 while buffer[0]: seq = "".join( [ buffer[col].pop() for col in range(len(IC)) ] ) seqs[cnt] = seq
IncompatibleSpliceSitePhases, ) from pssm import parse_ic_file, Pssm, pssmscore # Import Global variables from settings.splicesites import ( IC_DONOR_PATTERN_OFFSET, IC_DONOR_DATA_FILE, IC_DONOR_NCGC_DATA_FILE, IC_ACCEPTOR_PATTERN_OFFSET, IC_ACCEPTOR_DATA_FILE, ) # parse IC PSSM files of cannonical sites IC_ACCEPTOR = parse_ic_file(IC_ACCEPTOR_DATA_FILE) IC_DONOR = parse_ic_file(IC_DONOR_DATA_FILE) IC_NC_GC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE) class SpliceSiteBase(BasicGFF): """ """ def __init__(self,start,phase=None,strand='+',pattern=None, pattern_offset=(0,0),pssm_score=None,gff={}): """ Initialization function of Basal SpliceSite logic Recommended is to use only one of the inheriting classes @type start: number @param start: start coord of site (e.g GT) or pattern (e.g. tgtGTcgat) @type phase: number
import sys from pssm import parse_ic_data, parse_ic_file if len(sys.argv) == 2: IC = parse_ic_file(sys.argv[1]) else: ic_data = [] for line in sys.stdin.readlines(): if line[0] == '#': continue ic_data.append(line.strip()) IC = parse_ic_data("\n".join(ic_data)) buffer = [] num_seqs = 1000 for col in range(len(IC)): buffer.append([]) vdict = IC[col] for base, value in vdict.iteritems(): freq = pow(2, value - 2.0) cnt = int(round(freq * num_seqs)) buffer[-1].extend([base] * cnt) while len(buffer[-1]) < num_seqs: buffer[-1].append("n") seqs = {} cnt = 1 while buffer[0]: seq = "".join([buffer[col].pop() for col in range(len(IC))]) seqs[cnt] = seq cnt += 1
def scan_pssm_splice_site( seq, splicetype="donor", override_pattern_offset=(), min_pssm_score=None, allow_non_canonical=False, non_canonical_min_pssm_score=0.0, ignore_unambiguity=False, relativescore=False, ): """ Find splice sites by a PSSM on input sequence @type seq: string @param seq: DNA sequence of EXACT length of the PSSM @type splicetype: string @param splicetype: 'donor' or 'acceptor' @type min_pssm_score: float @param min_pssm_score: @type allow_non_canonical: boolean @param allow_non_canonical: True of False @type non_canonical_min_pssm_score: float @param non_canonical_min_pssm_score: @type override_pattern_offset: tuple @param override_pattern_offset: tuple with 2 integers; use cautiously!! @rtype: list @return: list with SpliceDonors or SpliceAcceptors """ if splicetype == 'acceptor': PSSM_MATRIX = IC_ACCEPTOR pattern_offset = IC_ACCEPTOR_PATTERN_OFFSET canonical = "AG" # initialize Psmm (scoring) class canssPssm = Pssm(ic=IC_ACCEPTOR, ignore_unambiguity=ignore_unambiguity, relativescore=relativescore) # import output SpliceAcceptor object from acceptor import SpliceAcceptor elif splicetype == 'donor': PSSM_MATRIX = IC_DONOR pattern_offset = IC_DONOR_PATTERN_OFFSET canonical = "GT" # initialize Psmm (scoring) class canssPssm = Pssm(ic=IC_DONOR, ignore_unambiguity=ignore_unambiguity, relativescore=relativescore) # import output SpliceDonor object from donor import SpliceDonor else: message = "'splicetype' (%s) not in [donor,acceptor]" % splicetype raise InproperlyAppliedArgument, message if allow_non_canonical: # obtain PSSM_IC for non-canonical (GC) donors IC_NCGC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE) noncanonical = ["GC"] # initialize Psmm (scoring) class noncanssPssm = Pssm(ic=IC_NCGC_DONOR, ignore_unambiguity=ignore_unambiguity, relativescore=relativescore) # hmm... somebody knows what he or she is doing ;-) if override_pattern_offset: pattern_offset = override_pattern_offset pssmlength = len(PSSM_MATRIX) sites = [] for offset in range(0, len(seq) - pssmlength + 1): # get sequence slice of pattern and actual splice site seqpart = seq[offset:offset + pssmlength].upper() splicesite = seqpart[pattern_offset[0]:-pattern_offset[1]] # continue if non-canonical sites if not requested for if not allow_non_canonical and splicesite != canonical: continue elif splicesite == canonical: # score this splicesite #score = _score_splice_site(seqpart,splicetype=splicetype) score = canssPssm.score(seqpart) # check if site must be stored if min_pssm_score or min_pssm_score == 0.0: if score < min_pssm_score: continue elif splicesite != canonical and splicetype == 'donor' and splicesite in noncanonical: # score non-canonical donor site #score = pssmscore(seqpart,IC_NCGC_DONOR) score = noncanssPssm.score(seqpart) # check if site must be stored if non_canonical_min_pssm_score or non_canonical_min_pssm_score == 0.0: if score < non_canonical_min_pssm_score: continue ####print seqpart, score, offset else: continue if splicetype == 'acceptor': a = SpliceAcceptor(offset, seqpart, acceptor=splicesite, pssm_score=score) sites.append(a) else: d = SpliceDonor(offset, seqpart, donor=splicesite, pssm_score=score) sites.append(d) # return sites for Donor if splicetype == 'donor': sites.reverse() # and return return sites
UnexpectedSpliceSitePhase, IncompatibleSpliceSitePhases, ) from pssm import parse_ic_file, Pssm, pssmscore # Import Global variables from settings.splicesites import ( IC_DONOR_PATTERN_OFFSET, IC_DONOR_DATA_FILE, IC_DONOR_NCGC_DATA_FILE, IC_ACCEPTOR_PATTERN_OFFSET, IC_ACCEPTOR_DATA_FILE, ) # parse IC PSSM files of cannonical sites IC_ACCEPTOR = parse_ic_file(IC_ACCEPTOR_DATA_FILE) IC_DONOR = parse_ic_file(IC_DONOR_DATA_FILE) IC_NC_GC_DONOR = parse_ic_file(IC_DONOR_NCGC_DATA_FILE) class SpliceSiteBase(BasicGFF): """ """ def __init__(self, start, phase=None, strand='+', pattern=None, pattern_offset=(0, 0), pssm_score=None, gff={}): """
# Python Imports from re import finditer, compile from copy import deepcopy # Import Global variables from settings.translationalstartsites import ( TSS_MIN_PSSM_SCORE, TSS_ALLOW_NON_CANONICAL, TSS_NON_CANONICAL_MIN_PSSM_SCORE, IC_TSS_DATA_FILE, IC_TSS_PATTERN_OFFSET, ) # parse IC PSSM file of TSS IC_TSS = parse_ic_file(IC_TSS_DATA_FILE) class StartCodon(BasicGFF): def __init__(self,pos,gff={}): """ """ BasicGFF.__init__(self) self._gff.update(gff) self.pos = pos self.start = self.pos self.end = self.start+3 self.pssm_score = 1.0 # default, dummy value self.phase = 0 # end of function __init__