def get_phosphosequence(self): """ Returns a sequence with all phosphorylated residues converted to E. Phosphorylated residues defined by the phosphosites object variable """ if len(self.phosphosites) == 0: warning_message( "No phosphosites defined - phosphosequence will be equivalent to the unphosphorylated sequence") # first defined the empty sequence pseq = "" idx = 0 # for each position if that residue is phosphorylatable set it to 'E' instead # of the actual Y/S/T for i in self.seq: if idx in self.phosphosites: # extra level of checking! if i not in ["S", "Y", "T"]: raise SequenceException( "In get_phosphosequence - trying to replace non-phsophrylatable residue with GLU") pseq = pseq + "E" else: pseq = pseq + self.seq[idx] idx = idx + 1 return pseq
def setPhosPhoSites(self, listOfPsites): """ Set one or more sites on your sequence which can be phosphorylated. Note that this indexes from 1 (like all of bioinformatics) and not from 0 (like all of computer science). i.e. "KKKYKKK" the Y here is at position 4 Internally we do translate to indexing from 0, but this is not something you should have to worry about Note that all data validation for the phosphosite list is done in this function. """ # if we passed a single value not in a list then convert to a list of length # 1 if isinstance(listOfPsites, int): tmp = listOfPsites listOfPsites = [] listOfPsites.appned(tmp) # evaluate proposed phosphosites for site in listOfPsites: # check we can convert to an integer! site = int(site) # python indexes from 0 but humans from 1 idx = site - 1 # if we're outside our sequence if idx >= len(self.seq) or idx < 0: warning_message("Proposed phosphosite (" + str(idx + 1) + " is outside sequence range. Skipping...") pass # grab the residue letter from the sequence res = self.seq[idx] status_message("Setting " + res + str(idx + 1)) if res not in ["S", "T", "Y"]: # we skip it if it seems like an unphosphorylatable residue warning_message( 'Position ' + str(site) + ' in sequence is a non phosphorylatable residue [' + str(res) + ']') else: if idx in self.phosphosites: # don't add the same residue twice, but no need to warn # about it pass else: # let's add that bad-boy! self.phosphosites.append(idx)
def __validSeq(self, sequence): """ Internal function which validates if a [region of] a sequence is a valid protein sequence. The validation skips spaces and numbers, but will raise an exception on any other character """ parsed_seq = "" # for each residue in the sequence for i in sequence: # if the residue is not in the three letter code if i not in ONE_TO_THREE.keys(): if i == " ": # skip spaces continue elif i == "*": # Add * for now but then validate at the end (* can be a stop codon) parsed_seq = parsed_seq + i continue elif i in "1234567890": warning_message( "Found '" + i + "' in sequence, stripping out and ignoring...") # strip out numbers (useful for copy/pasted FASTA formats) continue else: raise SequenceFileParserException( "\n\nERROR: Invalid sequence file, found [" + i + "] in sequence region\n\n" + sequence + "\n\n") # if the residue *is* one of the 20 AAs then append to the growing # sequence else: parsed_seq = parsed_seq + i return parsed_seq
def kappa(self): """ Return the kappa value, as defined in REF 1 \ """ if self.deltaMax() == 0: warning_message( "The sequence has no charged residues - kappa is not a valid/relevant parameter") return -1 else: kappaVal = self.delta() / self.deltaMax() # so the heuristics for kappa are good BUT may under estimate # deltaMax is some cases. If this is a small deviation then we # just set it to 0 because the sequence with the highest delta is probably # an sequence-Isomer of the sequence we have. If this deviation is larger, # however, it may be indicative of a bug in the code which we # should address if kappaVal > 1.0 and kappaVal < 1.1: return 1.0 else: return kappaVal
def validateSequence(self, seq): processed = "" AAs = data.aminoacids.ONE_TO_THREE.keys() pos = 0 messageWarned = False # for each residue in your protein sequence for i in seq: pos = pos + 1 if i not in AAs: # if we find whitespace if i.isspace(): if not messageWarned: # only warn once... status_message("Removing whitespace from sequence") messageWarned = True pass # if unexpected residue/character bail else: raise SequenceException( "Invalid amino acid [" + str(i) + "] found at position " + str(pos)) # else append sequence to the processed sequence else: processed = processed + i # determine proline content and warn if over 15% prolineContent = float(processed.count("P")) / float(len(processed)) if prolineContent > 0.15: warning_message( "This sequence has a proline content of greater than 15%.\nThis may render some analyses [notably kappa and phase diagram predictions] incorrect") return processed
def __validSeq(self, sequence): """ Internal function which validates if a [region of] a sequence is a valid protein sequence. The validation skips spaces and numbers, but will raise an exception on any other character """ parsed_seq = "" # for each residue in the sequence for i in sequence: # if the residue is not in the three letter code if i not in ONE_TO_THREE.keys(): if i == " ": # skip spaces continue elif i in "1234567890": warning_message( "Found '" + i + "' in sequence, stripping out and ignoring...") # strip out numbers (useful for copy/pasted FASTA formats) continue else: raise SequenceFileParserException( "\n\nERROR: Invalid sequence file, found [" + i + "] in sequence region\n\n" + sequence + "\n\n") # if the residue *is* one of the 20 AAs then append to the growing # sequence else: parsed_seq = parsed_seq + i return parsed_seq
def parse_keyfile(self, filename): """ Function which takes a filename and parses it into the keyfile object for easy interaction with the file's content """ status_message("Parsing keyfile...") status_message("---------------------------------------") SeqFileParser = SequenceFileParser() # create a sequence file parsing object # read file to end with open(filename) as filehandle: content = filehandle.readlines() # [PHASE 1 START] # PARSE THE KEYFILE for line in content: line = line.strip() # if empty line if len(line) == 0: continue # comments in the keyfile if line[0] == "#": continue # if inline comment kill everything after the comment # character if len(line.split("#")) > 1: line = line.split("#")[0] # finally remove any other trailing whitespace line = line.strip() # split the remaining by whitespace line_list = line.split(" ") # now cycle over the first value in the whitelist splitted lits # and check if it matches one of the predefined KEYWORDS if line_list[0].strip() in self.KEYWORDS: # if we find a keyword and there's a single string after the keyword load it into the # KEYWORDS dictionary (i.e. this is what we expect!) if len(line_list) == 2: self.KEYWORDS[line_list[0].strip()] = line_list[1].strip() # there was more than one whitespace seperated string after the # keyword - we basically fail at this else: raise KeyFileException("Error: Found keyword " + str(line_list[0].strip()) + " but unable to parse associated value") else: warning_message( "Found unexpected keyword [" + str(line_list[0].strip()) + "] - ignorning...") # Now add default for the sequene, which will hopefully be set in the # next section by parsing the sequencefile self.KEYWORDS["SEQUENCE"] = "" # [PHASE 1 END] status_message("---------------------------------------") status_message("Keyfile parsed!\n") status_message("Validating keyfile contents") status_message("---------------------------------------") # Having parsed the keyfile we now validate the keyfile so # we don't have to worry about validation later on # VALIDATE the parsed values # [PHASE 2 START] # for keyword in KEYWORD_LIST: # extract the value associated with each keyword in turn value = self.KEYWORDS[keyword] ## # SEQUENCE FILE VALIDATION AND PARSING ## if keyword == "SEQFILE": if value == "": raise KeyFileException( "ERROR: No sequence file provided in keyfile (expecting keyword [SEQFILE])") else: if not os.path.isfile(value): raise KeyFileException( "Expected " + str(value) + " to be file") # if its a file lets try and extract a sequence from it! self.KEYWORDS[ "SEQUENCE"] = SeqFileParser.parseSeqFile(value) # if we get here we *should* now have a sequence... if self.KEYWORDS["SEQUENCE"] == "" or self.KEYWORDS[ "SEQUENCE"] is None: raise KeyFileException( "ERROR: No sequence was parsed from the sequence file...") ## # OUTPUT DIRECTORY VALIDATION ## elif keyword == "OUTDIR": if value == "": raise KeyFileException( "ERROR: No output directory provided in keyfile (expecting keyword [OUTDIR]") else: # creates the output directory if it doesn't already exist if not os.path.exists(value): status_message( "Creating output directory " + str(value)) try: os.makedirs(value) except OSError as e: print "----------------------------" print "" print "ERROR Creating output directory - do you have permission to create the directory [" + str(value) + "]" print "" print "----------------------------" raise e # if it does exist raise a quick warning else: # check if its empty if len(os.listdir(value)) > 0: warning_message( "Output directory exists already and is not empty [RISK OF OVERWRITING!]") else: pass # empty directory already exists - brilliant! ## # FREEZE FILE VALIDATION ## elif keyword == "FREEZE_FILE": if value == "": pass # no freeze file, no problem else: if not os.path.isfile(value): raise KeyFileException( "Expected " + str(value) + " to be file") status_message("Using freeze file") self.KEYWORDS[keyword] = value ## # WL TYPE ## elif keyword == "WL_TYPE": if value == "": self.KEYWORDS[keyword] = DEFAULT_VALS[keyword] status_message( "Setting WL type to default [" + str(DEFAULT_VALS[keyword]) + "]") else: if value in WL_TYPES: self.KEYWORDS[keyword] = value status_message( "Setting WL type to keyfile defined [" + str(value) + "]") else: raise KeyFileException( "Unexpected WL algorithm type selected " + str(value) + " ") self.KEYWORDS[keyword] = value ## # SET NUMERIC VALUES ## elif keyword == "BIN_MIN": self.__set_numeric(keyword, value) elif keyword == "BIN_MAX": self.__set_numeric(keyword, value) elif keyword == "NUMBER_OF_BINS": self.__set_numeric(keyword, value) elif keyword == "FLATCHECK_FREQ": self.__set_numeric(keyword, value) elif keyword == "CONVERGENCE": self.__set_numeric(keyword, value) elif keyword == "FLATNESS_CRITERION": self.__set_numeric(keyword, value) else: raise KeyFileException("SHOULD NOT BE GETTING HERE...")