Ejemplo n.º 1
0
    def setPhosPhoSites(self, listOfPsites):
        """
        Set one or more sites on your sequence which can be phosphorylated. Note that
        this indexes from 1 (like all of bioinformatics) and not from 0 (like all of
        computer science).

        i.e. "KKKYKKK" the Y here is at position 4

        Internally we do translate to indexing from 0, but this is not something you
        should have to worry about

        Note that all data validation for the phosphosite list is done in this function.

        """

        # if we passed a single value not in a list then convert to a list of length
        # 1
        if isinstance(listOfPsites, int):
            tmp = listOfPsites
            listOfPsites = []
            listOfPsites.appned(tmp)

        # evaluate proposed phosphosites
        for site in listOfPsites:

            # check we can convert to an integer!
            site = int(site)

            # python indexes from 0 but humans from 1
            idx = site - 1

            # if we're outside our sequence
            if idx >= len(self.seq) or idx < 0:
                warning_message("Proposed phosphosite (" + str(idx + 1) +
                                " is outside sequence range. Skipping...")
                pass

            # grab the residue letter from the sequence
            res = self.seq[idx]

            status_message("Setting " + res + str(idx + 1))

            if res not in ["S", "T", "Y"]:
                # we skip it if it seems like an unphosphorylatable residue
                warning_message(
                    'Position ' +
                    str(site) +
                    ' in sequence is a non phosphorylatable residue [' +
                    str(res) +
                    ']')
            else:
                if idx in self.phosphosites:
                    # don't add the same residue twice, but no need to warn
                    # about it
                    pass
                else:
                    # let's add that bad-boy!
                    self.phosphosites.append(idx)
Ejemplo n.º 2
0
    def parseSeqFile(self, filename, silent=False):
        """
        The parseSeqFile function is the meat of the SequenceFileParser object, and carrys out stateless parsing of a sequence file to a single,
        unbroken string which contains only valid amino acids.


        INPUT:
        Filename  | Name of a file to parse (string)
        Silent    | Defines if the parsing operation should be Silent, or if
                    there should be messages prinited to screen

        OUTPUT:
        Amino acid sequence in a string

        """
        # read file to end
        with open(filename) as filehandle:
            content = filehandle.readlines()

        header = False
        seq = ""

        # cycle over each line in the file
        for line in content:
            line = line.strip()

            # empty line
            if len(line) == 0:
                continue

            # if you have a header line
            if line[0] == ">":

                # if the header flag had already been flicked then raise an
                # exception (indicative of  multiple sequences in a single
                # file)
                if header:
                    raise SequenceFileParserException(
                        "\n\nERROR: During parsing of sequence file found a second header section. Sequence files must be a single file")

                # if it has not, flick the header flag to on
                header = True
                continue
            elif len(line) > 0:
                # validate sequence (raises exception if something is bad) and
                # append to the growing sequence string
                line = self.__validSeq(line)
                seq = seq + line

        seq = self.__final_validation(seq)
        if not silent:
            status_message(
                "Parsed sequence [" + str(len(seq)) + " residues]:\n" + seq)
        return seq
Ejemplo n.º 3
0
    def parseSeqFile(self, filename, silent=False):
        """
        The parseSeqFile function is the meat of the SequenceFileParser object, and carrys out stateless parsing of a sequence file to a single,
        unbroken string which contains only valid amino acids.


        INPUT:
        Filename  | Name of a file to parse (string)
        Silent    | Defines if the parsing operation should be Silent, or if
                    there should be messages prinited to screen

        OUTPUT:
        Amino acid sequence in a string

        """
        # read file to end
        with open(filename) as filehandle:
            content = filehandle.readlines()

        header = False
        seq = ""

        # cycle over each line in the file
        for line in content:
            line = line.strip()

            # empty line
            if len(line) == 0:
                continue

            # if you have a header line
            if line[0] == ">":

                # if the header flag had already been flicked then raise an
                # exception (indicative of  multiple sequences in a single
                # file)
                if header:
                    raise SequenceFileParserException(
                        "\n\nERROR: During parsing of sequence file found a second header section. Sequence files must be a single file")

                # if it has not, flick the header flag to on
                header = True
                continue
            elif len(line) > 0:
                # validate sequence (raises exception if something is bad) and
                # append to the growing sequence string
                line = self.__validSeq(line)
                seq = seq + line

        if not silent:
            status_message(
                "Parsed sequence [" + str(len(seq)) + " residues]:\n" + seq)
        return seq
Ejemplo n.º 4
0
    def validateSequence(self, seq):

        processed = ""

        AAs = data.aminoacids.ONE_TO_THREE.keys()
        pos = 0
        messageWarned = False

        # for each residue in your protein sequence
        for i in seq:
            pos = pos + 1
            if i not in AAs:

                # if we find whitespace
                if i.isspace():
                    if not messageWarned:
                        # only warn once...
                        status_message("Removing whitespace from sequence")
                        messageWarned = True
                    pass
                # if unexpected residue/character bail
                else:
                    raise SequenceException(
                        "Invalid amino acid [" + str(i) + "] found at position " + str(pos))

            # else append sequence to the processed sequence
            else:
                processed = processed + i

        # determine proline content and warn if over 15%
        prolineContent = float(processed.count("P")) / float(len(processed))
        if prolineContent > 0.15:
            warning_message(
                "This sequence has a proline content of greater than 15%.\nThis may render some analyses [notably kappa and phase diagram predictions] incorrect")

        return processed
Ejemplo n.º 5
0
 def __set_numeric(self, keyword, value):
     """
        Function which sets the KEYWORDS dictionary $keyword value to $value if $value can be treated
        as a numerical value or uses the default if it wasn't set (BUT DOES NOT use the default
        if an in-parsable value was set - we want to know when things are going wrong, silent errors
        cost lives. Maybe.).
     """
     if value == "":
         status_message("Setting " + keyword +
                        " to default [" + str(DEFAULT_VALS[keyword]) + "]")
         self.KEYWORDS[keyword] = DEFAULT_VALS[keyword]
     else:
         try:
             float(value)
             status_message("Setting " + keyword +
                            " to keyfile defined [" + str(value) + "]")
             self.KEYWORDS[keyword] = value
         except ValueError:
             raise KeyFileException(
                 "\n\nERROR: Invalid value for " +
                 keyword +
                 " - unable to convert [" +
                 value +
                 "] into a number\n")
Ejemplo n.º 6
0
 def __set_numeric(self, keyword, value):
     """
        Function which sets the KEYWORDS dictionary $keyword value to $value if $value can be treated
        as a numerical value or uses the default if it wasn't set (BUT DOES NOT use the default
        if an in-parsable value was set - we want to know when things are going wrong, silent errors
        cost lives. Maybe.).
     """
     if value == "":
         status_message("Setting " + keyword +
                        " to default [" + str(DEFAULT_VALS[keyword]) + "]")
         self.KEYWORDS[keyword] = DEFAULT_VALS[keyword]
     else:
         try:
             float(value)
             status_message("Setting " + keyword +
                            " to keyfile defined [" + str(value) + "]")
             self.KEYWORDS[keyword] = value
         except ValueError:
             raise KeyFileException(
                 "\n\nERROR: Invalid value for " +
                 keyword +
                 " - unable to convert [" +
                 value +
                 "] into a number\n")
Ejemplo n.º 7
0
    def swapRandChargeRes(self, frozen=set()):
        """ Function which randomly selects two residues and swaps them if that
            swap would change the kappa value
        """

        # get a random number
        rand = rng.Random()
        rand.seed(time.time())

        # determine the indices from which we can swap

        # (i.e. all positive indices which do not overlap with the set of frozen
        # residues)
        posInd = set(np.where(self.chargePattern > 0)[0]) - frozen
        negInd = set(np.where(self.chargePattern < 0)[0]) - frozen
        neutInd = set(np.where(self.chargePattern == 0)[0]) - frozen

        if(len(neutInd) == 0):
            if(len(posInd) == 0 or len(negInd) == 0):
                status_message(
                    'swap will not change kappa, only one charge type in sequence')
                return self
            else:
                chargeType = [1, 2]
        elif(len(negInd) == 0):
            if(len(posInd) == 0 or len(neutInd) == 0):
                status_message(
                    'swap will not change kappa, only one charge type in sequence')
                return self
            else:
                chargeType = [1, 3]
        elif(len(posInd) == 0):
            if(len(negInd) == 0 or len(neutInd) == 0):
                status_message(
                    'swap will not change kappa, only one charge type in sequence')
                return self
            else:
                chargeType = [2, 3]
        else:
            chargeType = rand.sample([1, 2, 3], 2)

        if(chargeType[0] == 1):
            swapPair1 = rand.sample(posInd, 1)
        elif(chargeType[0] == 2):
            swapPair1 = rand.sample(negInd, 1)
        elif(chargeType[0] == 3):
            swapPair1 = rand.sample(neutInd, 1)

        if(chargeType[1] == 1):
            swapPair2 = rand.sample(posInd, 1)
        elif(chargeType[1] == 2):
            swapPair2 = rand.sample(negInd, 1)
        elif(chargeType[1] == 3):
            swapPair2 = rand.sample(neutInd, 1)
        return self.swapRes(swapPair1[0], swapPair2[0])
Ejemplo n.º 8
0
 def print_progress(count, total):
     if (count % 50) == 0:
         status_message("Done " + str(count) + " of " + str(total))
Ejemplo n.º 9
0
    def parse_keyfile(self, filename):
        """
        Function which takes a filename and parses it into the keyfile object for easy
        interaction with the file's content
        """

        status_message("Parsing keyfile...")
        status_message("---------------------------------------")

        SeqFileParser = SequenceFileParser()  # create a sequence file parsing object

        # read file to end
        with open(filename) as filehandle:
            content = filehandle.readlines()

        # [PHASE 1 START]
        # PARSE THE KEYFILE
        for line in content:
            line = line.strip()

            # if empty line
            if len(line) == 0:
                continue

            # comments in the keyfile
            if line[0] == "#":
                continue

            # if inline comment kill everything after the comment
            # character
            if len(line.split("#")) > 1:
                line = line.split("#")[0]

            # finally remove any other trailing whitespace
            line = line.strip()

            # split the remaining by whitespace
            line_list = line.split(" ")

            # now cycle over the first value in the whitelist splitted lits
            # and check if it matches one of the predefined KEYWORDS
            if line_list[0].strip() in self.KEYWORDS:

                # if we find a keyword and there's a single string after the keyword load it into the
                # KEYWORDS dictionary (i.e. this is what we expect!)
                if len(line_list) == 2:
                    self.KEYWORDS[line_list[0].strip()] = line_list[1].strip()
                # there was more than one whitespace seperated string after the
                # keyword - we basically fail at this
                else:
                    raise KeyFileException("Error: Found keyword " +
                                           str(line_list[0].strip()) +
                                           " but unable to parse associated value")
            else:
                warning_message(
                    "Found unexpected keyword [" + str(line_list[0].strip()) + "] - ignorning...")

        # Now add default for the sequene, which will hopefully be set in the
        # next section by parsing the sequencefile
        self.KEYWORDS["SEQUENCE"] = ""
        # [PHASE 1 END]

        status_message("---------------------------------------")
        status_message("Keyfile parsed!\n")
        status_message("Validating keyfile contents")
        status_message("---------------------------------------")

        # Having parsed the keyfile we now validate the keyfile so
        # we don't have to worry about validation later on

        # VALIDATE the parsed values
        # [PHASE 2 START]
        #

        for keyword in KEYWORD_LIST:
            # extract the value associated with each keyword in turn
            value = self.KEYWORDS[keyword]

            ##
            # SEQUENCE FILE VALIDATION AND PARSING
            ##
            if keyword == "SEQFILE":
                if value == "":
                    raise KeyFileException(
                        "ERROR: No sequence file provided in keyfile (expecting keyword [SEQFILE])")
                else:
                    if not os.path.isfile(value):
                        raise KeyFileException(
                            "Expected " + str(value) + " to be file")

                    # if its a file lets try and extract a sequence from it!
                    self.KEYWORDS[
                        "SEQUENCE"] = SeqFileParser.parseSeqFile(value)

                    # if we get here we *should* now have a sequence...
                    if self.KEYWORDS["SEQUENCE"] == "" or self.KEYWORDS[
                            "SEQUENCE"] is None:
                        raise KeyFileException(
                            "ERROR: No sequence was parsed from the sequence file...")

            ##
            # OUTPUT DIRECTORY VALIDATION
            ##
            elif keyword == "OUTDIR":
                if value == "":
                    raise KeyFileException(
                        "ERROR: No output directory provided in keyfile (expecting keyword [OUTDIR]")
                else:
                    # creates the output directory if it doesn't already exist
                    if not os.path.exists(value):
                        status_message(
                            "Creating output directory " + str(value))
                        try:
                            os.makedirs(value)
                        except OSError as e:
                            print "----------------------------"
                            print ""
                            print "ERROR Creating output directory - do you have permission to create the directory [" + str(value) + "]"
                            print ""
                            print "----------------------------"
                            raise e
                    # if it does exist raise a quick warning
                    else:
                        # check if its empty
                        if len(os.listdir(value)) > 0:
                            warning_message(
                                "Output directory exists already and is not empty [RISK OF OVERWRITING!]")
                        else:
                            pass  # empty directory already exists - brilliant!

            ##
            # FREEZE FILE VALIDATION
            ##
            elif keyword == "FREEZE_FILE":
                if value == "":
                    pass  # no freeze file, no problem
                else:
                    if not os.path.isfile(value):
                        raise KeyFileException(
                            "Expected " + str(value) + " to be file")
                    status_message("Using freeze file")
                    self.KEYWORDS[keyword] = value

            ##
            # WL TYPE
            ##
            elif keyword == "WL_TYPE":
                if value == "":
                    self.KEYWORDS[keyword] = DEFAULT_VALS[keyword]
                    status_message(
                        "Setting WL type to default [" + str(DEFAULT_VALS[keyword]) + "]")
                else:
                    if value in WL_TYPES:
                        self.KEYWORDS[keyword] = value
                        status_message(
                            "Setting WL type to keyfile defined [" + str(value) + "]")
                    else:
                        raise KeyFileException(
                            "Unexpected WL algorithm type selected " + str(value) + " ")

                self.KEYWORDS[keyword] = value

            ##
            # SET NUMERIC VALUES
            ##

            elif keyword == "BIN_MIN":
                self.__set_numeric(keyword, value)

            elif keyword == "BIN_MAX":
                self.__set_numeric(keyword, value)

            elif keyword == "NUMBER_OF_BINS":
                self.__set_numeric(keyword, value)

            elif keyword == "FLATCHECK_FREQ":
                self.__set_numeric(keyword, value)

            elif keyword == "CONVERGENCE":
                self.__set_numeric(keyword, value)

            elif keyword == "FLATNESS_CRITERION":
                self.__set_numeric(keyword, value)

            else:
                raise KeyFileException("SHOULD NOT BE GETTING HERE...")
Ejemplo n.º 10
0
    def parse_keyfile(self, filename):
        """
        Function which takes a filename and parses it into the keyfile object for easy
        interaction with the file's content
        """

        status_message("Parsing keyfile...")
        status_message("---------------------------------------")

        SeqFileParser = SequenceFileParser()  # create a sequence file parsing object

        # read file to end
        with open(filename) as filehandle:
            content = filehandle.readlines()

        # [PHASE 1 START]
        # PARSE THE KEYFILE
        for line in content:
            line = line.strip()

            # if empty line
            if len(line) == 0:
                continue

            # comments in the keyfile
            if line[0] == "#":
                continue

            # if inline comment kill everything after the comment
            # character
            if len(line.split("#")) > 1:
                line = line.split("#")[0]

            # finally remove any other trailing whitespace
            line = line.strip()

            # split the remaining by whitespace
            line_list = line.split(" ")

            # now cycle over the first value in the whitelist splitted lits
            # and check if it matches one of the predefined KEYWORDS
            if line_list[0].strip() in self.KEYWORDS:

                # if we find a keyword and there's a single string after the keyword load it into the
                # KEYWORDS dictionary (i.e. this is what we expect!)
                if len(line_list) == 2:
                    self.KEYWORDS[line_list[0].strip()] = line_list[1].strip()
                # there was more than one whitespace seperated string after the
                # keyword - we basically fail at this
                else:
                    raise KeyFileException("Error: Found keyword " +
                                           str(line_list[0].strip()) +
                                           " but unable to parse associated value")
            else:
                warning_message(
                    "Found unexpected keyword [" + str(line_list[0].strip()) + "] - ignorning...")

        # Now add default for the sequene, which will hopefully be set in the
        # next section by parsing the sequencefile
        self.KEYWORDS["SEQUENCE"] = ""
        # [PHASE 1 END]

        status_message("---------------------------------------")
        status_message("Keyfile parsed!\n")
        status_message("Validating keyfile contents")
        status_message("---------------------------------------")

        # Having parsed the keyfile we now validate the keyfile so
        # we don't have to worry about validation later on

        # VALIDATE the parsed values
        # [PHASE 2 START]
        #

        for keyword in KEYWORD_LIST:
            # extract the value associated with each keyword in turn
            value = self.KEYWORDS[keyword]

            ##
            # SEQUENCE FILE VALIDATION AND PARSING
            ##
            if keyword == "SEQFILE":
                if value == "":
                    raise KeyFileException(
                        "ERROR: No sequence file provided in keyfile (expecting keyword [SEQFILE])")
                else:
                    if not os.path.isfile(value):
                        raise KeyFileException(
                            "Expected " + str(value) + " to be file")

                    # if its a file lets try and extract a sequence from it!
                    self.KEYWORDS[
                        "SEQUENCE"] = SeqFileParser.parseSeqFile(value)

                    # if we get here we *should* now have a sequence...
                    if self.KEYWORDS["SEQUENCE"] == "" or self.KEYWORDS[
                            "SEQUENCE"] is None:
                        raise KeyFileException(
                            "ERROR: No sequence was parsed from the sequence file...")

            ##
            # OUTPUT DIRECTORY VALIDATION
            ##
            elif keyword == "OUTDIR":
                if value == "":
                    raise KeyFileException(
                        "ERROR: No output directory provided in keyfile (expecting keyword [OUTDIR]")
                else:
                    # creates the output directory if it doesn't already exist
                    if not os.path.exists(value):
                        status_message(
                            "Creating output directory " + str(value))
                        try:
                            os.makedirs(value)
                        except OSError as e:
                            print "----------------------------"
                            print ""
                            print "ERROR Creating output directory - do you have permission to create the directory [" + str(value) + "]"
                            print ""
                            print "----------------------------"
                            raise e
                    # if it does exist raise a quick warning
                    else:
                        # check if its empty
                        if len(os.listdir(value)) > 0:
                            warning_message(
                                "Output directory exists already and is not empty [RISK OF OVERWRITING!]")
                        else:
                            pass  # empty directory already exists - brilliant!

            ##
            # FREEZE FILE VALIDATION
            ##
            elif keyword == "FREEZE_FILE":
                if value == "":
                    pass  # no freeze file, no problem
                else:
                    if not os.path.isfile(value):
                        raise KeyFileException(
                            "Expected " + str(value) + " to be file")
                    status_message("Using freeze file")
                    self.KEYWORDS[keyword] = value

            ##
            # WL TYPE
            ##
            elif keyword == "WL_TYPE":
                if value == "":
                    self.KEYWORDS[keyword] = DEFAULT_VALS[keyword]
                    status_message(
                        "Setting WL type to default [" + str(DEFAULT_VALS[keyword]) + "]")
                else:
                    if value in WL_TYPES:
                        self.KEYWORDS[keyword] = value
                        status_message(
                            "Setting WL type to keyfile defined [" + str(value) + "]")
                    else:
                        raise KeyFileException(
                            "Unexpected WL algorithm type selected " + str(value) + " ")

                self.KEYWORDS[keyword] = value

            ##
            # SET NUMERIC VALUES
            ##

            elif keyword == "BIN_MIN":
                self.__set_numeric(keyword, value)

            elif keyword == "BIN_MAX":
                self.__set_numeric(keyword, value)

            elif keyword == "NUMBER_OF_BINS":
                self.__set_numeric(keyword, value)

            elif keyword == "FLATCHECK_FREQ":
                self.__set_numeric(keyword, value)

            elif keyword == "CONVERGENCE":
                self.__set_numeric(keyword, value)

            elif keyword == "FLATNESS_CRITERION":
                self.__set_numeric(keyword, value)

            else:
                raise KeyFileException("SHOULD NOT BE GETTING HERE...")