def __formatList(aminoAcids): """Formats 'aminoAcids' into a list of AminoAcid objects.""" # No amino acids result in an empty list if aminoAcids is None: return [] # A single AminoAcid is copied and put within a list elif isinstance(aminoAcids, AminoAcid): return [AminoAcid(aminoAcids)] # Copy constructor # A string is converted to a list, based on its elif isinstance(aminoAcids, str): if aminoAcids.isupper(): # Multiple Amino Acids in short name mode return [AminoAcid(aa) for aa in aminoAcids] else: # A single Amino Acid with any name mode return [AminoAcid(aminoAcids)] # A list is copied with all of its items converted to AminoAcid objects elif isinstance(aminoAcids, list): return [AminoAcid(aa) for aa in aminoAcids] # No other supported types else: raise TypeError( "aminoAcids must be a Sequence, list, AminoAcid object, string or None" )
def getFrequencies(groupValues, groupSizes): """ Evaluates the frequencies of AminoAcids within columns of groups in 'groupValues'. Frequencies are weighted according to group sizes in 'groupSizes'. Returns two dictionaries and a number: -'freqPairs' maps pairs of AminoAcids to their frequencies -'freqSingle' maps single AminoAcids to their frequencies -'freqSum' is the sum of all frequencies """ seqSize = len(groupValues[0]) # Size of the Sequences groupCount = len(groupSizes) # Number of groups freqPairs = {} # frequencies of amino acid pairs (fAB) # Frequencies of single amino acids (fA) freqSingle = {AminoAcid(aa): 0 for aa in AminoAcid.getAllNames()} freqSum = 0 # Sum of frequencies sum(fAB) for col in range(seqSize): # Each column for groupAIndex in range(groupCount - 1): # Each groupA groupA = groupValues[groupAIndex] groupASize = groupSizes[groupAIndex] for groupBIndex in range(groupAIndex + 1, groupCount): # Each further groupB groupB = groupValues[groupBIndex] groupBSize = groupSizes[groupBIndex] for aaA, aaACount in groupA[col].items( ): # Each AA from groupA aaAFreq = aaACount / groupASize # Its frequency within groupA for aaB, aaBCount in groupB[col].items( ): # Each AA from groupB aaBFreq = aaBCount / groupBSize # Its frequency within groupB aaPairFreq = aaAFreq * aaBFreq # Pair frequency freqSum += aaPairFreq # Sum of all frequencies freqSingle[aaA] += aaPairFreq / 2 freqSingle[aaB] += aaPairFreq / 2 # Index is unique to this pair pairIndex = (aaA, aaB) if aaA > aaB else (aaB, aaA) try: freqPairs[pairIndex] += aaPairFreq except: freqPairs[pairIndex] = aaPairFreq return freqPairs, freqSingle, freqSum
def __init__(self, path="", description="", ignore=None): """ Creates a Score object. If 'path' is provided, loads the Score values from an iij file. Otherwise, creates a Score for all possible AminoAcids with values 0. """ self._description = description self._ignore = Sequence(ignore) self._matrix = [] self._aaOrder = {} self._aaSequence = Sequence() # If path is provided, load directly from iij file if path != "": with open(path, 'r') as file: foundAAOrder = False # Have we found the line with the amino acid values and order yet? for line in file: if line[0] != "#": # Comments if not foundAAOrder: # Read aa values and order for aa in line.split(): self._aaSequence.extend(aa) self._aaOrder = { aa: index for aa, index in zip( self._aaSequence, range(len(self._aaSequence))) } foundAAOrder = True else: # Read matrix values self._matrix.append([int(v) for v in line.split()]) # Otherwise initialize matrix with 0 else: lineSize = 1 for aa in AminoAcid.getAllNames(): if AminoAcid(aa) not in self._ignore: self._aaSequence.extend(aa) self._aaOrder[self._aaSequence[-1]] = lineSize - 1 self._matrix.append([0 for i in range(lineSize)]) lineSize += 1
def __init__(self): self.structures = "HETC" self.roc = {(s, r): 0 for s in self.structures for r in ("TP", "TN", "FP", "FN")} self.minScore, self.maxScore = 0, 0 self.scores = {(classifier, realS): [] for classifier in self.structures for realS in self.structures} self.correctPred = 0 self.totalPred = 0 self.neighbourOffset = 8 self.trainings = 0 # Number of trainings (one per AA) self.strucCount = {s: 0 for s in self.structures} self.pairCount = {(s, a): 0 for s in self.structures for a in AminoAcid.getAllNames()} self.tripletCount = {} for s in self.structures: for a in AminoAcid.getAllNames(): for na in AminoAcid.getAllNames(): # Neighbour AA self.tripletCount[(s, a, na)] = 0
def test_sliceSingleItemIsAminoAcid(self): prot = Sequence("ABC") aa = prot[0] self.assertEquals(aa, AminoAcid("A"))
def test_count(self): prot = Sequence("ABCAX") self.assertEquals(prot.count(AminoAcid("A")), 2)
def test_remove(self): prot = Sequence("TRANKIL") prot.remove(AminoAcid("A")) self.assertEquals(prot, Sequence("TRNKIL"))
def __setitem__(self, key, value): """Sets value for a slice of the sequence""" list.__setitem__(self, key, AminoAcid(value))
def canCreateAminoAcid(self, name): try: aa = AminoAcid(name) except ValueError: self.fail()
def test_hashValuesAreEqual(self): aa1 = AminoAcid("ala") aa2 = AminoAcid("A") self.assertEquals(hash(aa1), hash(aa2))
def test_isTermination(self): aa = AminoAcid("term") self.assertTrue(aa.isTermination())
def test_isGap(self): aa = AminoAcid("gap") self.assertTrue(aa.isGap())
def test_getName(self): aa = AminoAcid("ala") self.assertEquals("A", aa.getName())
def test_getAllNames(self): for nameMode in ("short", "medium", "long"): for name in AminoAcid.getAllNames(nameMode): self.canCreateAminoAcid(name)
def getScore(self, aa1, aa2): """ Get the score assigned to AminoAcids 'aa1', 'aa2'. """ id1 = self._aaOrder[aa1] id2 = self._aaOrder[aa2] if id1 > id2: return self._matrix[id1][id2] else: return self._matrix[id2][id1] # AA frequencies for complete UniProt database # from http://web.expasy.org/docs/relnotes/relstat.html, "AMINO ACID COMPOSITION" uniprob = { AminoAcid("Ala"): .0826, AminoAcid("Gln"): .0393, AminoAcid("Leu"): .0965, AminoAcid("Ser"): .0660, AminoAcid("Arg"): .0553, AminoAcid("Glu"): .0674, AminoAcid("Lys"): .0582, AminoAcid("Thr"): .0535, AminoAcid("Asn"): .0406, AminoAcid("Gly"): .0708, AminoAcid("Met"): .0241, AminoAcid("Trp"): .0109, AminoAcid("Asp"): .0546, AminoAcid("His"): .0227, AminoAcid("Phe"): .0386, AminoAcid("Tyr"): .0292,