def test_randomSequence(self): """randomSequence: 99% of new frequencies should be within 3*SD""" r_num, c_num = 100,20 num_elements = r_num*c_num alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" r = random([r_num,c_num]) p = Profile(r,alpha[:c_num]) p.normalizePositions() d = p.Data n = 1000 #Test only works on normalized profile, b/c of 1-d below means = n*d three_stds = sqrt(d*(1-d)*n)*3 a = Alignment([p.randomSequence() for x in range(n)]) def absoluteProfile(alignment,char_order): f = a.columnFrequencies() res = zeros([len(f),len(char_order)]) for row, freq in enumerate(f): for i in freq: col = char_order.index(i) res[row, col] = freq[i] return res ap = absoluteProfile(a,p.CharOrder) failure = abs(ap-means) > three_stds assert sum(sum(failure))/num_elements <= 0.01
def test_toOddsMatrix(self): """toOddsMatrix: should work on valid data or raise an error """ p = Profile(array([[.1,.3,.5,.1],[.25,.25,.25,.25],\ [.05,.8,.05,.1],[.7,.1,.1,.1],[.6,.15,.05,.2]]),\ Alphabet="ACTG") p_exp = Profile(array([[.4, 1.2, 2, .4],[1,1,1,1],[.2,3.2,.2,.4],\ [2.8,.4,.4,.4],[2.4,.6,.2,.8]]),Alphabet="ACTG") self.assertEqual(p.toOddsMatrix().Data,p_exp.Data) assert p.Alphabet is p.toOddsMatrix().Alphabet self.assertEqual(p.toOddsMatrix([.25,.25,.25,.25]).Data,p_exp.Data) #fails if symbol_freqs has wrong size self.assertRaises(ProfileError, p.toOddsMatrix,\ [.25,.25,.25,.25,.25,.25]) self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix,\ [.1,.2,.3]) #works on empty profile self.assertEqual(self.empty.toOddsMatrix().Data.tolist(),[[]]) #works with different input self.assertEqual(self.zero_entry.toOddsMatrix().Data,\ array([[1.2,.8,0,2],[0,0,3.2,.8]])) self.assertFloatEqual(self.zero_entry.toOddsMatrix([.1,.2,.3,.4]).Data,\ array([[3,1,0,1.25],[0,0,2.667,.5]]),1e-3) #fails when one of the background frequencies is 0 self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix,\ [.1,.2,.3,0])
def test_columnUncertainty(self): """columnUncertainty: should handle full and empty profiles """ p = Profile(array([[.25,.5],[.25,.5],[.25,0],[.25,0]]),"AB") self.assertEqual(p.columnUncertainty(),[2,1]) #for empty cols nothing is returned as the uncertainty self.assertEqual(self.empty.columnUncertainty().tolist(),[]) p = Profile(array([[],[],[]]),"") self.assertEqual(p.columnUncertainty().tolist(),[]) #doesn't work on 1D array self.assertRaises(ProfileError,self.oned.columnUncertainty)
def mVOR(alignment,n=1000,order=DNA_ORDER): """Returns sequence weights according to the modified Voronoi method. alignment: Alignment object n: sample size (=number of random profiles to be generated) order: specifies the order of the characters found in the alignment, used to build the sequence and random profiles. mVOR is a modification of the VOR method. Instead of generating discrete random sequences, it generates random profiles, to sample more equally from the sequence space and to prevent random sequences to be equidistant to multiple sequences in the alignment. See the Implementation notes to see how the random profiles are generated and compared to the 'sequence profiles' from the alignment. Random generalized sequences (or a profile filled with random numbers): Sequences that are equidistant to multiple sequences in the alignment can form a problem in small datasets. For longer sequences the likelihood of this event is negligable. Generating 'random generalized sequences' is a solution, because we're then sampling from continuous sequence space. Each column of a random profile is generated by normalizing a set of independent, exponentially distributed random numbers. In other words, a random profile is a two-dimensional array (rows are chars in the alphabet, columns are positions in the alignment) filled with a random numbers, sampled from the standard exponential distribution (lambda=1, and thus the mean=1), where each column is normalized to one. These random profiles are compared to the special profiles of just one sequence (ones for the single character observed at that position). The distance between the two profiles is simply the Euclidean distance. """ weights = zeros(len(alignment),Float64) #get seq profiles seq_profiles = {} for k,v in alignment.items(): #seq_profiles[k] = ProfileFromSeq(v,order=order) seq_profiles[k] = SeqToProfile(v,alphabet=order) for count in range(n): #generate a random profile exp = exponential(1,[alignment.SeqLen,len(order)]) r = Profile(Data=exp,Alphabet=order) r.normalizePositions() #append the distance between the random profile and the sequence #profile to temp temp = [seq_profiles[key].distance(r) for key in alignment.RowOrder] votes = row_to_vote(array(temp)) weights += votes weight_dict = Weights(dict(zip(alignment.RowOrder,weights))) weight_dict.normalize() return weight_dict
def test_rowUncertainty(self): """rowUncertainty: should handle full and empty profiles """ p = Profile(array([[.25,.25,.25,.25],[.5,.5,0,0]]),"ABCD") self.assertEqual(p.rowUncertainty(),[2,1]) #for empty rows 0 is returned as the uncertainty self.assertEqual(self.empty.rowUncertainty().tolist(),[]) p = Profile(array([[],[],[]]),"") self.assertEqual(p.rowUncertainty().tolist(),[]) #doesn't work on 1D array self.assertRaises(ProfileError,self.oned.rowUncertainty)
def test__div_(self): """__div__ and __truediv__: always true division b/c __future__.division """ p1 = Profile(array([[2,3],[4,5]]),"AB") p2 = Profile(array([[1,0],[4,5]]),"AB") #Int 0 p3 = Profile(array([[1,0.0],[4,5]]),"AB") #Float 0.0 p4 = Profile(array([[1,2],[8.0,5]]),"AB") #Float 0.0 self.assertRaises(ProfileError, p1.__truediv__,p2) #infinity in result data self.assertRaises(ProfileError, p1.__div__, p3) self.assertFloatEqual((p1.__div__(p4)).Data, array([[2,1.5],[0.5,1]]))
def test_toConsensus_include_all(self): """toConsensus: Should include all possibilities when include_all=True """ p1 = Profile(array([[.2,0,.8,0],[0,.1,.2,.7],[0,0,0,1],\ [.2,.3,.4,.1],[.5,.5,0,0]]),\ Alphabet=DnaAlphabet, CharOrder="TCAG") self.assertEqual(p1.toConsensus(cutoff=0.4, include_all=True),\ "AGGAY") p2 = Profile(array([[.25,0.25,.25,0.25],[0.1,.1,.1,0],\ [.4,0,.4,0],[0,.2,0.2,0.3]]),\ Alphabet=DnaAlphabet, CharOrder="TCAG") self.assertEqual(p2.toConsensus(cutoff=0.4,\ include_all=True), "NHWV")
def test_toLogOddsMatrix(self): """toLogOddsMatrix: should work as expected""" #This test can be short, because it mainly depends on toOddsMatrix #for which everything has been tested p = Profile(array([[.1,.3,.5,.1],[.25,.25,.25,.25],\ [.05,.8,.05,.1],[.7,.1,.1,.1],[.6,.15,.05,.2]]),\ Alphabet="ACTG") p_exp = Profile(array(\ [[-1.322, 0.263, 1., -1.322],\ [ 0., 0., 0., 0.],\ [-2.322, 1.678, -2.322, -1.322],\ [ 1.485, -1.322, -1.322, -1.322],\ [ 1.263, -0.737, -2.322, -0.322]]),\ Alphabet="ACTG") self.assertFloatEqual(p.toLogOddsMatrix().Data,p_exp.Data,eps=1e-3) #works on empty matrix self.assertEqual(self.empty.toLogOddsMatrix().Data.tolist(),[[]])
def test_normalizeSequences(self): """normalizeSequences: should normalize or raise appropriate error """ p = self.full.copy() p.normalizeSequences() self.assertEqual(p.Data,array([[2/9,4/17],[3/9,5/17],[4/9,8/17]])) self.assertEqual(sum(p.Data),[1,1]) p = self.empty_row.copy() p.normalizeSequences() self.assertEqual(p.Data,array([[1,1],[0,0]])) p = self.empty_col.copy() self.assertRaises(ProfileError,p.normalizeSequences) p = Profile(array([[0.0],[0.0]]),"AB") self.assertRaises(ProfileError,p.normalizeSequences) #negative numbers!!!!!! p1 = Profile(array([[3,4],[-2,-3]]),"AB") p1.normalizeSequences() self.assertEqual(p1.Data,array([[3,4],[-2,-3]])) p2 = Profile(array([[3,4],[-3,-3]]),"AB") self.assertRaises(ProfileError,p2.normalizeSequences)
def test_normalizePositions(self): """normalizePositions: should normalize or raise appropriate error """ p = self.full.copy() p.normalizePositions() self.assertEqual(p.Data,array([[2/6,4/6],[3/8,5/8],[4/12,8/12]])) self.assertEqual(sum(p.Data,1),[1,1,1]) p = self.empty_col.copy() p.normalizePositions() self.assertEqual(p.Data,array([[0,1],[0,1]])) p = self.empty_row.copy() self.assertRaises(ProfileError,p.normalizePositions) p = Profile(array([[0.0,0.0]]),"AB") self.assertRaises(ProfileError,p.normalizePositions) #negative numbers!!!!!! p1 = Profile(array([[3,-2],[4,-3]]),"AB") p1.normalizePositions() self.assertEqual(p1.Data,array([[3,-2],[4,-3]])) p2 = Profile(array([[3,-3],[4,-3]]),"AB") self.assertRaises(ProfileError,p2.normalizePositions)
def test_reduce_operators(self): """reduce: should work fine with different operators """ #different operators, normalize input, don't normalize output p1 = Profile(array([[1,0,0],[0,1,0]]),Alphabet="ABC") p2 = Profile(array([[1,0,0],[0,0,1]]),Alphabet="ABC") self.assertEqual(p1.reduce(p2).Data,array([[1,0,0],[0,.5,.5]])) self.assertEqual(p1.reduce(p2,add,normalize_input=True,\ normalize_output=False).Data,array([[2,0,0],[0,1,1]])) self.assertEqual(p1.reduce(p2,subtract,normalize_input=True,\ normalize_output=False).Data,array([[0,0,0],[0,1,-1]])) self.assertEqual(p1.reduce(p2,multiply,normalize_input=True,\ normalize_output=False).Data,array([[1,0,0],[0,0,0]])) self.assertRaises(ProfileError,p1.reduce,p2,divide,\ normalize_input=True,normalize_output=False) #don't normalize and normalize only input p3 = Profile(array([[1,2],[3,4]]),Alphabet="AB") p4 = Profile(array([[4,3],[2,1]]),Alphabet="AB") self.assertEqual(p3.reduce(p4,add,normalize_input=False,\ normalize_output=False).Data,array([[5,5],[5,5]])) self.assertFloatEqual(p3.reduce(p4,add,normalize_input=True,\ normalize_output=False).Data,array([[19/21,23/21],[23/21,19/21]])) #normalize input and output p5 = Profile(array([[1,1,0,0],[1,1,1,1]]),Alphabet="ABCD") p6 = Profile(array([[1,0,0,0],[1,0,0,1]]),Alphabet="ABCD") self.assertEqual(p5.reduce(p6,add,normalize_input=True,\ normalize_output=True).Data,array([[.75,.25,0,0],\ [.375,.125,.125,.375]])) #it can collapse empty profiles when normalizing is turned off self.assertEqual(self.empty.reduce(self.empty,\ normalize_input=False,normalize_output=False).Data.tolist(),[[]])
def test_isValid(self): """isValid: should work as expected""" #everything valid p1 = Profile(array([[.3,.7],[.8,.2]]),Alphabet="AB",CharOrder="AB") #invalid data, valid attributes p2 = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="BA") #invalid attributes, valid data p3 = Profile(array([[.3,.7],[.8,.2]]),Alphabet="ABCD",CharOrder="AF") self.assertEqual(p1.isValid(),True) self.assertEqual(p2.isValid(),False) self.assertEqual(p3.isValid(),False)
def test_hasValidAttributes(self): """hasValidAttributes: should work for different alphabets/char orders """ p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="BAC") #self.Data doesn't match len(CharOrder) self.assertEqual(p.hasValidAttributes(),False) p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="AX") #not all chars in CharOrder in Alphabet self.assertEqual(p.hasValidAttributes(),False) p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="CB") #should be fine self.assertEqual(p.hasValidAttributes(),True)
def setUp(self): """setUp method for all Profile tests""" self.full = Profile(array([[2,4],[3,5],[4,8]]),"AB") self.empty = Profile(array([[]]),"AB") self.empty_row = Profile(array([[1,1],[0,0]]), "AB") self.empty_col = Profile(array([[0,1],[0,1]]), "AB") self.consensus = Profile(array([[.2,0,.8,0],[0,.1,.2,.7],[0,0,0,1],\ [.2,.3,.4,.1],[.5,.5,0,0]]),\ Alphabet=DnaAlphabet, CharOrder="TCAG") self.not_same_value = Profile(array([[.3,.5,.1,.1],[.4,.6,0,.7],\ [.3,.2,0,0],[0,0,4,0]]),Alphabet=DnaAlphabet, CharOrder="TCAG") self.zero_entry = Profile(array([[.3,.2,0,.5],[0,0,.8,.2]]),\ Alphabet="UCAG") self.score1 = Profile(Data=array([[-1,0,1,2],[-2,2,0,0],[-3,5,1,0]]),\ Alphabet=DnaAlphabet, CharOrder="ATGC") self.score2 = Profile(array([[.2,.4,.4,0],[.1,0,.9,0],[.1,.2,.3,.4]]),\ Alphabet="TCAG") self.oned = Profile(array([.25,.25,.25,.25]),"ABCD")
def test_distance(self): """distance: should return correct distance between the profiles """ p1 = Profile(array([[2,4],[3,1]]), "AB") p2 = Profile(array([[4,6],[5,3]]), "AB") p3 = Profile(array([[4,6],[5,3],[1,1]]), "AB") p4 = Profile(array([2,2]),"AB") p5 = Profile(array([2,2,2]),"AB") p6 = Profile(array([[]]),"AB") self.assertEqual(p1.distance(p2),4) self.assertEqual(p2.distance(p1),4) self.assertEqual(p1.distance(p4),sqrt(6)) self.assertEqual(p6.distance(p6),0) #Raises error when frames are not aligned self.assertRaises(ProfileError, p1.distance,p3) self.assertRaises(ProfileError,p1.distance,p5)
def test_copy(self): """copy: should act as expected while rebinding/modifying attributes """ p = Profile(array([[1,1],[.7,.3]]),{'A':'A','G':'G','R':'AG'},"AG") p_copy = p.copy() assert p.Data is p_copy.Data assert p.Alphabet is p_copy.Alphabet assert p.CharOrder is p_copy.CharOrder #modifying p.Data modifies p_copy.Data p.Data[1,1] = 100 assert p.Alphabet is p_copy.Alphabet #normalizing p.Data rebinds it, so p_copy.Data is unchanged p.normalizePositions() assert not p.Data is p_copy.Data #Adding something to the alphabet changes both p and p_copy p.Alphabet['Y']='TC' assert p.Alphabet is p_copy.Alphabet #Rebinding the CharOrder does only change the original p.CharOrder='XX' assert not p.CharOrder is p_copy.CharOrder
def AlnToProfile(aln, alphabet=None, char_order=None, split_degenerates=False,\ weights=None): """Generates a Profile object from an Alignment. aln: Alignment object alphabet (optional): an Alphabet object (or list of chars, but if you want to split degenerate symbols, the alphabet must have a Degenerates property. Default is the alphabet of the first seq in the alignment. char_order (optional): order of the characters in the profile. Default is list(alphabet) split_degenerates (optional): Whether you want the counts for the degenerate symbols to be divided over the non-degenerate symbols they code for. weights (optional): dictionary of seq_id: weight. If not entered all seqs are weighted equally A Profile is a position x character matrix describing which characters occur at each position of an alignment. The Profile is always normalized, so it gives the probabilities of each character at each position. Ignoring chars: you can ignore characters in the alignment by not putting the char in the CharOrder. If you ignore all characters at a particular position, an error will be raised, because the profile can't be normalized. Splitting degenerates: you can split degenerate characters over the non-degenerate characters they code for. For example: R = A or G. So, an R at a position counts for 0.5 A and 0.5 G. Example: seq1 TCAG weight: 0.5 seq2 TAR- weight: 0.25 seq3 YAG- weight: 0.25 Profile(aln,alphabet=DnaAlphabet,char_order="TACG",weights=w, split_degenerates=True) Profile: T A C G [[ 0.875 0. 0.125 0. ] [ 0. 0.5 0.5 0. ] [ 0. 0.625 0. 0.375] [ 0. 0. 0. 1. ]] """ if alphabet is None: alphabet = aln.values()[0].Alphabet if char_order is None: char_order = list(alphabet) if weights is None: weights = dict.fromkeys(aln.keys(),1/len(aln)) char_meaning = CharMeaningProfile(alphabet, char_order,\ split_degenerates) profiles = [] for k,v in aln.items(): profiles.append(take(char_meaning.Data, asarray(v.upper(), UInt8))\ * weights[k]) s = reduce(add,profiles) result = Profile(s,alphabet, char_order) try: result.normalizePositions() except: raise ValueError,\ "Probably one of the rows in your profile adds up to zero,\n "+\ "because you are ignoring all of the characters in the "+\ "corresponding\n column in the alignment" return result
class ProfileTests(TestCase): """Tests for Profile object""" def setUp(self): """setUp method for all Profile tests""" self.full = Profile(array([[2,4],[3,5],[4,8]]),"AB") self.empty = Profile(array([[]]),"AB") self.empty_row = Profile(array([[1,1],[0,0]]), "AB") self.empty_col = Profile(array([[0,1],[0,1]]), "AB") self.consensus = Profile(array([[.2,0,.8,0],[0,.1,.2,.7],[0,0,0,1],\ [.2,.3,.4,.1],[.5,.5,0,0]]),\ Alphabet=DnaAlphabet, CharOrder="TCAG") self.not_same_value = Profile(array([[.3,.5,.1,.1],[.4,.6,0,.7],\ [.3,.2,0,0],[0,0,4,0]]),Alphabet=DnaAlphabet, CharOrder="TCAG") self.zero_entry = Profile(array([[.3,.2,0,.5],[0,0,.8,.2]]),\ Alphabet="UCAG") self.score1 = Profile(Data=array([[-1,0,1,2],[-2,2,0,0],[-3,5,1,0]]),\ Alphabet=DnaAlphabet, CharOrder="ATGC") self.score2 = Profile(array([[.2,.4,.4,0],[.1,0,.9,0],[.1,.2,.3,.4]]),\ Alphabet="TCAG") self.oned = Profile(array([.25,.25,.25,.25]),"ABCD") def test_init(self): """__init__: should set all attributed correctly""" self.assertRaises(TypeError, Profile) self.assertRaises(TypeError, Profile, array([[2,3]])) #only alphabet p = Profile(array([[.2,.8],[.7,.3]]),"AB") self.assertEqual(p.Data, [[.2,.8],[.7,.3]]) self.assertEqual(p.Alphabet, "AB") self.assertEqual(p.CharOrder, list("AB")) self.assertEqual(translate("ABBA",p._translation_table), "\x00\x01\x01\x00") #alphabet and char order p = Profile(array([[.1,.2],[.4,.3]]),Alphabet=DnaAlphabet, CharOrder="AG") self.assertEqual(p.CharOrder,"AG") assert p.Alphabet is DnaAlphabet #non-character alphabet p = Profile(array([[.1,.2],[.4,.3]]),Alphabet=[7,3], CharOrder=[3,7]) self.assertEqual(p.CharOrder,[3,7]) self.assertEqual(p.Alphabet, [7,3]) self.assertEqual(p.Data, [[.1,.2],[.4,.3]]) def test_str(self): """__str__: should return string representation of data in profile """ self.assertEqual(str(self.empty_row),str(array([[1,1],[0,0]]))) def test_make_translation_table(self): """_make_translation_table: should return correct table from char order """ p = Profile(array([[.2,.8],[.7,.3]]),"ABCDE","AB") self.assertEqual(translate("ABBA",p._translation_table), "\x00\x01\x01\x00") def test_hasValidData(self): """hasValidData: should work on full and empty profiles""" full = self.full.copy() full.normalizePositions() self.assertEqual(full.hasValidData(),True) self.assertEqual(self.empty_row.hasValidData(),False) self.assertEqual(self.empty.hasValidData(),False) def test_hasValidAttributes(self): """hasValidAttributes: should work for different alphabets/char orders """ p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="BAC") #self.Data doesn't match len(CharOrder) self.assertEqual(p.hasValidAttributes(),False) p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="AX") #not all chars in CharOrder in Alphabet self.assertEqual(p.hasValidAttributes(),False) p = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="CB") #should be fine self.assertEqual(p.hasValidAttributes(),True) def test_isValid(self): """isValid: should work as expected""" #everything valid p1 = Profile(array([[.3,.7],[.8,.2]]),Alphabet="AB",CharOrder="AB") #invalid data, valid attributes p2 = Profile(array([[1,2],[3,4]]),Alphabet="ABCD", CharOrder="BA") #invalid attributes, valid data p3 = Profile(array([[.3,.7],[.8,.2]]),Alphabet="ABCD",CharOrder="AF") self.assertEqual(p1.isValid(),True) self.assertEqual(p2.isValid(),False) self.assertEqual(p3.isValid(),False) def test_copy(self): """copy: should act as expected while rebinding/modifying attributes """ p = Profile(array([[1,1],[.7,.3]]),{'A':'A','G':'G','R':'AG'},"AG") p_copy = p.copy() assert p.Data is p_copy.Data assert p.Alphabet is p_copy.Alphabet assert p.CharOrder is p_copy.CharOrder #modifying p.Data modifies p_copy.Data p.Data[1,1] = 100 assert p.Alphabet is p_copy.Alphabet #normalizing p.Data rebinds it, so p_copy.Data is unchanged p.normalizePositions() assert not p.Data is p_copy.Data #Adding something to the alphabet changes both p and p_copy p.Alphabet['Y']='TC' assert p.Alphabet is p_copy.Alphabet #Rebinding the CharOrder does only change the original p.CharOrder='XX' assert not p.CharOrder is p_copy.CharOrder def test_normalizePositions(self): """normalizePositions: should normalize or raise appropriate error """ p = self.full.copy() p.normalizePositions() self.assertEqual(p.Data,array([[2/6,4/6],[3/8,5/8],[4/12,8/12]])) self.assertEqual(sum(p.Data,1),[1,1,1]) p = self.empty_col.copy() p.normalizePositions() self.assertEqual(p.Data,array([[0,1],[0,1]])) p = self.empty_row.copy() self.assertRaises(ProfileError,p.normalizePositions) p = Profile(array([[0.0,0.0]]),"AB") self.assertRaises(ProfileError,p.normalizePositions) #negative numbers!!!!!! p1 = Profile(array([[3,-2],[4,-3]]),"AB") p1.normalizePositions() self.assertEqual(p1.Data,array([[3,-2],[4,-3]])) p2 = Profile(array([[3,-3],[4,-3]]),"AB") self.assertRaises(ProfileError,p2.normalizePositions) def test_normalizeSequences(self): """normalizeSequences: should normalize or raise appropriate error """ p = self.full.copy() p.normalizeSequences() self.assertEqual(p.Data,array([[2/9,4/17],[3/9,5/17],[4/9,8/17]])) self.assertEqual(sum(p.Data),[1,1]) p = self.empty_row.copy() p.normalizeSequences() self.assertEqual(p.Data,array([[1,1],[0,0]])) p = self.empty_col.copy() self.assertRaises(ProfileError,p.normalizeSequences) p = Profile(array([[0.0],[0.0]]),"AB") self.assertRaises(ProfileError,p.normalizeSequences) #negative numbers!!!!!! p1 = Profile(array([[3,4],[-2,-3]]),"AB") p1.normalizeSequences() self.assertEqual(p1.Data,array([[3,4],[-2,-3]])) p2 = Profile(array([[3,4],[-3,-3]]),"AB") self.assertRaises(ProfileError,p2.normalizeSequences) def test_prettyPrint_without_parameters(self): """prettyPrint: should work without parameters passed in""" p = self.full self.assertEqual(p.prettyPrint(),"2\t4\n3\t5\n4\t8") self.assertEqual(p.prettyPrint(include_header=True),\ "A\tB\n2\t4\n3\t5\n4\t8") self.assertEqual(p.prettyPrint(transpose_data=True),\ "2\t3\t4\n4\t5\t8") self.assertEqual(p.prettyPrint(include_header=True,\ transpose_data=True),"A\t2\t3\t4\nB\t4\t5\t8") #empty self.assertEqual(self.empty.prettyPrint(),"") self.assertEqual(self.empty.prettyPrint(transpose_data=True),"") #it will still print with invalid data (e.g if len(CharOrder) #doesn't match the data p = self.full.copy() p.CharOrder="ABC" self.assertEqual(p.prettyPrint(include_header=True),\ "A\tB\tC\n2\t4\n3\t5\n4\t8") #it will truncate the CharOrder if data is transposed #and CharOrder is longer then the number of rows in the #transposed data self.assertEqual(p.prettyPrint(include_header=True,\ transpose_data=True),"A\t2\t3\t4\nB\t4\t5\t8") def test_reduce_wrong_size(self): """reduce: should fail when profiles have different sizes""" p1 = Profile(array([[1,0],[0,1]]),Alphabet="AB") p2 = Profile(array([[1,0,0],[1,0,0]]),Alphabet="ABC") self.assertRaises(ProfileError,p1.reduce,p2) def test_reduce_normalization_error(self): """reduce: fails when input or output can't be normalized""" #Will raise errors when input data can't be normalized self.assertRaises(ProfileError,self.empty.reduce,self.empty,add) self.assertRaises(ProfileError,self.full.reduce,self.empty_row,add) #don't normalize input, but do normalize output #fails when one row adds up to zero p1 = Profile(array([[3,3],[4,4]]),"AB") p2 = Profile(array([[3,3],[-4,-4]]),"AB") self.assertRaises(ProfileError,p1.reduce,p2,add,False,True) def test_reduce_operators(self): """reduce: should work fine with different operators """ #different operators, normalize input, don't normalize output p1 = Profile(array([[1,0,0],[0,1,0]]),Alphabet="ABC") p2 = Profile(array([[1,0,0],[0,0,1]]),Alphabet="ABC") self.assertEqual(p1.reduce(p2).Data,array([[1,0,0],[0,.5,.5]])) self.assertEqual(p1.reduce(p2,add,normalize_input=True,\ normalize_output=False).Data,array([[2,0,0],[0,1,1]])) self.assertEqual(p1.reduce(p2,subtract,normalize_input=True,\ normalize_output=False).Data,array([[0,0,0],[0,1,-1]])) self.assertEqual(p1.reduce(p2,multiply,normalize_input=True,\ normalize_output=False).Data,array([[1,0,0],[0,0,0]])) self.assertRaises(ProfileError,p1.reduce,p2,divide,\ normalize_input=True,normalize_output=False) #don't normalize and normalize only input p3 = Profile(array([[1,2],[3,4]]),Alphabet="AB") p4 = Profile(array([[4,3],[2,1]]),Alphabet="AB") self.assertEqual(p3.reduce(p4,add,normalize_input=False,\ normalize_output=False).Data,array([[5,5],[5,5]])) self.assertFloatEqual(p3.reduce(p4,add,normalize_input=True,\ normalize_output=False).Data,array([[19/21,23/21],[23/21,19/21]])) #normalize input and output p5 = Profile(array([[1,1,0,0],[1,1,1,1]]),Alphabet="ABCD") p6 = Profile(array([[1,0,0,0],[1,0,0,1]]),Alphabet="ABCD") self.assertEqual(p5.reduce(p6,add,normalize_input=True,\ normalize_output=True).Data,array([[.75,.25,0,0],\ [.375,.125,.125,.375]])) #it can collapse empty profiles when normalizing is turned off self.assertEqual(self.empty.reduce(self.empty,\ normalize_input=False,normalize_output=False).Data.tolist(),[[]]) #more specific tests of the operators will be in the #separate functions def test__add_(self): """__add__: should not normalize input or output, just add""" p1 = Profile(array([[.3,.4,.1,0],[.1,.1,.1,.7]]),Alphabet="ABCD") p2 = Profile(array([[1,0,0,0],[1,0,0,1]]),Alphabet="ABCD") self.assertEqual((p1+p2).Data, array([[1.3,.4,.1,0],[1.1,.1,.1,1.7]])) self.assertRaises(ProfileError,self.empty.__add__, p1) self.assertEqual((self.empty + self.empty).Data.tolist(),[[]]) def test__sub_(self): """__sub__: should subtract two profiles, no normalization""" p1 = Profile(array([[.3,.4,.1,0],[.1,.1,.1,.7]]),Alphabet="ABCD") p2 = Profile(array([[1,0,0,0],[1,0,0,1]]),Alphabet="ABCD") self.assertFloatEqual((p1-p2).Data, array([[-.7,.4,.1,0],\ [-.9,.1,.1,-.3]])) def test__mul_(self): """__mul__: should multiply two profiles, no normalization""" p1 = Profile(array([[1,-2,3,0],[1,1,1,.5]]),Alphabet="ABCD") p2 = Profile(array([[1,0,0,0],[1,0,3,2]]),Alphabet="ABCD") self.assertEqual((p1*p2).Data, array([[1,0,0,0],\ [1,0,3,1]])) def test__div_(self): """__div__ and __truediv__: always true division b/c __future__.division """ p1 = Profile(array([[2,3],[4,5]]),"AB") p2 = Profile(array([[1,0],[4,5]]),"AB") #Int 0 p3 = Profile(array([[1,0.0],[4,5]]),"AB") #Float 0.0 p4 = Profile(array([[1,2],[8.0,5]]),"AB") #Float 0.0 self.assertRaises(ProfileError, p1.__truediv__,p2) #infinity in result data self.assertRaises(ProfileError, p1.__div__, p3) self.assertFloatEqual((p1.__div__(p4)).Data, array([[2,1.5],[0.5,1]])) def test_distance(self): """distance: should return correct distance between the profiles """ p1 = Profile(array([[2,4],[3,1]]), "AB") p2 = Profile(array([[4,6],[5,3]]), "AB") p3 = Profile(array([[4,6],[5,3],[1,1]]), "AB") p4 = Profile(array([2,2]),"AB") p5 = Profile(array([2,2,2]),"AB") p6 = Profile(array([[]]),"AB") self.assertEqual(p1.distance(p2),4) self.assertEqual(p2.distance(p1),4) self.assertEqual(p1.distance(p4),sqrt(6)) self.assertEqual(p6.distance(p6),0) #Raises error when frames are not aligned self.assertRaises(ProfileError, p1.distance,p3) self.assertRaises(ProfileError,p1.distance,p5) def test_toOddsMatrix(self): """toOddsMatrix: should work on valid data or raise an error """ p = Profile(array([[.1,.3,.5,.1],[.25,.25,.25,.25],\ [.05,.8,.05,.1],[.7,.1,.1,.1],[.6,.15,.05,.2]]),\ Alphabet="ACTG") p_exp = Profile(array([[.4, 1.2, 2, .4],[1,1,1,1],[.2,3.2,.2,.4],\ [2.8,.4,.4,.4],[2.4,.6,.2,.8]]),Alphabet="ACTG") self.assertEqual(p.toOddsMatrix().Data,p_exp.Data) assert p.Alphabet is p.toOddsMatrix().Alphabet self.assertEqual(p.toOddsMatrix([.25,.25,.25,.25]).Data,p_exp.Data) #fails if symbol_freqs has wrong size self.assertRaises(ProfileError, p.toOddsMatrix,\ [.25,.25,.25,.25,.25,.25]) self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix,\ [.1,.2,.3]) #works on empty profile self.assertEqual(self.empty.toOddsMatrix().Data.tolist(),[[]]) #works with different input self.assertEqual(self.zero_entry.toOddsMatrix().Data,\ array([[1.2,.8,0,2],[0,0,3.2,.8]])) self.assertFloatEqual(self.zero_entry.toOddsMatrix([.1,.2,.3,.4]).Data,\ array([[3,1,0,1.25],[0,0,2.667,.5]]),1e-3) #fails when one of the background frequencies is 0 self.assertRaises(ProfileError, self.zero_entry.toOddsMatrix,\ [.1,.2,.3,0]) def test_toLogOddsMatrix(self): """toLogOddsMatrix: should work as expected""" #This test can be short, because it mainly depends on toOddsMatrix #for which everything has been tested p = Profile(array([[.1,.3,.5,.1],[.25,.25,.25,.25],\ [.05,.8,.05,.1],[.7,.1,.1,.1],[.6,.15,.05,.2]]),\ Alphabet="ACTG") p_exp = Profile(array(\ [[-1.322, 0.263, 1., -1.322],\ [ 0., 0., 0., 0.],\ [-2.322, 1.678, -2.322, -1.322],\ [ 1.485, -1.322, -1.322, -1.322],\ [ 1.263, -0.737, -2.322, -0.322]]),\ Alphabet="ACTG") self.assertFloatEqual(p.toLogOddsMatrix().Data,p_exp.Data,eps=1e-3) #works on empty matrix self.assertEqual(self.empty.toLogOddsMatrix().Data.tolist(),[[]]) def test__score_indices(self): """_score_indices: should work on valid input""" self.assertEqual(self.score1._score_indices(array([0,1,1,3,0,3]),\ offset=0),[6,2,-3,0]) self.assertFloatEqual(self.score2._score_indices(\ array([3,1,2,0,2,2,3]), offset=0),[.3,1.4,.8,1.4,1.7]) self.assertFloatEqual(self.score2._score_indices(\ array([3,1,2,0,2,2,3]), offset=3),[1.4,1.7]) #Errors will be raised on invalid input. Errors are not handled #in this method. Validation of the input is done elsewhere self.assertRaises(IndexError,self.score2._score_indices,\ array([3,1,63,0,4,2,3]), offset=3) def test__score_profile(self): """_score_profile: should work on valid input""" p1 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,.5,.5],[0,0,0,1],\ [.25,.25,.25,.25]]),"TCAG") p2 = Profile(array([[0,1,0,0],[.2,0,.8,0],[0,0,.5,.5],[1/3,1/3,0,1/3],\ [.25,.25,.25,.25]]),"TCAG") self.assertFloatEqual(self.score2._score_profile(p1,offset=0),\ [.55,1.25,.45]) self.assertFloatEqual(self.score2._score_profile(p1,offset=2),\ [.45]) self.assertFloatEqual(self.score2._score_profile(p2,offset=0),\ [1.49,1.043,.483],1e-3) #Errors will be raised on invalid input. Errors are not handled #in this method. Validation of the input is done elsewhere #In this case you don't get an error, but for sure an unexpected #result self.assertFloatEqual(self.score2._score_profile(p1,offset=3).tolist(),\ []) def test_score_sequence(self): """score: should work correctly for Sequence as input """ #works on normal valid data self.assertEqual(self.score1.score("ATTCAC",offset=0),\ [6,2,-3,0]) self.assertFloatEqual(self.score2.score("TCAAGT",offset=0), [.5,1.6,1.7,0.5]) #works with different offset self.assertFloatEqual(self.score2.score("TCAAGT",offset=2), [1.7,0.5]) self.assertFloatEqual(self.score2.score("TCAAGT",offset=3), [0.5]) #raises error on invalid offset self.assertRaises(ProfileError,self.score2.score,\ "TCAAGT",offset=4) #works on seq of minimal length self.assertFloatEqual(self.score2.score("AGT",offset=0), [0.5]) #raises error when sequence is too short self.assertRaises(ProfileError, self.score2.score,"",offset=0) #raises error on empty profile self.assertRaises(ProfileError,self.empty.score,"ACGT") #raises error when sequence contains characters that #are not in the characterorder self.assertRaises(ProfileError,self.score2.score,"ACBRT") def test_score_profile(self): """score: should work correctly for Profile as input """ p1 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,.5,.5],[0,0,0,1],\ [.25,.25,.25,.25]]),"TCAG") p2 = Profile(array([[0,1,0,0],[.2,0,.8,0],[0,0,.5,.5],[1/3,1/3,0,1/3],\ [.25,.25,.25,.25]]),"TCAG") p3 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,0,1]]),"TCAG") p4 = Profile(array([[1,0,0,0],[0,1,0,0]]),"TCAG") p5 = Profile(array([[1,0,0,0],[0,1,0,0],[0,0,0,1]]),"AGTC") #works on normal valid data self.assertFloatEqual(self.score2.score(p1,offset=0),\ [.55,1.25,.45]) self.assertFloatEqual(self.score2.score(p2,offset=0), [1.49,1.043,.483],1e-3) #works with different offset self.assertFloatEqual(self.score2.score(p1,offset=1), [1.25,0.45]) self.assertFloatEqual(self.score2.score(p1,offset=2), [0.45]) #raises error on invalid offset self.assertRaises(ProfileError,self.score2.score,\ p1,offset=3) #works on profile of minimal length self.assertFloatEqual(self.score2.score(p3,offset=0), [0.6]) #raises error when profile is too short self.assertRaises(ProfileError, self.score2.score,p4,offset=0) #raises error on empty profile self.assertRaises(ProfileError,self.empty.score,p1) #raises error when character order doesn't match self.assertRaises(ProfileError,self.score2.score,p5) def test_rowUncertainty(self): """rowUncertainty: should handle full and empty profiles """ p = Profile(array([[.25,.25,.25,.25],[.5,.5,0,0]]),"ABCD") self.assertEqual(p.rowUncertainty(),[2,1]) #for empty rows 0 is returned as the uncertainty self.assertEqual(self.empty.rowUncertainty().tolist(),[]) p = Profile(array([[],[],[]]),"") self.assertEqual(p.rowUncertainty().tolist(),[]) #doesn't work on 1D array self.assertRaises(ProfileError,self.oned.rowUncertainty) def test_columnUncertainty(self): """columnUncertainty: should handle full and empty profiles """ p = Profile(array([[.25,.5],[.25,.5],[.25,0],[.25,0]]),"AB") self.assertEqual(p.columnUncertainty(),[2,1]) #for empty cols nothing is returned as the uncertainty self.assertEqual(self.empty.columnUncertainty().tolist(),[]) p = Profile(array([[],[],[]]),"") self.assertEqual(p.columnUncertainty().tolist(),[]) #doesn't work on 1D array self.assertRaises(ProfileError,self.oned.columnUncertainty) def test_rowDegeneracy(self): """rowDegneracy: should work as expected""" p1 = self.consensus p2 = self.not_same_value self.assertEqual(p1.rowDegeneracy(),[1,1,1,2,1]) self.assertEqual(p1.rowDegeneracy(cutoff=.5),[1,1,1,2,1]) self.assertEqual(p1.rowDegeneracy(cutoff=.75),[1,2,1,3,2]) #when a row seems to add up to the cutoff value, it's not #always found because of floating point error. E.g. second row #in this example self.assertEqual(p1.rowDegeneracy(cutoff=1),[2,4,1,4,2]) #when the cutoff can't be found, the number of columns in the #profile is returned (for each row) self.assertEqual(p1.rowDegeneracy(cutoff=1.5),[4,4,4,4,4]) self.assertEqual(p2.rowDegeneracy(cutoff=.95),[4,2,4,1]) self.assertEqual(p2.rowDegeneracy(cutoff=1.4),[4,3,4,1]) self.assertEqual(self.empty.rowDegeneracy(),[]) def test_columnDegeneracy(self): """columnDegeneracy: shoudl work as expected""" p1 = self.consensus p1.Data = transpose(p1.Data) p2 = self.not_same_value p2.Data = transpose(p2.Data) self.assertEqual(p1.columnDegeneracy(),[1,1,1,2,1]) self.assertEqual(p1.columnDegeneracy(cutoff=.5),[1,1,1,2,1]) self.assertEqual(p1.columnDegeneracy(cutoff=.75),[1,2,1,3,2]) #when a row seems to add up to the cutoff value, it's not #always found because of floating point error. E.g. second row #in this example self.assertEqual(p1.columnDegeneracy(cutoff=1),[2,4,1,4,2]) #when the cutoff can't be found, the number of rows in the #profile is returned (for each column) self.assertEqual(p1.columnDegeneracy(cutoff=1.5),[4,4,4,4,4]) self.assertEqual(p2.columnDegeneracy(cutoff=.95),[4,2,4,1]) self.assertEqual(p2.columnDegeneracy(cutoff=1.4),[4,3,4,1]) self.assertEqual(self.empty.columnDegeneracy(),[]) def test_rowMax(self): """rowMax should return max value in each row""" p1 = self.consensus obs = p1.rowMax() self.assertEqual(obs, array([.8, .7, 1, .4, .5])) def test_toConsensus(self): """toConsensus: should work with all the different options """ p = self.consensus self.assertEqual(p.toConsensus(fully_degenerate=False),"AGGAT") self.assertEqual(p.toConsensus(fully_degenerate=True),"WVGNY") self.assertEqual(p.toConsensus(cutoff=0.75),"ARGHY") self.assertEqual(p.toConsensus(cutoff=0.95),"WVGNY") self.assertEqual(p.toConsensus(cutoff=2),"WVGNY") p = self.not_same_value self.assertEqual(p.toConsensus(fully_degenerate=False),"CGTA") self.assertEqual(p.toConsensus(fully_degenerate=True),"NBYA") self.assertEqual(p.toConsensus(cutoff=0.75),"YSYA") self.assertEqual(p.toConsensus(cutoff=2),"NBYA") self.assertEqual(p.toConsensus(cutoff=5),"NBYA") #when you specify both fully_generate and a cutoff value #the cutoff takes priority and is used in the calculation self.assertEqual(p.toConsensus(cutoff=0.75,fully_degenerate=True),\ "YSYA") #raises AttributeError when Alphabet doens't have Degenerates p = Profile(array([[.2,.8],[.7,.3]]),"AB") self.assertRaises(AttributeError,p.toConsensus,cutoff=.5) def test_toConsensus_include_all(self): """toConsensus: Should include all possibilities when include_all=True """ p1 = Profile(array([[.2,0,.8,0],[0,.1,.2,.7],[0,0,0,1],\ [.2,.3,.4,.1],[.5,.5,0,0]]),\ Alphabet=DnaAlphabet, CharOrder="TCAG") self.assertEqual(p1.toConsensus(cutoff=0.4, include_all=True),\ "AGGAY") p2 = Profile(array([[.25,0.25,.25,0.25],[0.1,.1,.1,0],\ [.4,0,.4,0],[0,.2,0.2,0.3]]),\ Alphabet=DnaAlphabet, CharOrder="TCAG") self.assertEqual(p2.toConsensus(cutoff=0.4,\ include_all=True), "NHWV") def test_randomIndices(self): """randomIndices: 99% of new frequencies should be within 3*SD """ r_num, c_num = 100,20 num_elements = r_num*c_num r = random([r_num,c_num]) p = Profile(r,"A"*c_num) p.normalizePositions() d = p.Data n = 1000 #Test only works on normalized profile, b/c of 1-d below means = n*d three_stds = sqrt(d*(1-d)*n)*3 a = Alignment([p.randomIndices() for x in range(n)]) def absoluteProfile(alignment,char_order): f = a.columnFrequencies() res = zeros([len(f),len(char_order)]) for row, freq in enumerate(f): for i in freq: res[row, i] = freq[i] return res ap = absoluteProfile(a,p.CharOrder) failure = abs(ap-means) > three_stds assert sum(sum(failure))/num_elements <= 0.01 def test_randomSequence(self): """randomSequence: 99% of new frequencies should be within 3*SD""" r_num, c_num = 100,20 num_elements = r_num*c_num alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" r = random([r_num,c_num]) p = Profile(r,alpha[:c_num]) p.normalizePositions() d = p.Data n = 1000 #Test only works on normalized profile, b/c of 1-d below means = n*d three_stds = sqrt(d*(1-d)*n)*3 a = Alignment([p.randomSequence() for x in range(n)]) def absoluteProfile(alignment,char_order): f = a.columnFrequencies() res = zeros([len(f),len(char_order)]) for row, freq in enumerate(f): for i in freq: col = char_order.index(i) res[row, col] = freq[i] return res ap = absoluteProfile(a,p.CharOrder) failure = abs(ap-means) > three_stds assert sum(sum(failure))/num_elements <= 0.01