def test_randomIndices(self): """randomIndices: 99% of new frequencies should be within 3*SD """ r_num, c_num = 100, 20 num_elements = r_num * c_num r = random([r_num, c_num]) p = Profile(r, "A" * c_num) p.normalizePositions() d = p.Data n = 1000 # Test only works on normalized profile, b/c of 1-d below means = n * d three_stds = sqrt(d * (1 - d) * n) * 3 result = [p.randomIndices() for x in range(n)] a = Alignment(transpose(result)) def absoluteProfile(alignment, char_order): f = a.columnFreqs() res = zeros([len(f), len(char_order)]) for row, freq in enumerate(f): for i in freq: res[row, ord(i)] = freq[i] return res ap = absoluteProfile(a, p.CharOrder) failure = abs(ap - means) > three_stds assert sum(sum(failure)) / num_elements <= 0.01
def test_randomSequence(self): """randomSequence: 99% of new frequencies should be within 3*SD""" r_num, c_num = 100, 20 num_elements = r_num * c_num alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" r = random([r_num, c_num]) p = Profile(r, alpha[:c_num]) p.normalizePositions() d = p.Data n = 1000 # Test only works on normalized profile, b/c of 1-d below means = n * d three_stds = sqrt(d * (1 - d) * n) * 3 a = Alignment([p.randomSequence() for x in range(n)]) def absoluteProfile(alignment, char_order): f = a.columnFreqs() res = zeros([len(f), len(char_order)]) for row, freq in enumerate(f): for i in freq: col = char_order.index(i) res[row, col] = freq[i] return res ap = absoluteProfile(a, p.CharOrder) failure = abs(ap - means) > three_stds assert sum(sum(failure)) / num_elements <= 0.01
def test_randomSequence(self): """randomSequence: 99% of new frequencies should be within 3*SD""" r_num, c_num = 100,20 num_elements = r_num*c_num alpha = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" r = random([r_num,c_num]) p = Profile(r,alpha[:c_num]) p.normalizePositions() d = p.Data n = 1000 #Test only works on normalized profile, b/c of 1-d below means = n*d three_stds = sqrt(d*(1-d)*n)*3 a = Alignment([p.randomSequence() for x in range(n)]) def absoluteProfile(alignment,char_order): f = a.columnFreqs() res = zeros([len(f),len(char_order)]) for row, freq in enumerate(f): for i in freq: col = char_order.index(i) res[row, col] = freq[i] return res ap = absoluteProfile(a,p.CharOrder) failure = abs(ap-means) > three_stds assert sum(sum(failure))/num_elements <= 0.01
def test_randomIndices(self): """randomIndices: 99% of new frequencies should be within 3*SD """ r_num, c_num = 100,20 num_elements = r_num*c_num r = random([r_num,c_num]) p = Profile(r,"A"*c_num) p.normalizePositions() d = p.Data n = 1000 #Test only works on normalized profile, b/c of 1-d below means = n*d three_stds = sqrt(d*(1-d)*n)*3 result = [p.randomIndices() for x in range(n)] a = Alignment(transpose(result)) def absoluteProfile(alignment,char_order): f = a.columnFreqs() res = zeros([len(f),len(char_order)]) for row, freq in enumerate(f): for i in freq: res[row, ord(i)] = freq[i] return res ap = absoluteProfile(a,p.CharOrder) failure = abs(ap-means) > three_stds assert sum(sum(failure))/num_elements <= 0.01
def test_copy(self): """copy: should act as expected while rebinding/modifying attributes """ p = Profile(array([[1, 1], [.7, .3]]), { 'A': 'A', 'G': 'G', 'R': 'AG' }, "AG") p_copy = p.copy() assert p.Data is p_copy.Data assert p.Alphabet is p_copy.Alphabet assert p.CharOrder is p_copy.CharOrder #modifying p.Data modifies p_copy.Data p.Data[1, 1] = 100 assert p.Alphabet is p_copy.Alphabet #normalizing p.Data rebinds it, so p_copy.Data is unchanged p.normalizePositions() assert not p.Data is p_copy.Data #Adding something to the alphabet changes both p and p_copy p.Alphabet['Y'] = 'TC' assert p.Alphabet is p_copy.Alphabet #Rebinding the CharOrder does only change the original p.CharOrder = 'XX' assert not p.CharOrder is p_copy.CharOrder
def mVOR(alignment,n=1000,order=DNA_ORDER): """Returns sequence weights according to the modified Voronoi method. alignment: Alignment object n: sample size (=number of random profiles to be generated) order: specifies the order of the characters found in the alignment, used to build the sequence and random profiles. mVOR is a modification of the VOR method. Instead of generating discrete random sequences, it generates random profiles, to sample more equally from the sequence space and to prevent random sequences to be equidistant to multiple sequences in the alignment. See the Implementation notes to see how the random profiles are generated and compared to the 'sequence profiles' from the alignment. Random generalized sequences (or a profile filled with random numbers): Sequences that are equidistant to multiple sequences in the alignment can form a problem in small datasets. For longer sequences the likelihood of this event is negligable. Generating 'random generalized sequences' is a solution, because we're then sampling from continuous sequence space. Each column of a random profile is generated by normalizing a set of independent, exponentially distributed random numbers. In other words, a random profile is a two-dimensional array (rows are chars in the alphabet, columns are positions in the alignment) filled with a random numbers, sampled from the standard exponential distribution (lambda=1, and thus the mean=1), where each column is normalized to one. These random profiles are compared to the special profiles of just one sequence (ones for the single character observed at that position). The distance between the two profiles is simply the Euclidean distance. """ weights = zeros(len(alignment.Names),Float64) #get seq profiles seq_profiles = {} for k,v in list(alignment.items()): #seq_profiles[k] = ProfileFromSeq(v,order=order) seq_profiles[k] = SeqToProfile(v,alphabet=order) for count in range(n): #generate a random profile exp = exponential(1,[alignment.SeqLen,len(order)]) r = Profile(Data=exp,Alphabet=order) r.normalizePositions() #append the distance between the random profile and the sequence #profile to temp temp = [seq_profiles[key].distance(r) for key in alignment.Names] votes = row_to_vote(array(temp)) weights += votes weight_dict = Weights(dict(list(zip(alignment.Names,weights)))) weight_dict.normalize() return weight_dict
def mVOR(alignment, n=1000, order=DNA_ORDER): """Returns sequence weights according to the modified Voronoi method. alignment: Alignment object n: sample size (=number of random profiles to be generated) order: specifies the order of the characters found in the alignment, used to build the sequence and random profiles. mVOR is a modification of the VOR method. Instead of generating discrete random sequences, it generates random profiles, to sample more equally from the sequence space and to prevent random sequences to be equidistant to multiple sequences in the alignment. See the Implementation notes to see how the random profiles are generated and compared to the 'sequence profiles' from the alignment. Random generalized sequences (or a profile filled with random numbers): Sequences that are equidistant to multiple sequences in the alignment can form a problem in small datasets. For longer sequences the likelihood of this event is negligable. Generating 'random generalized sequences' is a solution, because we're then sampling from continuous sequence space. Each column of a random profile is generated by normalizing a set of independent, exponentially distributed random numbers. In other words, a random profile is a two-dimensional array (rows are chars in the alphabet, columns are positions in the alignment) filled with a random numbers, sampled from the standard exponential distribution (lambda=1, and thus the mean=1), where each column is normalized to one. These random profiles are compared to the special profiles of just one sequence (ones for the single character observed at that position). The distance between the two profiles is simply the Euclidean distance. """ weights = zeros(len(alignment.Names), Float64) #get seq profiles seq_profiles = {} for k, v in alignment.items(): #seq_profiles[k] = ProfileFromSeq(v,order=order) seq_profiles[k] = SeqToProfile(v, alphabet=order) for count in range(n): #generate a random profile exp = exponential(1, [alignment.SeqLen, len(order)]) r = Profile(Data=exp, Alphabet=order) r.normalizePositions() #append the distance between the random profile and the sequence #profile to temp temp = [seq_profiles[key].distance(r) for key in alignment.Names] votes = row_to_vote(array(temp)) weights += votes weight_dict = Weights(dict(zip(alignment.Names, weights))) weight_dict.normalize() return weight_dict
def test_normalizePositions(self): """normalizePositions: should normalize or raise appropriate error """ p = self.full.copy() p.normalizePositions() self.assertEqual(p.Data, array([[2 / 6, 4 / 6], [3 / 8, 5 / 8], [4 / 12, 8 / 12]])) self.assertEqual(sum(p.Data, 1), [1, 1, 1]) p = self.empty_col.copy() p.normalizePositions() self.assertEqual(p.Data, array([[0, 1], [0, 1]])) p = self.empty_row.copy() self.assertRaises(ProfileError, p.normalizePositions) p = Profile(array([[0.0, 0.0]]), "AB") self.assertRaises(ProfileError, p.normalizePositions) # negative numbers!!!!!! p1 = Profile(array([[3, -2], [4, -3]]), "AB") p1.normalizePositions() self.assertEqual(p1.Data, array([[3, -2], [4, -3]])) p2 = Profile(array([[3, -3], [4, -3]]), "AB") self.assertRaises(ProfileError, p2.normalizePositions)
def test_normalizePositions(self): """normalizePositions: should normalize or raise appropriate error """ p = self.full.copy() p.normalizePositions() self.assertEqual(p.Data,array([[2/6,4/6],[3/8,5/8],[4/12,8/12]])) self.assertEqual(sum(p.Data,1),[1,1,1]) p = self.empty_col.copy() p.normalizePositions() self.assertEqual(p.Data,array([[0,1],[0,1]])) p = self.empty_row.copy() self.assertRaises(ProfileError,p.normalizePositions) p = Profile(array([[0.0,0.0]]),"AB") self.assertRaises(ProfileError,p.normalizePositions) #negative numbers!!!!!! p1 = Profile(array([[3,-2],[4,-3]]),"AB") p1.normalizePositions() self.assertEqual(p1.Data,array([[3,-2],[4,-3]])) p2 = Profile(array([[3,-3],[4,-3]]),"AB") self.assertRaises(ProfileError,p2.normalizePositions)
def test_copy(self): """copy: should act as expected while rebinding/modifying attributes """ p = Profile(array([[1, 1], [0.7, 0.3]]), {"A": "A", "G": "G", "R": "AG"}, "AG") p_copy = p.copy() assert p.Data is p_copy.Data assert p.Alphabet is p_copy.Alphabet assert p.CharOrder is p_copy.CharOrder # modifying p.Data modifies p_copy.Data p.Data[1, 1] = 100 assert p.Alphabet is p_copy.Alphabet # normalizing p.Data rebinds it, so p_copy.Data is unchanged p.normalizePositions() assert not p.Data is p_copy.Data # Adding something to the alphabet changes both p and p_copy p.Alphabet["Y"] = "TC" assert p.Alphabet is p_copy.Alphabet # Rebinding the CharOrder does only change the original p.CharOrder = "XX" assert not p.CharOrder is p_copy.CharOrder
def test_copy(self): """copy: should act as expected while rebinding/modifying attributes """ p = Profile(array([[1,1],[.7,.3]]),{'A':'A','G':'G','R':'AG'},"AG") p_copy = p.copy() assert p.Data is p_copy.Data assert p.Alphabet is p_copy.Alphabet assert p.CharOrder is p_copy.CharOrder #modifying p.Data modifies p_copy.Data p.Data[1,1] = 100 assert p.Alphabet is p_copy.Alphabet #normalizing p.Data rebinds it, so p_copy.Data is unchanged p.normalizePositions() assert not p.Data is p_copy.Data #Adding something to the alphabet changes both p and p_copy p.Alphabet['Y']='TC' assert p.Alphabet is p_copy.Alphabet #Rebinding the CharOrder does only change the original p.CharOrder='XX' assert not p.CharOrder is p_copy.CharOrder
def AlnToProfile(aln, alphabet=None, char_order=None, split_degenerates=False,\ weights=None): """Generates a Profile object from an Alignment. aln: Alignment object alphabet (optional): an Alphabet object (or list of chars, but if you want to split degenerate symbols, the alphabet must have a Degenerates property. Default is the alphabet of the first seq in the alignment. char_order (optional): order of the characters in the profile. Default is list(alphabet) split_degenerates (optional): Whether you want the counts for the degenerate symbols to be divided over the non-degenerate symbols they code for. weights (optional): dictionary of seq_id: weight. If not entered all seqs are weighted equally A Profile is a position x character matrix describing which characters occur at each position of an alignment. The Profile is always normalized, so it gives the probabilities of each character at each position. Ignoring chars: you can ignore characters in the alignment by not putting the char in the CharOrder. If you ignore all characters at a particular position, an error will be raised, because the profile can't be normalized. Splitting degenerates: you can split degenerate characters over the non-degenerate characters they code for. For example: R = A or G. So, an R at a position counts for 0.5 A and 0.5 G. Example: seq1 TCAG weight: 0.5 seq2 TAR- weight: 0.25 seq3 YAG- weight: 0.25 Profile(aln,alphabet=DNA,char_order="TACG",weights=w, split_degenerates=True) Profile: T A C G [[ 0.875 0. 0.125 0. ] [ 0. 0.5 0.5 0. ] [ 0. 0.625 0. 0.375] [ 0. 0. 0. 1. ]] """ if alphabet is None: alphabet = aln.values()[0].MolType if char_order is None: char_order = list(alphabet) if weights is None: weights = dict.fromkeys(aln.keys(), 1 / len(aln)) char_meaning = CharMeaningProfile(alphabet, char_order,\ split_degenerates) profiles = [] for k, v in aln.items(): idxs = array(str(v).upper(), 'c').view(UInt8) profiles.append(char_meaning.Data[idxs] * weights[k]) s = reduce(add, profiles) result = Profile(s, alphabet, char_order) try: result.normalizePositions() except Exception, e: raise ValueError, e
def AlnToProfile(aln, alphabet=None, char_order=None, split_degenerates=False,\ weights=None): """Generates a Profile object from an Alignment. aln: Alignment object alphabet (optional): an Alphabet object (or list of chars, but if you want to split degenerate symbols, the alphabet must have a Degenerates property. Default is the alphabet of the first seq in the alignment. char_order (optional): order of the characters in the profile. Default is list(alphabet) split_degenerates (optional): Whether you want the counts for the degenerate symbols to be divided over the non-degenerate symbols they code for. weights (optional): dictionary of seq_id: weight. If not entered all seqs are weighted equally A Profile is a position x character matrix describing which characters occur at each position of an alignment. The Profile is always normalized, so it gives the probabilities of each character at each position. Ignoring chars: you can ignore characters in the alignment by not putting the char in the CharOrder. If you ignore all characters at a particular position, an error will be raised, because the profile can't be normalized. Splitting degenerates: you can split degenerate characters over the non-degenerate characters they code for. For example: R = A or G. So, an R at a position counts for 0.5 A and 0.5 G. Example: seq1 TCAG weight: 0.5 seq2 TAR- weight: 0.25 seq3 YAG- weight: 0.25 Profile(aln,alphabet=DNA,char_order="TACG",weights=w, split_degenerates=True) Profile: T A C G [[ 0.875 0. 0.125 0. ] [ 0. 0.5 0.5 0. ] [ 0. 0.625 0. 0.375] [ 0. 0. 0. 1. ]] """ if alphabet is None: alphabet = list(aln.values())[0].MolType if char_order is None: char_order = list(alphabet) if weights is None: weights = dict.fromkeys(list(aln.keys()),1/len(aln)) char_meaning = CharMeaningProfile(alphabet, char_order,\ split_degenerates) profiles = [] for k,v in list(aln.items()): idxs = array(str(v).upper(), 'c').view(UInt8) profiles.append(char_meaning.Data[idxs] * weights[k]) s = reduce(add,profiles) result = Profile(s,alphabet, char_order) try: result.normalizePositions() except Exception as e: raise ValueError(e) #"Probably one of the rows in your profile adds up to zero,\n "+\ #"because you are ignoring all of the characters in the "+\ #"corresponding\n column in the alignment" return result