def test_hamming_distance_diff_length(self): """hamming_distance: truncates at shortest sequence""" self.assertEqual( hamming_distance(array('ABC', 'c'), array('ABBDDD', 'c')), 1) self.assertEqual( hamming_distance(array('ABC', 'c'), array('ABCDDD', 'c')), 0) self.assertEqual( hamming_distance(array('ABC', 'c'), array('DDDDDD', 'c')), 3)
def test_hamming_distance_same_length(self): """hamming_distance: should return # of chars different""" hd = hamming_distance(array('ABC', 'c'), array('ABB', 'c')) self.assertEqual(hd, 1) self.assertEqual( hamming_distance(array('ABC', 'c'), array('ABC', 'c')), 0) self.assertEqual( hamming_distance(array('ABC', 'c'), array('DDD', 'c')), 3)
def test_hamming_distance_diff_length(self): """hamming_distance: truncates at shortest sequence""" self.assertEqual(hamming_distance(array('ABC', 'c'),array('ABBDDD', 'c')),1) self.assertEqual(hamming_distance(array('ABC', 'c'),array('ABCDDD', 'c')),0) self.assertEqual(hamming_distance(array('ABC', 'c'),array('DDDDDD', 'c')),3)
def test_hamming_distance_same_length(self): """hamming_distance: should return # of chars different""" hd = hamming_distance(array('ABC','c'),array('ABB','c')) self.assertEqual(hd,1) self.assertEqual(hamming_distance(array('ABC', 'c'),array('ABC', 'c')),0) self.assertEqual(hamming_distance(array('ABC', 'c'),array('DDD', 'c')),3)
def VOR(alignment,n=1000,force_monte_carlo=False,mc_threshold=1000): """Returns sequence weights according to the Voronoi weighting method. alignment: Alignment object n: sampling size (in case monte carlo is used) force_monte_carlo: generate pseudo seqs with monte carlo always (even if there's only a small number of possible unique pseudo seqs mc_threshold: threshold of when to use the monte carlo sampling method if the number of possible pseudo seqs exceeds this threshold monte carlo is used. VOR differs from VA in the set of sequences against which it's comparing all the sequences in the alignment. In addition to the sequences in the alignment itself, it uses a set of pseudo sequences. Generating discrete random sequences: A discrete random sequence is generated by choosing with equal likelihood at each position one of the residues observed at that position in the alighment. An occurrence of once in the alignment column is sufficient to make the residue type an option. Note: you're choosing with equal likelihood from each of the observed residues (independent of their frequency at that position). In earlier versions of the algorithm the characters were chosen either at the frequency with which they occur at a position or at the frequency with which they occur in the database. Both trials were unsuccesful, because they deviate from random sampling (see Sibbald & Argos 1990). Depending on the number of possible pseudo sequences, all of them are used or a random sample is taken (monte carlo). Example: Alignment: AA, AA, BB AA AA BB AA 0 (.5) 0 (.5) 2 AB 1 (1/3) 1 (1/3) 1 (1/3) BA 1 (1/3) 1 (1/3) 1 (1/3) BB 2 2 0 (1) ----------------------------- total 7/6 7/6 10/6 norm .291 .291 .418 For a bigger example with more pseudo sequences, see Henikoff 1994 I tried the described optimization (pre-calculate the distance to the closest sequence). I doesn't have an advantage over the original method. """ MC_THRESHOLD = mc_threshold #decide on sampling method if force_monte_carlo or number_of_pseudo_seqs(alignment) > MC_THRESHOLD: sampling_method = pseudo_seqs_monte_carlo else: sampling_method = pseudo_seqs_exact #change sequences into arrays aln_array = DenseAlignment(alignment, MolType=BYTES) weights = zeros(len(aln_array.Names),Float64) #calc distances for each pseudo seq rows = [array(seq, 'c') for seq in map(str, aln_array.Seqs)] for seq in sampling_method(aln_array,n=n): seq = array(seq, 'c') temp = [hamming_distance(row, seq) for row in rows] votes = row_to_vote(array(temp)) #change distances to votes weights += votes #add to previous weights weight_dict = Weights(dict(list(zip(aln_array.Names,weights)))) weight_dict.normalize() #normalize return weight_dict
def VOR(alignment, n=1000, force_monte_carlo=False, mc_threshold=1000): """Returns sequence weights according to the Voronoi weighting method. alignment: Alignment object n: sampling size (in case monte carlo is used) force_monte_carlo: generate pseudo seqs with monte carlo always (even if there's only a small number of possible unique pseudo seqs mc_threshold: threshold of when to use the monte carlo sampling method if the number of possible pseudo seqs exceeds this threshold monte carlo is used. VOR differs from VA in the set of sequences against which it's comparing all the sequences in the alignment. In addition to the sequences in the alignment itself, it uses a set of pseudo sequences. Generating discrete random sequences: A discrete random sequence is generated by choosing with equal likelihood at each position one of the residues observed at that position in the alighment. An occurrence of once in the alignment column is sufficient to make the residue type an option. Note: you're choosing with equal likelihood from each of the observed residues (independent of their frequency at that position). In earlier versions of the algorithm the characters were chosen either at the frequency with which they occur at a position or at the frequency with which they occur in the database. Both trials were unsuccesful, because they deviate from random sampling (see Sibbald & Argos 1990). Depending on the number of possible pseudo sequences, all of them are used or a random sample is taken (monte carlo). Example: Alignment: AA, AA, BB AA AA BB AA 0 (.5) 0 (.5) 2 AB 1 (1/3) 1 (1/3) 1 (1/3) BA 1 (1/3) 1 (1/3) 1 (1/3) BB 2 2 0 (1) ----------------------------- total 7/6 7/6 10/6 norm .291 .291 .418 For a bigger example with more pseudo sequences, see Henikoff 1994 I tried the described optimization (pre-calculate the distance to the closest sequence). I doesn't have an advantage over the original method. """ MC_THRESHOLD = mc_threshold #decide on sampling method if force_monte_carlo or number_of_pseudo_seqs(alignment) > MC_THRESHOLD: sampling_method = pseudo_seqs_monte_carlo else: sampling_method = pseudo_seqs_exact #change sequences into arrays aln_array = DenseAlignment(alignment, MolType=BYTES) weights = zeros(len(aln_array.Names), Float64) #calc distances for each pseudo seq rows = [array(seq, 'c') for seq in map(str, aln_array.Seqs)] for seq in sampling_method(aln_array, n=n): seq = array(seq, 'c') temp = [hamming_distance(row, seq) for row in rows] votes = row_to_vote(array(temp)) #change distances to votes weights += votes #add to previous weights weight_dict = Weights(dict(zip(aln_array.Names, weights))) weight_dict.normalize() #normalize return weight_dict