def lowess(x, y, f=2. / 3., iter=3): """lowess(x, y, f=2./3., iter=3) -> yest Lowess smoother: Robust locally weighted regression. The lowess function fits a nonparametric regression curve to a scatterplot. The arrays x and y contain an equal number of elements; each pair (x[i], y[i]) defines a data point in the scatterplot. The function returns the estimated (smooth) values of y. The smoothing span is given by f. A larger value for f will result in a smoother curve. The number of robustifying iterations is given by iter. The function will run faster with a smaller number of iterations. x and y should be numpy float arrays of equal length. The return value is also a numpy float array of that length. e.g. >>> import numpy >>> x = numpy.array([4, 4, 7, 7, 8, 9, 10, 10, 10, 11, 11, 12, 12, 12, ... 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15, 15, 16, 16, ... 17, 17, 17, 18, 18, 18, 18, 19, 19, 19, 20, 20, 20, 20, ... 20, 22, 23, 24, 24, 24, 24, 25], numpy.float) >>> y = numpy.array([2, 10, 4, 22, 16, 10, 18, 26, 34, 17, 28, 14, 20, 24, ... 28, 26, 34, 34, 46, 26, 36, 60, 80, 20, 26, 54, 32, 40, ... 32, 40, 50, 42, 56, 76, 84, 36, 46, 68, 32, 48, 52, 56, ... 64, 66, 54, 70, 92, 93, 120, 85], numpy.float) >>> result = lowess(x, y) >>> len(result) 50 >>> print("[%0.2f, ..., %0.2f]" % (result[0], result[-1])) [4.85, ..., 84.98] """ n = len(x) r = int(numpy.ceil(f * n)) h = [numpy.sort(abs(x - x[i]))[r] for i in range(n)] w = numpy.clip(abs(([x] - numpy.transpose([x])) / h), 0.0, 1.0) w = 1 - w * w * w w = w * w * w yest = numpy.zeros(n) delta = numpy.ones(n) for iteration in range(iter): for i in range(n): weights = delta * w[:, i] weights_mul_x = weights * x b1 = numpy.dot(weights, y) b2 = numpy.dot(weights_mul_x, y) A11 = sum(weights) A12 = sum(weights_mul_x) A21 = A12 A22 = numpy.dot(weights_mul_x, x) determinant = A11 * A22 - A12 * A21 beta1 = (A22 * b1 - A12 * b2) / determinant beta2 = (A11 * b2 - A21 * b1) / determinant yest[i] = beta1 + beta2 * x[i] residuals = y - yest s = median(abs(residuals)) delta[:] = numpy.clip(residuals / (6 * s), -1, 1) delta[:] = 1 - delta * delta delta[:] = delta * delta return yest
def make_instances_from_counts(self): """Creates "fake" instances for a motif created from a count matrix. In case the sums of counts are different for different columnes, the shorter columns are padded with background. """ alpha = "".join(self.alphabet.letters) #col[i] is a column taken from aligned motif instances col = [] self.has_instances = True self.instances = [] s = sum(self.counts[nuc][0] for nuc in self.alphabet.letters) for i in range(self.length): col.append("") for n in self.alphabet.letters: col[i] = col[i] + n*(self.counts[n][i]) if len(col[i]) < s: print("WARNING, column too short %i %i" % (len(col[i]), s)) col[i] += (alpha*s)[:(s-len(col[i]))] #print("column %i, %s" % (i, col[i])) #iterate over instances for i in range(s): inst = "" #start with empty seq for j in range(self.length): #iterate over positions inst += col[j][i] #print("%i %s" % (i,inst) inst = Seq(inst, self.alphabet) self.add_instance(inst) return self.instances
def forward_algorithm(self): """Calculate sequence probability using the forward algorithm. This implements the forward algorithm, as described on p57-58 of Durbin et al. Returns: o A dictionary containing the forward variables. This has keys of the form (state letter, position in the training sequence), and values containing the calculated forward variable. o The calculated probability of the sequence. """ # all of the different letters that the state path can be in state_letters = self._seq.states.alphabet.letters # -- initialize the algorithm # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # forward_var = {} # f_{0}(0) = 1 forward_var[(state_letters[0], -1)] = 1 # f_{k}(0) = 0, for k > 0 for k in range(1, len(state_letters)): forward_var[(state_letters[k], -1)] = 0 # -- now do the recursion step # loop over the training sequence # Recursion step: (i = 1 .. L) for i in range(len(self._seq.emissions)): # now loop over the letters in the state path for main_state in state_letters: # calculate the forward value using the appropriate # method to prevent underflow errors forward_value = self._forward_recursion( main_state, i, forward_var) if forward_value is not None: forward_var[(main_state, i)] = forward_value # -- termination step - calculate the probability of the sequence first_state = state_letters[0] seq_prob = 0 for state_item in state_letters: # f_{k}(L) forward_value = forward_var[(state_item, len(self._seq.emissions) - 1)] # a_{k0} transition_value = self._mm.transition_prob[(state_item, first_state)] seq_prob += forward_value * transition_value return forward_var, seq_prob
def forward_algorithm(self): """Calculate sequence probability using the forward algorithm. This implements the forward algorithm, as described on p57-58 of Durbin et al. Returns: o A dictionary containing the forward variables. This has keys of the form (state letter, position in the training sequence), and values containing the calculated forward variable. o The calculated probability of the sequence. """ # all of the different letters that the state path can be in state_letters = self._seq.states.alphabet.letters # -- initialize the algorithm # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # forward_var = {} # f_{0}(0) = 1 forward_var[(state_letters[0], -1)] = 1 # f_{k}(0) = 0, for k > 0 for k in range(1, len(state_letters)): forward_var[(state_letters[k], -1)] = 0 # -- now do the recursion step # loop over the training sequence # Recursion step: (i = 1 .. L) for i in range(len(self._seq.emissions)): # now loop over the letters in the state path for main_state in state_letters: # calculate the forward value using the appropriate # method to prevent underflow errors forward_value = self._forward_recursion(main_state, i, forward_var) if forward_value is not None: forward_var[(main_state, i)] = forward_value # -- termination step - calculate the probability of the sequence first_state = state_letters[0] seq_prob = 0 for state_item in state_letters: # f_{k}(L) forward_value = forward_var[(state_item, len(self._seq.emissions) - 1)] # a_{k0} transition_value = self._mm.transition_prob[(state_item, first_state)] seq_prob += forward_value * transition_value return forward_var, seq_prob
def calculate(self, sequence): """ returns the PWM score for a given sequence for all positions. - the sequence can only be a DNA sequence - the search is performed only on one strand - if the sequence and the motif have the same length, a single number is returned - otherwise, the result is a one-dimensional list or numpy array """ #TODO - Code itself tolerates ambiguous bases (as NaN). if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" \ % self.alphabet) if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" \ % sequence.alphabet) #TODO - Force uppercase here and optimise switch statement in C #by assuming upper case? sequence = str(sequence) m = self.length n = len(sequence) scores = [] # check if the fast C code can be used try: import _pwm except ImportError: # use the slower Python code otherwise #The C code handles mixed case so Python version must too: sequence = sequence.upper() for i in range(n - m + 1): score = 0.0 for position in range(m): letter = sequence[i + position] try: score += self[letter][position] except KeyError: score = _nan break scores.append(score) else: # get the log-odds matrix into a proper shape # (each row contains sorted (ACGT) log-odds values) logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] scores = _pwm.calculate(sequence, logodds) if len(scores) == 1: return scores[0] else: return scores
def calculate(self, sequence): """ returns the PWM score for a given sequence for all positions. - the sequence can only be a DNA sequence - the search is performed only on one strand - if the sequence and the motif have the same length, a single number is returned - otherwise, the result is a one-dimensional list or numpy array """ #TODO - Code itself tolerates ambiguous bases (as NaN). if not isinstance(self.alphabet, IUPAC.IUPACUnambiguousDNA): raise ValueError("PSSM has wrong alphabet: %s - Use only with DNA motifs" \ % self.alphabet) if not isinstance(sequence.alphabet, IUPAC.IUPACUnambiguousDNA): raise ValueError("Sequence has wrong alphabet: %r - Use only with DNA sequences" \ % sequence.alphabet) #TODO - Force uppercase here and optimise switch statement in C #by assuming upper case? sequence = str(sequence) m = self.length n = len(sequence) scores = [] # check if the fast C code can be used try: import _pwm except ImportError: # use the slower Python code otherwise #The C code handles mixed case so Python version must too: sequence = sequence.upper() for i in range(n-m+1): score = 0.0 for position in range(m): letter = sequence[i+position] try: score += self[letter][position] except KeyError: score = _nan break scores.append(score) else: # get the log-odds matrix into a proper shape # (each row contains sorted (ACGT) log-odds values) logodds = [[self[letter][i] for letter in "ACGT"] for i in range(m)] scores = _pwm.calculate(sequence, logodds) if len(scores)==1: return scores[0] else: return scores
def matches_schema(pattern, schema, ambiguity_character='*'): """Determine whether or not the given pattern matches the schema. Arguments: o pattern - A string representing the pattern we want to check for matching. This pattern can contain ambiguity characters (which are assumed to be the same as those in the schema). o schema - A string schema with ambiguity characters. o ambiguity_character - The character used for ambiguity in the schema. """ if len(pattern) != len(schema): return 0 # check each position, and return a non match if the schema and pattern # are non ambiguous and don't match for pos in range(len(pattern)): if schema[pos] != ambiguity_character and \ pattern[pos] != ambiguity_character and \ pattern[pos] != schema[pos]: return 0 return 1
def pwm(self,laplace=True): """ returns the PWM computed for the set of instances if laplace=True (default), pseudocounts equal to self.background multiplied by self.beta are added to all positions. """ if self._pwm_is_current: return self._pwm #we need to compute new pwm self._pwm = [] for i in range(self.length): dict = {} #filling the dict with 0's for letter in self.alphabet.letters: if laplace: dict[letter]=self.beta*self.background[letter] else: dict[letter]=0.0 if self.has_counts: #taking the raw counts for letter in self.alphabet.letters: dict[letter]+=self.counts[letter][i] elif self.has_instances: #counting the occurences of letters in instances for seq in self.instances: #dict[seq[i]]=dict[seq[i]]+1 try: dict[seq[i]]+=1 except KeyError: #we need to ignore non-alphabet letters pass self._pwm.append(FreqTable.FreqTable(dict, FreqTable.COUNT, self.alphabet)) self._pwm_is_current=1 return self._pwm
def calculate_pseudocounts(motif): alphabet = motif.alphabet background = motif.background # It is possible to have unequal column sums so use the average # number of instances. total = 0 for i in range(motif.length): total += sum(float(motif.counts[letter][i]) for letter in alphabet.letters) avg_nb_instances = total / motif.length sq_nb_instances = math.sqrt(avg_nb_instances) if background: background = dict(background) else: background = dict.fromkeys(sorted(alphabet.letters), 1.0) total = sum(background.values()) pseudocounts = {} for letter in alphabet.letters: background[letter] /= total pseudocounts[letter] = sq_nb_instances * background[letter] return pseudocounts
def normalize(self, pseudocounts=None): """ create and return a position-weight matrix by normalizing the counts matrix. If pseudocounts is None (default), no pseudocounts are added to the counts. If pseudocounts is a number, it is added to the counts before calculating the position-weight matrix. Alternatively, the pseudocounts can be a dictionary with a key for each letter in the alphabet associated with the motif. """ counts = {} if pseudocounts is None: for letter in self.alphabet.letters: counts[letter] = [0.0] * self.length elif isinstance(pseudocounts, dict): for letter in self.alphabet.letters: counts[letter] = [float(pseudocounts[letter])] * self.length else: for letter in self.alphabet.letters: counts[letter] = [float(pseudocounts)] * self.length for i in range(self.length): for letter in self.alphabet.letters: counts[letter][i] += self[letter][i] # Actual normalization is done in the PositionWeightMatrix initializer return PositionWeightMatrix(self.alphabet, counts)
def std(self, background=None): """Standard deviation of the score of a motif.""" if background is None: background = dict.fromkeys(self._letters, 1.0) else: background = dict(background) total = sum(background.values()) for letter in self._letters: background[letter] /= total variance = 0.0 for i in range(self.length): sx = 0.0 sxx = 0.0 for letter in self._letters: logodds = self[letter, i] if _isnan(logodds): continue if _isinf(logodds) and logodds < 0: continue b = background[letter] p = b * math.pow(2, logodds) sx += p * logodds sxx += p * logodds * logodds sxx -= sx * sx variance += sxx variance = max(variance, 0) # to avoid roundoff problems return math.sqrt(variance)
def representation(self, sequence): """Represent the given input sequence as a bunch of motif counts. Arguments: o sequence - A Bio.Seq object we are going to represent as schemas. This takes the sequence, searches for the motifs within it, and then returns counts specifying the relative number of times each motifs was found. The frequencies are in the order the original motifs were passed into the initializer. """ schema_counts = [] for schema in self._schemas: num_counts = self._converter.num_matches(schema, str(sequence)) schema_counts.append(num_counts) # normalize the counts to go between zero and one min_count = 0 max_count = max(schema_counts) # only normalize if we've actually found something, otherwise # we'll just return 0 for everything if max_count > 0: for count_num in range(len(schema_counts)): schema_counts[count_num] = (float(schema_counts[count_num]) - float(min_count)) / float(max_count) return schema_counts
def _crossover( self, x, no, locs ): """Generalized Crossover Function: arguments: x (int) - genome number [0|1] no (organism,organism) - new organisms locs (int list, int list) - lists of locations, [0, +n points+, bound] for each genome (sync'd with x) return type: sequence (to replace no[x]) """ s = no[ x ].genome[ :locs[ x ][1] ] for n in range(1, self._npoints): # flipflop between genome_0 and genome_1 mode = (x+n)%2 # _generate_locs gives us [0, +n points+, bound] # so we can iterate: { 0:loc(1) ... loc(n):bound } t = no[ mode ].genome[ locs[mode][n]:locs[mode][n+1] ] if (s): s = s + t else: s = t return s
def representation(self, sequence): """Represent the given input sequence as a bunch of motif counts. Arguments: o sequence - A Bio.Seq object we are going to represent as schemas. This takes the sequence, searches for the motifs within it, and then returns counts specifying the relative number of times each motifs was found. The frequencies are in the order the original motifs were passed into the initializer. """ schema_counts = [] for schema in self._schemas: num_counts = self._converter.num_matches(schema, str(sequence)) schema_counts.append(num_counts) # normalize the counts to go between zero and one min_count = 0 max_count = max(schema_counts) # only normalize if we've actually found something, otherwise # we'll just return 0 for everything if max_count > 0: for count_num in range(len(schema_counts)): schema_counts[count_num] = ( float(schema_counts[count_num]) - float(min_count)) / float(max_count) return schema_counts
def std(self, background=None): """Standard deviation of the score of a motif.""" if background is None: background = dict.fromkeys(self._letters, 1.0) else: background = dict(background) total = sum(background.values()) for letter in self._letters: background[letter] /= total variance = 0.0 for i in range(self.length): sx = 0.0 sxx = 0.0 for letter in self._letters: logodds = self[letter, i] if _isnan(logodds): continue if _isinf(logodds) and logodds < 0: continue b = background[letter] p = b * math.pow(2, logodds) sx += p*logodds sxx += p*logodds*logodds sxx -= sx*sx variance += sxx variance = max(variance, 0) # to avoid roundoff problems return math.sqrt(variance)
def intermediate_points(start, end, graph_data): """ intermediate_points(start, end, graph_data) o graph_data o start o end Returns a list of (start, end, value) tuples describing the passed graph data as 'bins' between position midpoints. """ #print start, end, len(graph_data) newdata = [] # data in form (X0, X1, val) # add first block newdata.append( (start, graph_data[0][0] + (graph_data[1][0] - graph_data[0][0]) / 2., graph_data[0][1])) # add middle set for index in range(1, len(graph_data) - 1): lastxval, lastyval = graph_data[index - 1] xval, yval = graph_data[index] nextxval, nextyval = graph_data[index + 1] newdata.append((lastxval + (xval - lastxval) / 2., xval + (nextxval - xval) / 2., yval)) # add last block newdata.append((xval + (nextxval - xval) / 2., end, graph_data[-1][1])) #print newdata[-1] #print newdata return newdata
def calculate_pseudocounts(motif): alphabet = motif.alphabet background = motif.background # It is possible to have unequal column sums so use the average # number of instances. total = 0 for i in range(motif.length): total += sum( float(motif.counts[letter][i]) for letter in alphabet.letters) avg_nb_instances = total / motif.length sq_nb_instances = math.sqrt(avg_nb_instances) if background: background = dict(background) else: background = dict.fromkeys(sorted(alphabet.letters), 1.0) total = sum(background.values()) pseudocounts = {} for letter in alphabet.letters: background[letter] /= total pseudocounts[letter] = sq_nb_instances * background[letter] return pseudocounts
def __init__(self, alphabet, counts): GenericPositionMatrix.__init__(self, alphabet, counts) for i in range(self.length): total = sum(float(self[letter][i]) for letter in alphabet.letters) for letter in alphabet.letters: self[letter][i] /= total for letter in alphabet.letters: self[letter] = tuple(self[letter])
def _gen_random_array(n): """ Return an array of n random numbers, where the elements of the array sum to 1.0""" randArray = [random.random() for i in range(n)] total = sum(randArray) normalizedRandArray = [x/total for x in randArray] return normalizedRandArray
def dist_product_at(self, other, offset): s=0 for i in range(max(self.length, offset+other.length)): f1=self[i] f2=other[i-offset] for n, b in self.background.items(): s+=b*f1[n]*f2[n] return s/i
def _pwm_calculate(self, sequence): logodds = self.log_odds() m = len(logodds) s = len(sequence) n = s - m + 1 result = [None] * n for i in range(n): score = 0.0 for j in range(m): c = sequence[i+j] temp = logodds[j].get(c) if temp is None: break score += temp else: result[i] = score return result
def _schema_from_motif(self, motif, motif_list, num_ambiguous): """Create a schema from a given starting motif. Arguments: o motif - A motif with the pattern we will start from. o motif_list - The total motifs we have.to match to. o num_ambiguous - The number of ambiguous characters that should be present in the schema. Returns: o A string representing the newly generated schema. o A list of all of the motifs in motif_list that match the schema. """ assert motif in motif_list, \ "Expected starting motif present in remaining motifs." # convert random positions in the motif to ambiguous characters # convert the motif into a list of characters so we can manipulate it new_schema_list = list(motif) for add_ambiguous in range(num_ambiguous): # add an ambiguous position in a new place in the motif while True: ambig_pos = random.choice(list(range(len(new_schema_list)))) # only add a position if it isn't already ambiguous # otherwise, we'll try again if new_schema_list[ambig_pos] != self._ambiguity_symbol: new_schema_list[ambig_pos] = self._ambiguity_symbol break # convert the schema back to a string new_schema = ''.join(new_schema_list) # get the motifs that the schema matches matched_motifs = [] for motif in motif_list: if matches_schema(motif, new_schema, self._ambiguity_symbol): matched_motifs.append(motif) return new_schema, matched_motifs
def search(self, sequence): """ a generator function, returning found positions of motif instances in a given sequence """ for pos in range(0, len(sequence) - self.length + 1): for instance in self: if str(instance) == str(sequence[pos:pos + self.length]): yield (pos, instance) break # no other instance will fit (we don't want to return multiple hits)
def __getitem__(self, index): """Returns the probability distribution over symbols at a given position, padding with background. If the requested index is out of bounds, the returned distribution comes from background. """ if index in range(self.length): return self.pwm()[index] else: return self.background
def search(self, sequence): """ a generator function, returning found positions of motif instances in a given sequence """ for pos in range(0, len(sequence) - self.length + 1): for instance in self: if str(instance) == str(sequence[pos : pos + self.length]): yield (pos, instance) break # no other instance will fit (we don't want to return multiple hits)
def min(self): """Minimal possible score for this motif. returns the score computed for the anticonsensus sequence. """ score = 0.0 letters = self._letters for position in range(0, self.length): score += min(self[letter][position] for letter in letters) return score
def __str__(self): words = ["%6d" % i for i in range(self.length)] line = " " + " ".join(words) lines = [line] for letter in self._letters: words = ["%6.2f" % value for value in self[letter]] line = "%c: " % letter + " ".join(words) lines.append(line) text = "\n".join(lines) + "\n" return text
def search_instances(self, sequence): """ a generator function, returning found positions of instances of the motif in a given sequence """ if not self.has_instances: raise ValueError ("This motif has no instances") for pos in range(0, len(sequence) - self.length + 1): for instance in self.instances: if str(instance) == str(sequence[pos:pos + self.length]): yield (pos, instance) break # no other instance will fit (we don't want to return multiple hits)
def ic(self): """Method returning the information content of a motif. """ res=0 pwm=self.pwm() for i in range(self.length): res+=2 for a in self.alphabet.letters: if pwm[i][a]!=0: res+=pwm[i][a]*math.log(pwm[i][a], 2) return res
def _to_horizontal_matrix(self,letters=None,normalized=True): """Return string representation of the motif as a matrix. """ if letters is None: letters = self.alphabet.letters res = "" if normalized: #output PWM self._pwm_is_current=False mat=self.pwm(laplace=False) for a in letters: res += "\t".join(str(mat[i][a]) for i in range(self.length)) res += "\n" else: #output counts if not self.has_counts: self.make_counts_from_instances() mat = self.counts for a in letters: res += "\t".join(str(mat[a][i]) for i in range(self.length)) res += "\n" return res
def anticonsensus(self): """returns the least probable pattern to be generated from this motif. """ res="" for i in range(self.length): min_f=10.0 min_n="X" for n in sorted(self[i]): if self[i][n]<min_f: min_f=self[i][n] min_n=n res+=min_n return Seq(res, self.alphabet)
def __str__(self,masked=False): """ string representation of a motif. """ str = "".join(str(inst) + "\n" for inst in self.instances) if masked: for i in range(self.length): if self.mask[i]: str += "*" else: str += " " str += "\n" return str
def gc_content(self): """ Compute the fraction GC content. """ alphabet = self.alphabet gc_total = 0.0 total = 0.0 for i in range(self.length): for letter in alphabet.letters: if letter in 'CG': gc_total += self[letter][i] total += self[letter][i] return gc_total / total
def consensus(self): """Returns the consensus sequence of a motif. """ res="" for i in range(self.length): max_f=0 max_n="X" for n in sorted(self[i]): if self[i][n]>max_f: max_f=self[i][n] max_n=n res+=max_n return Seq(res, self.alphabet)
def _to_vertical_matrix(self,letters=None): """Return string representation of the motif as a matrix. """ if letters is None: letters = self.alphabet.letters self._pwm_is_current=False pwm = self.pwm(laplace=False) res = "" for i in range(self.length): res += "\t".join(str(pwm[i][a]) for a in letters) res += "\n" return res
def dist_product(self, other): """ A similarity measure taking into account a product probability of generating overlaping instances of two motifs """ max_p=0.0 for offset in range(-self.length+1, other.length): if offset<0: p = self.dist_product_at(other, -offset) else: #offset>=0 p = other.dist_product_at(self, offset) if max_p<p: max_p=p max_o=-offset return 1-max_p/self.dist_product_at(self, 0), max_o
def backward_algorithm(self): """Calculate sequence probability using the backward algorithm. This implements the backward algorithm, as described on p58-59 of Durbin et al. Returns: o A dictionary containing the backwards variables. This has keys of the form (state letter, position in the training sequence), and values containing the calculated backward variable. """ # all of the different letters that the state path can be in state_letters = self._seq.states.alphabet.letters # -- initialize the algorithm # # NOTE: My index numbers are one less than what is given in Durbin # et al, since we are indexing the sequence going from 0 to # (Length - 1) not 1 to Length, like in Durbin et al. # backward_var = {} first_letter = state_letters[0] # b_{k}(L) = a_{k0} for all k for state in state_letters: backward_var[(state, len(self._seq.emissions) - 1)] = \ self._mm.transition_prob[(state, state_letters[0])] # -- recursion # first loop over the training sequence backwards # Recursion step: (i = L - 1 ... 1) all_indexes = list(range(len(self._seq.emissions) - 1)) all_indexes.reverse() for i in all_indexes: # now loop over the letters in the state path for main_state in state_letters: # calculate the backward value using the appropriate # method to prevent underflow errors backward_value = self._backward_recursion(main_state, i, backward_var) if backward_value is not None: backward_var[(main_state, i)] = backward_value # skip the termination step to avoid recalculations -- you should # get sequence probabilities using the forward algorithm return backward_var