def _add_transition(self, hmm, rank, tran, probability): match = hmm.layers[rank][States.Match] nextmatch = None if rank < hmm.layers.last_index: nextmatch = hmm.layers[rank + 1][States.Match] else: nextmatch = hmm.end if tran == 'M->M': transition = Transition(match, nextmatch, probability) match.transitions.append(transition) elif tran == 'M->I': insertion = hmm.layers[rank][States.Insertion] transition = Transition(match, insertion, probability) match.transitions.append(transition) elif tran == 'M->D': deletion = State(States.Deletion) deletion.rank = rank + 1 hmm.layers[rank + 1].append(deletion) transition = Transition(match, deletion, probability) match.transitions.append(transition) elif tran == 'I->M': insertion = hmm.layers[rank][States.Insertion] transition = Transition(insertion, nextmatch, probability) insertion.transitions.append(transition) elif tran == 'I->I': insertion = hmm.layers[rank][States.Insertion] selfloop = Transition(insertion, insertion, probability) insertion.transitions.append(selfloop) elif tran == 'D->M': deletion = hmm.layers[rank][States.Deletion] transition = Transition(deletion, nextmatch, probability) deletion.transitions.append(transition) elif tran == 'D->D': deletion = hmm.layers[rank][States.Deletion] if States.Deletion not in hmm.layers[rank + 1]: nextdeletion = State(States.Deletion) nextdeletion.rank = rank + 1 hmm.layers[rank + 1].append(nextdeletion) else: nextdeletion = hmm.layers[rank + 1][States.Deletion] assert match.transitions[ States.Deletion].successor == nextdeletion transition = Transition(deletion, nextdeletion, probability) deletion.transitions.append(transition) else: if not tran.startswith('Neff'): raise NotImplementedError( 'Unknown transition: {0}'.format(tran))
def build_hmm(): hmm = ProfileHMM(units=ScoreUnits.Probability) factory = StateFactory() background = OrderedDict([(ProteinAlphabet.ALA, 0.02457563), (ProteinAlphabet.CYS, 0.00325358), (ProteinAlphabet.GLU, 0.01718016)]) emission = dict((aa, 1.0 / i) for i, aa in enumerate(background, start=1)) # States # Start / End hmm.start = State(States.Start) hmm.start_insertion = factory.create_insertion(background) hmm.end = State(States.End) # L1 match1 = factory.create_match(emission, background) insertion1 = factory.create_insertion(background) deletion1 = factory.create_deletion() states = { States.Match: match1, States.Insertion: insertion1, States.Deletion: deletion1 } layer1 = HMMLayer(1, ProteinResidue(1, ProteinAlphabet.ALA), states) hmm.layers.append(layer1) # L2 match2 = factory.create_match(emission, background) insertion2 = factory.create_insertion(background) deletion2 = factory.create_deletion() states = { States.Match: match2, States.Insertion: insertion2, States.Deletion: deletion2 } layer2 = HMMLayer(2, ProteinResidue(2, ProteinAlphabet.CYS), states) hmm.layers.append(layer2) # Transitions # start hmm.start.transitions.append(Transition(hmm.start, match1, 0.8)) hmm.start.transitions.append(Transition(hmm.start, deletion1, 0.1)) hmm.start.transitions.append( Transition(hmm.start, hmm.start_insertion, 0.1)) hmm.start_insertion.transitions.append( Transition(hmm.start_insertion, hmm.start_insertion, 0.5)) hmm.start_insertion.transitions.append( Transition(hmm.start_insertion, match1, 0.5)) # L1 match1.transitions.append(Transition(match1, match2, 0.8)) match1.transitions.append(Transition(match1, insertion1, 0.1)) match1.transitions.append(Transition(match1, deletion2, 0.1)) insertion1.transitions.append(Transition(insertion1, insertion1, 0.5)) insertion1.transitions.append(Transition(insertion1, match2, 0.5)) deletion1.transitions.append(Transition(deletion1, deletion2, 0.5)) deletion1.transitions.append(Transition(deletion1, match2, 0.5)) # L2 match2.transitions.append(Transition(match2, hmm.end, 0.9)) match2.transitions.append(Transition(match2, insertion2, 0.1)) insertion2.transitions.append(Transition(insertion2, insertion2, 0.5)) insertion2.transitions.append(Transition(insertion2, hmm.end, 0.5)) deletion2.transitions.append(Transition(deletion2, hmm.end, 1.0)) hmm.effective_matches = 10 hmm.version = 1.5 hmm.name = 'name' hmm.id = 'id' hmm.family = 'fam' hmm.length = ProfileLength(2, 2) return hmm
def _parse_profile(self, hmm, units=ScoreUnits.LogScales): """ Parse the HMM profile. @param hmm: the hmm object being constructed @type hmm: L{ProfileHMM} @return: the updated hmm @rtype: L{ProfileHMM} @raise NotImplementedError: when an unknown transition string is encountered """ assert self._chopped # 0. Prepare start and end states hmm.start = State(States.Start) hmm.end = State(States.End) residues = None background = {} tran_types = None tran_lines = [] start_probs = None lines = iter(self._profile) pattern = re.compile('^[A-Z\-]\s[0-9]+\s+') if units == ScoreUnits.LogScales: def parse_probability(v): if v.strip() == '*': return None else: return float(v) else: def parse_probability(v): if v.strip() == '*': return None else: return hmm._convert(units, float(v), hmm.scale, hmm.logbase) # 1. Create all layers (profile columns), create and attach their match states while True: try: line = next(lines) except StopIteration: break if line.startswith('NULL'): try: backprobs = tuple(map(parse_probability, line.split()[1:])) line = next(lines) residues = line.split()[1:] residues = [ Enum.parse(ProteinAlphabet, aa) for aa in residues ] for pos, aa in enumerate(residues): background[aa] = backprobs[pos] line = next(lines) tran_types = line.split() line = next(lines) start_probs = list(map(parse_probability, line.split())) except StopIteration: break elif re.match(pattern, line): emrow = line try: tran_lines.append(next(lines)) #junkrow = next(lines) except StopIteration: break emprobs = emrow.split() if len(emprobs) != 23: raise HHProfileFormatError( "Unexpected number of data fields: {0}".format( len(emprobs))) rank = int(emprobs[1]) residue = structure.ProteinResidue(rank=rank, type=emprobs[0], sequence_number=rank, insertion_code=None) if residue.type == ProteinAlphabet.GAP: raise HHProfileFormatError( "Layer {0} can't be represented by a gap".format(rank)) new_layer = hmm.layers.append(HMMLayer(rank, residue)) if new_layer != rank: raise HHProfileFormatError( 'Layer {0} defined as {1}'.format(new_layer, rank)) match = State(States.Match, emit=Enum.members(ProteinAlphabet)) match.rank = rank match.background.set(background) for col, aa in enumerate(residues): prob = parse_probability(emprobs[col + 2]) match.emission.append(aa, prob) hmm.layers[new_layer].append(match) assert hmm.layers.last_index == match.rank # 2. Append starting transitions: S -> M[1] and optionally S -> D[1] and S -> I[0]. # States D[1] and I[0] will be created if needed # Note that [0] is not a real layer, I[0] is simply an insertion at the level of Start if len(hmm.layers) > 0: first_match = hmm.layers[hmm.layers.start_index] if start_probs[0] is None: raise HHProfileFormatError( "Transition Start > Match[1] is undefined") start_tran = Transition(hmm.start, first_match[States.Match], start_probs[0]) hmm.start.transitions.append(start_tran) if start_probs[1] is not None and start_probs[ 3] is not None: # Start -> I[0] -> M[1] start_ins = State(States.Insertion, emit=Enum.members(ProteinAlphabet)) start_ins.rank = 0 start_ins.background.set(background) start_ins.emission = start_ins.background hmm.start_insertion = start_ins # Start -> I[0] hmm.start.transitions.append( Transition(hmm.start, hmm.start_insertion, start_probs[1])) # I[0] -> M[1] hmm.start_insertion.transitions.append( Transition(hmm.start_insertion, first_match[States.Match], start_probs[3])) # I[0] -> I[0] if start_probs[4]: hmm.start_insertion.transitions.append( Transition(hmm.start_insertion, hmm.start_insertion, start_probs[4])) if start_probs[2] is None and start_probs[6] is not None: # M->D is corrupt (*) at the Start layer, using D->D instead start_probs[2] = start_probs[6] if start_probs[2] is not None: # Start -> D[1] start_del = State(States.Deletion) start_del.rank = 1 hmm.layers[1].append(start_del) start_tran = Transition(hmm.start, first_match[States.Deletion], start_probs[2]) hmm.start.transitions.append(start_tran) else: start_tran = Transition(hmm.start, hmm.end, start_probs[0]) hmm.start.transitions.append(start_tran) # 3. Append remaining transitions. I and D states will be created on demand. for rank, fields in enumerate(tran_lines, start=hmm.layers.start_index): assert hmm.layers[rank][States.Match].rank == rank ofields = fields.split() fields = tuple(map(parse_probability, ofields)) # 3a. Parse all Neff values and create I[i] and D[i] states if NeffX[i] is not None for col, neff in enumerate(tran_types[7:10], start=7): if fields[col] is not None: neff_value = float(ofields[col]) / abs(hmm.scale) if neff == 'Neff': hmm.layers[rank].effective_matches = neff_value elif neff == 'Neff_I': hmm.layers[rank].effective_insertions = neff_value if States.Insertion not in hmm.layers[rank]: insertion = State( States.Insertion, emit=Enum.members(ProteinAlphabet)) insertion.background.set(background) insertion.emission.set(background) insertion.rank = rank hmm.layers[rank].append(insertion) elif neff == 'Neff_D': hmm.layers[rank].effective_deletions = neff_value if States.Deletion not in hmm.layers[ rank] and neff_value > 0: deletion = State(States.Deletion) deletion.rank = rank hmm.layers[rank].append(deletion) # 3b. Starting from the first layer, parse all transitions and build the HMM graph stepwise for col, tran in enumerate(tran_types): probability = fields[col] if probability is not None: try: self._add_transition(hmm, rank, tran, probability) except (CollectionIndexError, ItemNotFoundError) as ex: msg = "Can't add transition {0} at {1}: {2.__class__.__name__}, {2!s}" raise HHProfileFormatError(msg.format(tran, rank, ex)) return hmm
def add_transition_pseudocounts(self, gapb=1., gapd=0.15, gape=1.0, gapf=0.6, gapg=0.6, gapi=0.6): """ Add pseudocounts to the transitions. A port from hhsearch -gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6 """ from numpy import array if not self.hmm._score_units == ScoreUnits.Probability: self.hmm.convert_scores(units=ScoreUnits.Probability) if self.hmm.pseudocounts or self.hmm.transition_pseudocounts: return # We need a fully populated HMM so first add all missing states states = [States.Match, States.Insertion, States.Deletion] background = self.hmm.layers[1][States.Match].background for layer in self.hmm.layers: rank = layer.rank for state in states: if state not in layer: if state is States.Deletion: # Add a new Deletion state deletion = State(States.Deletion) deletion.rank = rank layer.append(deletion) elif state is States.Insertion: # Add a new Deletion state insertion = State( States.Insertion, emit=csb.core.Enum.members( sequence.SequenceAlphabets.Protein)) insertion.background.set(background) insertion.emission.set(background) insertion.rank = rank layer.append(insertion) if not self.hmm.start_insertion: insertion = State(States.Insertion, emit=csb.core.Enum.members( sequence.SequenceAlphabets.Protein)) insertion.background.set(background) insertion.emission.set(background) insertion.rank = 0 self.hmm.start_insertion = insertion # make hmm completly connected for i in range(1, self.hmm.layers.length): layer = self.hmm.layers[i] #Start with match state state = layer[States.Match] if not States.Insertion in state.transitions: state.transitions.append( Transition(state, self.hmm.layers[i][States.Insertion], 0.0)) if not States.Deletion in state.transitions: state.transitions.append( Transition(state, self.hmm.layers[i + 1][States.Deletion], 0.0)) state = layer[States.Insertion] if not States.Insertion in state.transitions: state.transitions.append( Transition(state, self.hmm.layers[i][States.Insertion], 0.0)) if not States.Match in state.transitions: state.transitions.append( Transition(state, self.hmm.layers[i + 1][States.Match], 0.0)) state = layer[States.Deletion] if not States.Deletion in state.transitions: state.transitions.append( Transition(state, self.hmm.layers[i + 1][States.Deletion], 0.0)) if not States.Match in state.transitions: state.transitions.append( Transition(state, self.hmm.layers[i + 1][States.Match], 0.0)) # start layer state = self.hmm.start if not States.Insertion in self.hmm.start.transitions: state.transitions.append( Transition(self.hmm.start, self.hmm.start_insertion, 0.0)) if not States.Deletion in self.hmm.start.transitions: state.transitions.append( Transition(self.hmm.start, self.hmm.layers[1][States.Deletion], 0.0)) state = self.hmm.start_insertion if not States.Insertion in self.hmm.start_insertion.transitions: state.transitions.append( Transition(self.hmm.start_insertion, self.hmm.start_insertion, 0.0)) if not States.Match in self.hmm.start_insertion.transitions: state.transitions.append( Transition(self.hmm.start_insertion, self.hmm.layers[1][States.Match], 0.0)) # last layer state = self.hmm.layers[-1][States.Match] if not States.Insertion in state.transitions: state.transitions.append( Transition(state, self.hmm.layers[-1][States.Insertion], 0.0)) state = self.hmm.layers[-1][States.Insertion] if not States.Insertion in state.transitions: state.transitions.append( Transition(state, self.hmm.layers[-1][States.Insertion], 0.0)) if not States.End in state.transitions: state.transitions.append(Transition(state, self.hmm.end, 0.0)) state = self.hmm.layers[-1][States.Deletion] if not States.End in state.transitions: state.transitions.append(Transition(state, self.hmm.end, 0.0)) # Now we have created a fully connected HMM # Lates add pseuod counts # Calculate pseudo counts # to be honest I really do not know how they came up with this pc_MD = pc_MI = 0.0286 * gapd pc_MM = 1. - 2 * pc_MD pc_DD = pc_II = gape / (gape - 1 + 1 / 0.75) pc_DM = pc_IM = 1. - pc_II # Get current transtion probabilities t_mm = self.hmm.start.transitions[States.Match].probability t_mi = self.hmm.start.transitions[States.Insertion].probability t_md = self.hmm.start.transitions[States.Deletion].probability # Transitions from Match state n_eff = self.hmm.effective_matches t = array([(n_eff - 1) * t_mm + gapb * pc_MM, (n_eff - 1) * t_mi + gapb * pc_MI, (n_eff - 1) * t_md + gapb * pc_MD]) # normalize to one t /= t.sum() # Set self.hmm.start.transitions[States.Match].probability = t[0] self.hmm.start.transitions[States.Insertion].probability = t[1] self.hmm.start.transitions[States.Deletion].probability = t[2] # Rinse and repeat t_im = self.hmm.start_insertion.transitions[States.Match].probability t_ii = self.hmm.start_insertion.transitions[ States.Insertion].probability t = array([t_im + gapb * pc_IM, t_ii + gapb * pc_II]) t /= t.sum() self.hmm.start_insertion.transitions[States.Match].probability = t[0] t_ii = self.hmm.start_insertion.transitions[ States.Insertion].probability = t[1] # And now for all layers for layer in self.hmm.layers[:-1]: # Get current transtion probabilities t_mm = layer[States.Match].transitions[States.Match].probability t_mi = layer[States.Match].transitions[ States.Insertion].probability t_md = layer[States.Match].transitions[States.Deletion].probability n_eff = layer.effective_matches t = array([(n_eff - 1) * t_mm + gapb * pc_MM, (n_eff - 1) * t_mi + gapb * pc_MI, (n_eff - 1) * t_md + gapb * pc_MD]) # normalize to one t /= t.sum() layer[States.Match].transitions[States.Match].probability = t[0] layer[States.Match].transitions[ States.Insertion].probability = t[1] layer[States.Match].transitions[States.Deletion].probability = t[2] # Transitions from insert state t_im = layer[States.Insertion].transitions[ States.Match].probability t_ii = layer[States.Insertion].transitions[ States.Insertion].probability n_eff = layer.effective_insertions t = array( [t_im * n_eff + gapb * pc_IM, t_im * n_eff + gapb * pc_II]) # normalize to one t /= t.sum() layer[States.Insertion].transitions[ States.Match].probability = t[0] layer[States.Insertion].transitions[ States.Insertion].probability = t[1] # Transitions form deletion state t_dm = layer[States.Deletion].transitions[States.Match].probability t_dd = layer[States.Deletion].transitions[ States.Deletion].probability n_eff = layer.effective_deletions t = array( [t_dm * n_eff + gapb * pc_DM, t_dd * n_eff + gapb * pc_DD]) # normalize to one t /= t.sum() layer[States.Deletion].transitions[States.Match].probability = t[0] layer[States.Deletion].transitions[ States.Deletion].probability = t[1] #Last layer layer = self.hmm.layers[-1] t_mm = layer[States.Match].transitions[States.End].probability t_mi = layer[States.Match].transitions[States.Insertion].probability n_eff = layer.effective_matches # No deletion t = array([(n_eff - 1) * t_mm + gapb * pc_MM, (n_eff - 1) * t_mi + gapb * pc_MI]) # normalize to one t /= t.sum() layer[States.Match].transitions[States.End].probability = t[0] layer[States.Match].transitions[States.Insertion].probability = t[1] # Transitions from insert state t_im = layer[States.Insertion].transitions[States.End].probability t_ii = layer[States.Insertion].transitions[ States.Insertion].probability n_eff = layer.effective_insertions t = array([t_im * n_eff + gapb * pc_IM, t_im * n_eff + gapb * pc_II]) # normalize to one t /= t.sum() layer[States.Insertion].transitions[States.End].probability = t[0] layer[States.Insertion].transitions[ States.Insertion].probability = t[1] layer[States.Deletion].transitions[States.End].probability = 1. self.hmm.transition_pseudocounts = True return