Exemple #1
0
    def _add_transition(self, hmm, rank, tran, probability):

        match = hmm.layers[rank][States.Match]
        nextmatch = None
        if rank < hmm.layers.last_index:
            nextmatch = hmm.layers[rank + 1][States.Match]
        else:
            nextmatch = hmm.end

        if tran == 'M->M':
            transition = Transition(match, nextmatch, probability)
            match.transitions.append(transition)

        elif tran == 'M->I':
            insertion = hmm.layers[rank][States.Insertion]
            transition = Transition(match, insertion, probability)
            match.transitions.append(transition)

        elif tran == 'M->D':
            deletion = State(States.Deletion)
            deletion.rank = rank + 1
            hmm.layers[rank + 1].append(deletion)
            transition = Transition(match, deletion, probability)
            match.transitions.append(transition)

        elif tran == 'I->M':
            insertion = hmm.layers[rank][States.Insertion]
            transition = Transition(insertion, nextmatch, probability)
            insertion.transitions.append(transition)

        elif tran == 'I->I':
            insertion = hmm.layers[rank][States.Insertion]
            selfloop = Transition(insertion, insertion, probability)
            insertion.transitions.append(selfloop)

        elif tran == 'D->M':
            deletion = hmm.layers[rank][States.Deletion]
            transition = Transition(deletion, nextmatch, probability)
            deletion.transitions.append(transition)

        elif tran == 'D->D':
            deletion = hmm.layers[rank][States.Deletion]

            if States.Deletion not in hmm.layers[rank + 1]:
                nextdeletion = State(States.Deletion)
                nextdeletion.rank = rank + 1
                hmm.layers[rank + 1].append(nextdeletion)

            else:
                nextdeletion = hmm.layers[rank + 1][States.Deletion]
                assert match.transitions[
                    States.Deletion].successor == nextdeletion

            transition = Transition(deletion, nextdeletion, probability)
            deletion.transitions.append(transition)

        else:
            if not tran.startswith('Neff'):
                raise NotImplementedError(
                    'Unknown transition: {0}'.format(tran))
def build_hmm():

    hmm = ProfileHMM(units=ScoreUnits.Probability)

    factory = StateFactory()
    background = OrderedDict([(ProteinAlphabet.ALA, 0.02457563),
                              (ProteinAlphabet.CYS, 0.00325358),
                              (ProteinAlphabet.GLU, 0.01718016)])

    emission = dict((aa, 1.0 / i) for i, aa in enumerate(background, start=1))

    # States
    #  Start / End
    hmm.start = State(States.Start)
    hmm.start_insertion = factory.create_insertion(background)
    hmm.end = State(States.End)
    #  L1
    match1 = factory.create_match(emission, background)
    insertion1 = factory.create_insertion(background)
    deletion1 = factory.create_deletion()
    states = {
        States.Match: match1,
        States.Insertion: insertion1,
        States.Deletion: deletion1
    }
    layer1 = HMMLayer(1, ProteinResidue(1, ProteinAlphabet.ALA), states)
    hmm.layers.append(layer1)
    #  L2
    match2 = factory.create_match(emission, background)
    insertion2 = factory.create_insertion(background)
    deletion2 = factory.create_deletion()
    states = {
        States.Match: match2,
        States.Insertion: insertion2,
        States.Deletion: deletion2
    }
    layer2 = HMMLayer(2, ProteinResidue(2, ProteinAlphabet.CYS), states)
    hmm.layers.append(layer2)

    # Transitions
    #  start
    hmm.start.transitions.append(Transition(hmm.start, match1, 0.8))
    hmm.start.transitions.append(Transition(hmm.start, deletion1, 0.1))
    hmm.start.transitions.append(
        Transition(hmm.start, hmm.start_insertion, 0.1))
    hmm.start_insertion.transitions.append(
        Transition(hmm.start_insertion, hmm.start_insertion, 0.5))
    hmm.start_insertion.transitions.append(
        Transition(hmm.start_insertion, match1, 0.5))
    #  L1
    match1.transitions.append(Transition(match1, match2, 0.8))
    match1.transitions.append(Transition(match1, insertion1, 0.1))
    match1.transitions.append(Transition(match1, deletion2, 0.1))
    insertion1.transitions.append(Transition(insertion1, insertion1, 0.5))
    insertion1.transitions.append(Transition(insertion1, match2, 0.5))
    deletion1.transitions.append(Transition(deletion1, deletion2, 0.5))
    deletion1.transitions.append(Transition(deletion1, match2, 0.5))
    #  L2
    match2.transitions.append(Transition(match2, hmm.end, 0.9))
    match2.transitions.append(Transition(match2, insertion2, 0.1))
    insertion2.transitions.append(Transition(insertion2, insertion2, 0.5))
    insertion2.transitions.append(Transition(insertion2, hmm.end, 0.5))
    deletion2.transitions.append(Transition(deletion2, hmm.end, 1.0))

    hmm.effective_matches = 10
    hmm.version = 1.5
    hmm.name = 'name'
    hmm.id = 'id'
    hmm.family = 'fam'
    hmm.length = ProfileLength(2, 2)

    return hmm
Exemple #3
0
    def _parse_profile(self, hmm, units=ScoreUnits.LogScales):
        """
        Parse the HMM profile.

        @param hmm: the hmm object being constructed
        @type hmm: L{ProfileHMM}
        @return: the updated hmm
        @rtype: L{ProfileHMM}

        @raise NotImplementedError: when an unknown transition string is
                                    encountered
        """
        assert self._chopped

        # 0. Prepare start and end states
        hmm.start = State(States.Start)
        hmm.end = State(States.End)

        residues = None
        background = {}
        tran_types = None
        tran_lines = []
        start_probs = None

        lines = iter(self._profile)
        pattern = re.compile('^[A-Z\-]\s[0-9]+\s+')

        if units == ScoreUnits.LogScales:

            def parse_probability(v):
                if v.strip() == '*':
                    return None
                else:
                    return float(v)
        else:

            def parse_probability(v):
                if v.strip() == '*':
                    return None
                else:
                    return hmm._convert(units, float(v), hmm.scale,
                                        hmm.logbase)

        # 1. Create all layers (profile columns), create and attach their match states

        while True:
            try:
                line = next(lines)
            except StopIteration:
                break

            if line.startswith('NULL'):
                try:
                    backprobs = tuple(map(parse_probability, line.split()[1:]))

                    line = next(lines)
                    residues = line.split()[1:]
                    residues = [
                        Enum.parse(ProteinAlphabet, aa) for aa in residues
                    ]

                    for pos, aa in enumerate(residues):
                        background[aa] = backprobs[pos]

                    line = next(lines)
                    tran_types = line.split()

                    line = next(lines)
                    start_probs = list(map(parse_probability, line.split()))
                except StopIteration:
                    break

            elif re.match(pattern, line):
                emrow = line
                try:
                    tran_lines.append(next(lines))
                    #junkrow = next(lines)
                except StopIteration:
                    break

                emprobs = emrow.split()
                if len(emprobs) != 23:
                    raise HHProfileFormatError(
                        "Unexpected number of data fields: {0}".format(
                            len(emprobs)))

                rank = int(emprobs[1])
                residue = structure.ProteinResidue(rank=rank,
                                                   type=emprobs[0],
                                                   sequence_number=rank,
                                                   insertion_code=None)
                if residue.type == ProteinAlphabet.GAP:
                    raise HHProfileFormatError(
                        "Layer {0} can't be represented by a gap".format(rank))

                new_layer = hmm.layers.append(HMMLayer(rank, residue))
                if new_layer != rank:
                    raise HHProfileFormatError(
                        'Layer {0} defined as {1}'.format(new_layer, rank))

                match = State(States.Match, emit=Enum.members(ProteinAlphabet))

                match.rank = rank
                match.background.set(background)

                for col, aa in enumerate(residues):
                    prob = parse_probability(emprobs[col + 2])
                    match.emission.append(aa, prob)

                hmm.layers[new_layer].append(match)
                assert hmm.layers.last_index == match.rank

        # 2. Append starting transitions: S -> M[1] and optionally S -> D[1] and S -> I[0].
        #    States D[1] and I[0] will be created if needed
        #    Note that [0] is not a real layer, I[0] is simply an insertion at the level of Start
        if len(hmm.layers) > 0:

            first_match = hmm.layers[hmm.layers.start_index]

            if start_probs[0] is None:
                raise HHProfileFormatError(
                    "Transition Start > Match[1] is undefined")

            start_tran = Transition(hmm.start, first_match[States.Match],
                                    start_probs[0])
            hmm.start.transitions.append(start_tran)

            if start_probs[1] is not None and start_probs[
                    3] is not None:  # Start -> I[0] -> M[1]
                start_ins = State(States.Insertion,
                                  emit=Enum.members(ProteinAlphabet))
                start_ins.rank = 0
                start_ins.background.set(background)
                start_ins.emission = start_ins.background

                hmm.start_insertion = start_ins
                # Start -> I[0]
                hmm.start.transitions.append(
                    Transition(hmm.start, hmm.start_insertion, start_probs[1]))
                # I[0] -> M[1]
                hmm.start_insertion.transitions.append(
                    Transition(hmm.start_insertion, first_match[States.Match],
                               start_probs[3]))
                # I[0] -> I[0]
                if start_probs[4]:
                    hmm.start_insertion.transitions.append(
                        Transition(hmm.start_insertion, hmm.start_insertion,
                                   start_probs[4]))

            if start_probs[2] is None and start_probs[6] is not None:
                # M->D is corrupt (*) at the Start layer, using D->D instead
                start_probs[2] = start_probs[6]

            if start_probs[2] is not None:  # Start -> D[1]
                start_del = State(States.Deletion)
                start_del.rank = 1
                hmm.layers[1].append(start_del)
                start_tran = Transition(hmm.start,
                                        first_match[States.Deletion],
                                        start_probs[2])
                hmm.start.transitions.append(start_tran)
        else:
            start_tran = Transition(hmm.start, hmm.end, start_probs[0])
            hmm.start.transitions.append(start_tran)

        # 3. Append remaining transitions. I and D states will be created on demand.

        for rank, fields in enumerate(tran_lines,
                                      start=hmm.layers.start_index):
            assert hmm.layers[rank][States.Match].rank == rank

            ofields = fields.split()
            fields = tuple(map(parse_probability, ofields))

            # 3a. Parse all Neff values and create I[i] and D[i] states if NeffX[i] is not None
            for col, neff in enumerate(tran_types[7:10], start=7):

                if fields[col] is not None:
                    neff_value = float(ofields[col]) / abs(hmm.scale)

                    if neff == 'Neff':
                        hmm.layers[rank].effective_matches = neff_value

                    elif neff == 'Neff_I':
                        hmm.layers[rank].effective_insertions = neff_value

                        if States.Insertion not in hmm.layers[rank]:
                            insertion = State(
                                States.Insertion,
                                emit=Enum.members(ProteinAlphabet))
                            insertion.background.set(background)
                            insertion.emission.set(background)
                            insertion.rank = rank
                            hmm.layers[rank].append(insertion)

                    elif neff == 'Neff_D':
                        hmm.layers[rank].effective_deletions = neff_value

                        if States.Deletion not in hmm.layers[
                                rank] and neff_value > 0:
                            deletion = State(States.Deletion)
                            deletion.rank = rank
                            hmm.layers[rank].append(deletion)

            # 3b. Starting from the first layer, parse all transitions and build the HMM graph stepwise
            for col, tran in enumerate(tran_types):
                probability = fields[col]

                if probability is not None:
                    try:
                        self._add_transition(hmm, rank, tran, probability)
                    except (CollectionIndexError, ItemNotFoundError) as ex:
                        msg = "Can't add transition {0} at {1}: {2.__class__.__name__}, {2!s}"
                        raise HHProfileFormatError(msg.format(tran, rank, ex))

        return hmm
Exemple #4
0
    def add_transition_pseudocounts(self,
                                    gapb=1.,
                                    gapd=0.15,
                                    gape=1.0,
                                    gapf=0.6,
                                    gapg=0.6,
                                    gapi=0.6):
        """
        Add pseudocounts to the transitions. A port from hhsearch
        -gapb 1.0 -gapd 0.15 -gape 1.0 -gapf 0.6 -gapg 0.6 -gapi 0.6
        """

        from numpy import array

        if not self.hmm._score_units == ScoreUnits.Probability:
            self.hmm.convert_scores(units=ScoreUnits.Probability)

        if self.hmm.pseudocounts or self.hmm.transition_pseudocounts:
            return

        # We need a fully populated HMM so first add all missing states
        states = [States.Match, States.Insertion, States.Deletion]
        background = self.hmm.layers[1][States.Match].background
        for layer in self.hmm.layers:
            rank = layer.rank
            for state in states:
                if state not in layer:

                    if state is States.Deletion:
                        # Add a new Deletion state
                        deletion = State(States.Deletion)
                        deletion.rank = rank
                        layer.append(deletion)

                    elif state is States.Insertion:
                        # Add a new Deletion state
                        insertion = State(
                            States.Insertion,
                            emit=csb.core.Enum.members(
                                sequence.SequenceAlphabets.Protein))
                        insertion.background.set(background)
                        insertion.emission.set(background)
                        insertion.rank = rank
                        layer.append(insertion)

        if not self.hmm.start_insertion:
            insertion = State(States.Insertion,
                              emit=csb.core.Enum.members(
                                  sequence.SequenceAlphabets.Protein))
            insertion.background.set(background)
            insertion.emission.set(background)
            insertion.rank = 0
            self.hmm.start_insertion = insertion

        # make hmm completly connected
        for i in range(1, self.hmm.layers.length):
            layer = self.hmm.layers[i]
            #Start with match state
            state = layer[States.Match]
            if not States.Insertion in state.transitions:
                state.transitions.append(
                    Transition(state, self.hmm.layers[i][States.Insertion],
                               0.0))
            if not States.Deletion in state.transitions:
                state.transitions.append(
                    Transition(state, self.hmm.layers[i + 1][States.Deletion],
                               0.0))
            state = layer[States.Insertion]
            if not States.Insertion in state.transitions:
                state.transitions.append(
                    Transition(state, self.hmm.layers[i][States.Insertion],
                               0.0))
            if not States.Match in state.transitions:
                state.transitions.append(
                    Transition(state, self.hmm.layers[i + 1][States.Match],
                               0.0))
            state = layer[States.Deletion]
            if not States.Deletion in state.transitions:
                state.transitions.append(
                    Transition(state, self.hmm.layers[i + 1][States.Deletion],
                               0.0))
            if not States.Match in state.transitions:
                state.transitions.append(
                    Transition(state, self.hmm.layers[i + 1][States.Match],
                               0.0))
        # start layer
        state = self.hmm.start
        if not States.Insertion in self.hmm.start.transitions:
            state.transitions.append(
                Transition(self.hmm.start, self.hmm.start_insertion, 0.0))
        if not States.Deletion in self.hmm.start.transitions:
            state.transitions.append(
                Transition(self.hmm.start, self.hmm.layers[1][States.Deletion],
                           0.0))

        state = self.hmm.start_insertion
        if not States.Insertion in self.hmm.start_insertion.transitions:
            state.transitions.append(
                Transition(self.hmm.start_insertion, self.hmm.start_insertion,
                           0.0))
        if not States.Match in self.hmm.start_insertion.transitions:
            state.transitions.append(
                Transition(self.hmm.start_insertion,
                           self.hmm.layers[1][States.Match], 0.0))

        # last layer
        state = self.hmm.layers[-1][States.Match]
        if not States.Insertion in state.transitions:
            state.transitions.append(
                Transition(state, self.hmm.layers[-1][States.Insertion], 0.0))
        state = self.hmm.layers[-1][States.Insertion]
        if not States.Insertion in state.transitions:
            state.transitions.append(
                Transition(state, self.hmm.layers[-1][States.Insertion], 0.0))

        if not States.End in state.transitions:
            state.transitions.append(Transition(state, self.hmm.end, 0.0))
        state = self.hmm.layers[-1][States.Deletion]
        if not States.End in state.transitions:
            state.transitions.append(Transition(state, self.hmm.end, 0.0))

        # Now we have created a fully connected HMM
        # Lates add pseuod counts
        # Calculate pseudo counts

        # to be honest I really do not know how they came up with this
        pc_MD = pc_MI = 0.0286 * gapd
        pc_MM = 1. - 2 * pc_MD
        pc_DD = pc_II = gape / (gape - 1 + 1 / 0.75)
        pc_DM = pc_IM = 1. - pc_II

        # Get current transtion probabilities
        t_mm = self.hmm.start.transitions[States.Match].probability
        t_mi = self.hmm.start.transitions[States.Insertion].probability
        t_md = self.hmm.start.transitions[States.Deletion].probability

        # Transitions from Match state
        n_eff = self.hmm.effective_matches

        t = array([(n_eff - 1) * t_mm + gapb * pc_MM,
                   (n_eff - 1) * t_mi + gapb * pc_MI,
                   (n_eff - 1) * t_md + gapb * pc_MD])
        # normalize to one
        t /= t.sum()
        # Set
        self.hmm.start.transitions[States.Match].probability = t[0]
        self.hmm.start.transitions[States.Insertion].probability = t[1]
        self.hmm.start.transitions[States.Deletion].probability = t[2]

        # Rinse and repeat
        t_im = self.hmm.start_insertion.transitions[States.Match].probability
        t_ii = self.hmm.start_insertion.transitions[
            States.Insertion].probability

        t = array([t_im + gapb * pc_IM, t_ii + gapb * pc_II])
        t /= t.sum()

        self.hmm.start_insertion.transitions[States.Match].probability = t[0]
        t_ii = self.hmm.start_insertion.transitions[
            States.Insertion].probability = t[1]

        # And now for all layers
        for layer in self.hmm.layers[:-1]:
            # Get current transtion probabilities
            t_mm = layer[States.Match].transitions[States.Match].probability
            t_mi = layer[States.Match].transitions[
                States.Insertion].probability
            t_md = layer[States.Match].transitions[States.Deletion].probability
            n_eff = layer.effective_matches
            t = array([(n_eff - 1) * t_mm + gapb * pc_MM,
                       (n_eff - 1) * t_mi + gapb * pc_MI,
                       (n_eff - 1) * t_md + gapb * pc_MD])
            # normalize to one
            t /= t.sum()
            layer[States.Match].transitions[States.Match].probability = t[0]
            layer[States.Match].transitions[
                States.Insertion].probability = t[1]
            layer[States.Match].transitions[States.Deletion].probability = t[2]

            # Transitions from insert state
            t_im = layer[States.Insertion].transitions[
                States.Match].probability
            t_ii = layer[States.Insertion].transitions[
                States.Insertion].probability
            n_eff = layer.effective_insertions
            t = array(
                [t_im * n_eff + gapb * pc_IM, t_im * n_eff + gapb * pc_II])
            # normalize to one
            t /= t.sum()
            layer[States.Insertion].transitions[
                States.Match].probability = t[0]
            layer[States.Insertion].transitions[
                States.Insertion].probability = t[1]

            # Transitions form deletion state
            t_dm = layer[States.Deletion].transitions[States.Match].probability
            t_dd = layer[States.Deletion].transitions[
                States.Deletion].probability
            n_eff = layer.effective_deletions
            t = array(
                [t_dm * n_eff + gapb * pc_DM, t_dd * n_eff + gapb * pc_DD])
            # normalize to one
            t /= t.sum()
            layer[States.Deletion].transitions[States.Match].probability = t[0]
            layer[States.Deletion].transitions[
                States.Deletion].probability = t[1]

        #Last layer

        layer = self.hmm.layers[-1]
        t_mm = layer[States.Match].transitions[States.End].probability
        t_mi = layer[States.Match].transitions[States.Insertion].probability
        n_eff = layer.effective_matches
        # No deletion
        t = array([(n_eff - 1) * t_mm + gapb * pc_MM,
                   (n_eff - 1) * t_mi + gapb * pc_MI])
        # normalize to one
        t /= t.sum()
        layer[States.Match].transitions[States.End].probability = t[0]
        layer[States.Match].transitions[States.Insertion].probability = t[1]

        # Transitions from insert state
        t_im = layer[States.Insertion].transitions[States.End].probability
        t_ii = layer[States.Insertion].transitions[
            States.Insertion].probability
        n_eff = layer.effective_insertions
        t = array([t_im * n_eff + gapb * pc_IM, t_im * n_eff + gapb * pc_II])
        # normalize to one
        t /= t.sum()
        layer[States.Insertion].transitions[States.End].probability = t[0]
        layer[States.Insertion].transitions[
            States.Insertion].probability = t[1]

        layer[States.Deletion].transitions[States.End].probability = 1.

        self.hmm.transition_pseudocounts = True
        return