Exemple #1
0
def create_0order_hmm(nb_seq, nb_residues, first_letters, motif):
    """
    Create a 0-order HMM initialized from MEME result

    :arg nb_seq: Number of sequences used by MEME
    :type nb_seq: int
    :arg nb_residues: Number of residues used by MEME
    :type nb_residues: int
    :arg first_letters: Number of occurrences of ACGT at the begining of
        sequences used by MEME
    :type first_letters: dic of str->int
    :arg motif: PFM as a Biopython motif to be used to initialize the TFFFM
    :type motif: :class:`Bio.motifs`

    :returns: The constructed HMM
    :rtype: :class:`ghmm.DiscreteEmissionHMM`

    """

    # The first state is random
    emissions = [[0.25, 0.25, 0.25, 0.25]]
    # Complete the emissions with the actual motif frequencies
    if motif.instances:
        # The motif.counts is computed directly when creating the motif from
        # instances
        nb_hits = len(motif.instances)
    else:
        nb_hits = nb_seq
    for position in xrange(len(motif)):
        frequencies = []
        for letter in "ACGT":
            freq = (motif.counts[letter][position] + 1.) / (nb_hits + 4.)
            frequencies.append(freq)
        emissions.append(frequencies)

    # Background transitions
    transitions = []
    background_to_background = 1. - float(nb_seq) / nb_residues
    background_to_foreground = 1. - background_to_background
    transitions.append(
        [background_to_background, background_to_foreground] + [0.] *
        (len(motif) - 1))
    # Core transitions
    for position in xrange(1, len(motif)):
        transitions.append(
            [0.] * (position + 1) + [1.] + [0.] * (len(motif) - position - 1))
    # Final transitions now
    transitions.append([1.] + [0.] * len(motif))

    # Starting proba
    initials = [1.] + [0.] * len(motif)
    return ghmm.HMMFromMatrices(ghmm.Alphabet(ALPHABET),
                                ghmm.DiscreteDistribution(
                                    ghmm.Alphabet(ALPHABET)),
                                transitions, emissions, initials)
    def train(self):
        # This tells GHMM every possible value that it will be seeing
        alphabet = ghmm.Alphabet(list(set(self.events)))
        alphaLen = len(alphabet)

        # Initiaize the probabilities of transitioning from each state to each other
        # state. There is probably a better way to do this, but this is nice and simple.
        trans_prob = 1.0 / (alphaLen)
        trans = [[trans_prob for row in range(alphaLen)]
                 for col in range(alphaLen)]

        # Initialize the probabilities of seeing each output from each state.
        # Again, there is probably a better way to do this, but this is simple.
        emiss_prob = 1.0 / (alphaLen)
        emiss = [[emiss_prob for row in range(alphaLen)]
                 for col in range(alphaLen)]

        # Some grease to get GHMM to work
        pi = [1.0 / alphaLen] * alphaLen

        # The sequence of musical events gathered from the music
        train_seq = ghmm.EmissionSequence(alphabet, self.events)

        # Generate the model of the data
        m = ghmm.HMMFromMatrices(alphabet, ghmm.DiscreteDistribution(alphabet),
                                 trans, emiss, pi)

        # Train the model based on the training sequence
        m.baumWelch(train_seq)

        return (m, alphabet)
Exemple #3
0
    def _get_posterior_proba(self, sequence_split):
        """
        Get the posterior probabilities at each nucleotide position given the
        TFFM.

        :arg sequence_split: The sequence splitted in subsequences to not
            consider non ACGT nucleotides.
        :type sequence_split: list

        :returns: The posterior probabilities at each position of the sequence.
        :rtype: list of list

        :note: One example of a sequence_split is ["ACT", "N", "ATC"].

        """

        ghmm_extended_alphabet = ghmm.Alphabet(EXTENDED_ALPHABET)
        posterior_proba = []
        # null probabilities for non ACGT nucleotides.
        null_proba = [0.] * self.N
        for sequence in sequence_split:
            if re.match("[ACGT]", sequence):
                emission_sequence = ghmm.SequenceSet(ghmm_extended_alphabet,
                                                     [sequence])[0]
                posterior_proba.extend(self.posterior(emission_sequence))
            else:
                for __ in xrange(len(sequence)):
                    posterior_proba.append(null_proba)
        return posterior_proba
Exemple #4
0
    def train(self, training_file, epsilon=0.0001, max_iter=500):
        """
        Train the TFFM using the fasta sequences to learn emission and
        transition probabilities.

        :note: The training of the underlying HMM is made using the Baum-Welsh
            algorithm.

        :arg training_file: The fasta file of the sequences to train the TFFM
            on.
        :type training_file: str
        :arg epsilon: The least relative improvement cut-off in likelihood
            compared to the previous iteration of the Baum-Welsh algorithm
            (default: 0.0001).
        :type epsilon: float
        :arg max_iter: The maximum number of iteration of the Baum-Welsh
            algorithm to reestimate the probabilities (default: 500).
        :type max_iter: int

        """

        assert(os.path.isfile(training_file))
        # Only upper case is allowed in the ALPHABET, need to convert
        sequences = []
        for record in SeqIO.parse(training_file, "fasta"):
            sequence = record.seq.upper()
            # Only considering sequences with ACGTs
            if not re.search("[^AGCT]", str(sequence)):
                sequences.append(sequence)
        training_sequences = ghmm.SequenceSet(ghmm.Alphabet(ALPHABET),
                                              sequences)
        # Need to give the same weight to all the sequences since it does not
        # seem to be done by default by ghmm.
        utils.set_sequences_weight(training_sequences, 1.0)
        self.baumWelch(training_sequences, max_iter, epsilon)
Exemple #5
0
 def __init__(self, A, B, Pi, observations):
     if len(A) == len(Pi):
         self.states = range(len(A))
         self.sigma = ghmm.Alphabet(observations) # The "alphabet" comprising action indices
         self.initA = A
         self.initB = B
         self.initPi = Pi
         self.ghmmModel = ghmm.HMMFromMatrices(self.sigma, ghmm.DiscreteDistribution(self.sigma), self.initA, self.initB, self.initPi)
     else:
         prettyPrint("Unable to initialize model. Unequal number of states", "error")
         return
Exemple #6
0
def ghmm_from_discrete_hmm(hmm):
    hmm = deepcopy(hmm)
    domain = ghmm.Alphabet(range(hmm.alphabetSize))
    trans = hmm.transitionMatrix
    init = hmm.initialProbabilities
    emissions = [d.probabilities for d in hmm.emissionDistributions]
    return ghmm.HMMFromMatrices(emissionDomain=domain,
                                distribution=ghmm.DiscreteDistribution(domain),
                                A=trans,
                                B=emissions,
                                pi=init)
Exemple #7
0
    def __init__(self, n_hid=10, **kwargs):
        """@todo: to be defined1.

        :param **kwargs: @todo

        """
        RPSPlayer.__init__(self, **kwargs)
        self._n_hid = n_hid
        self._n_sym = len(self._rules)
        sym_pairs = [(i, j) for i in range(self._n_sym)
                     for j in range(self._n_sym)]
        self._alphab = gh.Alphabet(sym_pairs)
        self._conversion_array = np.asarray(
            [[self._alphab.internal((i, j)) for j in range(self._n_sym)]
             for i in range(self._n_sym)])
        self._prediction_mode = False
Exemple #8
0
    def test_viterbi_against_hmm(self):
        from kerehmm.test.util import ghmm_from_discrete_hmm
        import ghmm

        hmm = self.new_hmm()
        hmm.setup_strict_left_to_right(set_emissions=True)
        domain = ghmm.Alphabet(range(hmm.alphabetSize))
        hmm_reference = ghmm_from_discrete_hmm(hmm)
        seq = list(range(self.nSymbols))
        print "True path and emission: {}".format(seq)
        true_path = seq
        reference_path, reference_prob = hmm_reference.viterbi(
            ghmm.SequenceSet(domain, [seq]))
        path, prob = hmm.viterbi_path(seq)
        print "Reference path: {}".format(reference_path)
        print "Calculated path: {}".format(path)
        print "Reference prob: {}, Calculated prob: {}".format(
            reference_prob, prob)
        assert np.all(np.equal(true_path, reference_path))
        assert np.all(np.equal(true_path, path))
        assert np.isclose(prob, reference_prob)
Exemple #9
0
def train_model(songs_data):
    """Input: list of data on several songs (could be a single song)
       Ouput: a list of models, one for each bar type. """
    note_models = {}
    notes = get_notes(songs_data)

    # This tells GHMM every possible value that it will be seeing
    note_alphabet = ghmm.Alphabet(list(set(notes)))
    note_alpha_len = len(note_alphabet)

    # Initiaize the probabilities of transitioning from each state to each other
    # state. There is probably a better way to do this, but this is nice and simple.
    note_trans_prob = 1.0 / (note_alpha_len)
    trans = [[note_trans_prob for row in range(note_alpha_len)]
             for col in range(note_alpha_len)]

    # Initialize the probabilities of seeing each output from each state.
    # Again, there is probably a better way to do this, but this is simple.
    note_emiss_prob = 1.0 / (note_alpha_len)
    emiss = [[note_emiss_prob for row in range(note_alpha_len)]
             for col in range(note_alpha_len)]

    # Some grease to get GHMM to work
    pi = [1.0 / note_alpha_len] * note_alpha_len

    # The sequence of notes gathered from the music
    note_train_seq = ghmm.EmissionSequence(note_alphabet, notes)

    bars = BarLearner.get_bars(songs_data)
    for bar in set(bars):
        # Generate the model of the data
        note_models[bar] = ghmm.HMMFromMatrices(
            note_alphabet, ghmm.DiscreteDistribution(note_alphabet), trans,
            emiss, pi)

    for bar in bars:
        # Train the model based on the training sequence
        note_models[bar].baumWelch(note_train_seq)

    return (note_models, note_alphabet)
    def finalize(self):
        cmodel = self.HMM.finalize()

        if (self.modeltype & ghmmwrapper.kContinuousHMM):
            return ghmm.ContinuousMixtureHMM(
                ghmm.Float(), ghmm.ContinuousMixtureDistribution(ghmm.Float()),
                cmodel)

        elif ((self.modeltype & ghmmwrapper.kDiscreteHMM)
              and not (self.modeltype & ghmmwrapper.kTransitionClasses)
              and not (self.modeltype & ghmmwrapper.kPairHMM)):
            emission_domain = ghmm.Alphabet([], cmodel.alphabet)
            if (self.modeltype & ghmmwrapper.kLabeledStates):
                labelDomain = ghmm.LabelDomain([], cmodel.label_alphabet)
                return ghmm.StateLabelHMM(
                    emission_domain,
                    ghmm.DiscreteDistribution(emission_domain), labelDomain,
                    cmodel)

            else:
                return ghmm.DiscreteEmissionHMM(
                    emission_domain,
                    ghmm.DiscreteDistribution(emission_domain), cmodel)
Exemple #11
0
    def __init__(self, allele):
        ''' Initialize the predictor. For now, only H2-IAb is supported. '''

        if (allele != 'H2-IAb'):
            raise ValueError('Unsupported allele')

        self.allele = allele

        # range of allowed peptide lengths (percentile rank calibration was
        # done for this range)
        self.minPeptideLength = 12
        self.maxPeptideLength = 24

        # define the HMM emission alphabet, including a dedicated "stop"
        # emission (a GHMM-specific tweak)
        self.alphabet = ghmm.Alphabet(ghmm.AminoAcids.listOfCharacters + ['Z'])

        # a value to replace inf in the predictions. Same value as used in
        # percentile rank calibration
        self.infSubstitute = 1000

        # GHMM has an internal limitation of 1,500,000 sequences in a set
        self.sequencesPerBlock = 1500000

        # load the HMM model and the corresponding percentile rank calibration data
        from pkg_resources import resource_filename
        hmmFile = resource_filename(__name__,
                                    'models/hmm-h2iab-iedb2018-binders.xml')
        prCalibrationFile = resource_filename(
            __name__, 'models/precent-rank-model-h2iab.npz')

        self.hmm = ghmm.HMMOpen(hmmFile)

        prCalibration = np.load(prCalibrationFile)
        self.prCalibrationCdf = prCalibration['cdf']
        self.prCalibrationBinEdges = prCalibration['bin_edges']
Exemple #12
0
                                pi=init)


def ghmm_from_multivariate_continuous_hmm(hmm):
    hmm = deepcopy(hmm)
    domain = ghmm.Float()
    trans = hmm.transitionMatrix.tolist()
    init = hmm.initialProbabilities.tolist()
    emissions = [[d.mean.tolist(), d.variance.flatten().tolist()] for d in hmm.emissionDistributions]
    # print init
    # print trans
    # print emissions
    return ghmm.HMMFromMatrices(emissionDomain=domain,
                                distribution=ghmm.MultivariateGaussianDistribution(domain),
                                A=trans,
                                B=emissions,
                                pi=init)


if __name__ == "__main__":
    from discrete_hmm_test import DiscreteHMMTest

    test = DiscreteHMMTest()
    hmm = test.new_hmm()
    domain = ghmm.Alphabet(range(hmm.alphabetSize))
    hmm_reference = ghmm_from_discrete_hmm(hmm)
    seq = ghmm.EmissionSequence(hmm_reference.emissionDomain, [0, 0, 0])
    print hmm_reference
    print hmm_reference.forward(seq)[0]
    print hmm_reference.backward(seq)[0]
Exemple #13
0
def extractHMMFeatures(sourceFiles):
    """ Extracts HMM-similarity features from all files in a given directory """
    allTraces = [] # List to store all traces for the HMM-similarity extraction     
    try:
        for targetFile in sourceFiles:
            if os.path.exists(targetFile.replace(".c", ".seq")):
                instructionAlphaSequence = open(targetFile.replace(".c", ".seq")).read()
                allTraces.append( (instructionAlphaSequence, targetFile.replace(".c", ".hmm"), loadLabelFromFile(targetFile.replace(".c", ".metadata"))[0])) #TODO: Append a tuple of (trace, filename, cluster) for each data sample
        if len(allTraces) < 1:
            prettyPrint("No traces to process for HMM-similarity feature extraction. Skipping", "warning")
        else:
            allClusters = []
            # Retrieve list of clusters
            prettyPrint("Retrieving clusters")
            for trace in allTraces:
                if not trace[2] in allClusters:
                    allClusters.append(trace[2])
            # Gather traces belonging to different clusters
            clusterTraces = []
            for cluster in allClusters:
                currentCluster = []
                for trace in allTraces:
                    if trace[2] == cluster:
                        currentCluster.append(trace[0])
                clusterTraces.append(currentCluster)
                prettyPrint("Retrieved %s instances for cluster %s" % (len(currentCluster), cluster))
            # Should wind up with list of lists each of which depict traces of a cluster
            allHMMs = []
            for cluster in allClusters:
                # Build HMM for each cluster and use it to calculate likelihoods for all instances
                prettyPrint("Building HMM for cluster \"%s\"" % cluster)
                trainingSequences =  clusterTraces[ allClusters.index(cluster) ]
                # Retrieve number of observations
                observations = []
                for sequence in trainingSequences:
                    for o in sequence:
                        if o not in observations:
                            observations.append(o)
                # Prepare matrices for HMM
                A = numpy.random.random((len(allClusters), len(allClusters))).tolist()
                B = numpy.random.random((len(allClusters), len(observations))).tolist()
                Pi = numpy.random.random((len(allClusters),)).tolist()
                sigma = ghmm.Alphabet(observations)
                # Build HMM and train it using Baum-Welch algorithm
                clusterHMM = ghmm.HMMFromMatrices(sigma, ghmm.DiscreteDistribution(sigma), A, B, Pi)
                clusterHMM.baumWelch(ghmm.SequenceSet(clusterHMM.emissionDomain, trainingSequences))
                # Add that to list of all HMM's
                allHMMs.append((clusterHMM, observations))
            # Finally, for every trace, calculate the feature vectors
            prettyPrint("Calculating similarity features for traces")
            for trace in allTraces:
                featureVector = []
                for hmm in allHMMs:
                    # Make sure sequences contains observations supported by the current HMM
                    sequence = []
                    for obs in trace[0]:
                        if obs in hmm[1]:
                            sequence.append(obs)
                    # Calculate the likelihood
                    sequence = ghmm.EmissionSequence(ghmm.Alphabet(hmm[1]), sequence)
                    featureVector.append(hmm[0].loglikelihood(sequence))
                    featureFile = open(trace[1], "w")
                    featureFile.write(str(featureVector))
                    featureFile.close()
        #############################################################################

    except Exception as e:
        prettyPrint("Error encoutered: %s" % e, "error")
        return False
        
    return True 
Exemple #14
0
                    action='store',
                    help='output FASTA file of shuffled sequences')
parser.add_argument('--prefix',
                    default='',
                    action='store',
                    help='add this prefix to the sequence ids')
options = parser.parse_args()

fin = open(options.infasta, 'r')

if (options.outfasta != '-'):
    fout = open(options.outfasta, 'w')
else:
    fout = sys.stdout

sigma = ghmm.Alphabet(['A', 'C', 'G', 'T'])

#import ipdb;  ipdb.set_trace()

#  HMM with transition probabilities learned from the dinucleotide frequencies
#
#    A->A, A->C, A->G, A->T
#    C->A, C->C, C->G, C->T
#    G->A, G->C, G->G, G->T
#    T->A, T->C, T->G, T->T
#
#  emission probabilities
#
#    A->A (1.0), A->C (0.0), A->G (0.0), A->T (0.0)
#    C->A (0.0), C->C (1.0), C->G (0.0), C->T (0.0)
#    G->A (0.0), G->C (0.0), G->G (1.0), G->T (0.0)
Exemple #15
0
def convert2ghmm(h):
    """Convert an HMM object to a GHMM object."""
    alphabets = ghmm.Alphabet("ACDEFGHIKLMNPQRSTVWY")
    g = ghmm.HMMFromMatrices(alphabets, ghmm.DiscreteDistribution(alphabets),
            h._t, h._e.T, h._i)
    return g
Exemple #16
0
def create_detailed_hmm(nb_seq, nb_residues, first_letters, motif):
    """
    Create a detailed HMM initialized from MEME result

    :arg nb_seq: Number of sequences used by MEME
    :type nb_seq: int
    :arg nb_residues: Number of residues used by MEME
    :type nb_residues: int
    :arg first_letters: Number of occurrences of ACGT at the begining of
        sequences used by MEME
    :type first_letters: dic of str->int
    :arg motif: PFM as a Biopython motif to be used to initialize the TFFFM
    :type motif: :class:`Bio.motifs`

    :returns: The constructed HMM
    :rtype: :class:`ghmm.DiscreteEmissionHMM`

    """

    # Starting proba with the starting nucleotides of the sequences and a '1'
    # pseudocount added
    initials = [(first_letters['A'] + 1.) / (nb_seq + 4.),
                (first_letters['C'] + 1.) / (nb_seq + 4.),
                (first_letters['G'] + 1.) / (nb_seq + 4.),
                (first_letters['T'] + 1.) / (nb_seq + 4.),
                1. / nb_seq, 1. / nb_seq, 1. / nb_seq, 1. / nb_seq]
    initials += [0.] * 4 * (len(motif) - 1)

    # Emission proba
    emissions = [[1., 0., 0., 0.], [0., 1., 0., 0.], [0., 0., 1., 0.],
                 [0., 0., 0., 1.]] * (len(motif) + 1)

    # Background transitions proba
    if motif.instances:
        # The motif.counts is computed directly when creating the motif from
        # instances
        nb_hits = len(motif.instances)
    else:
        nb_hits = nb_seq
    background_to_background = 1. - float(nb_seq) / nb_residues
    background_to_foreground = 1. - background_to_background
    background_to_background /= 4.
    transi = {}
    for letter in "ACGT":
        freq = (motif.counts[letter][0] + 1.) / (nb_hits + 4.)
        transi[letter] = freq * background_to_foreground
    transitions = []
    for __ in xrange(4):
        transitions.append([background_to_background,
                            background_to_background,
                            background_to_background,
                            background_to_background,
                            transi['A'], transi['C'],
                            transi['G'], transi['T']]
                           + [0.] * 4 * (len(motif) - 1))
    pfm = [(v + 1.) / (nb_hits + 4.) for letter in 'ACGT' for v in motif.counts[letter]]
    for position in xrange(len(motif) * 4):
        transitions.append([0.] * 4 * (len(motif) + 1))
    for position in xrange(1, len(motif)):
        for line in xrange(4 * (position - 1) + 1, 4 * (position - 1) + 5):
            for column in xrange(4 * position + 1, 4 * position + 5):
                index = (column - (4 * position + 1)) * len(motif) + position
                transitions[line + 3][column + 3] = pfm[index]
    for index in xrange(4):
        state = len(motif) * 4 + index
        transitions[state][0] = 0.25
        transitions[state][1] = 0.25
        transitions[state][2] = 0.25
        transitions[state][3] = 0.25
    return ghmm.HMMFromMatrices(ghmm.Alphabet(ALPHABET),
                                ghmm.DiscreteDistribution(
                                    ghmm.Alphabet(ALPHABET)),
                                transitions, emissions, initials)
Exemple #17
0
#author: Anas Elghafari
#building a toy profile HMM in two libraries, GHMM and YAHMM,
#with the purpose of comparing the viterbi calculations in both libraries

from yahmm import *
import ghmm
random.seed(0)  #needed for yahmm ??

#ghmm model:
alphabet = ghmm.Alphabet(['A', 'C', 'G', 'T'])
initial_probs = ([1] + [0] * 13)
start_em = end_em = [0.25, 0.25, 0.25, 0.25]
m1_em = [0.8, 0.1, 0.05, 0.05]
m2_em = [0.8, 0.1, 0.05, 0.05]
m3_em = [0.1, 0.8, 0.05, 0.05]
m4_em = [0.1, 0.8, 0.05, 0.05]
i1_em = i2_em = i3_em = i4_em = [0.25, 0.25, 0.25, 0.25]
d1_em = d2_em = d3_em = d4_em = [0, 0, 0, 0]
#transitions:
m1_trans = [0, 0, 0.2, 0.2, 0.6] + ([0] * 9)
m2_trans = [0] * 5 + [0.2, 0.2, 0.6] + [0] * 6
m3_trans = [0] * 8 + [0.2, 0.2, 0.6] + [0] * 3
m4_trans = [0] * 11 + [0.2, 0.2, 0.6]

i1_trans = [0, 0, 0.2, 0, 0.8] + [0] * 9
i2_trans = [0] * 5 + [0.2, 0, 0.8] + [0] * 6
i3_trans = [0] * 8 + [0.2, 0, 0.8] + [0] * 3
i4_trans = [0] * 11 + [0.2, 0, 0.8]

d1_trans = [0] * 2 + [0.1, 0, 0.9] + [0] * 9
d2_trans = [0] * 5 + [0.1, 0, 0.9] + [0] * 6