Exemple #1
0
 def resample_changer(self,data,numiter):
     '''
     metropolis-(hastings) / simulated annealing version
     '''
     # TODO make another version that exploits gamma/poisson construction of
     # negbin distribution
     if len(data) == 0:
         self.r = sample_discrete(self.discrete) + 1
         self.p = stats.beta.rvs(self.alpha,self.beta)
     else:
         assert np.min(data) >= 1
         # got this general idea from web.mit.edu/~wingated/www/introductions/mcmc-gibbs-intro.pdf
         # get posterior value of current (r,p)
         current_log_prior_value = stats.beta.logpdf(self.p,self.alpha,self.beta) + np.log(self.discrete[self.r-1])
         current_log_likelihood_value = np.sum(self.log_pmf(data))
         for iter in xrange(numiter):
             # generate proposals, using prior on r and conditionally poterior on p as proposal distribution
             # it uses posterior information in proposing p
             proposal_r = sample_discrete(self.discrete)+1
             proposal_p = stats.beta.rvs(self.alpha + proposal_r * float(len(data)), self.beta + np.sum(data-1.))
             # get posterior value for proposal
             proposal_log_prior_value =  stats.beta.logpdf(proposal_p,self.alpha,self.beta) + np.log(self.discrete[self.r-1])
             proposal_log_likelihood_value = np.sum(self.log_pmf(x=data,r=proposal_r,p=proposal_p))
             # accept proposal with some probability
             accept_probability = np.exp(min(0.,proposal_log_prior_value - current_log_prior_value + proposal_log_likelihood_value - current_log_likelihood_value))
             #accept_probability = min(1, (proposal_prior_value / current_prior_value * np.exp(proposal_log_likelihood_value - current_log_likelihood_value)) )
             if sample_discrete(np.array((1.-accept_probability, accept_probability))):
                 self.r, self.p = proposal_r, proposal_p
                 current_log_prior_value = proposal_log_prior_value
                 current_log_likelihood_value = proposal_log_likelihood_value
Exemple #2
0
 def resample(self,data=np.array([]),numiter=10):
     if data.size == 0:
         # sample from prior
         self.wait = sample_discrete(self.discrete) + self.MIN
         self.distn.resample()
     else:
         assert data.ndim == 1
         # this is a pretty simplistic method
         for iter in xrange(numiter*10):
             # resample posterior wait, given fixed distn
             log_probs = np.sum(self.distn.log_pmf(np.vstack([data - (wait+self.MIN) for wait in xrange(len(self.discrete))])),axis=1)
             log_probs -= np.amax(log_probs)
             self.wait = sample_discrete( self.discrete * np.exp(log_probs) )
             # resample fixed distn given wait
             self.distn.resample(data - self.wait,numiter=numiter)
Exemple #3
0
    def generate_states(self):
        if self.left_censoring:
            raise NotImplementedError
        idx = 0
        nextstate_distr = self.pi_0
        A = self.trans_matrix

        stateseq = np.empty(self.T, dtype=np.int32)
        # durations = []

        while idx < self.T:
            # sample a state
            state = sample_discrete(nextstate_distr)
            # sample a duration for that state
            duration = self.dur_distns[state].rvs()
            # save everything
            # durations.append(duration)
            stateseq[idx:idx +
                     duration] = state  # this can run off the end, that's okay
            # set up next state distribution
            nextstate_distr = A[state, ]
            # update index
            idx += duration

        self.stateseq = stateseq
Exemple #4
0
    def test(cls):
        from matplotlib import pyplot as plt

        truth = cls(1.,1.)
        print truth.concentration

        infer = cls(1.,1.)

        foo = []
        for itr in range(50):
            num_die = 1
            num_sides = 6
            dice = stats.gamma.rvs(truth.concentration * np.ones((num_die,num_sides))/num_sides)
            dice /= dice.sum(1)[:,na]

            # get some samples
            num_samples = 50*np.ones(num_die)
            counts = np.zeros((num_die,num_sides),dtype=np.int32)
            for idx, (num, die) in enumerate(zip(num_samples,dice)):
                counts[idx] = np.bincount(sample_discrete(die,size=num),minlength=num_sides)

            infer.resample(counts)
            foo.append(infer.concentration)

        print np.median(foo)
        plt.hist(foo,bins=25,normed=True)
Exemple #5
0
    def _resample_a_word(self, hsmm_states):
        # hsmm_states = [letter_state for letter_state in self.letter_hsmm.states_list if letter_state.word_idx == word_idx]
        candidates = [
            tuple(letter_state.stateseq_norep) for letter_state in hsmm_states
        ]
        unique_candidates = list(set(candidates))
        ref_array = np.array(
            [unique_candidates.index(candi) for candi in candidates])
        if len(candidates) == 0:
            return self.generate_word()
        elif len(unique_candidates) == 1:
            return unique_candidates[0]
        cache_score = np.empty((len(unique_candidates), len(candidates)))
        likelihoods = np.array(
            [letter_state.log_likelihood() for letter_state in hsmm_states])
        range_tmp = list(range(len(candidates)))

        for candi_idx, candi in enumerate(unique_candidates):
            tmp = range_tmp[:]
            if (ref_array == candi_idx).sum() == 1:
                tmp.remove(np.where(ref_array == candi_idx)[0][0])
            for tmp_idx in tmp:
                # print(hsmm_states[tmp_idx].likelihood_block_word(candi)[-1])
                cache_score[candi_idx, tmp_idx] = hsmm_states[
                    tmp_idx].likelihood_block_word(candi)[-1]
        cache_scores_matrix = cache_score[ref_array]
        for i in range_tmp:
            cache_scores_matrix[i, i] = 0.0
        scores = cache_scores_matrix.sum(axis=1) + likelihoods

        assert (np.exp(scores) >= 0).all(), cache_scores_matrix
        sampled_candi_idx = sample_discrete(np.exp(scores))
        return candidates[sampled_candi_idx]
Exemple #6
0
    def generate_states(self):
        idx = 0
        nextstate_distr = self.initial_distn.pi_0
        A = self.transition_distn.A

        stateseq = -1*np.ones(self.T,dtype=np.int32)
        stateseq_norep = []
        durations = []

        while idx < self.T:
            # sample a state
            state = sample_discrete(nextstate_distr)
            # sample a duration for that state
            duration = self.dur_distns[state].rvs()
            # save everything
            stateseq_norep.append(state)
            durations.append(duration)
            stateseq[idx:idx+duration] = state # this can run off the end, that's okay
            # set up next state distribution
            nextstate_distr = A[state,]
            # update index
            idx += duration

        self.stateseq_norep = np.array(stateseq_norep,dtype=np.int32)
        self.durations = np.array(durations,dtype=np.int32)
        self.stateseq = stateseq

        # NOTE self.durations.sum() >= self.T since self.T is the censored
        # length

        assert len(self.stateseq_norep) == len(self.durations)
        assert (self.stateseq >= 0).all()
Exemple #7
0
 def generate_word(self, word_size):
     nextstate_distn = self.init_state_distn.pi_0
     A = self.trans_distn.trans_matrix
     word = [-1] * word_size
     for idx in range(word_size):
         word[idx] = sample_discrete(nextstate_distn)
         nextstate_distn = A[word[idx]]
     return tuple(word)
Exemple #8
0
    def resample_letter_params(self):
        states_index = [0]
        hsmm = self.letter_hsmm
        hsmm.states_list = []
        for s in self.states_list:
            s.letterseq = np.ones(len(s.data), dtype=np.int64) * -1

        for state in range(self.state_dim):
            for s in self.states_list:
                for state2, (start, stop) in s.state_ranges:
                    if state == state2:
                        hsmm.add_data_parallel(s.data[start:stop])
                        hsmm.states_list[-1].letterseq = s.letterseq[
                            start:stop]

            states_index.append(len(hsmm.states_list))

        hsmm.resample_states_parallel()
        likelihoods = hsmm.likelihoods()
        state_count = {}

        for state, bound in enumerate(zip(states_index[:-1],
                                          states_index[1:])):
            staff = range(*bound)
            if len(staff) == 0:
                self.word_list[state] = self.generate_word()
                continue

            candidates = []
            scores = []
            for idx in staff:
                rest = set(staff) - set([idx])
                word = hsmm.states_list[idx].stateseq_norep
                score = np.sum([
                    hsmm.states_list[s].likelihood_block_word(
                        0, len(hsmm.states_list[s].data), word) for s in rest
                ]) + likelihoods[idx]
                scores.append(score)
                candidates.append(tuple(word))

            resample_state_flag = len(set(candidates)) > 1
            if resample_state_flag:
                word_idx = sample_discrete(np.exp(scores))
                sampleseq = candidates[word_idx]
            else:
                sampleseq = candidates[0]

            self.word_list[state] = tuple(sampleseq)
            for idx in staff:
                s = hsmm.states_list[idx]
                s.letterseq[:] = s.stateseq
                word = tuple(s.stateseq_norep)

        hsmm.resample_trans_distn()
        hsmm.resample_init_state_distn()
        hsmm.resample_dur_distns()
        hsmm.resample_obs_distns()
        self.resample_length_dist()
Exemple #9
0
    def _sample_forwards_log(betal,trans_matrix,init_state_distn,log_likelihoods):
        A = trans_matrix
        aBl = log_likelihoods
        T = aBl.shape[0]

        stateseq = np.empty(T,dtype=np.int32)

        nextstate_unsmoothed = init_state_distn
        for idx in range(T):
            logdomain = betal[idx] + aBl[idx]
            logdomain[nextstate_unsmoothed == 0] = -np.inf
            if np.any(np.isfinite(logdomain)):
                stateseq[idx] = sample_discrete(nextstate_unsmoothed * np.exp(logdomain - np.amax(logdomain)))
            else:
                stateseq[idx] = sample_discrete(nextstate_unsmoothed)
            nextstate_unsmoothed = A[stateseq[idx]]

        return stateseq
Exemple #10
0
    def generate_word(self, size):
        next_dist = self.init_state_distn.pi_0
        word = []
        for _ in range(size):
            letter = sample_discrete(next_dist)
            word.append(letter)
            next_dist = self.trans_distn.A[letter]

        return tuple(word)
Exemple #11
0
    def _sample_forwards_log(betal, trans_matrix, init_state_distn, log_likelihoods):
        A = trans_matrix
        aBl = log_likelihoods
        T = aBl.shape[0]

        stateseq = np.empty(T, dtype=np.int32)

        nextstate_unsmoothed = init_state_distn
        for idx in xrange(T):
            logdomain = betal[idx] + aBl[idx]
            logdomain[nextstate_unsmoothed == 0] = -np.inf
            if np.any(np.isfinite(logdomain)):
                stateseq[idx] = sample_discrete(nextstate_unsmoothed * np.exp(logdomain - np.amax(logdomain)))
            else:
                stateseq[idx] = sample_discrete(nextstate_unsmoothed)
            nextstate_unsmoothed = A[stateseq[idx]]

        return stateseq
    def _generate(self,T):
        alpha = self.alpha_0
        betavec = self.beta.betavec
        model = self.model
        self.stateseq = np.array([])

        ks = list(model._occupied()) + [None]
        firststateidx = sample_discrete(np.arange(len(ks)))
        if firststateidx == len(ks)-1:
            firststate = self._new_label(ks)
        else:
            firststate = ks[firststateidx]

        self.dur.resample(combinedata((model._durs_withlabel(firststate),self._durs_withlabel(firststate))))
        firststate_dur = self.dur.rvs()

        self.stateseq = np.ones(firststate_dur,dtype=int)*firststate
        t = firststate_dur

        # run a family-CRF (CRF with durations) forwards
        while t < T:
            ks = list(model._occupied() | self._occupied())
            betarest = 1-sum(betavec[k] for k in ks)
            fromto_counts = np.array([model._counts_fromto(self.stateseq[t-1],k)
                                            + self._counts_fromto(self.stateseq[t-1],k)
                                            for k in ks])
            scores = np.array([(alpha*betavec[k] + ft if k != self.stateseq[t-1] else 0)
                    for k,ft in zip(ks,fromto_counts)]
                    + [alpha*(1-betavec[self.stateseq[t-1]])*betarest])
            nextstateidx = sample_discrete(scores)
            if nextstateidx == scores.shape[0]-1:
                nextstate = self._new_label(ks)
            else:
                nextstate = ks[nextstateidx]

            # now get the duration of nextstate!
            self.dur.resample(combinedata((model._durs_withlabel(nextstate),self._durs_withlabel(nextstate))))
            nextstate_dur = self.dur.rvs()

            self.stateseq = np.concatenate((self.stateseq,np.ones(nextstate_dur,dtype=int)*nextstate))

            t += nextstate_dur

        self.T = len(self.stateseq)
Exemple #13
0
    def generate(self):
        word_size = self.letter_dur.rvs() or 1
        next_state_dist = self.init_dist.pi_0
        ret = []

        for i in range(word_size):
            next_state = sample_discrete(next_state_dist)
            ret.append(next_state)
            next_state_dist = self.letter_trans.A[next_state]

        return tuple(ret)
Exemple #14
0
    def generate_states(self):
        T = self.T
        nextstate_distn = self.pi_0
        A = self.trans_matrix

        stateseq = np.zeros(T, dtype=np.int32)
        for idx in xrange(T):
            stateseq[idx] = sample_discrete(nextstate_distn)
            nextstate_distn = A[stateseq[idx]]

        self.stateseq = stateseq
        return stateseq
Exemple #15
0
    def _sample_backwards_normalized(alphan, trans_matrix_transpose):
        AT = trans_matrix_transpose
        T = alphan.shape[0]

        stateseq = np.empty(T, dtype=np.int32)

        next_potential = np.ones(AT.shape[0])
        for t in xrange(T - 1, -1, -1):
            stateseq[t] = sample_discrete(next_potential * alphan[t])
            next_potential = AT[stateseq[t]]

        return stateseq
Exemple #16
0
    def generate_states(self):
        T = self.T
        nextstate_distn = self.pi_0
        A = self.trans_matrix

        stateseq = np.zeros(T, dtype=np.int32)
        for idx in xrange(T):
            stateseq[idx] = sample_discrete(nextstate_distn)
            nextstate_distn = A[stateseq[idx]]

        self.stateseq = stateseq
        return stateseq
Exemple #17
0
def hsmm_sample_forwards_log(
    trans_potentials,
    initial_state_potential,
    cumulative_obs_potentials,
    dur_potentials,
    dur_survival_potentails,
    betal,
    betastarl,
    left_censoring=False,
    right_censoring=True,
):

    T, _ = betal.shape
    stateseq = np.empty(T, dtype=np.int32)
    durations = []

    t = 0

    if left_censoring:
        raise NotImplementedError
    else:
        nextstate_unsmoothed = initial_state_potential

    while t < T:
        ## sample the state
        nextstate_distn_log = nextstate_unsmoothed + betastarl[t]
        nextstate_distn = np.exp(nextstate_distn_log -
                                 logsumexp(nextstate_distn_log))
        assert nextstate_distn.sum() > 0
        state = sample_discrete(nextstate_distn)

        ## sample the duration
        dur_logpmf = dur_potentials(t)[:, state]
        obs, offset = cumulative_obs_potentials(t)
        obs, offset = obs[:, state], offset  # [state]
        durprob = np.random.random()

        dur = 0  # NOTE: always incremented at least once
        while durprob > 0 and dur < dur_logpmf.shape[0] and t + dur < T:
            p_d = np.exp(dur_logpmf[dur] + obs[dur] - offset +
                         betal[t + dur, state] - betastarl[t, state])

            assert not np.isnan(p_d)
            durprob -= p_d
            dur += 1

        stateseq[t:t + dur] = state
        durations.append(dur)

        t += dur
        nextstate_log_distn = trans_potentials(t)[state]

    return stateseq, durations
Exemple #18
0
    def generate_states(self):
        T = self.T
        stateseq = np.zeros(T,dtype=np.int32)
        nextstate_distn = self.initial_distn.pi_0
        A = self.transition_distn.A

        for idx in xrange(T):
            stateseq[idx] = sample_discrete(nextstate_distn)
            nextstate_distn = A[stateseq[idx]]

        self.stateseq = stateseq
        return stateseq
Exemple #19
0
    def _sample_backwards_normalized(alphan, trans_matrix_transpose):
        AT = trans_matrix_transpose
        T = alphan.shape[0]

        stateseq = np.empty(T, dtype=np.int32)

        next_potential = np.ones(AT.shape[0])
        for t in xrange(T - 1, -1, -1):
            stateseq[t] = sample_discrete(next_potential * alphan[t])
            next_potential = AT[stateseq[t]]

        return stateseq
Exemple #20
0
    def generate_states(self):
        if self.left_censoring:
            raise NotImplementedError
        Tblock = len(self.changepoints)
        blockstateseq = self.blockstateseq = np.zeros(Tblock,dtype=np.int32)

        tblock = 0
        nextstate_distr = self.pi_0
        A = self.trans_matrix

        while tblock < Tblock:
            # sample the state
            state = sample_discrete(nextstate_distr)

            # compute possible duration info (indep. of state)
            possible_durations = self.segmentlens[tblock:].cumsum()

            # compute the pmf over those steps
            durprobs = self.dur_distns[state].pmf(possible_durations)
            # TODO censoring: the last possible duration isn't quite right
            durprobssum = durprobs.sum()
            durprobs /= durprobssum

            # If no duration is possible, then pick the first duration
            if durprobssum == 0:
                durprobs[0] = 1.0
                durprobs[1:] = 0.0

            # sample it
            blockdur = sample_discrete(durprobs) + 1

            # set block sequence
            blockstateseq[tblock:tblock+blockdur] = state

            # set up next iteration
            tblock += blockdur
            nextstate_distr = A[state]

        self._stateseq_norep = None
        self._durations_censored = None
Exemple #21
0
    def generate_states(self):
        if self.left_censoring:
            raise NotImplementedError
        Tblock = len(self.changepoints)
        blockstateseq = self.blockstateseq = np.zeros(Tblock,dtype=np.int32)

        tblock = 0
        nextstate_distr = self.pi_0
        A = self.trans_matrix

        while tblock < Tblock:
            # sample the state
            state = sample_discrete(nextstate_distr)

            # compute possible duration info (indep. of state)
            possible_durations = self.segmentlens[tblock:].cumsum()

            # compute the pmf over those steps
            durprobs = self.dur_distns[state].pmf(possible_durations)
            # TODO censoring: the last possible duration isn't quite right
            durprobssum = durprobs.sum()
            durprobs /= durprobssum

            # If no duration is possible, then pick the first duration
            if durprobssum == 0:
                durprobs[0] = 1.0
                durprobs[1:] = 0.0

            # sample it
            blockdur = sample_discrete(durprobs) + 1

            # set block sequence
            blockstateseq[tblock:tblock+blockdur] = state

            # set up next iteration
            tblock += blockdur
            nextstate_distr = A[state]

        self._stateseq_norep = None
        self._durations_censored = None
    def _generate(self,T):
        self.T = T
        alpha, kappa = self.alpha_0, self.kappa
        betavec = self.beta.betavec
        stateseq = np.zeros(T,dtype=np.int)
        model = self.model
        self.stateseq = stateseq[:0]

        # NOTE: we have a choice of what state to start in; it's just a
        # definition choice that isn't specified in the HDP-HMM
        # Here, we choose just to sample from beta. Note that if this is the
        # first chain being sampled in this model, this will always sample
        # zero, since no states will be occupied.
        ks = list(model._occupied()) + [None]
        firststate = sample_discrete(np.arange(len(ks)))
        if firststate == len(ks)-1:
            stateseq[0] = self._new_label(ks)
        else:
            stateseq[0] = ks[firststate]

        # runs a CRF with fixed weights beta forwards
        for t in range(1,T):
            self.stateseq = stateseq[:t]
            ks = list(model._occupied() | self._occupied())
            betarest = 1-sum(betavec[k] for k in ks)
            # get the counts of new states coming out of our current state
            # going to all other states
            fromto_counts = np.array([model._counts_fromto(stateseq[t-1],k)
                                            + self._counts_fromto(stateseq[t-1],k)
                                            for k in ks])
            # for those states plus a new one, sample proportional to
            scores = np.array([(alpha*betavec[k] + (kappa if k == stateseq[t+1] else 0) + ft)
                    for k,ft in zip(ks,fromto_counts)] + [alpha*betarest])
            nextstateidx = sample_discrete(scores)
            if nextstateidx == scores.shape[0]-1:
                stateseq[t] = self._new_label(ks)
            else:
                stateseq[t] = ks[nextstateidx]
        self.stateseq = stateseq
Exemple #23
0
    def sample_forwards(self,aBl,betal):
        T = aBl.shape[0]
        stateseq = np.zeros(T,dtype=np.int32)
        nextstate_unsmoothed = self.initial_distn.pi_0
        A = self.transition_distn.A

        for idx in xrange(T):
            logdomain = betal[idx] + aBl[idx]
            logdomain[nextstate_unsmoothed == 0] = -np.inf # to enforce constraints in the trans matrix
            stateseq[idx] = sample_discrete(nextstate_unsmoothed * np.exp(logdomain - np.amax(logdomain)))
            nextstate_unsmoothed = A[stateseq[idx]]

        self.stateseq = stateseq
Exemple #24
0
def hsmm_sample_forwards_log(
    trans_potentials,
    initial_state_potential,
    cumulative_obs_potentials,
    dur_potentials,
    dur_survival_potentails,
    betal,
    betastarl,
    left_censoring=False,
    right_censoring=True,
):

    T, _ = betal.shape
    stateseq = np.empty(T, dtype=np.int32)
    durations = []

    t = 0

    if left_censoring:
        raise NotImplementedError
    else:
        nextstate_unsmoothed = initial_state_potential

    while t < T:
        ## sample the state
        nextstate_distn_log = nextstate_unsmoothed + betastarl[t]
        nextstate_distn = np.exp(nextstate_distn_log - logsumexp(nextstate_distn_log))
        assert nextstate_distn.sum() > 0
        state = sample_discrete(nextstate_distn)

        ## sample the duration
        dur_logpmf = dur_potentials(t)[:, state]
        obs, offset = cumulative_obs_potentials(t)
        obs, offset = obs[:, state], offset[state]
        durprob = np.random.random()

        dur = 0  # NOTE: always incremented at least once
        while durprob > 0 and dur < dur_logpmf.shape[0] and t + dur < T:
            p_d = np.exp(dur_logpmf[dur] + obs[dur] - offset + betal[t + dur, state] - betastarl[t, state])

            assert not np.isnan(p_d)
            durprob -= p_d
            dur += 1

        stateseq[t : t + dur] = state
        durations.append(dur)

        t += dur
        nextstate_log_distn = trans_potentials(t)[state]

    return stateseq, durations
Exemple #25
0
    def _sample_forwards_normalized(betan,trans_matrix,init_state_distn,log_likelihoods):
        A = trans_matrix
        aBl = log_likelihoods
        T = aBl.shape[0]

        stateseq = np.empty(T,dtype=np.int32)

        nextstate_unsmoothed = init_state_distn
        for idx in range(T):
            logdomain = aBl[idx]
            logdomain[nextstate_unsmoothed == 0] = -np.inf
            stateseq[idx] = sample_discrete(nextstate_unsmoothed * betan * np.exp(logdomain - np.amax(logdomain)))
            nextstate_unsmoothed = A[stateseq[idx]]

        return stateseq
Exemple #26
0
    def _sample_forwards_normalized(betan, trans_matrix, init_state_distn, log_likelihoods):
        A = trans_matrix
        aBl = log_likelihoods
        T = aBl.shape[0]

        stateseq = np.empty(T, dtype=np.int32)

        nextstate_unsmoothed = init_state_distn
        for idx in xrange(T):
            logdomain = aBl[idx]
            logdomain[nextstate_unsmoothed == 0] = -np.inf
            stateseq[idx] = sample_discrete(nextstate_unsmoothed * betan * np.exp(logdomain - np.amax(logdomain)))
            nextstate_unsmoothed = A[stateseq[idx]]

        return stateseq
Exemple #27
0
    def sample_forwards(self, betal, betastarl):
        T = self.T
        A = self.A
        aD = self.aD
        stateseq = self.stateseq = np.zeros(T, dtype=np.int32)
        state_ranges = self.state_ranges = []
        idx = 0
        nextstate_unsmoothed = self.model.init_dist.pi_0
        while idx < T:
            logdomain = betastarl[idx] - np.amax(betastarl[idx])
            nextstate_dist = np.exp(logdomain) * nextstate_unsmoothed
            if (nextstate_dist == 0.).all():
                nextstate_dist = np.exp(logdomain)

            state = sample_discrete(nextstate_dist)
            durprob = np.random.random()
            word = self.model.word_list[state]
            dur = len(word) - 1

            while durprob > 0:
                p_d_prior = aD[dur, state] if dur < T else 1.
                assert not np.isnan(p_d_prior)
                assert p_d_prior >= 0

                if p_d_prior == 0:
                    dur += 1
                    continue

                if idx + dur < T:
                    loglikelihood = self.likelihood_block_word(
                        idx, idx + dur + 1, word)
                    mess_term = np.exp(loglikelihood +
                                       betal[idx + dur, state] -
                                       betastarl[idx, state])
                    p_d = mess_term * p_d_prior
                    assert not np.isnan(p_d)
                    durprob -= p_d
                    dur += 1
                else:
                    dur += 1
                    break

            assert dur > 0
            assert dur >= len(word)
            stateseq[idx:idx + dur] = state
            state_ranges.append((state, (idx, idx + dur)))
            nextstate_unsmoothed = A[state]
            idx += dur
Exemple #28
0
    def sample_forwards(self, betal, betastarl):
        T = self.T
        aD = np.exp(self.aDl)
        log_trans_matrix = self.log_trans_matrix
        stateseq = self._stateseq[:]
        stateseq[:] = -1
        letter_stateseq = self._letter_stateseq[:]
        letter_stateseq[:] = -1
        stateseq_norep = []
        durations_censored = []
        t = 0
        nextstate_unsmoothed = self.pi_0
        while t < T:
            logdomain = betastarl[t] - betastarl[t].max()
            nextstate_dist = np.exp(logdomain) * nextstate_unsmoothed
            if (nextstate_dist == 0.).all():
                nextstate_dist = np.exp(logdomain)

            state = sample_discrete(nextstate_dist)
            durprob = np.random.random()
            # dur = len(self.model.word_list[state])
            cache_mess_term = np.exp(
                self.likelihood_block_word(t, T, self.model.word_list[state]) +
                betal[t:T, state] - betastarl[t, state])

            dur = 0
            while durprob > 0 and t + dur < T:
                # p_d_prior = aD[dur, state] if t + dur < T else 1.
                p_d_prior = aD[dur, state]
                assert not np.isnan(p_d_prior)
                assert p_d_prior >= 0

                p_d = cache_mess_term[dur] * p_d_prior
                assert not np.isnan(p_d)
                durprob -= p_d
                dur += 1

            assert dur > 0
            assert dur >= len(self.model.word_list[state])
            stateseq[t:t + dur] = state
            nextstate_unsmoothed = nextstate_dist[state]
            t += dur

            stateseq_norep.append(state)
            durations_censored.append(dur)
        self._stateseq_norep = np.array(stateseq_norep, dtype=np.int32)
        self._durations_censored = np.array(durations_censored, dtype=np.int32)
Exemple #29
0
    def test(cls):
        from matplotlib import pyplot as plt
        truth = cls(1.,1.)

        infer = cls(1.,1.)
        print truth.concentration
        blah = []
        for itr in range(200):
            alldata = []
            sizes = [20]
            for size in sizes:
                weights = stats.gamma.rvs(truth.concentration/50,size=50) # 50 \approx inf when #draws=20
                weights /= weights.sum()
                alldata.append(sample_discrete(weights,size=size))
            infer.resample(sample_numbers=np.array(sizes),total_num_distinct=len(set(np.concatenate(alldata))))
            blah.append(infer.concentration)

        print np.median(blah)
        plt.hist(blah,bins=25,normed=True)
Exemple #30
0
def hlm_sample_forwards_log(likelihood_block_word_func, trans_matrix, pi_0,
                            aDl, word_list, betal, betastarl, stateseq,
                            stateseq_norep, durations_censored):
    stateseq[:] = -1
    T = betal.shape[0]
    t = 0
    aD = np.exp(aDl)
    nextstate_unsmoothed = pi_0
    while t < T:
        logdomain = betastarl[t] - betastarl[t].max()
        nextstate_dist = np.exp(logdomain) * nextstate_unsmoothed

        state = sample_discrete(nextstate_dist)

        durprob = np.random.random()
        cache_mess_term = np.exp(
            likelihood_block_word_func(t, T, word_list[state]) +
            betal[t:T, state] - betastarl[t, state])

        dur = 0
        while durprob > 0 and t + dur < T:
            # p_d_prior = aD[dur, state] if t + dur < T else 1.
            p_d_prior = aD[dur, state]
            assert not np.isnan(p_d_prior)
            assert p_d_prior >= 0

            p_d = cache_mess_term[dur] * p_d_prior
            assert not np.isnan(p_d)
            durprob -= p_d
            dur += 1

        assert dur > 0
        assert dur >= len(word_list[state])
        stateseq[t:t + dur] = state
        nextstate_unsmoothed = trans_matrix[state]
        t += dur

        stateseq_norep.append(state)
        durations_censored.append(dur)
    stateseq_norep = np.array(stateseq_norep, dtype=np.int32)
    durations_censored = np.array(durations_censored, dtype=np.int32)
    return stateseq, stateseq_norep, durations_censored
Exemple #31
0
    def resample_words(self):
        for word_idx in range(self.num_states):
            hsmm_states = [letter_state for letter_state in self.letter_hsmm.states_list if letter_state.word_idx == word_idx]
            candidates = [tuple(letter_state.stateseq_norep) for letter_state in hsmm_states]
            unique_candidates = list(set(candidates))
            ref_array = np.array([unique_candidates.index(candi) for candi in candidates])
            if len(candidates) == 0:
                self._generate_word_and_set_at(word_idx)
                continue
            elif len(unique_candidates) == 1:
                self.word_list[word_idx] = unique_candidates[0]
                continue
            cache_score = np.empty((len(unique_candidates), len(candidates)))
            likelihoods = np.array([letter_state.log_likelihood() for letter_state in hsmm_states])
            range_tmp = list(range(len(candidates)))

            for candi_idx, candi in enumerate(unique_candidates):
                tmp = range_tmp[:]
                if (ref_array == candi_idx).sum() == 1:
                    tmp.remove(np.where(ref_array == candi_idx)[0][0])
                for tmp_idx in tmp:
                    # print(hsmm_states[tmp_idx].likelihood_block_word(candi)[-1])
                    cache_score[candi_idx, tmp_idx] = hsmm_states[tmp_idx].likelihood_block_word(candi)[-1]
            cache_scores_matrix = cache_score[ref_array]
            for i in range_tmp:
                cache_scores_matrix[i, i] = 0.0
            scores = cache_scores_matrix.sum(axis=1) + likelihoods

            assert (np.exp(scores) >= 0).all(), cache_scores_matrix
            sampled_candi_idx = sample_discrete(np.exp(scores))
            self.word_list[word_idx] = candidates[sampled_candi_idx]

        # Merge same letter seq which has different id.
        for i, word in enumerate(self.word_list):
            if word in self.word_list[:i]:
                existed_id = self.word_list[:i].index(word)
                for word_state in self.states_list:
                    stateseq, stateseq_norep = word_state.stateseq, word_state.stateseq_norep
                    word_state.stateseq[stateseq == i] = existed_id
                    word_state.stateseq_norep[stateseq_norep == i] = existed_id
                    self._generate_word_and_set_at(i)
Exemple #32
0
    def generate(self, limit_len=3):
        nextstate_dist = self.init_dist.pi_0
        A = self.trans_dists.A
        state_list = []

        for _ in range(limit_len):
            state = sample_discrete(nextstate_dist)
            state_list.append(state)
            nextstate_dist = A[state]

        stateseq = []
        letseq = []
        obsseq = []
        for s in state_list:
            for l in self.word_list[s]:
                d = self.dur_distns[l].rvs() or 1
                o = self.obs_distns[l].rvs(size=d)
                obsseq.append(o)
                letseq.append([l] * d)
                stateseq.append([s] * d)

        return map(np.concatenate, (stateseq, letseq, obsseq))
Exemple #33
0
    def generate_states(self):
        if self.left_censoring:
            raise NotImplementedError
        idx = 0
        nextstate_distr = self.pi_0
        A = self.trans_matrix

        stateseq = np.empty(self.T,dtype=np.int32)
        # durations = []

        while idx < self.T:
            # sample a state
            state = sample_discrete(nextstate_distr)
            # sample a duration for that state
            duration = self.dur_distns[state].rvs()
            # save everything
            # durations.append(duration)
            stateseq[idx:idx+duration] = state # this can run off the end, that's okay
            # set up next state distribution
            nextstate_distr = A[state,]
            # update index
            idx += duration

        self.stateseq = stateseq
Exemple #34
0
 def rvs_given_less_than(self,x,num):
     pmf = self.pmf(np.arange(1,x))
     return sample_discrete(pmf,num)+1
Exemple #35
0
 def rvs_given_less_than(self, x, num):
     pmf = self.pmf(np.arange(1, x))
     return sample_discrete(pmf, num) + 1
Exemple #36
0
 def rvs(self,size=[]):
     return sample_discrete(self.distn,size=size)
Exemple #37
0
    def sample_forwards(self,betal,betastarl):
        stateseq = self.stateseq = np.zeros(self.T,dtype=np.int32)
        durations = []
        stateseq_norep = []

        idx = 0
        A = self.transition_distn.A
        nextstate_unsmoothed = self.initial_distn.pi_0

        apmf = np.zeros((self.state_dim,self.T))
        arg = np.arange(1,self.T+1)
        for state_idx, dur_distn in enumerate(self.dur_distns):
            apmf[state_idx] = dur_distn.pmf(arg)

        while idx < self.T:
            logdomain = betastarl[idx] - np.amax(betastarl[idx])
            nextstate_distr = np.exp(logdomain) * nextstate_unsmoothed
            if (nextstate_distr == 0.).all():
                # this is a numerical issue; no good answer, so we'll just follow the messages.
                nextstate_distr = np.exp(logdomain)
            state = sample_discrete(nextstate_distr)
            assert len(stateseq_norep) == 0 or state != stateseq_norep[-1]

            durprob = random()
            dur = 0 # always incremented at least once
            prob_so_far = 0.0
            while durprob > 0:
                assert dur < 2*self.T # hacky infinite loop check
                #assert self.dur_distns[state].pmf(dur+1) == apmf[state,dur]
                p_d_marg = apmf[state,dur] if dur < self.T else 1. # note funny indexing: dur variable is 1 less than actual dur we're considering
                assert not np.isnan(p_d_marg)
                assert p_d_marg >= 0
                if p_d_marg == 0:
                    dur += 1
                    continue
                if idx+dur < self.T:
                    mess_term = np.exp(self.likelihood_block_state(idx,idx+dur+1,state) + betal[idx+dur,state] - betastarl[idx,state]) # TODO unnecessarily slow for subhmms
                    p_d = mess_term * p_d_marg
                    #print 'dur: %d, durprob: %f, p_d_marg: %f, p_d: %f' % (dur+1,durprob,p_d_marg,p_d)
                    prob_so_far += p_d
                else:
                    # we're out of data, so we need to sample a duration
                    # conditioned on having lasted at least this long. the
                    # likelihood contributes the same to all possibilities, so
                    # we can just sample from the prior (conditioned on it being
                    # at least this long).
                    arg = np.arange(dur+1,2*self.T) # 2*T is just a guessed upper bound, +1 because 'dur' is one less than the duration we're actually considering
                    remaining = dur_distn.pmf(arg)
                    therest = sample_discrete(remaining)
                    dur = dur + therest
                    durprob = -1 # just to get us out of loop

                assert not np.isnan(p_d)
                durprob -= p_d
                dur += 1

            assert dur > 0

            stateseq[idx:idx+dur] = state
            stateseq_norep.append(state)
            assert len(stateseq_norep) < 2 or stateseq_norep[-1] != stateseq_norep[-2]
            durations.append(dur)

            nextstate_unsmoothed = A[state,:]

            idx += dur

        self.durations = np.array(durations,dtype=np.int32)
        self.stateseq_norep = np.array(stateseq_norep,dtype=np.int32)
Exemple #38
0
 def rvs(self,size=[]):
     return sample_discrete(self.pi_0,size=size)