def __call__(self, *args, **kwargs): model = GaussianHMM(n_states=2, init_algo=self.init_algo, reversible_type=self.reversible_type, thresh=1e-4, n_iter=30, random_state=rs) model.fit(X) validate_timeseries(means, vars, transmat, model, 0.1, 0.05) assert abs(model.fit_logprob_[-1] - model.score(X)) < 0.5
def __call__(self, *args, **kwargs): model = GaussianHMM(n_states=2, init_algo=self.init_algo, reversible_type=self.reversible_type, thresh=1e-4, n_iter=30) model.fit(X) validate_timeseries(means, vars, transmat, model, 0.1, 0.05) assert abs(model.fit_logprob_[-1] - model.score(X)) < 0.5
def test_3(): transmat = np.array([[0.2, 0.3, 0.5], [0.4, 0.4, 0.2], [0.8, 0.2, 0.0]]) means = np.array([[0.0], [10.0], [5.0]]) vars = np.array([[1.0], [2.0], [0.3]]) X = [create_timeseries(means, vars, transmat) for i in range(20)] # For each value of various options, create a 3 state HMM and see if it is correct. for init_algo in ('kmeans', 'GMM'): for reversible_type in ('mle', 'transpose'): model = GaussianHMM(n_states=3, init_algo=init_algo, reversible_type=reversible_type, thresh=1e-4, n_iter=30) model.fit(X) validate_timeseries(means, vars, transmat, model, 0.1, 0.1) assert abs(model.fit_logprob_[-1]-model.score(X)) < 0.5
def makeHMM(Trajectories, topology): top = md.load_prmtop(topology) alpha_carbons = [a.index for a in top.atoms if a.name == 'CA'] filenames = sorted(glob(Trajectories)) first_frame = md.load_frame(filenames[0], 0, top=top) f = SuperposeFeaturizer(alpha_carbons, first_frame) dataset = [] for fragment in filenames: for chunk in md.iterload(fragment, chunk=100, top=top): dataset.append(f.partial_transform(chunk)) hmm = GaussianHMM(n_states=8) hmm.fit(dataset) print(hmm.timescales_) return hmm
def test_pipeline(): trajs = AlanineDipeptide().get_cached().trajectories topology = trajs[0].topology indices = topology.select('backbone') p = Pipeline([('diheds', SuperposeFeaturizer(indices, trajs[0][0])), ('hmm', GaussianHMM(n_states=4))]) predict = p.fit_predict(trajs) p.named_steps['hmm'].summarize()
def hmm_arbitrary(ic_slice, n_dims, n_clusters, n_obs): # BIC_hmm = 0.0 print("Trying {:d} clusters...".format(n_clusters)) hmm = GHMM(n_states=n_clusters, reversible_type='transpose', thresh=1e-4, init_algo='GMM', timing=True) print("Running fit...") hmm.fit(ic_slice) if n_obs > 10000: # BIC_hmm = (1.0 + 2.0 * float(n_dims)) * float(n_clusters)**2 * math.log(n_obs) - 2.0 * hmm.fit_logprob_[0] # else: # BIC_hmm = (1.0 + 2.0 * float(n_dims)) * float(n_clusters) * math.log(n_obs) - 2.0 * hmm.fit_logprob_[0] # return hmm, BIC_hmm
def test_ala2(): # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes # sure the code runs without erroring out trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs) hmm.fit(sequences) assert len(hmm.timescales_ == 3) assert np.any(hmm.timescales_ > 50)
def test_pickle(): """Test pickling an HMM""" trajectories = AlanineDipeptide().get_cached().trajectories topology = trajectories[0].topology indices = topology.select('symbol C or symbol O or symbol N') featurizer = SuperposeFeaturizer(indices, trajectories[0][0]) sequences = featurizer.transform(trajectories) hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs) hmm.fit(sequences) logprob, hidden = hmm.predict(sequences) with tempfile.TemporaryFile() as savefile: pickle.dump(hmm, savefile) savefile.seek(0, 0) hmm2 = pickle.load(savefile) logprob2, hidden2 = hmm2.predict(sequences) assert (logprob == logprob2)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~ HIDDEN MARKOV MODEL ~~~~~~~~~~~~~~~~~~~~~~ # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ lag_times = [1, 10, 20, 30, 40] hmm_ts0 = {} hmm_ts1 = {} n_states = [3, 5] for n in n_states: hmm_ts0[n] = [] hmm_ts1[n] = [] for lag_time in lag_times: strided_data = [ s[i::lag_time] for s in sequences for i in range(lag_time) ] hmm = GaussianHMM(n_states=n, n_init=1).fit(strided_data) timescales = hmm.timescales_ * lag_time hmm_ts0[n].append(timescales[0]) hmm_ts1[n].append(timescales[1]) print('n_states=%d\tlag_time=%d\ttimescales=%s' % (n, lag_time, timescales)) figure(figsize=(14, 3)) for i, n in enumerate(n_states): subplot(1, len(n_states), 1 + i) plot(lag_times, hmm_ts0[n]) plot(lag_times, hmm_ts1[n]) if i == 0: ylabel('Relaxation Timescale') xlabel('Lag Time')
def make_hmm_fullauto(ic_projs, equil_dists, n_comp, n_obs, resume): # all_hmms = [None] all_BICs = [None] all_BIC_mins = [None] min_dim = 0 pos = 0 if equil_dists > n_comp: # temp = equil_dists equil_dists = n_comp n_comp = equil_dists del temp # n_dims = equil_dists - 1 pos = 0 dim_strike = 1 dim_end = 0 while n_dims <= n_comp: # n_dims += 1 n_clusters = 0 if not (resume is None): # print("Resuming with {:d} dimensions and {:d} clusters.".format(int(resume[0]), int(resume[1]))) n_dims = int(resume[0]) n_clusters = int(resume[1]) - 1 resume = None # print("Trying with {:d} dimensions...\n".format(n_dims)) ic_slice = [ic_projs[0][:,:n_dims]] for j in range(1, len(ic_projs)): # ic_slice.append(ic_projs[j][:,:n_dims]) # best_hmm = None BICs = np.zeros(1) BIC_min = 0 strike_max = 0 strike_min = 0 while True: # n_clusters += 1 print("Trying {:d} clusters...".format(n_clusters)) hmm = GHMM(n_states=n_clusters, reversible_type='transpose', thresh=1e-4, init_algo='GMM', timing=True) print("Running fit...") hmm.fit(ic_slice) print(hmm.fit_logprob_) if n_obs > 10000: # BIC_hmm = (1.0 + 2.0 * float(n_dims)) * float(n_clusters)**2 * math.log(n_obs) - 2.0 * hmm.fit_logprob_[-1] # else: # BIC_hmm = (1.0 + 2.0 * float(n_dims)) * float(n_clusters) * math.log(n_obs) - 2.0 * hmm.fit_logprob_[-1] # if BICs[0] == 0.0: # BICs[0] = BIC_hmm # else: # BICs = np.append(BICs, [BIC_hmm]) # if len(BICs) == 1: # best_hmm = deepcopy(hmm) # elif len(BICs) > 1: # if BICs[len(BICs)-1] >= BICs[len(BICs)-2]: # print("Strike max") print("Current BIC:") print(BIC_hmm) print("Current min:") print(BICs[BIC_min]) strike_max += 1 # else: # print("Reset max") strike_max = 0 if BICs[len(BICs)-1] >= BICs[BIC_min]: # if len(BICs) > 2 and BICs[len(BICs)-2] <= BICs[len(BICs)-3]: # print("Carry min") print("Current BIC:") print(BIC_hmm) print("Current min:") print(BICs[BIC_min]) # else: # print("Strike min") print("Current BIC:") print(BIC_hmm) print("Current min:") print(BICs[BIC_min]) strike_min += 1 # # else: # print("Reset min") print("Current BIC:") print(BIC_hmm) strike_min = 0 BIC_min = len(BICs)-1 best_hmm = deepcopy(hmm) # # # if n_clusters >= 10 and (strike_max >= 3 or strike_min >= 3): # print("Reached expected true minimum\n") break # print("\n") # print("Pos :\n{:d}".format(pos)) if all_hmms[0] is None: # all_hmms[0] = deepcopy(best_hmm) all_BICs[0] = BICs all_BIC_mins[0] = BIC_min min_dim = n_dims # else: # all_hmms.append(deepcopy(best_hmm)) all_BICs.append(BICs) all_BIC_mins.append(BIC_min) print("Current slowest timescale :") print(all_hmms[-1].timescales_[0]) print("Current optimal slowest timescale :") print(all_hmms[pos].timescales_[0]) if all_hmms[-1].timescales_[0] < all_hmms[pos].timescales_[0]: # print("Strike on dimension.") dim_strike += 1 # elif all_hmms[-1].timescales_[0] > all_hmms[pos].timescales_[0]: # print("New optimum found at {:d} dimensions".format(n_dims)) dim_strike = 1 min_dim = n_dims pos = len(all_hmms) - 1 # if n_dims == n_comp: # print("End of loop. Optimal model selected") break # if dim_strike > 3: # print("Maximal strike reached.") dim_end = n_dims + 1 break # # # all_dims = np.arange(equil_dists, dim_end) if min_dim == 0: # raise Exception("Texte.") # return all_hmms, all_BICs, all_BIC_mins, min_dim, pos, all_dims
def make_hmm(ic_slice, dimension, dims): # #bad_clustering = True #while bad_clustering: #n_clusters += 1 populations, ranges, min_pop, tot_pop = md_populations(ic_slice, dimension) n_clusters = 0 best_best_hmm = None #best_best_KLD_H = None best_best_hmm_popl = None BICs = np.zeros(1) BIC_min = 0 strike_max = 0 strike_min = 0 while True: # n_clusters += 1 print("Trying {:d} clusters...".format(n_clusters)) hmm = GHMM(n_states=n_clusters, reversible_type='transpose', thresh=1e-4, init_algo='GMM', timing=True) best_hmm = None #best_KLD_H = None best_BIC = None for tries in range(1): # print("Run {:d}...".format(tries + 1)) hmm.fit(ic_slice) populations_hmm = hmm_populations(hmm, dimension, ranges, min_pop) if tot_pop > 10000.0: # BIC_hmm = (1.0 + 2.0 * float(dims)) * float(n_clusters)**2 * math.log(tot_pop) - 2.0 * hmm.fit_logprob_[0] # else: # BIC_hmm = (1.0 + 2.0 * float(dims)) * float(n_clusters) * math.log(tot_pop) - 2.0 * hmm.fit_logprob_[0] # #KLD_H_hmm = hmm_KLD_H(populations_hmm, populations, min_pop) if best_hmm is None: # print("Better fit hmm found with {:d} clusters".format(n_clusters)) #print(KLD_H_hmm) #print(best_KLD_H) print("BIC :") print(BIC_hmm) print("Penalty :") print((1.0 + 2.0 * float(dims)) * float(n_clusters) * math.log(tot_pop)) print((1.0 + 2.0 * float(dims)) * float(n_clusters)**2 * math.log(tot_pop)) print((1.0 + 2.0 * float(dims)) * float(n_clusters)**3 * math.log(tot_pop)) best_hmm = deepcopy(hmm) #best_KLD_H = KLD_H_hmm best_hmm_popl = populations_hmm best_BIC = BIC_hmm # elif BIC_hmm < best_BIC: # print("Better fit hmm found with {:d} clusters".format(n_clusters)) #print(KLD_H_hmm) #print(best_KLD_H) print("BIC :") print(BIC_hmm) print("Penalty :") print((1.0 + 2.0 * float(dims)) * float(n_clusters) * math.log(tot_pop)) print((1.0 + 2.0 * float(dims)) * float(n_clusters)**2 * math.log(tot_pop)) print((1.0 + 2.0 * float(dims)) * float(n_clusters)**3 * math.log(tot_pop)) print("Previous minimum :") print(best_BIC) best_hmm = deepcopy(hmm) #best_KLD_H = KLD_H_hmm best_hmm_popl = populations_hmm best_BIC = BIC_hmm # # if BICs[0] == 0.0: # BICs[0] = best_BIC # else: # BICs = np.append(BICs, [best_BIC]) # if len(BICs) == 1: # best_best_hmm = deepcopy(best_hmm) best_best_hmm_popl = best_hmm_popl # elif len(BICs) > 1: # if BICs[len(BICs)-1] >= BICs[len(BICs)-2]: # print("Strike max") strike_max += 1 # else: # print("Reset max") strike_max = 0 if BICs[len(BICs)-1] >= BICs[BIC_min]: # if len(BICs) > 2 and BICs[len(BICs)-2] <= BICs[len(BICs)-3]: # print("Carry min") # else: # print("Strike min") strike_min += 1 # # else: # print("Reset min") strike_min = 0 BIC_min = len(BICs)-1 best_best_hmm = deepcopy(best_hmm) best_best_hmm_popl = best_hmm_popl # # # if n_clusters >= 10 and (strike_max >= 3 or strike_min >= 3): # print("Reached expected true minimum") break # #print("Kullback-Leibler Divergence to posterior entropy ratio :") #print(best_KLD_H) #if best_KLD_H <= 0.005: # #print("Information loss is inferior to 0.75%") #best_best_KLD_H = best_KLD_H #best_best_hmm = deepcopy(best_hmm) #best_best_hmm_popl = best_hmm_popl #break # print("\n") # return best_best_hmm, ranges, populations, best_best_hmm_popl, BICs