Esempio n. 1
0
 def __call__(self, *args, **kwargs):
     model = GaussianHMM(n_states=2, init_algo=self.init_algo,
                         reversible_type=self.reversible_type,
                         thresh=1e-4, n_iter=30, random_state=rs)
     model.fit(X)
     validate_timeseries(means, vars, transmat, model, 0.1, 0.05)
     assert abs(model.fit_logprob_[-1] - model.score(X)) < 0.5
Esempio n. 2
0
 def __call__(self, *args, **kwargs):
     model = GaussianHMM(n_states=2,
                         init_algo=self.init_algo,
                         reversible_type=self.reversible_type,
                         thresh=1e-4,
                         n_iter=30)
     model.fit(X)
     validate_timeseries(means, vars, transmat, model, 0.1, 0.05)
     assert abs(model.fit_logprob_[-1] - model.score(X)) < 0.5
Esempio n. 3
0
def test_3():
    transmat = np.array([[0.2, 0.3, 0.5], [0.4, 0.4, 0.2], [0.8, 0.2, 0.0]])
    means = np.array([[0.0], [10.0], [5.0]])
    vars = np.array([[1.0], [2.0], [0.3]])
    X = [create_timeseries(means, vars, transmat) for i in range(20)]

    # For each value of various options, create a 3 state HMM and see if it is correct.

    for init_algo in ('kmeans', 'GMM'):
        for reversible_type in ('mle', 'transpose'):
            model = GaussianHMM(n_states=3, init_algo=init_algo, reversible_type=reversible_type, thresh=1e-4, n_iter=30)
            model.fit(X)
            validate_timeseries(means, vars, transmat, model, 0.1, 0.1)
            assert abs(model.fit_logprob_[-1]-model.score(X)) < 0.5
Esempio n. 4
0
def test_3():
    transmat = np.array([[0.2, 0.3, 0.5], [0.4, 0.4, 0.2], [0.8, 0.2, 0.0]])
    means = np.array([[0.0], [10.0], [5.0]])
    vars = np.array([[1.0], [2.0], [0.3]])
    X = [create_timeseries(means, vars, transmat) for i in range(20)]
    
    # For each value of various options, create a 3 state HMM and see if it is correct.
    
    for init_algo in ('kmeans', 'GMM'):
        for reversible_type in ('mle', 'transpose'):
            model = GaussianHMM(n_states=3, init_algo=init_algo, reversible_type=reversible_type, thresh=1e-4, n_iter=30)
            model.fit(X)
            validate_timeseries(means, vars, transmat, model, 0.1, 0.1)
            assert abs(model.fit_logprob_[-1]-model.score(X)) < 0.5
Esempio n. 5
0
def makeHMM(Trajectories, topology):
    top = md.load_prmtop(topology)
    alpha_carbons = [a.index for a in top.atoms if a.name == 'CA']
    filenames = sorted(glob(Trajectories))
    first_frame = md.load_frame(filenames[0], 0, top=top)

    f = SuperposeFeaturizer(alpha_carbons, first_frame)
    dataset = []
    for fragment in filenames:
        for chunk in md.iterload(fragment, chunk=100, top=top):
            dataset.append(f.partial_transform(chunk))
    hmm = GaussianHMM(n_states=8)
    hmm.fit(dataset)
    print(hmm.timescales_)
    return hmm
Esempio n. 6
0
def makeHMM(Trajectories, topology):
    top = md.load_prmtop(topology)
    alpha_carbons = [a.index for a in top.atoms if a.name == 'CA']
    filenames = sorted(glob(Trajectories))
    first_frame = md.load_frame(filenames[0], 0, top=top)

    f = SuperposeFeaturizer(alpha_carbons, first_frame)
    dataset = []
    for fragment in filenames:
            for chunk in md.iterload(fragment, chunk=100, top=top):
                dataset.append(f.partial_transform(chunk))
    hmm = GaussianHMM(n_states=8)
    hmm.fit(dataset)
    print(hmm.timescales_)
    return hmm
Esempio n. 7
0
def test_pipeline():
    trajs = AlanineDipeptide().get_cached().trajectories
    topology = trajs[0].topology

    indices = topology.select('backbone')
    p = Pipeline([('diheds', SuperposeFeaturizer(indices, trajs[0][0])),
                  ('hmm', GaussianHMM(n_states=4))])

    predict = p.fit_predict(trajs)
    p.named_steps['hmm'].summarize()
Esempio n. 8
0
def hmm_arbitrary(ic_slice, n_dims, n_clusters, n_obs):
#
	BIC_hmm = 0.0
	
	print("Trying {:d} clusters...".format(n_clusters))
	
	hmm = GHMM(n_states=n_clusters, reversible_type='transpose', thresh=1e-4, init_algo='GMM', timing=True)
	
	print("Running fit...")
	
	hmm.fit(ic_slice)
	
	if n_obs > 10000:
	#
		BIC_hmm = (1.0 + 2.0 * float(n_dims)) * float(n_clusters)**2 * math.log(n_obs) - 2.0 * hmm.fit_logprob_[0]
	#
	else:
	#
		BIC_hmm = (1.0 + 2.0 * float(n_dims)) * float(n_clusters) * math.log(n_obs) - 2.0 * hmm.fit_logprob_[0]
	#
	
	return hmm, BIC_hmm
Esempio n. 9
0
def test_ala2():
    # creates a 4-state HMM on the ALA2 data. Nothing fancy, just makes
    # sure the code runs without erroring out
    trajectories = AlanineDipeptide().get_cached().trajectories
    topology = trajectories[0].topology

    indices = topology.select('symbol C or symbol O or symbol N')
    featurizer = SuperposeFeaturizer(indices, trajectories[0][0])

    sequences = featurizer.transform(trajectories)
    hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs)
    hmm.fit(sequences)

    assert len(hmm.timescales_ == 3)
    assert np.any(hmm.timescales_ > 50)
Esempio n. 10
0
def test_pickle():
    """Test pickling an HMM"""
    trajectories = AlanineDipeptide().get_cached().trajectories
    topology = trajectories[0].topology
    indices = topology.select('symbol C or symbol O or symbol N')
    featurizer = SuperposeFeaturizer(indices, trajectories[0][0])
    sequences = featurizer.transform(trajectories)
    hmm = GaussianHMM(n_states=4, n_init=3, random_state=rs)
    hmm.fit(sequences)
    logprob, hidden = hmm.predict(sequences)

    with tempfile.TemporaryFile() as savefile:
        pickle.dump(hmm, savefile)
        savefile.seek(0, 0)
        hmm2 = pickle.load(savefile)

    logprob2, hidden2 = hmm2.predict(sequences)
    assert (logprob == logprob2)
Esempio n. 11
0
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~        HIDDEN MARKOV MODEL     ~~~~~~~~~~~~~~~~~~~~~~
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
lag_times = [1, 10, 20, 30, 40]
hmm_ts0 = {}
hmm_ts1 = {}
n_states = [3, 5]

for n in n_states:
    hmm_ts0[n] = []
    hmm_ts1[n] = []
    for lag_time in lag_times:
        strided_data = [
            s[i::lag_time] for s in sequences for i in range(lag_time)
        ]
        hmm = GaussianHMM(n_states=n, n_init=1).fit(strided_data)
        timescales = hmm.timescales_ * lag_time
        hmm_ts0[n].append(timescales[0])
        hmm_ts1[n].append(timescales[1])
        print('n_states=%d\tlag_time=%d\ttimescales=%s' %
              (n, lag_time, timescales))

figure(figsize=(14, 3))

for i, n in enumerate(n_states):
    subplot(1, len(n_states), 1 + i)
    plot(lag_times, hmm_ts0[n])
    plot(lag_times, hmm_ts1[n])
    if i == 0:
        ylabel('Relaxation Timescale')
    xlabel('Lag Time')
Esempio n. 12
0
def make_hmm_fullauto(ic_projs, equil_dists, n_comp, n_obs, resume):
#
	all_hmms = [None]
	all_BICs = [None]
	all_BIC_mins = [None]
	min_dim = 0
	pos = 0
	
	if equil_dists > n_comp:
	#
		temp = equil_dists
		
		equil_dists = n_comp
		
		n_comp = equil_dists
		
		del temp
	#
	
	n_dims = equil_dists - 1
	
	pos = 0
	dim_strike = 1
	dim_end = 0
	
	while n_dims <= n_comp:
	#
		n_dims += 1
		
		n_clusters = 0
		
		if not (resume is None):
		#
			print("Resuming with {:d} dimensions and {:d} clusters.".format(int(resume[0]), int(resume[1])))
			
			n_dims = int(resume[0])
			n_clusters = int(resume[1]) - 1
			resume = None
		#
		
		print("Trying with {:d} dimensions...\n".format(n_dims))
		
		ic_slice = [ic_projs[0][:,:n_dims]]
		
		for j in range(1, len(ic_projs)):
		#
			ic_slice.append(ic_projs[j][:,:n_dims])
		#
		
		best_hmm = None
		
		BICs = np.zeros(1)
		BIC_min = 0
		
		strike_max = 0
		strike_min = 0
		
		while True:
		#
			n_clusters += 1
			
			print("Trying {:d} clusters...".format(n_clusters))
			
			hmm = GHMM(n_states=n_clusters, reversible_type='transpose', thresh=1e-4, init_algo='GMM', timing=True)
			
			print("Running fit...")
			
			hmm.fit(ic_slice)
			
			print(hmm.fit_logprob_)
			
			if n_obs > 10000:
			#
				BIC_hmm = (1.0 + 2.0 * float(n_dims)) * float(n_clusters)**2 * math.log(n_obs) - 2.0 * hmm.fit_logprob_[-1]
			#
			else:
			#
				BIC_hmm = (1.0 + 2.0 * float(n_dims)) * float(n_clusters) * math.log(n_obs) - 2.0 * hmm.fit_logprob_[-1]
			#
			
			if BICs[0] == 0.0:
			#
				BICs[0] = BIC_hmm
			#
			else:
			#
				BICs = np.append(BICs, [BIC_hmm])
			#
			
			if len(BICs) == 1:
			#
				best_hmm = deepcopy(hmm)
			#
			elif len(BICs) > 1:
			#
				if BICs[len(BICs)-1] >= BICs[len(BICs)-2]:
				#
					print("Strike max")
					print("Current BIC:")
					print(BIC_hmm)
					print("Current min:")
					print(BICs[BIC_min])
					
					strike_max += 1
				#
				else:
				#
					print("Reset max")
					
					strike_max = 0
					
					if BICs[len(BICs)-1] >= BICs[BIC_min]:
					#
						if len(BICs) > 2 and BICs[len(BICs)-2] <= BICs[len(BICs)-3]:
						#
							print("Carry min")
							print("Current BIC:")
							print(BIC_hmm)
							print("Current min:")
							print(BICs[BIC_min])
						#
						else:
						#
							print("Strike min")
							print("Current BIC:")
							print(BIC_hmm)
							print("Current min:")
							print(BICs[BIC_min])
							
							strike_min += 1
						#
					#
					else:
					#
						print("Reset min")
						print("Current BIC:")
						print(BIC_hmm)
						
						strike_min = 0
						
						BIC_min = len(BICs)-1
						
						best_hmm = deepcopy(hmm)
					#
				#
			#
			
			if n_clusters >= 10 and (strike_max >= 3 or strike_min >= 3):
			#
				print("Reached expected true minimum\n")
				
				break
			#
			
			print("\n")
		#
		
		print("Pos :\n{:d}".format(pos))
		
		if all_hmms[0] is None:
		#
			all_hmms[0] = deepcopy(best_hmm)
			all_BICs[0] = BICs
			all_BIC_mins[0] = BIC_min
			
			min_dim = n_dims
		#
		else:
		#
			all_hmms.append(deepcopy(best_hmm))
			all_BICs.append(BICs)
			all_BIC_mins.append(BIC_min)
			
			print("Current slowest timescale :")
			print(all_hmms[-1].timescales_[0])
			print("Current optimal slowest timescale :")
			print(all_hmms[pos].timescales_[0])
			
			if all_hmms[-1].timescales_[0] < all_hmms[pos].timescales_[0]:
			#
				print("Strike on dimension.")
				
				dim_strike += 1
			#
			elif all_hmms[-1].timescales_[0] > all_hmms[pos].timescales_[0]:
			#
				print("New optimum found at {:d} dimensions".format(n_dims))
				
				dim_strike = 1
				
				min_dim = n_dims
				
				pos = len(all_hmms) - 1
			#
			
			if n_dims == n_comp:
			#
				print("End of loop. Optimal model selected")
				
				break
			#
			
			if dim_strike > 3:
			#
				print("Maximal strike reached.")
				
				dim_end = n_dims + 1
				
				break
			#
		#
	#
	
	all_dims = np.arange(equil_dists, dim_end)
	
	if min_dim == 0:
	#
		raise Exception("Texte.")
	#
	
	return all_hmms, all_BICs, all_BIC_mins, min_dim, pos, all_dims
Esempio n. 13
0
def make_hmm(ic_slice, dimension, dims):
#
	#bad_clustering = True
	#while bad_clustering:
		#n_clusters += 1
	
	populations, ranges, min_pop, tot_pop = md_populations(ic_slice, dimension)
	
	n_clusters = 0
	
	best_best_hmm = None
	#best_best_KLD_H = None
	best_best_hmm_popl = None
	
	BICs = np.zeros(1)
	
	BIC_min = 0
	
	strike_max = 0
	strike_min = 0
	
	while True:
	#
		n_clusters += 1
		
		print("Trying {:d} clusters...".format(n_clusters))
		
		hmm = GHMM(n_states=n_clusters, reversible_type='transpose', thresh=1e-4, init_algo='GMM', timing=True)
		
		best_hmm = None
		#best_KLD_H = None
		best_BIC = None
		
		for tries in range(1):
		#
			print("Run {:d}...".format(tries + 1))
			
			hmm.fit(ic_slice)
			
			populations_hmm = hmm_populations(hmm, dimension, ranges, min_pop)
			
			if tot_pop > 10000.0:
			#
				BIC_hmm = (1.0 + 2.0 * float(dims)) * float(n_clusters)**2 * math.log(tot_pop) - 2.0 * hmm.fit_logprob_[0]
			#
			else:
			#
				BIC_hmm = (1.0 + 2.0 * float(dims)) * float(n_clusters) * math.log(tot_pop) - 2.0 * hmm.fit_logprob_[0]
			#
			
			#KLD_H_hmm = hmm_KLD_H(populations_hmm, populations, min_pop)
			
			if best_hmm is None:
			#
				print("Better fit hmm found with {:d} clusters".format(n_clusters))
				#print(KLD_H_hmm)
				#print(best_KLD_H)
				print("BIC :")
				print(BIC_hmm)
				print("Penalty :")
				print((1.0 + 2.0 * float(dims)) * float(n_clusters) * math.log(tot_pop))
				print((1.0 + 2.0 * float(dims)) * float(n_clusters)**2 * math.log(tot_pop))
				print((1.0 + 2.0 * float(dims)) * float(n_clusters)**3 * math.log(tot_pop))
				
				best_hmm = deepcopy(hmm)
				#best_KLD_H = KLD_H_hmm
				best_hmm_popl = populations_hmm
				best_BIC = BIC_hmm
			#
			elif BIC_hmm < best_BIC:
			#
				print("Better fit hmm found with {:d} clusters".format(n_clusters))
				#print(KLD_H_hmm)
				#print(best_KLD_H)
				print("BIC :")
				print(BIC_hmm)
				print("Penalty :")
				print((1.0 + 2.0 * float(dims)) * float(n_clusters) * math.log(tot_pop))
				print((1.0 + 2.0 * float(dims)) * float(n_clusters)**2 * math.log(tot_pop))
				print((1.0 + 2.0 * float(dims)) * float(n_clusters)**3 * math.log(tot_pop))
				print("Previous minimum :")
				print(best_BIC)
				
				best_hmm = deepcopy(hmm)
				#best_KLD_H = KLD_H_hmm
				best_hmm_popl = populations_hmm
				best_BIC = BIC_hmm
			#
		#
		
		if BICs[0] == 0.0:
		#
			BICs[0] = best_BIC
		#
		else:
		#
			BICs = np.append(BICs, [best_BIC])
		#
		
		if len(BICs) == 1:
		#
			best_best_hmm = deepcopy(best_hmm)
			best_best_hmm_popl = best_hmm_popl
		#
		elif len(BICs) > 1:
		#
			if BICs[len(BICs)-1] >= BICs[len(BICs)-2]:
			#
				print("Strike max")
			
				strike_max += 1
			#
			else:
			#
				print("Reset max")
			
				strike_max = 0
			
				if BICs[len(BICs)-1] >= BICs[BIC_min]:
				#
					if len(BICs) > 2 and BICs[len(BICs)-2] <= BICs[len(BICs)-3]:
					#
						print("Carry min")
					#
					else:
					#
						print("Strike min")
					
						strike_min += 1
					#
				#
				else:
				#
					print("Reset min")
				
					strike_min = 0
				
					BIC_min = len(BICs)-1
					
					best_best_hmm = deepcopy(best_hmm)
					best_best_hmm_popl = best_hmm_popl
				#
			#
		#
		
		if n_clusters >= 10 and (strike_max >= 3 or strike_min >= 3):
		#
			print("Reached expected true minimum")
			
			break
		#
		
		#print("Kullback-Leibler Divergence to posterior entropy ratio :")
		#print(best_KLD_H)
		
		#if best_KLD_H <= 0.005:
		#
			#print("Information loss is inferior to 0.75%")
			
			#best_best_KLD_H = best_KLD_H
			#best_best_hmm = deepcopy(best_hmm)
			#best_best_hmm_popl = best_hmm_popl
			
			#break
		#
		
		print("\n")
	#
	
	return best_best_hmm, ranges, populations, best_best_hmm_popl, BICs