def train(X, n_components): ############################################################################### # Run Gaussian HMM print ("fitting to HMM and decoding ...") # make an HMM instance and execute fit model = GaussianHMM(n_components, covariance_type="diag", n_iter=2000) model.fit([X]) # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print ("done\n") ############################################################################### # print trained parameters and plot print ("Transition matrix") print (model.transmat_) print () print ("means and vars of each hidden state") for i in range(n_components): print ("%dth hidden state" % i) print ("mean = ", model.means_[i]) print ("var = ", np.diag(model.covars_[i])) print () return hidden_states, model
def use_hmm(img_times, change_vals, fps=10, min_secs_for_train_to_pass=8): from sklearn.hmm import GaussianHMM X = np.column_stack(change_vals) n_components = 2 model = GaussianHMM(n_components, covariance_type="diag", n_iter=1000) model.fit([X.T]) #thresh = 10**-15 #model.transmat_ = np.array([[1-thresh,thresh],[1-thresh,thresh]]) hidden_states = model.predict(X.T) # print trained parameters and plot print("Transition matrix") print(model.transmat_) print() print("means and vars of each hidden state") for i in range(n_components): print("%dth hidden state" % i) print("mean = ", model.means_[i]) print("var = ", np.diag(model.covars_[i])) print() if model.means_[0][0] > model.means_[1][0]: # assume most most frames have no train, switch labels if necessary hidden_states = 1 - hidden_states train_spotted = filter_out_short_motions(hidden_states, min_secs_for_train_to_pass, fps) plot_timeline(img_times, change_vals, hidden_states, train_spotted) utils.copy_image_subset(config.experiment_data_frames, config.experiment_output_frames_hmm, np.nonzero(train_spotted)[0]) return train_spotted
def predictWithHMM(index, window = 252): training_X = X[range(index-window,index),:] training_y = actual_y[range(index-window,index)] testing_X = X[index,:].reshape(1,training_X.shape[1]) testing_y = y[index] # PCA DATA if perform_pca: pca = PCA(n_components= pca_components) pca.fit(training_X) training_X = pca.transform(training_X) testing_X = pca.transform(testing_X) model = GaussianHMM(n_components, "diag",n_iter=1000) model.fit([training_X]) hidden_states = model.predict(training_X) predicted_hidden_state = model.predict(testing_X) # DO PROBALISTIC APPROACH # pr = model.predict_proba(testing_X) # print pr prob = 0 state_idx = (hidden_states == predicted_hidden_state) median_val = np.mean(training_y[state_idx]) return int(median_val>0), testing_y, prob
def create_hmm_by_label(label): seqs = get_sequences_by_label(label) n_states = 3 hmm = GaussianHMM(n_states, covariance_type="diag", n_iter=1000) hmm.fit([seqs]) return hmm
def __init__(self, n_states, n_features): from sklearn.hmm import GaussianHMM self.impl = GaussianHMM(n_states, params='stmc') self._sequences = None self.means_ = None self.vars_ = None self.transmat_ = None self.startprob_ = None
def fit_HMMs(self, apans=None, dpans=None): if apans is not None: self.get_observations(apans, dpans) # gather data self.HMMs_dead = {} self.HMMs_alive = {} self.risk_vectors_dead = [] self.risk_vectors_alive = [] print "Training HMM's" for v in self.vitals_available: self.HMMs_dead[v] = GaussianHMM(self.nstates_dead[v], self.covariance_type ).fit(self.observations_dead[v]) self.HMMs_alive[v] = GaussianHMM(self.nstates_alive[v], self.covariance_type ).fit(self.observations_alive[v])
class HMMGestureMonitor (GestureMonitor): def __init__ (self, _train_ms_list, _gesture_name, FeatureExtractor=AVFeatureExtractor): GestureMonitor.__init__ (self, _train_ms_list, _gesture_name, FeatureExtractor) def train (self, motion_sequences): dfs = [ms.get_dataframe () for ms in motion_sequences] examples = [self.feature_extractor.extract (df) for df in dfs] examples = [e for e in examples if not np.isnan(np.sum(e))] self.hmm = GaussianHMM (n_components=5).fit (examples) self.score_threshold = GMScoreThreshold (self.hmm.score, examples) self.window_timespans = self.calculate_window_timespans (motion_sequences) def classify_window_df (self, window_df): features = self.feature_extractor.extract (window_df) score = self.score_threshold.classify (features) return score def get_current_reaction (self): scores = [self.hmm.score (self.feature_extractor.extract(window_df)) for window_df in self.get_window_dfs ()] if len(scores) > 0: return np.max(scores) else: return None
def run(self, protos): models = [] for nstate, label, seq in protos: train = self._training.run(seq) f1, f2 = self._feature.run(train, True) o = np.vstack((f1[:,1], f2)).T (start, trans) = self.init_left_right_model(nstate) clf = GaussianHMM(n_components=nstate, covariance_type=self._covar, transmat=trans, startprob=start) clf.fit(np.array([o])) models.append({'id':label, 'model':clf}) self._models = models return models
def create_hmm_by_labels(labels, dbs): seqs_all= [] for label in labels: seqs = get_sequences_by_label_multi_dbs(label, dbs) seqs_all.append(seqs) seqs_all = np.array(seqs_all)[0] #print seqs_all #print np.shape(seqs_all) n_states = 3 hmm = GaussianHMM(n_states, covariance_type="full", n_iter=1000) hmm.fit(seqs_all) return hmm
def get_trained_model(rootpath, condition, n_states, n_iterations, feature, cov_type): fname_mean = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-mean.txt' fname_cov = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-cov.txt' fname_tmat = condition + '-cond-' + feature + '-feat-' + str(n_states) + '-states-' + str(n_iterations) + '-iter-transtion.txt' constructed_path_mean = rootpath + condition + '/' + fname_mean mean = np.loadtxt(constructed_path_mean) iter_list = range(n_states) iter_list.reverse() deleted_means = [] for i in iter_list: if mean[i][mean[i] > 0.01].shape[0] == 0: print 'skipping deleting ith mean:', i, mean[i] #mean = np.delete(mean, i, 0) #deleted_means.append(i) constructed_path_cov = rootpath + condition + '/' + fname_cov if cov_type == 'full': cov = load_full(constructed_path_cov, n_states, 10) else: cov = np.loadtxt(constructed_path_cov) constructed_path_tmat = rootpath + condition + '/' + fname_tmat tmat = np.loadtxt(constructed_path_tmat) #fixing tmat if any of the means and covs were deleted deleted_means.sort() deleted_means.reverse() for di in deleted_means: tmat = np.delete(tmat, di, 1) tmat = np.delete(tmat, di, 0) smat = np.zeros(tmat.shape[0]) smat[0] = 1.0 sum_fix = np.sum(tmat, axis=1) sum_fix = 1.0 / sum_fix #print tmat for i in range(tmat.shape[0]): tmat[i] = tmat[i] * sum_fix[i] #print 'corrected\n', tmat if n_states != tmat.shape[0]: print 'removed some states, n_states now corrected to: ', tmat.shape[0], 'was originaly', n_states n_states = tmat.shape[0] model = GaussianHMM(n_components=n_states, covariance_type=cov_type, startprob=smat, transmat=tmat, n_iter=0, init_params='mc') model.means_ = mean model.covars_ = cov return model
def run(self, protos): models = [] for nstate, label, seq in protos: train = self._training.run(seq) f1, f2 = self._feature.run(train, True) o = np.vstack((f1[:, 1], f2)).T (start, trans) = self.init_left_right_model(nstate) clf = GaussianHMM(n_components=nstate, covariance_type=self._covar, transmat=trans, startprob=start) clf.fit(np.array([o])) models.append({'id': label, 'model': clf}) self._models = models return models
def train (self, motion_sequences): dfs = [ms.get_dataframe () for ms in motion_sequences] examples = [self.feature_extractor.extract (df) for df in dfs] examples = [e for e in examples if not np.isnan(np.sum(e))] self.hmm = GaussianHMM (n_components=5).fit (examples) self.score_threshold = GMScoreThreshold (self.hmm.score, examples) self.window_timespans = self.calculate_window_timespans (motion_sequences)
def HMM(data, sid, means_prior=None): # data is _not_ an event-frame, but an array # of the most recent trade events # Create scikit-learn model using the means # from the previous model as a prior model = GaussianHMM(HIDDEN_STATES, covariance_type="diag", n_iter=10, means_prior=means_prior, means_weight=0.5) # Extract variation and volume diff = data.variation[sid].values volume = data.volume[sid].values X = np.column_stack([diff, volume]) if len(diff) < HIDDEN_STATES: return None # Estimate model model.fit([X]) return model
def get_hmms (self): for gesture_type in self.gesture_types: print_status ("Get_Hmms", "Fitting for gesture_type: " + gesture_type) ### Step 1: fill hmm_examples appropriately ### hmm_examples = [] for gesture in self.gestures[gesture_type]: hmm_rep = gesture.get_hmm_rep () hmm_examples.append (hmm_rep) ### Step 2: fit parameters for the hmm ### hmm = GaussianHMM (self.num_hmm_states) hmm.fit (hmm_examples) ### Step 3: store the hmm in self.hmms ### self.hmms[gesture_type] = hmm print_inner_status (gesture_type, "predicted the following sequences: (score: sequence)") for example in hmm_examples: print " ", hmm.score (example), ": ", hmm.predict (example)
def predictWithHMM(index, window=252): training_X = X[range(index - window, index), :] training_y = actual_y[range(index - window, index)] testing_X = X[index, :].reshape(1, training_X.shape[1]) testing_y = y[index] # PCA DATA if perform_pca: pca = PCA(n_components=pca_components) pca.fit(training_X) training_X = pca.transform(training_X) testing_X = pca.transform(testing_X) model = GaussianHMM(n_components, "diag", n_iter=1000) model.fit([training_X]) hidden_states = model.predict(training_X) predicted_hidden_state = model.predict(testing_X) # DO PROBALISTIC APPROACH # pr = model.predict_proba(testing_X) # print pr prob = 0 state_idx = (hidden_states == predicted_hidden_state) median_val = np.mean(training_y[state_idx]) return int(median_val > 0), testing_y, prob
def get_hmms(self): for gesture_type in self.gesture_types: print_status("Get_Hmms", "Fitting for gesture_type: " + gesture_type) ### Step 1: fill hmm_examples appropriately ### hmm_examples = [] for gesture in self.gestures[gesture_type]: hmm_rep = gesture.get_hmm_rep() hmm_examples.append(hmm_rep) ### Step 2: fit parameters for the hmm ### hmm = GaussianHMM(self.num_hmm_states) hmm.fit(hmm_examples) ### Step 3: store the hmm in self.hmms ### self.hmms[gesture_type] = hmm print_inner_status( gesture_type, "predicted the following sequences: (score: sequence)") for example in hmm_examples: print " ", hmm.score(example), ": ", hmm.predict(example)
def run(self, data): sid = self.sids[0] self.dates = data[sid]['price'].values self.close_v = data[sid]['close_v'].values self.volume = data[sid]['volume'].values[1:] # take diff of close value # this makes len(diff) = len(close_t) - 1 # therefore, others quantity also need to be shifted self.diff = self.close_v[1:] - self.close_v[:-1] # pack diff and volume for training self.X = np.column_stack([self.diff, self.volume]) # make an HMM instance and execute fit self.model = GaussianHMM(self.n_components, covariance_type="diag", n_iter=self.n_iter) self.model.fit([self.X], n_iter=self.n_iter) # predict the optimal sequence of internal hidden state self.hidden_states = self.model.predict(self.X)
def gaussian_hmm_model(stock_market_quote, n_components=5): close_v = np.asarray(stock_market_quote.get_closing_price()) volume = np.asanyarray(stock_market_quote.get_volume()) volume = volume[:-1] diff = close_v[1:] - close_v[:-1] close_v = close_v[1:] X = np.column_stack([diff, volume]) model = GaussianHMM(n_components, covariance_type="diag") model.fit([X]) hidden_states = model.predict(X) print "Transition matrix" print model.transmat_ print "" print "means and vars of each hidden state" for i in xrange(n_components): print "%dth hidden state" % i print "mean = ", model.means_[i] print "var = ", np.diag(model.covars_[i]) print "" '''Visualization of Closing Price with respect to Volume, clustered by hidden states of data ''' fig = mlp.figure() ax = fig.add_subplot(111) for i in xrange(n_components): idx = (hidden_states == i) ax.plot(volume[idx], close_v[idx], 'o', label="%dth hidden state" % i) ax.legend() ax.set_xlabel('Volume of Stock', fontsize=20) ax.set_ylabel('Closing Price of Stock', fontsize=20) ax.set_title("""Quote's Volume and closing volume change in different hidden states""") ax.grid(True) mlp.show()
def hmm(samples): model = GaussianHMM(n_components=3) samples = samples.dropna() idx = samples.index if samples.values.ndim < 2: #import pdb; pdb.set_trace() m = samples.values.shape samples = samples.values.reshape(m[0],1) model.fit([samples]) #_, states = model.decode(samples, algorithm='map') framelogprob = model._compute_log_likelihood(samples) logprob, fwdlattice = model._do_forward_pass(framelogprob) n, _ = model.means_.shape frame = pd.DataFrame( framelogprob, index=idx, columns=map(lambda x: "frame_"+str(x), range(n)) ) forward = pd.DataFrame( fwdlattice, index=idx, columns=map(lambda x: "forward_"+str(x), range(n)) ) #import pdb; pdb.set_trace() predict = pd.DataFrame( (fwdlattice-framelogprob)[1:, :], index=idx[:-1], columns=map(lambda x: "predict_"+str(x), range(n))) import pdb; pdb.set_trace() return model, frame.join(forward)
def main(): """ First ARG: list of training files Second ARG: save name for model """ file1 = sys.argv[1] outname = sys.argv[2] file_list = [f[0:-1] for f in open(file1, 'r')] models, transitions, priors = calc_transmat(file_list) hmm = GaussianHMM( transitions.shape[0], "full", #startprob=priors, n_iter=500, transmat=transitions, init_params='mcs', params='mcs', ) feats, _ = load_feats_labels(file_list) feat, lab = load_feats_labels(file_list) #hmm.means_ = np.transpose(models['mean']) #hmm.covars_ = models['sigma'] print 'Fitting' start = timeit.default_timer() hmm.fit([np.transpose(feat)]) stop = timeit.default_timer() print 'Training Time: ' + str(stop - start) features, labels = load_feats_labels(['audio.arff']) _, seq = hmm.decode(np.transpose(features)) #print filter(lambda(x,y): x==y, zip(labels, map(int2label, seq))) print len(filter(lambda (x, y): x == y, zip(labels, map(int2label, seq)))) pickle.dump(hmm, open(outname, "wb")) plt.imshow(transitions, interpolation='nearest') plt.show()
def main(): """ First ARG: list of training files Second ARG: save name for model """ file1 = sys.argv[1] outname = sys.argv[2] file_list = [f[0:-1] for f in open(file1,'r')] models, transitions, priors = calc_transmat(file_list) hmm = GaussianHMM( transitions.shape[0], "full", #startprob=priors, n_iter=500, transmat=transitions, init_params='mcs', params='mcs', ) feats, _ = load_feats_labels(file_list) feat, lab = load_feats_labels(file_list) #hmm.means_ = np.transpose(models['mean']) #hmm.covars_ = models['sigma'] print 'Fitting' start = timeit.default_timer() hmm.fit([np.transpose(feat)]) stop = timeit.default_timer() print 'Training Time: ' + str(stop - start) features, labels = load_feats_labels(['audio.arff']) _, seq = hmm.decode(np.transpose(features)) #print filter(lambda(x,y): x==y, zip(labels, map(int2label, seq))) print len(filter(lambda(x,y): x==y, zip(labels, map(int2label, seq)))) pickle.dump(hmm, open(outname, "wb")) plt.imshow(transitions, interpolation='nearest') plt.show()
class _SklearnGaussianHMMCPUImpl(object): def __init__(self, n_states, n_features): from sklearn.hmm import GaussianHMM self.impl = GaussianHMM(n_states, params='stmc') self._sequences = None self.means_ = None self.vars_ = None self.transmat_ = None self.startprob_ = None def do_estep(self): from sklearn.utils.extmath import logsumexp self.impl.means_ = self.means_.astype(np.double) self.impl.covars_ = self.vars_.astype(np.double) self.impl.transmat_ = self.transmat_.astype(np.double) self.impl.startprob_ = self.startprob_.astype(np.double) stats = self.impl._initialize_sufficient_statistics() curr_logprob = 0 for seq in self._sequences: seq = seq.astype(np.double) framelogprob = self.impl._compute_log_likelihood(seq) lpr, fwdlattice = self.impl._do_forward_pass(framelogprob) bwdlattice = self.impl._do_backward_pass(framelogprob) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T curr_logprob += lpr self.impl._accumulate_sufficient_statistics( stats, seq, framelogprob, posteriors, fwdlattice, bwdlattice, self.impl.params) return curr_logprob, stats def do_viterbi(self): logprob = 0 state_sequences = [] for obs in self._sequences: lpr, ss = self.impl._decode_viterbi(obs) logprob += lpr state_sequences.append(ss) return logprob, state_sequences
def predict(self, obs): """Find most likely state sequence corresponding to `obs`. Parameters ---------- obs : np.ndarray, shape=(n_samples, n_features) Sequence of n_features-dimensional data points. Each row corresponds to a single point in the sequence. Returns ------- hidden_states : np.ndarray, shape=(n_states) Index of the most likely states for each observation """ _, vl = scipy.linalg.eig(self.transmat_, left=True, right=False) startprob = vl[:, 0] / np.sum(vl[:, 0]) model = GaussianHMM(n_components=self.n_states, covariance_type='full') model.startprob_ = startprob model.transmat_ = self.transmat_ model.means_ = self.means_ model.covars_ = self.covars_ return model.predict(obs)
def build_model(self): n_states = self.n_states X_hmm = self.X_hmm self.model = GaussianHMM(n_states,covariance_type='diag',n_iter=1000) self.model.fit([X_hmm]) self.hidden_states = self.model.predict(X_hmm)
class HMM(object): ''' class for creating and manipulating HMM model ''' def __init__(self,**kwargs): if 'steam_obj' not in kwargs: self.steam_obj = Steam() else: self.steam_obj = kwargs['steam_obj'] if 'weather_obj' not in kwargs: self.weather_obj = Weather() else: self.weather_obj = kwargs['weather_obj'] steam_obj = self.steam_obj weather_obj = self.weather_obj hour_of_day = steam_obj.ts.index.map(lambda x: x.hour + (x.minute/60.0)) day_of_week = steam_obj.ts.index.map(lambda x: x.dayofweek) df_hmm = pd.DataFrame({'steam':steam_obj.ts,'weather':weather_obj.ts, \ 'hour_of_day':hour_of_day,'day_of_week':day_of_week},index=steam_obj.ts.index) #its imp that the order for columns is maintain #while slicing the HMM model self.df_hmm,self.X_hmm = self.gen_meta_data(steam_obj,weather_obj) if 'n_states' not in kwargs: self.plot_elbow(3,15) else: self.n_states = kwargs['n_states'] def __len__(self): return len(self.X_hmm) def build_model(self): n_states = self.n_states X_hmm = self.X_hmm self.model = GaussianHMM(n_states,covariance_type='diag',n_iter=1000) self.model.fit([X_hmm]) self.hidden_states = self.model.predict(X_hmm) def build_forecast_model(self): model = self.model n_states = self.n_states model_forecast = copy.deepcopy(model) model_forecast.n_features = model.n_features-1 model_forecast._means_ = model.means_[:,1:] model_forecast._covars_ = model._covars_[:,1:] self.model_forecast = model_forecast def gen_meta_data(self,steam_obj=None,weather_obj=None): if steam_obj!=None: hour_of_day = steam_obj.ts.index.map(lambda x: x.hour + (x.minute/60.0)) day_of_week = steam_obj.ts.index.map(lambda x: x.dayofweek) df_hmm = pd.DataFrame({'steam':steam_obj.ts,'weather':weather_obj.ts, \ 'hour_of_day':hour_of_day},index=steam_obj.ts.index) #df_hmm = pd.DataFrame({'steam':steam_obj.ts,'weather':weather_obj.ts, \ # 'hour_of_day':hour_of_day,'day_of_week':day_of_week},index=steam_obj.ts.index) # X_hmm = df_hmm.as_matrix(columns=['steam','weather']) X_hmm = df_hmm.as_matrix(columns=['steam','weather','hour_of_day']) #X_hmm = df_hmm.as_matrix(columns=['steam','weather','hour_of_day','day_of_week']) else: hour_of_day = weather_obj.ts.index.map(lambda x: x.hour + (x.minute/60.0)) day_of_week = weather_obj.ts.index.map(lambda x: x.dayofweek) df_hmm = pd.DataFrame({'weather':weather_obj.ts, \ 'hour_of_day':hour_of_day},index=weather_obj.ts.index) #df_hmm = pd.DataFrame({'weather':weather_obj.ts, \ # 'hour_of_day':hour_of_day,'day_of_week':day_of_week},index=weather_obj.ts.index) # X_hmm = df_hmm.as_matrix(columns=['weather']) X_hmm = df_hmm.as_matrix(columns=['weather','hour_of_day']) #X_hmm = df_hmm.as_matrix(columns=['weather','hour_of_day','day_of_week']) return df_hmm,X_hmm def plot_model(self,x_ax=None,y_ax=None): X_hmm = self.X_hmm steam_ts = self.steam_obj.ts if x_ax == None: x_ax = np.asarray([item.to_datetime() for item in steam_ts.index]) if y_ax == None: y_ax = X_hmm[:,0] hidden_states = self.hidden_states n_states = self.n_states fig = plt.figure() ax = fig.add_subplot(111) for i in xrange(n_states): print i idx = (hidden_states==i) if i<7: ax.plot(x_ax[idx],y_ax[idx],'o',label='%dth state'%i) elif i<14: ax.plot(x_ax[idx],y_ax[idx],'x',label='%dth state'%i) elif i<21: ax.plot(x_ax[idx],y_ax[idx],'+',label='%dth state'%i) elif i<28: ax.plot(x_ax[idx],y_ax[idx],'*',label='%dth state'%i) ax.set_title('%d State HMM'%(n_states)) ax.legend() ax.set_ylabel('Load (Mlb/Hr)') ax.set_xlabel('Time') ax.grid(True) plt.show() def plot_elbow(self,start,end): ''' Fit GMM and plot elbow using AIC & BIC ''' from sklearn.mixture import GMM,DPGMM obs = self.X_hmm aics = [] bics = [] for i in range(start,end+1): n_iter=1000 for j in range(1,11): g = GMM(n_components=i,n_iter=n_iter) g.fit(obs) print i converged = g.converged_ if converged: print 'j:%d'%(j) break n_iter += 1000 aics.append(g.aic(obs)) bics.append(g.bic(obs)) if not converged: print 'Not Converged!!' fig = plt.figure() ax = fig.add_subplot(111) ax.plot(range(start,end+1),aics,label='AIC') ax.plot(range(start,end+1),bics,label='BIC') ax.set_xlabel("No. of Clusters") ax.set_ylabel("Information Loss") ax.set_xticks(range(start,end+1),minor=True) ax.legend() ax.grid(True,which='both') plt.show()
start_cov = EPS * np.ones(len(pre_mean)) else: start_cov = EPS * np.identity(len(pre_mean)) means = np.vstack(([start_mean], means)) covs = np.vstack(([start_cov], covs)) return means, covs if __name__ == "__main__": root = '../../lowres_features/' train_map = open(root + 'trainset.recs.updated.lowres.cleaned', 'r').readlines() train_map = [(line.split('/')[3], line.split('\t')[0], line.split('\t')[1]) for line in train_map] n_states = 10 means, covs = get_states(n_states - 3, 'sirs', 'deviation', end=True) tmat, smat = get_tmat_smat_with_end(n_states - 3) model = GaussianHMM(n_components=n_states, covariance_type="diag", startprob=smat, transmat=tmat, n_iter=2, init_params='mc') for condition, file_path, incident_time in train_map: if condition == 'sirs': #condition, file_path, incident_time = train_map[110] # a random patient file print condition, file_path, incident_time t, last_index = overlapped_samples(file_path, incident_reported_time=int(incident_time), overlap=5, window=10, with_end=2) if t is None: print file_path, 'is bad' else: model.means_ = means model.covars_ = covs print 'shape intial', np.shape(covs) ''' best_seq = model.decode(t) print 'intial,', best_seq
def test_2(): n_features = 3 length = 32 for n_states in [4]: t1 = np.random.randn(length, n_features) means = np.random.randn(n_states, n_features) vars = np.random.rand(n_states, n_features) transmat = np.random.rand(n_states, n_states) transmat = transmat / np.sum(transmat, axis=1)[:, None] startprob = np.random.rand(n_states) startprob = startprob / np.sum(startprob) chmm = GaussianHMMCPUImpl(n_states, n_features) chmm._sequences = [t1] pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag') chmm.means_ = means.astype(np.float32) chmm.vars_ = vars.astype(np.float32) chmm.transmat_ = transmat.astype(np.float32) chmm.startprob_ = startprob.astype(np.float32) clogprob, cstats = chmm.do_estep() pyhmm.means_ = means pyhmm.covars_ = vars pyhmm.transmat_ = transmat pyhmm.startprob_ = startprob framelogprob = pyhmm._compute_log_likelihood(t1) fwdlattice = pyhmm._do_forward_pass(framelogprob)[1] bwdlattice = pyhmm._do_backward_pass(framelogprob) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T stats = pyhmm._initialize_sufficient_statistics() pyhmm._accumulate_sufficient_statistics( stats, t1, framelogprob, posteriors, fwdlattice, bwdlattice, 'stmc') yield lambda: np.testing.assert_array_almost_equal(stats['trans'], cstats['trans'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['post'], cstats['post'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['obs'], cstats['obs'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['obs**2'], cstats['obs**2'], decimal=3)
def makeGaussHMM(d): for i in range(len(d)): d[i] = normalize(d[i]) new_mod = GaussianHMM(4, n_iter = 10000) new_results = new_mod.fit(d) return new_results
for i, row in enumerate(t[1:]): farm1.fill(i, row[0], row[1]) farm2.fill(i, row[0], row[2]) farm3.fill(i, row[0], row[3]) farm4.fill(i, row[0], row[4]) farm5.fill(i, row[0], row[5]) farm6.fill(i, row[0], row[6]) farm7.fill(i, row[0], row[7]) model = GaussianHMM(algorithm='viterbi', covariance_type='full', covars_prior=0.01, covars_weight=1, means_prior=None, means_weight=0, n_components=5, random_state=None, startprob=None, startprob_prior=1.0, transmat=None, transmat_prior=1.0) print "Fitting model..." model.fit([farm1.get_output()], n_iter=1000) print "Predicting hidden states..." hidden_states = model.predict(farm1.get_output()) print "Transition matrix" print model.transmat_
# save tagged sequence to file if save: filename = filenames[i].replace('.csv', '.tagged.csv') observation_sequences[i].save(save + os.sep + filename, include_state=True) return likelihood_of_training_data, observations_per_state ### PROTOTYPE ROUTINE print "Loading training data..." # Load observation sequences from CSV observation_sequences, filenames = readObservationSequences(training_data, return_filenames=True) training_sequences = [ observation_sequence.getNumpyArray() for observation_sequence in observation_sequences ] print "Training multivariate Gaussian HMM (base model)..." # Implements (1.), (2.) base_model = GaussianHMM(n_states, covariance_type=covariance_type, n_iter=num_EM_iterations) base_model.fit(training_sequences) # save base model print "\tSaving base model to file..." saveModel(base_model, 'base_model', observation_sequences[0].getFeatureNames()) # tag training data using base model print "\tTagging training data using base model..." # Implements (3.), (4.) likelihood_of_training_data, observations_per_state = tagTrainingData( base_model, training_sequences, list(observation_sequences), # pass a copy save='base_model/tagged_training_data', filenames=filenames ) print "\tTotal log lokelihood of the training data according to base model: %.4f" % likelihood_of_training_data previous_model = base_model
output_dir = "/Users/sam/Documents/ausbildung/uni/msc_ai/thesis/Models/MultivariateGaussianHMM/keyDown/A/P/" adaptor = XMLAdaptorMultiWindow1() for file_path in glob.glob(source): observation_sequence = adaptor.convert(file_path) if observation_sequence: observation_sequence.save(output_dir + "training_observations/" + os.path.basename(file_path) + '.csv') print "Loading observations from CSV..." # Load observation sequences from CSV observation_sequences, filenames = readObservationSequences(output_dir + "training_observations/*.csv", return_filenames=True) training_sequences = [ observation_sequence.getNumpyArray() for observation_sequence in observation_sequences ] print "Training Multivariate Gaussian HMM model..." n_components = 3 model = GaussianHMM(n_components, covariance_type="full", n_iter=10) model.fit(training_sequences) # save Gaussian HMM model to file model_dir = output_dir + '%sstates/' % n_components mkdir_p(model_dir) serialiser = HMMSerialiser(model, feature_names=adaptor.getFeatures()) serialiser.saveXML(model_dir + 'model.xml') print "Tagging observation sequences..." tagged_sequences_dir = model_dir + "tagged_sequences/" mkdir_p(tagged_sequences_dir) for i, training_sequence in enumerate(training_sequences): hidden_state_sequence = model.predict(training_sequence) for j, state in enumerate(hidden_state_sequence): observation_sequences[i].getObservation(j).setState( "H%s" % state )
def main(): """ Main function that performs footprint analysis. Keyword arguments: None Return: None """ ################################################################################################### # Processing Input Arguments ################################################################################################### # Initializing ErrorHandler error_handler = ErrorHandler() # Parameters current_version = "0.0.1" usage_message = ( "\n--------------------------------------------------\n" "The 'hint' program predicts TFBSs given open chromatin data.\n" "In order to use this tools, please type: \n\n" "%prog [options] <experiment_matrix>\n\n" "The <experiment matrix> should contain:\n" "- One region file representing the regions in which the HMM\n" " will be applied. It should contain 'regions' in the type field\n" "- One DNase aligned reads file (bam) file with 'DNASE' in the name field.\n" "- One to Three histone modification aligned reads file (bam).\n\n" "For more information, please refer to:\n" "http://www.regulatory-genomics.org/dnasefootprints/\n" "--------------------------------------------------") version_message = "HINT - Regulatory Analysis Toolbox (RGT). Version: " + str( current_version) # Initializing Option Parser parser = PassThroughOptionParser(usage=usage_message, version=version_message) # Optional Input Options parser.add_option( "--hmm-file", dest="hmm_file", type="string", metavar="FILE_1[,FILE_2,...,FILE_N]", default=None, help= ("List of HMM files separated by comma. If one file only, then this HMM will be " "applied for all histone signals, otherwise, the list must have the same number" "of histone files given. The order of the list should be the order of the" "histones in the input_matrix file. If the argument is not given, then an HMM" "trained with H3K4me3 in K562 will be used.")) # Parameters Options parser.add_option( "--organism", dest="organism", type="string", metavar="STRING", default="hg19", help= ("Organism considered on the analysis. Check our full documentation for all available " "options. All default files such as genomes will be based on the chosen organism " "and the data.config file. This option is used only if a bigbed output is asked." )) # Output Options parser.add_option("--output-location", dest="output_location", type="string", metavar="PATH", default=getcwd(), help=("Path where the output files will be written.")) parser.add_option("--footprint-name", dest="footprint_name", type="string", metavar="STRING", default="footprints", help=("Name of the footprint file (without extension).")) parser.add_option( "--print-bb", dest="print_bb", action="store_true", default=False, help=("If used, the output will be a bigbed (.bb) file.")) # Processing Options options, arguments = parser.parse_args() if (not arguments or len(arguments) > 1): error_handler.throw_error("FP_WRONG_ARGUMENT") # Fixed Parameters ################ region_total_ext = 10000 fp_state_nb = 7 fp_limit_size = 50 ### dnase_initial_clip = 1000 dnase_sg_window_size = 9 dnase_norm_per = 98 dnase_slope_per = 98 dnase_frag_ext = 1 ### histone_initial_clip = 1000 histone_sg_window_size = 201 histone_norm_per = 98 histone_slope_per = 98 histone_frag_ext = 200 ################################### ################################################################################################### # Reading Input Matrix ################################################################################################### # Reading input argument input_matrix = arguments[0] # Create experimental matrix try: exp_matrix = ExperimentalMatrix() exp_matrix.read(input_matrix) except Exception: error_handler.throw_error("FP_WRONG_EXPMAT") ################################################################################################### # Reading Regions ################################################################################################### # Fetching region file region_set_list = exp_matrix.get_regionsets() if (len(region_set_list) == 0): error_handler.throw_error("FP_ONE_REGION") elif (len(region_set_list) > 1): error_handler.throw_warning("FP_ONE_REGION") regions = region_set_list[0] # Extending + Sorting + Merging / keeping an original copy original_regions = deepcopy(regions) regions.extend(int(region_total_ext / 2), int(region_total_ext / 2)) # Extending regions.merge() # Sort & Merge ################################################################################################### # Reading Signals ################################################################################################### # Initialization name_list = exp_matrix.names type_list = exp_matrix.types file_dict = exp_matrix.files dnase_label = "DNASE" # Fetching signal files dnase_file = None histone_file_list = [] for i in range(0, len(name_list)): if (type_list[i] == "regions"): continue if (name_list[i].upper() == dnase_label): # DNase signal if (not dnase_file): dnase_file = BamFile(file_dict[name_list[i]]) dnase_file.load_sg_coefs(dnase_sg_window_size) else: error_handler.throw_warning("FP_MANY_DNASE") else: # Histone signal histone_file = BamFile(file_dict[name_list[i]]) histone_file.load_sg_coefs(histone_sg_window_size) histone_file_list.append(histone_file) # Handling errors if (not dnase_file): error_handler.throw_error("FP_NO_DNASE") if (len(histone_file_list) == 0): error_handler.throw_error("FP_NO_HISTONE") elif (len(histone_file_list) > 3): error_handler.throw_warning("FP_MANY_HISTONE") ################################################################################################### # Creating HMM list ################################################################################################### # Fetching HMM input flag_multiple_hmms = False if (options.hmm_file): # Argument is passed # Fetching list of HMM files hmm_file_list = options.hmm_file.split(",") # Verifying HMM application mode (one HMM or multiple HMM files) if (len(hmm_file_list) == 1): flag_multiple_hmms = False # One HMM file only elif (len(hmm_file_list) == len(histone_file_name_list)): flag_multiple_hmms = True # One HMM file for each histone else: error_handler.throw_error("FP_NB_HMMS") else: # Argument was not passed flag_multiple_hmms = False hmm_data = HmmData() hmm_file_list = [hmm_data.get_default_hmm()] # Creating scikit HMM list hmm_list = [] for hmm_file_name in hmm_file_list: try: hmm_scaffold = HMM() hmm_scaffold.load_hmm(hmm_file_name) scikit_hmm = GaussianHMM(n_components=hmm_scaffold.states, covariance_type="full", transmat=array(hmm_scaffold.A), startprob=array(hmm_scaffold.pi)) scikit_hmm.means_ = array(hmm_scaffold.means) scikit_hmm.covars_ = array(hmm_scaffold.covs) except Exception: error_handler.throw_error("FP_HMM_FILES") hmm_list.append(scikit_hmm) ################################################################################################### # Main Pipeline ################################################################################################### # Initializing result set footprints = GenomicRegionSet("footprints") # Iterating over regions for r in regions.sequences: # Fetching DNase signal try: dnase_norm, dnase_slope = dnase_file.get_signal( r.chrom, r.initial, r.final, dnase_frag_ext, dnase_initial_clip, dnase_norm_per, dnase_slope_per) except Exception: error_handler.throw_warning( "FP_DNASE_PROC", add_msg="for region (" + ",".join([r.chrom, str(r.initial), str(r.final)]) + "). This iteration will be skipped.") continue # Iterating over histone modifications for i in range(0, len(histone_file_list)): # Fetching histone signal try: histone_file = histone_file_list[i] histone_norm, histone_slope = histone_file.get_signal( r.chrom, r.initial, r.final, histone_frag_ext, histone_initial_clip, histone_norm_per, histone_slope_per) except Exception: error_handler.throw_warning( "FP_HISTONE_PROC", add_msg="for region (" + ",".join([r.chrom, str(r.initial), str(r.final)]) + ") and histone modification " + histone_file.file_name + ". This iteration will be skipped for this histone.") continue # Formatting sequence try: input_sequence = array( [dnase_norm, dnase_slope, histone_norm, histone_slope]).T except Exception: error_handler.throw_warning( "FP_SEQ_FORMAT", add_msg="for region (" + ",".join([r.chrom, str(r.initial), str(r.final)]) + ") and histone modification " + histone_file.file_name + ". This iteration will be skipped.") continue # Applying HMM if (flag_multiple_hmms): current_hmm = hmm_list[i] else: current_hmm = hmm_list[0] try: posterior_list = current_hmm.predict(input_sequence) except Exception: error_handler.throw_warning( "FP_HMM_APPLIC", add_msg="in region (" + ",".join([r.chrom, str(r.initial), str(r.final)]) + ") and histone modification " + histone_file.file_name + ". This iteration will be skipped.") continue # Writing results start_pos = 0 flag_start = False for k in range(r.initial, r.final): curr_index = k - r.initial if (flag_start): if (posterior_list[curr_index] != fp_state_nb): if (k - start_pos < fp_limit_size): fp = GenomicRegion(r.chrom, start_pos, k) footprints.add(fp) flag_start = False else: if (posterior_list[curr_index] == fp_state_nb): flag_start = True start_pos = k if (flag_start): fp = GenomicRegion(r.chrom, start_pos, r.final) footprints.add(fp) # Sorting and Merging footprints.merge() # Overlapping results with original regions footprints = footprints.intersect(original_regions, mode=OverlapType.ORIGINAL) ################################################################################################### # Writing output ################################################################################################### # Creating output file output_file_name = options.output_location + options.footprint_name + ".bed" footprints.write_bed(output_file_name) # Verifying condition to write bb if (options.print_bb): # Fetching file with chromosome sizes genome_data = GenomeData(options.organism) chrom_sizes_file = genome_data.get_chromosome_sizes() # Converting to big bed output_bb_name = options.output_location + options.footprint_name + ".bb" try: system(" ".join([ "bedToBigBed", output_file_name, chrom_sizes_file, output_bb_name ])) #remove(output_file_name) except Exception: error_handler.throw_error("FP_BB_CREATION")
def test_1(): vm = VonMisesHMM(n_states=5) gm = GaussianHMM(n_components=5) X1 = np.random.randn(100, 2) yield lambda: vm.fit([X1]) yield lambda: gm.fit([X1])
def test_2(): n_features = 3 length = 32 for n_states in [4]: t1 = np.random.randn(length, n_features) means = np.random.randn(n_states, n_features) vars = np.random.rand(n_states, n_features) transmat = np.random.rand(n_states, n_states) transmat = transmat / np.sum(transmat, axis=1)[:, None] startprob = np.random.rand(n_states) startprob = startprob / np.sum(startprob) chmm = GaussianHMMCPUImpl(n_states, n_features) chmm._sequences = [t1] pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag') chmm.means_ = means.astype(np.float32) chmm.vars_ = vars.astype(np.float32) chmm.transmat_ = transmat.astype(np.float32) chmm.startprob_ = startprob.astype(np.float32) clogprob, cstats = chmm.do_estep() pyhmm.means_ = means pyhmm.covars_ = vars pyhmm.transmat_ = transmat pyhmm.startprob_ = startprob framelogprob = pyhmm._compute_log_likelihood(t1) fwdlattice = pyhmm._do_forward_pass(framelogprob)[1] bwdlattice = pyhmm._do_backward_pass(framelogprob) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T stats = pyhmm._initialize_sufficient_statistics() pyhmm._accumulate_sufficient_statistics(stats, t1, framelogprob, posteriors, fwdlattice, bwdlattice, 'stmc') yield lambda: np.testing.assert_array_almost_equal( stats['trans'], cstats['trans'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['post'], cstats['post'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['obs'], cstats['obs'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['obs**2'], cstats['obs**2'], decimal=3)
quantized_set = np.asarray(quantized_set) nclasses = len(np.unique(classlabels)) hmmclass = [] #print classlabels print quantized_set.shape for i in range(0, nclasses): newtrainset = [] for k in range(0, len(classlabels)): if classlabels[k] == i: #print i #print k newtrainset.append(quantized_set[:, k]) newtrainset = np.asarray(newtrainset) #print newtrainset.shape hmm = HMM(64) hmm.fit([newtrainset]) hmmclass.append(hmm) #print testingset.shape rowdivision = datasample.shape[0] t = [] for i in xrange(int(round(testingset.shape[0] / rowdivision))): t.append( quantize_data(testingset[rowdivision * i:rowdivision * (i + 1), :], kmms)) #print t.shape t = np.asarray(t) rlabels = [] for ts in t: i = 0
X = np.asarray(X_training).astype(float) y = np.array(y_training).astype(float) X_test = np.asarray(X_test).astype(float) # take diff of close value # this makes len(diff) = len(close_t) - 1 # therefore, others quantity also need to be shifted # pack diff and volume for training X = np.column_stack([X]) ############################################################################### # Run Gaussian HMM print("fitting to HMM and decoding ..."), n_components = 2 # make an HMM instance and execute fit model = GaussianHMM(n_components, "diag") model.fit([X]) # predict the optimal sequence of internal hidden state hidden_states = model.predict(X_test) for i in range(0,50): print(hidden_states[i]), print("done\n") ############################################################################### # print trained parameters and plot print("Transition matrix") print (model.transmat_) print ("") print ("means and vars of each hidden state")
# this makes len(diff) = len(close_t) - 1 # therefore, others quantity also need to be shifted diff = close_v[1:] - close_v[:-1] dates = dates[1:] close_v = close_v[1:] # pack diff and volume for training X = np.column_stack([diff, volume]) ############################################################################### # Run Gaussian HMM print "fitting to HMM and decoding ...", n_components = 5 # make an HMM instance and execute fit model = GaussianHMM(n_components, "diag") model.fit([X], n_iter=1000) # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print "done\n" ############################################################################### # print trained parameters and plot print "Transition matrix" print model.transmat_ print "" print "means and vars of each hidden state" for i in xrange(n_components):
X_new = X_new * 1000 #n_features = sum(good_features2) n_features = X_new.shape[1] print(n_features) # # clf = svm.SVC() # # clf.fit(X_new, y) # hmm = MultinomialHMM() # pos = np.where(np.diff(y) != 0)[0] # d = np.hstack([0, pos+1, len(y)]) # lens = np.diff(d) # hmm.fit(X_new, y, lens) hmm = GaussianHMM(n_components=20) hmm.fit([X_new]) clusters = pred = hmm.predict(X_new) # neigh = KNeighborsClassifier(n_neighbors=10, weights='distance') # scores = cross_validation.cross_val_score(neigh, X_new, y, cv=5) # print(scores) # # neigh.fit(X_new, y) # good_features = ETC.feature_importances_ >= 0.0005 # print(np.sum(good_features)) # X_new2 = X[..., good_features] # n_features = 20 # pca = PCA(n_components = n_features)
list_of_patient_feats, start_stop_idx, list_of_patient_file_paths = string_patient_feats(train_map, condition, overlap, window) #sirs_feats_stacked = stack_patient_feats(list_of_sirs_patients) feats_as_list = list_patient_feats(list_of_patient_feats) #print np.shape(sirs_feats_stacked) means, covs = get_initial_states(pre_states, condition, feature, end=False, start=False, cov_type=cov_type) print means print covs if cov_type == 'full': for i in range(n_states): print 'checking if initial covs are pos-definite' np.linalg.cholesky(covs[i]) print np.linalg.eigvals(covs[i]) tmat, smat = get_tmat_and_smat(pre_states, end=False, start=False) print tmat, smat model = GaussianHMM(n_components=n_states, n_iter=n_iter, covariance_type=cov_type, startprob=smat, transmat=tmat, init_params='mc') model.means_ = means model.covars_ = covs sum_inital_ll = 0.0 sum_initial_score = 0.0 sum_initial_map = 0.0 remove_idx = [] for idx, feat_from_list in enumerate(feats_as_list): if np.shape(feat_from_list)[0] > n_states: initial_ll, initial_best_seq = model.decode(feat_from_list) initial_map, initial_best_sep_map = model.decode(feat_from_list, algorithm='map') sum_initial_score += model.score(feat_from_list) sum_inital_ll += initial_ll sum_initial_map += initial_map else: remove_idx.append(idx)
trimmed_count = 0 counts.append(trimmed_count) kmer_stash.append(kmer) i += 1 if not len(counts): sys.exit( "No k-mer counts remain after filtering; check thresholds and try again." ) ## fit HMM to counts if len(args.mu) != len(args.sigmasq): sys.exit("Vectors of prior means and variances must be same length.") counts = np.reshape(np.log1p(np.array(counts, dtype="int")), (-1, 1)) hmm = GaussianHMM(len(args.mu)) hmm.fit([counts]) if args.verbose: sys.stderr.write( "Fitting HMM to k-mer counts, assuming {} hidden states...\n".format( len(args.mu))) sys.stderr.write("means:\n" + str(hmm.means_) + "\n") sys.stderr.write("covariances:\n" + str(hmm.covars_) + "\n") sys.stderr.write("\n") sys.stderr.write("Processing possible variant sites...\n") sys.stderr.write("\trejecting haplotypes with read count < {}\n".format( args.maf)) sys.stderr.write( "\taccepting as TE/ME any haplotype with max count > {}\n".format( args.maxhits))
def train_hmm(X): hmm = GaussianHMM(n_components=8) hmm.fit(X); print hmm.score(X[0]) print np.shape(X[0]) return hmm
volume = [] for row in data: if row[1] != 'close': #list = [] #for i in range(len(row)-2): # list.append(float(row[i+1])) label = float(row[7]) volume.append(float(row[2])) if label > 0: indices.append(1) else: indices.append(0) #matrix.append(list) X = numpy.column_stack([numpy.array(indices), numpy.array(volume)]) model = GaussianHMM(2, covariance_type="diag", n_iter=1000) model.fit([X]) """ reading the dato to be classified """ with open('hackathon-master/AAPL-test.csv', 'rb') as csvfile: data = csv.reader(csvfile, delimiter=',') #matrix = [] volume = [] labels = [] for row in data: if row[1] != 'close': list = []
# this makes len(diff) = len(close_t) - 1 # therefore, others quantity also need to be shifted diff = close_v[1:] - close_v[:-1] dates = dates[1:] close_v = close_v[1:] # pack diff and volume for training X = np.column_stack([diff, volume]) ############################################################################### # Run Gaussian HMM print "fitting to HMM and decoding ...", n_components = 2 # make an HMM instance and execute fit model = GaussianHMM(n_components, covariance_type="diag", n_iter=1000) model.fit([X]) # predict the optimal sequence of internal hidden state hidden_states = model.predict(X) print "done\n" ############################################################################### # print trained parameters and plot print "Transition matrix" print model.transmat_ print "" print "means and vars of each hidden state"
volume = [] for row in data: if row[1] != 'close': #list = [] #for i in range(len(row)-2): # list.append(float(row[i+1])) label = float(row[7]) volume.append(float(row[2])) if label > 0: indices.append(1) else: indices.append(0) #matrix.append(list) X = numpy.column_stack([numpy.array(indices), numpy.array(volume)]) model = GaussianHMM(2, covariance_type="diag", n_iter=1000) model.fit([X]) """ reading the dato to be classified """ with open('hackathon-master/AAPL-test.csv', 'rb') as csvfile: data = csv.reader(csvfile, delimiter=',') #matrix = [] volume = [] labels = [] for row in data: if row[1] != 'close': list = [] volume.append(float(row[2])) #for i in range(len(row)-2):
def test_2(): np.random.seed(42) n_features = 32 length = 20 #for n_states in [3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32]: for n_states in [8]: t1 = np.random.randn(length, n_features) means = np.random.randn(n_states, n_features) vars = np.random.rand(n_states, n_features) transmat = np.random.rand(n_states, n_states) transmat = transmat / np.sum(transmat, axis=1)[:, None] startprob = np.random.rand(n_states) startprob = startprob / np.sum(startprob) cuhmm = GaussianHMMCUDAImpl(n_states, n_features) cuhmm._sequences = [t1] pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag') cuhmm.means_ = means cuhmm.vars_ = vars cuhmm.transmat_ = transmat cuhmm.startprob_ = startprob logprob, custats = cuhmm.do_estep() pyhmm.means_ = means pyhmm.covars_ = vars pyhmm.transmat_ = transmat pyhmm.startprob_ = startprob pyhmm._initialize_sufficient_statistics() framelogprob = pyhmm._compute_log_likelihood(t1) cuframelogprob = cuhmm._get_framelogprob() yield lambda: np.testing.assert_array_almost_equal( framelogprob, cuframelogprob, decimal=3) fwdlattice = pyhmm._do_forward_pass(framelogprob)[1] cufwdlattice = cuhmm._get_fwdlattice() yield lambda: np.testing.assert_array_almost_equal( fwdlattice, cufwdlattice, decimal=3) bwdlattice = pyhmm._do_backward_pass(framelogprob) cubwdlattice = cuhmm._get_bwdlattice() yield lambda: np.testing.assert_array_almost_equal( bwdlattice, cubwdlattice, decimal=3) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T cuposteriors = cuhmm._get_posteriors() yield lambda: np.testing.assert_array_almost_equal( posteriors, cuposteriors, decimal=3) stats = pyhmm._initialize_sufficient_statistics() pyhmm._accumulate_sufficient_statistics(stats, t1, framelogprob, posteriors, fwdlattice, bwdlattice, 'stmc') print 'ref transcounts' print transitioncounts(cufwdlattice, cubwdlattice, cuframelogprob, np.log(transmat)) print 'cutranscounts' print custats['trans'] yield lambda: np.testing.assert_array_almost_equal( stats['trans'], custats['trans'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['post'], custats['post'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['obs'], custats['obs'], decimal=3) yield lambda: np.testing.assert_array_almost_equal( stats['obs**2'], custats['obs**2'], decimal=3)
class GaussianHmmLib: """ ref: http://scikit-learn.org/0.14/auto_examples/applications/plot_hmm_stock_analysis.html https://www.quantopian.com/posts/inferring-latent-states-using-a-gaussian-hidden-markov-model bear market: smaller mean, higher variant bull market: higher mean, smaller variant """ def __init__(self, dbhandler, *args, **kwargs): self.dbhandler = dbhandler self.sids = self.dbhandler.stock.ids self.n_components = int(kwargs.pop('n_components')) or 5 self.n_iter = int(kwargs.pop('n_iter')) or 1000 def run(self, data): sid = self.sids[0] self.dates = data[sid]['price'].values self.close_v = data[sid]['close_v'].values self.volume = data[sid]['volume'].values[1:] # take diff of close value # this makes len(diff) = len(close_t) - 1 # therefore, others quantity also need to be shifted self.diff = self.close_v[1:] - self.close_v[:-1] # pack diff and volume for training self.X = np.column_stack([self.diff, self.volume]) # make an HMM instance and execute fit self.model = GaussianHMM(self.n_components, covariance_type="diag", n_iter=self.n_iter) self.model.fit([self.X], n_iter=self.n_iter) # predict the optimal sequence of internal hidden state self.hidden_states = self.model.predict(self.X) def report(self): # print trained parameters and plot print "Transition matrix" print self.model.transmat_ print "" print "means and vars of each hidden state" for i in xrange(self.n_components): print "%dth hidden state" % i print "mean = ", self.model.means_[i] print "var = ", np.diag(self.model.covars_[i]) print "" years = YearLocator() # every year months = MonthLocator() # every month yearsFmt = DateFormatter('%Y') fig = plt.figure() ax = fig.add_subplot(111) for i in xrange(self.n_components): # use fancy indexing to plot data in each state idx = (self.hidden_states == i) ax.plot_date(self.dates[idx], self.close_v[idx], 'o', label="%dth hidden state" % i) ax.legend() # format the ticks ax.xaxis.set_major_locator(years) ax.xaxis.set_major_formatter(yearsFmt) ax.xaxis.set_minor_locator(months) ax.autoscale_view() # format the coords message box ax.fmt_xdata = DateFormatter('%Y-%m-%d') ax.fmt_ydata = lambda x: '$%1.2f' % x ax.grid(True) fig.autofmt_xdate() plt.savefig("gaussianhmm_%s.png" % (self.sids[0]))
def test_2(): np.random.seed(42) n_features = 32 length = 20 #for n_states in [3, 4, 5, 7, 8, 9, 15, 16, 17, 31, 32]: for n_states in [8]: t1 = np.random.randn(length, n_features) means = np.random.randn(n_states, n_features) vars = np.random.rand(n_states, n_features) transmat = np.random.rand(n_states, n_states) transmat = transmat / np.sum(transmat, axis=1)[:, None] startprob = np.random.rand(n_states) startprob = startprob / np.sum(startprob) cuhmm = GaussianHMMCUDAImpl(n_states, n_features) cuhmm._sequences = [t1] pyhmm = GaussianHMM(n_components=n_states, init_params='', params='', covariance_type='diag') cuhmm.means_ = means cuhmm.vars_ = vars cuhmm.transmat_ = transmat cuhmm.startprob_ = startprob logprob, custats = cuhmm.do_estep() pyhmm.means_ = means pyhmm.covars_ = vars pyhmm.transmat_ = transmat pyhmm.startprob_ = startprob pyhmm._initialize_sufficient_statistics() framelogprob = pyhmm._compute_log_likelihood(t1) cuframelogprob = cuhmm._get_framelogprob() yield lambda: np.testing.assert_array_almost_equal(framelogprob, cuframelogprob, decimal=3) fwdlattice = pyhmm._do_forward_pass(framelogprob)[1] cufwdlattice = cuhmm._get_fwdlattice() yield lambda: np.testing.assert_array_almost_equal(fwdlattice, cufwdlattice, decimal=3) bwdlattice = pyhmm._do_backward_pass(framelogprob) cubwdlattice = cuhmm._get_bwdlattice() yield lambda: np.testing.assert_array_almost_equal(bwdlattice, cubwdlattice, decimal=3) gamma = fwdlattice + bwdlattice posteriors = np.exp(gamma.T - logsumexp(gamma, axis=1)).T cuposteriors = cuhmm._get_posteriors() yield lambda: np.testing.assert_array_almost_equal(posteriors, cuposteriors, decimal=3) stats = pyhmm._initialize_sufficient_statistics() pyhmm._accumulate_sufficient_statistics( stats, t1, framelogprob, posteriors, fwdlattice, bwdlattice, 'stmc') print 'ref transcounts' print transitioncounts(cufwdlattice, cubwdlattice, cuframelogprob, np.log(transmat)) print 'cutranscounts' print custats['trans'] yield lambda: np.testing.assert_array_almost_equal(stats['trans'], custats['trans'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['post'], custats['post'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['obs'], custats['obs'], decimal=3) yield lambda: np.testing.assert_array_almost_equal(stats['obs**2'], custats['obs**2'], decimal=3)
""" agelessmojo bot implementation """ from __future__ import division # for floating point division import os, json import numpy as np from sklearn.hmm import GaussianHMM from bottle import get, post, request, run, response PORT = os.getenv('VCAP_APP_PORT') HOST = os.getenv('VCAP_APP_HOST') HMM = GaussianHMM(50, "diag") # TODO Variables here WINDOW_SIZE = 5 LAP_DATA = {} LAP_DATA_SMOOTHED = {} LAP_COUNT = 0 LAP_ITERATOR = 0 @get('/ping') def ping(): """ Check for bot health. Returns success in text/plain. """ response.headers['Content-Type'] = 'text/plain' return "success" def send_power_control(power):
matrix with some extra bells and whisles. But, because of the way the E-step works currently, the means and covariances are estimated exactly as with a Gaussian HMM. Then, afterwards, the A, b and Q are estimated. So, we can do a lot of testing by comparing to a reference gaussian HMM implementation ''' import string import numpy as np from sklearn.hmm import GaussianHMM from sklearn.utils.extmath import logsumexp from mixtape.mslds import MetastableSwitchingLDS from mixtape import _switching_var1 N_STATES = 2 data = [np.random.randn(100, 3), np.random.randn(100, 3)] refmodel = GaussianHMM(n_components=N_STATES, covariance_type='full').fit(data) def _sklearn_estep(): # copied from sklearn/hmm.py#L440 curr_logprob = 0 stats = refmodel._initialize_sufficient_statistics() stats['post[1:]'] = np.zeros(refmodel.n_components) stats['post[:-1]'] = np.zeros(refmodel.n_components) stats['obs[1:]'] = np.zeros((refmodel.n_components, refmodel.n_features)) stats['obs[:-1]'] = np.zeros((refmodel.n_components, refmodel.n_features)) stats['obs*obs[t-1].T'] = np.zeros( (refmodel.n_components, refmodel.n_features, refmodel.n_features)) stats['obs[1:]*obs[1:].T'] = np.zeros( (refmodel.n_components, refmodel.n_features, refmodel.n_features)) stats['obs[:-1]*obs[:-1].T'] = np.zeros(
## initialize hmm parameters rs = check_random_state(None) # fix RNG seed? maybe? means = np.array([[0.0, 0.0], [np.log1p(args.coverage), 0.0], [0.0, np.log1p(args.coverage)], [np.log1p(args.coverage / 2), np.log1p(args.coverage / 2)], [np.log1p(args.coverage), np.log1p(args.coverage)]]) cv = 1.0 covars = np.array([[0.01, 0.01], [cv, 0.01], [0.01, cv], [cv / 2, cv / 2], [cv, cv]]) hidden = ["private"] + ref_samples + ["heterozygous", "pseudohet"] hmm = GaussianHMM(n_components=len(means), random_state=rs) hmm._set_means(means) hmm._set_covars(covars) ## filter sites; compute observation sequence as log(1+count) keep = np.logical_and((counts.max(1) < args.X_max * args.coverage), (counts.sum(1) > -1.0)) counts = counts[keep, :] obs = np.log1p(counts) starts = np.array([start for start, end in ivls]).reshape((len(ivls), 1)) starts = starts[keep, :] ## run hmm states = hmm.decode(obs) ## print result to stdout