def load_segmentation_model(modeldata): model = HiddenMarkovModel('model') states = {} for s in modeldata: if len(s['emission']) == 1: emission = NormalDistribution(*s['emission'][0][:2]) else: weights = np.array([w for _, _, w in s['emission']]) dists = [NormalDistribution(mu, sigma) for mu, sigma, _ in s['emission']] emission = GeneralMixtureModel(dists, weights=weights) state = State(emission, name=s['name']) states[s['name']] = state model.add_state(state) if 'start_prob' in s: model.add_transition(model.start, state, s['start_prob']) for s in modeldata: current = states[s['name']] for nextstate, prob in s['transition']: model.add_transition(current, states[nextstate], prob) model.bake() return model
def update_hmm(self): num_states = self.num_states start_prob = self.start_prob num_emissions = self.num_emissions hmm = HiddenMarkovModel('hmm') dist = [ DiscreteDistribution( dict(zip(range(num_emissions), self.emissions[i]))) for i in range(num_states) ] states = [ State(dist[i], 's' + str(i).zfill(2)) for i in range(num_states) ] hmm.add_states(states) for i in range(num_states): s_i = states[i] hmm.add_transition(hmm.start, s_i, start_prob[i]) for j in range(num_states): s_j = states[j] p = self.transitions[i, j] hmm.add_transition(s_i, s_j, p) self.hmm = hmm self.hmm.bake()
def _buildModel(self, data): ''' builds the model given the data to init the distributions at good point data: 2d matrix every row is a vector of features ''' # we want to call from_matrix(transition, dists, starts, ends) tm = np.zeros((self.statesNumber, self.statesNumber)) indices = [(x, x) for x in range(self.statesNumber)] indices.extend([(x, x + 1) for x in range(self.statesNumber)]) indices.pop( ) # this the item (self.statesNumber-1 , self.statesNumber) that is out of bound indices = np.array(indices) tm[indices[:, 0], indices[:, 1]] = 0.5 tm[self.statesNumber - 1, self.statesNumber - 1] = 0.5 # this is the end state prob, i write it alone as we may change it specificity dists = self._initDists(data) starts = np.zeros((self.statesNumber, )) starts[0] = 1 ends = np.zeros((self.statesNumber, )) ends[-1] = 0.5 self.model = HiddenMarkovModel.from_matrix(tm, dists, starts, ends, name=self.mname) return self.model
def init(): m = 1000 # restricts number of genes, used for local testing gc, mt, track = load_data(m) state_range = [5, 10, 25, 50, 100] z_range = [3, 5, 10, 20] msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) sequences = np.concatenate((sequences, -1 * sequences)) # tie positive and negative expression sequences tied = {} for i, label in enumerate(labels): tied[label] = [i, i+labels.size] state_labels = np.concatenate(((labels + '+'), (labels + '-'))) labels = np.concatenate((labels, labels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts) noise.freeze_distributions() return gc, mt, sequences, labels, state_labels, tied, noise, z_range, \ state_range
def init(m, seed): if m == -1: m = None gc, mt, track = load_data(m, seed) msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) sequences = np.concatenate((sequences, -1 * sequences)) # tie positive and negative expression sequences tied = {} for i, label in enumerate(labels): tied[label] = [i, i+labels.size] labels = np.concatenate(((labels + '+'), (labels + '-'))) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() return sequences, labels, tied, noise
def insert_delete_main_hmm(data_matrix): v_columns = column_clasify(data_matrix) v_zones = create_zones(v_columns) v_grouped_states = group_states(v_zones, 'test') v_model = HiddenMarkovModel() v_first_state = State(None, name='ali_start') v_last_state = State(None, name='ali_end') v_model.add_state(v_first_state) v_model.add_transition(v_model.start, v_first_state, 1) v_model.add_state(v_last_state) add_states(v_model, v_grouped_states) v_trans = calculate_transitions(v_first_state, v_last_state, v_grouped_states) apply_transitions(v_model, v_trans) v_model.bake() return v_model
def HiddenSpaceGenerator(cls, X, n_components): """This method creates more features by training a HiddenMarkovModel on the game statistics, then returns the hidden state space of each timestep/game as a new feature. HOWEVER, note that this doesn't return a list of features, instead it returns a HMM that can generate more features but can still be used for classification also. Arguments: X: the input features (with a series_idx) n_components: The number of hidden state space variables to initialize. Note that (sadly) pomegranate does not implement continuous HMMs, so it will discretize every continuous variable by K-means so then the outputted space will be discrete. """ # the last series id, none are negative so we will get a new one last_series_idx = -1 # restrict down, but since a temporal we need to make a list of entries _X = [] for row in X: if row[0] != last_series_idx: # create a new series to train on, start with empty input # series then add to them while the row has the same index last_series_idx = row[0] _X.append(np.full((0, X.shape[1] - 1), None, dtype=None)) # add the datapoint to the current series _X[-1] = np.vstack((_X[-1], row[1:])) # now train an HMM to the data return HiddenMarkovModel.from_samples(MultivariateGaussianDistribution, n_components, _X)
def load(self, file_path): with open(file_path, 'rb') as f: objects = pickle.load(f) try: self.transformer = objects['transformer'] except KeyError: # for backwards compatibility self.transformer = objects['lda'] self.hmm = HiddenMarkovModel.from_json(objects['hmm'])
def get_variable_number_of_repeats_matcher_hmm(patterns, copies=1, vpaths=None): model = get_constant_number_of_repeats_matcher_hmm(patterns, copies, vpaths) start_repeats_matches = State(None, name='start_repeating_pattern_match') end_repeats_matches = State(None, name='end_repeating_pattern_match') mat = model.dense_transition_matrix() states = model.states states.append(start_repeats_matches) states.append(end_repeats_matches) states_count = len(mat) start_repeats_ind = states_count end_repeats_ind = states_count + 1 mat = np.c_[mat, np.zeros(states_count), np.zeros(states_count)] mat = np.r_[mat, [np.zeros(states_count + 2)]] mat = np.r_[mat, [np.zeros(states_count + 2)]] unit_ends = [] for i, state in enumerate(model.states): if state.name.startswith('unit_end'): unit_ends.append(i) first_unit_start = None for i in range(len(mat[model.start_index])): if mat[model.start_index][i] != 0: first_unit_start = i mat[model.start_index][first_unit_start] = 0.0 mat[model.start_index][start_repeats_ind] = 1 mat[start_repeats_ind][first_unit_start] = 1 for unit_end in unit_ends: next_state = None for j in range(len(mat[unit_end])): if mat[unit_end][j] != 0: next_state = j mat[unit_end][next_state] = 0.5 mat[unit_end][end_repeats_ind] = 0.5 mat[end_repeats_ind][model.end_index] = 1 starts = np.zeros(states_count + 2) starts[model.start_index] = 1.0 ends = np.zeros(states_count + 2) ends[model.end_index] = 1.0 state_names = [state.name for state in states] distributions = [state.distribution for state in states] name = 'Repeat Matcher HMM Model' new_model = Model.from_matrix(mat, distributions, starts, ends, name=name, state_names=state_names, merge=None) new_model.bake(merge=None) return new_model
def hmm(df, emissions, n_states, algorithm): model = HiddenMarkovModel.from_samples( distribution=MultivariateGaussianDistribution, n_components=n_states, X=df[emissions].to_numpy(), algorithm=algorithm, verbose=True, ) return model
def get_read_matcher_model(left_flanking_region, right_flanking_region, patterns, copies=1, vpaths=None): model = get_suffix_matcher_hmm(left_flanking_region) repeats_matcher = get_variable_number_of_repeats_matcher_hmm( patterns, copies, vpaths) right_flanking_matcher = get_prefix_matcher_hmm(right_flanking_region) model.concatenate(repeats_matcher) model.concatenate(right_flanking_matcher) model.bake(merge=None) mat = model.dense_transition_matrix() first_repeat_matches = [] repeat_match_states = [] suffix_start = None for i, state in enumerate(model.states): if state.name[0] == 'M' and state.name.split('_')[-1] == '0': first_repeat_matches.append(i) if state.name[0] == 'M' and state.name.split('_')[-1] not in [ 'prefix', 'suffix' ]: repeat_match_states.append(i) if state.name == 'suffix_start_suffix': suffix_start = i mat[model.start_index][suffix_start] = 0.3 for first_repeat_match in first_repeat_matches: mat[model. start_index][first_repeat_match] = 0.7 / len(first_repeat_matches) for match_state in repeat_match_states: to_end = 0.7 / len(repeat_match_states) total = 1 + to_end for next_state in range(len(mat[match_state])): if mat[match_state][next_state] != 0: mat[match_state][next_state] /= total mat[match_state][model.end_index] = to_end / total starts = np.zeros(len(model.states)) starts[model.start_index] = 1.0 ends = np.zeros(len(model.states)) ends[model.end_index] = 1.0 state_names = [state.name for state in model.states] distributions = [state.distribution for state in model.states] name = 'Read Matcher' new_model = Model.from_matrix(mat, distributions, starts, ends, name=name, state_names=state_names, merge=None) new_model.bake(merge=None) return new_model
def test_sample_from_site(): dists = [ NormalDistribution(5, 1), NormalDistribution(1, 7), NormalDistribution(8, 2) ] trans_mat = np.array([[0.7, 0.3, 0.0], [0.0, 0.8, 0.2], [0.0, 0.0, 0.9]]) starts = np.array([1.0, 0.0, 0.0]) ends = np.array([0.0, 0.0, 0.1]) model = HiddenMarkovModel.from_matrix(trans_mat, dists, starts, ends) model.plot()
def gaussian_hmm(n_states, lower, upper, variance, model_id): """ insantiate a model with random parameters randomly generates start and transition matrices generates nomal distrobutions for each state from partition on sequences """ np.random.seed(int(time.time())) model = HiddenMarkovModel(model_id) # make states with distrobutions from random subsets of timepoints x = np.linspace(lower, upper, n_states) states = [] for i in range(n_states): dist = \ NormalDistribution(x[i], variance) states.append(State(dist, name=str(i))) model.add_states(states) # add uniform start probabilities start_prob = 1.0 / n_states start_probs = [] for i in range(n_states): start_probs.append(start_prob + np.random.ranf()) start_probs = np.array(start_probs) start_probs = start_probs / start_probs.sum() for i, state in enumerate(states): model.add_transition(model.start, state, start_probs[i]) # add transition probabilities proportional to probability of generating # one state mean from another for state1 in states: transitions = [] for other_state in states: transitions.append(np.exp(state1.distribution.log_probability( other_state.distribution.parameters[0])) + np.random.ranf()) transitions = np.array(transitions) transitions = transitions / transitions.sum() for i, state2 in enumerate(states): model.add_transition(state1, state2, transitions[i]) model.bake() print 'Initialized HMM: ', model.name return model
def init(base_dir): print base_dir cluster_directories = \ glob.glob(base_dir + '/*') initial_clusterings = {} clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read initial clusters initial_clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['init_assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') initial_clusters[cluster_name] = cluster_members l += 4 initial_clusterings[clustering_id] = initial_clusters # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass return initial_clusterings, clusterings
def lambda_handler(event, context): # TODO implement content_object = s3.get_object(Bucket='bhargav-ml-trained-models', Key='pos_model.txt') file_content = content_object['Body'].read().decode() json_content = json.loads(file_content) model = HiddenMarkovModel.from_json(json_content) sentence = event['body'].split(' ') output = simplify_decoding(sentence, model) return { 'statusCode': 200, 'headers': {'Content-Type': 'text/plain', 'Access-Control-Allow-Origin': '*'}, 'body': output }
def get_vntr_matcher_hmm(self, read_length): """Try to load trained HMM for this VNTR If there was no trained HMM, it will build one and store it for later usage """ logging.info('Using read length %s' % read_length) copies = self.get_copies_for_hmm(read_length) base_name = str( self.reference_vntr.id) + '_' + str(read_length) + '.json' stored_hmm_file = settings.TRAINED_HMMS_DIR + base_name if settings.USE_TRAINED_HMMS and os.path.isfile(stored_hmm_file): model = Model() model = model.from_json(stored_hmm_file) return model flanking_region_size = read_length vntr_matcher = self.build_vntr_matcher_hmm(copies, flanking_region_size) json_str = vntr_matcher.to_json() with open(stored_hmm_file, 'w') as outfile: outfile.write(json_str) return vntr_matcher
def cluster(self): if self.preprocessed_data is None: print("No preprocessed_data attribute found") return -1 if self.alg == "Kmeans": from sklearn.cluster import KMeans km = KMeans(n_clusters=self.K, precompute_distances=True) km.fit(np.concatenate( self.preprocessed_data)) #flattens all dates together self.states = [km.predict(d) for d in self.preprocessed_data] elif self.alg == "HMM": from pomegranate import HiddenMarkovModel, MultivariateGaussianDistribution distribution = MultivariateGaussianDistribution hmm=HiddenMarkovModel().from_samples(distribution,n_components=self.K\ ,X=self.preprocessed_data.copy()) self.states = [ np.array(hmm.predict(d.copy())) for d in self.preprocessed_data ] else: print("Unrecognised or undefined clustering algorithm.") return -1 self.experiment_progress = 2
def create_hidden_MarkovModel(e_df, q_df, start_p_dict): """ Creates a Hidden Markov Model based on DataFrame @args: - e_df (pd.Dataframe): contains the emission probabilites - q_df (pd.Dataframe): contains the emission probabilites """ model = HiddenMarkovModel(name="Example Model") '#1: Create a dict for each key in trans. df' model_dict = {} for key in q_df.keys().values: model_dict[key] = {} '#2: Create the states' for key in model_dict: '#2.1.Step Add teh emission prob. to each state, , P(observation | state)' emission_p = DiscreteDistribution(e_df[key].to_dict()) sunny_state = State(emission_p, name=key) model_dict[key] = State(emission_p, name=key) model.add_state(model_dict[key]) '#2.2.Step: Add the start probability for each state' model.add_transition(model.start, model_dict[key], start_p_dict[key]) '#3.Step: Add the transition probability to each state' for key, item in q_df.to_dict("index").items(): for item_name, value in item.items(): print(key, " , ", item_name, ": ", value) tmp_origin = model_dict[key] tmp_destination = model_dict[item_name] model.add_transition(tmp_origin, tmp_destination, q_df.loc[key, item_name]) # finally, call the .bake() method to finalize the model model.bake() return model
def hmm(df, num_states): "HMM program" # df['value']=df['value'].replace(0,np.nan) #this removes unmappable areas of chr # df_dropna=df.dropna(subset=['value']) #this removes unmappable areas of chr (NaN is otherwise considered 0) vals = df["value"].values model = HiddenMarkovModel.from_samples(NormalDistribution, X=[vals], n_components=num_states) states = model.predict(vals) # Rename states to increase with mean signal order = np.argsort(df['value'].groupby(states).mean()) states = [order[s] for s in states] df["state"] = states df['state'][np.isnan(df['value'])] = np.nan return df
def run_test(arg, k): np.random.seed(k) exp_type = arg['type'] N = arg['N'] alpha = arg['alpha'] n_comp = arg['n_comp'] norm_params = arg['norm_params'] save_dir = arg['dir'] sequence = generator.Sequence(N, alpha, type=exp_type, params=norm_params) labels = list(map(myutils.rename_state, sequence.path)) model = HiddenMarkovModel.from_samples(DiscreteDistribution, n_components=n_comp, X=[sequence.sequence], labels=[labels], algorithm='labeled') return model, sequence.sequence
def generate( genre_folder: str, bpm: int, beats: int, steps: int, onset: str, components: int, regex: str, output: str, include: bool, ): """ This command generates a new unique beat, based on the audio files in the given input folder. """ audios = util.read_audio_files(include, Path(genre_folder), regex) sequences, samples = util.create_knowledge_base( audios, OnsetAlgorithm(onset.lower()), beats, steps ) # Create the model # sequences = [add_up_ones(seq) for seq in sequences] model: HiddenMarkovModel = HiddenMarkovModel.from_samples( DiscreteDistribution, n_components=components, X=sequences, algorithm="viterbi", verbose=True, name="groover", ) # model: MarkovChain = MarkovChain.from_samples(X=sequences) # lengths: List[int] = [len(x) for x in sequences] sequence = model.sample(length=beats * steps) sequence = sequences[0] print(sequence) # sequence = ones(sequence) # print(len(sequence)) print( "BPM: {}, Beats: {}, Steps:{}, Onset Algorithm: {}".format( bpm, beats, steps, onset ) ) # Save the beat util.create_beat(sequence, samples, bpm, beats, steps).save(Path(output))
class HMMWrapper: def __init__(self): self.model = HiddenMarkovModel() self.start = self.model.start self.end = self.model.end self.states_before_bake = [] self.states = None def add_state(self, state, start_prob=0): self.states_before_bake.append((state, start_prob)) self.model.add_state(state) def add_transition(self, start_state, end_state, prob): # print('adding from', start_state.name, 'to', end_state.name, prob) self.model.add_transition(start_state, end_state, prob) def bake(self): starter_states_no_prob = [] free_start_prob = 1.0 for state in self.states_before_bake: if 'none' not in state[0].name: if not state[1]: starter_states_no_prob.append(state) else: free_start_prob -= state[1] print('asignado ' + str(state[1]) + ' a ' + state[0].name) self.add_transition(self.start, state[0], state[1]) len_no_prob = len(starter_states_no_prob) starter_prob = free_start_prob / len_no_prob print(len_no_prob, starter_prob) for state in starter_states_no_prob: self.add_transition(self.start, state, starter_prob) self.model.bake() self.states = self.model.states def make_states_from_alignment(self, first_state, last_state, seq_matrix, name): columns = column_clasify(seq_matrix) zones = create_zones(columns) grouped_states = group_states(zones, name) add_states(self, grouped_states) trans = calculate_transitions(first_state, last_state, grouped_states) apply_transitions(self, trans) def predict(self, *args, **kwargs): return self.model.predict(*args, **kwargs)
def fit_hmm(self, signal_arrays, state_vectors, distribution, state_transition_threshold=1e-4, **kwargs): # We want to bunch together artefact states with their # corresponding "clean" states. state_vectors = [np.abs(vec) for vec in state_vectors] # remove 'undefined' samples # TODO: let pomegranate handle that signal_arrays = [ arr[vec != 0] for arr, vec in zip(signal_arrays, state_vectors) ] state_vectors = [vec[vec != 0] for vec in state_vectors] # Pomegranate expects string labels for valid states and None for invalid states. # labels = [[str(state) if state != 0 else None for state in vec] for vec in state_vectors] labels = [[str(state) for state in vec] for vec in state_vectors] # construct matching state names # state_names = [str(state) for state in np.unique(np.concatenate(state_vectors)) if state != 0] state_names = [ str(state) for state in np.unique(np.concatenate(state_vectors)) ] # fit HMM states to transformed signals signals = [self.transform(arr) for arr in signal_arrays] hmm = HiddenMarkovModel.from_samples(distribution=distribution, n_components=len(state_names), X=signals, labels=labels, algorithm='labeled', state_names=state_names, **kwargs) if state_transition_threshold > 0.: new_hmm = _sparsify_hmm(hmm, state_transition_threshold) return new_hmm else: return hmm
def fit(self, data): """ Fits a model---learns transition and emission probabilities Arguments: data: list of SMILES """ list_data = [list(smiles) for smiles in data] self.model = HiddenMarkovModel.from_samples( DiscreteDistribution, n_components=self.n_components, end_state=True, X=list_data, init='kmeans||', verbose=self.verbose, n_jobs=self.n_jobs, max_iterations=self.epochs, batches_per_epoch=self.batches_per_epoch, random_state=self.seed ) self.fitted = True return self
def load(cls, path): """ Loads saved model Arguments: path: path to saved .pkl file Returns: Loaded HMM """ with open(path, "rb") as f: data = pickle.load(f) hmm = data['model'] del data['model'] model = cls(**data) model.model = HiddenMarkovModel.from_json(hmm) model.fitted = True return model
def fit(self, X, y=None): X_processed = self._check_and_preprocess(X, True) self.hmmmodel = HiddenMarkovModel.from_samples( NormalDistribution, self.n_states, X_processed, algorithm="baum-welch", n_jobs=8, verbose=self.verbose, batches_per_epoch=20, max_iterations=self.max_iterations) self.hmmmodel.bake() self.decision_scores_ = np.zeros(X.shape[0]) for i, sequence in enumerate(X_processed): self.decision_scores_[i] = -self.hmmmodel.log_probability(sequence) self._process_decision_scores()
def create_casas7_HMM_with_prepared_train_and_test_based_on_seq_of_activities( train_set, list_of_persons_in_train, test_set, list_of_persons_in_test): ''' create a single HMM for all of persons train_set = an ndarray that has train_set for each person separately test_set = ''' #concatinate train_sets and test_sets of all of people number_of_persons = len(train_set) final_train_set = train_set[0] final_test_set = test_set[0] final_train_set_labels = list_of_persons_in_train[0] final_test_set_labels = list_of_persons_in_test[0] #print(type(final_train_set) , type(train_set) , type(train_set[1])) for per in range(1, number_of_persons): final_train_set = np.concatenate((final_train_set, train_set[per]), axis=0) final_test_set = np.concatenate((final_test_set, test_set[per]), axis=0) final_train_set_labels = np.concatenate( (final_train_set_labels, list_of_persons_in_train[per]), axis=0) final_test_set_labels = np.concatenate( (final_test_set_labels, list_of_persons_in_test[per]), axis=0) #r = np.shape(final_train_set) #for i in range(r[0]): # print(np.shape(final_train_set[i])) #final_train_set = np.array([[1,2,3,0,0] , [1,2,0,0,0]], dtype = np.ndarray) #final_train_set_labels = np.array([1,2] , dtype= np.ndarray) print(type(final_train_set[11]), np.shape(final_train_set[11])) print(final_train_set[0:2]) model = HiddenMarkovModel.from_samples( DiscreteDistribution, n_components=2, X=final_train_set, labels=final_train_set_labels, algorithm='labeled' ) # according to my tests :D n_components is number of hidden states print(model) #return 0 #test '''predicted_labels = np.zeros_like(actual_labels)
def __init__(self, length=None, n_features=None, initial=None, match_match=0.9, delete_insert=0.1, flank_prob=0): #last is polymorphism dummy super(ProfileHMM, self).__init__() if length is not None: n_states = 3 * length + 1 #print(self.get_emission_dists(n_states, n_features, initial)[:3]) self.model = HiddenMarkovModel.from_matrix( transition_probabilities=self.get_transmat( n_states, match_match, delete_insert), distributions=self.get_emission_dists(n_states, n_features, initial), starts=self.get_startprob(n_states), ends=self.get_endprob(n_states), state_names=self.get_state_names(length))
def build_model(self): distributions = [] for _ in range(self.hidden_size): emission_probs = np.random.random(self.num_characters) emission_probs = emission_probs / emission_probs.sum() distributions.append( DiscreteDistribution( dict(zip(self.all_characters, emission_probs)))) trans_mat = np.random.random((self.hidden_size, self.hidden_size)) trans_mat = trans_mat / trans_mat.sum(axis=1, keepdims=1) starts = np.random.random(self.hidden_size) starts = starts / starts.sum() # testing initializations np.testing.assert_almost_equal(starts.sum(), 1) np.testing.assert_array_almost_equal(np.ones(self.hidden_size), trans_mat.sum(axis=1)) self.model = HiddenMarkovModel.from_matrix(trans_mat, distributions, starts) self.model.bake()
def create_hmm_from_sample(file_address): #data, _ , _ = read_sequence_based_CSV_file_with_activity(file_address = file_address, has_header = True , separate_data_based_on_persons = False ) #data = read_data_from_CSV_file(dest_file = file_address, data_type = np.int , has_header = True , return_as_pandas_data_frame = False ) ''' data = np.delete(data , 2, 1) data = np.delete(data , 2, 1) data = np.delete(data , 0, 1) data = np.delete(data , 0, 1) data = np.delete(data , 0, 1) print(np.shape(data)) ''' #print(data) data = np.array([['a', 'b'], ['a', 'b']]) data = np.array([[np.array([1, 2, 3]), np.array([1, 1, 1])], [np.array([1, 1, 2]), np.array([1, 2, 2])]]) data = [ np.array([[1, 2, 3], [1, 2, 3]], np.int32), np.array([[1, 2, 3], [1, 2, 3]], np.int32), np.array([[1, 2, 3], [1, 2, 3]], np.int32) ] print(data) #data = np.array([[['a' , 'b'] , ['a' , 'a']] , [['a' , 'b'] , ['b' , 'b']]]) #data = create_sequence_of_sensor_events_based_on_activity(address_to_read = file_address, has_header = False, address_for_save = " ", isSave = False)#read_data_from_CSV_file(dest_file = file_address, data_type = numpy.int , has_header = False , return_as_pandas_data_frame = False ) model = HiddenMarkovModel.from_samples( MultivariateDistribution, n_components=3, X=data ) # according to my tests :D n_components is number of hidden states #print(model) #print(model._baum_welch_summarize()) #model.plot() ''' print("dense_transition_matrix:" , model.dense_transition_matrix()) print("edge_count:" , model.edge_count()) print("edges:" , model.edges) print("name:" , model.name) print("state_count:" , model.state_count()) ''' print(model)
def oriHMMParams(self, numdists=3): """ Set initial parameters for the Hidden Markov Model (HMM). """ # GMM emissions # 3 Hidden States: # 0--downstream, 1--no bias, 2--upstream if numdists == 1: dists = [ NormalDistribution(-2.5, 7.5), NormalDistribution(0, 7.5), NormalDistribution(2.5, 7.5) ] else: var = 7.5 / (numdists - 1) means = [[], [], []] for i in range(numdists): means[0].append(i * 7.5 / (numdists - 1) + 2.5) means[1].append(i * 7.5 * (-1)**i / (numdists - 1)) means[2].append(-i * 7.5 / (numdists - 1) - 2.5) dists = [] for i, m in enumerate(means): tmp = [] for j in m: tmp.append(NormalDistribution(j, var)) mixture = GeneralMixtureModel(tmp) dists.append(mixture) # transition matrix A = [[0.34, 0.33, 0.33], [0.33, 0.34, 0.33], [0.33, 0.33, 0.34]] starts = np.ones(3) / 3 hmm = HiddenMarkovModel.from_matrix(A, dists, starts, state_names=['0', '1', '2'], name='mixture{0}'.format(numdists)) return hmm
def init(): m = 1000 # restricts number of genes, used for local testing gc, mt, track = load_data(m) state_range = [5, 10, 25, 50, 100] z_range = [3, 5, 10, 20] msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data.iloc[:m, :]) sequences = np.concatenate((msequences, gsequences)) labels = np.concatenate((mlabels, glabels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts) noise.freeze_distributions() return gc, mt, sequences, labels, noise, z_range, state_range
class ModelWrapper: def __init__(self): self.model = HiddenMarkovModel() def add_state(self, distribution, name): state = State(distribution, name=name) self.model.add_state(state) return state def bake(self): self.model.bake() def viterbi(self, seq): return self.model.viterbi(seq) def add_transition(self, states, next_state_data): for state in states: for next_data in next_state_data: self.model.add_transition(state, next_data[0], next_data[1])
def init(m, seed): if m == -1: m = None gc, mt, track = load_data(m, seed) msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() return sequences, labels, noise
def __init__(self, length=None, n_features=None, initial=None, match_match=0.9, delete_insert=0.1, flank_prob=0.9999999): super(ProfileHMM, self).__init__() if length is not None: n_states = 3 * length + 1 transmat = self.get_transmat(n_states, match_match, delete_insert, length, flank_prob) #print(transmat.shape) #np.set_printoptions(edgeitems=10, linewidth=200) #print(transmat.round(2)) emissions = self.get_emission_dists(n_states, n_features, initial) self.model = HiddenMarkovModel.from_matrix( transition_probabilities=transmat, distributions=emissions, starts=self.get_startprob(n_states, flank_prob), ends=self.get_endprob(n_states, flank_prob), state_names=self.get_state_names(length))
def init(m=None, seed=None): gc, mt, track = load_data(m, seed) msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() # khmm clustering over a range of k and states-per model k_range = [10, 25, 50, 100, 200] state_range = [5, 10, 25, 50, 100] return sequences, labels, noise, k_range, state_range
def create_casas7_hmm(file_address, has_activity): if has_activity: list_of_data, list_of_persons, _ = read_sequence_based_CSV_file_with_activity( file_address=file_address, has_header=True, separate_data_based_on_persons=False) else: list_of_data, list_of_persons = read_sequence_based_CSV_file_without_activity( file_address=file_address, has_header=True, separate_data_based_on_persons=False) model = "" try: model = HiddenMarkovModel.from_samples(DiscreteDistribution, n_components=5, X=list_of_data, algorithm='baum-welch') #model = HiddenMarkovModel.from_samples(DiscreteDistribution, n_components=2, X=list_of_data , labels = list_of_persons , algorithm = 'labeled' ) except KeyError: print('there is an exception') print(model) #print((list_of_persons[0])) print("np.shape(list_of_data):", np.shape(list_of_data)) #print(model._baum_welch_summarize()) model.plot() print("dense_transition_matrix:", model.dense_transition_matrix()) print("edge_count:", model.edge_count()) print("edges:", model.edges) print("name:", model.name) print("state_count:", model.state_count()) #print("summarize:" , model.summarize()) print(model.thaw())
#!/usr/bin/env python2.7 # example.py: Yet Another Hidden Markov Model library # Contact: Jacob Schreiber ( [email protected] ) """ A simple example highlighting how to build a model using states, add transitions, and then run the algorithms, including showing how training on a sequence improves the probability of the sequence. """ import random from pomegranate import * from pomegranate import HiddenMarkovModel as Model random.seed(0) model = Model(name="ExampleModel") distribution = UniformDistribution(0.0, 1.0) state = State(distribution, name="uniform") state2 = State(NormalDistribution(0, 2), name="normal") silent = State(None, name="silent") model.add_state(state) model.add_state(state2) model.add_transition(state, state, 0.4) model.add_transition(state, state2, 0.4) model.add_transition(state2, state2, 0.4) model.add_transition(state2, state, 0.4) model.add_transition(model.start, state, 0.5) model.add_transition(model.start, state2, 0.5) model.add_transition(state, model.end, 0.2)
infinite model, with no extra work! This change is passed on to all the algorithms. ''' from pomegranate import * from pomegranate import HiddenMarkovModel as Model import itertools as it import numpy as np # Define the states s1 = State( NormalDistribution( 5, 2 ), name="S1" ) s2 = State( NormalDistribution( 15, 2 ), name="S2" ) s3 = State( NormalDistribution( 25, 2 ), name="S3 ") # Define the transitions model = Model( "infinite" ) model.add_transition( model.start, s1, 0.7 ) model.add_transition( model.start, s2, 0.2 ) model.add_transition( model.start, s3, 0.1 ) model.add_transition( s1, s1, 0.6 ) model.add_transition( s1, s2, 0.1 ) model.add_transition( s1, s3, 0.3 ) model.add_transition( s2, s1, 0.4 ) model.add_transition( s2, s2, 0.4 ) model.add_transition( s2, s3, 0.2 ) model.add_transition( s3, s1, 0.05 ) model.add_transition( s3, s2, 0.15 ) model.add_transition( s3, s3, 0.8 ) model.bake() sequence = [ 4.8, 5.6, 24.1, 25.8, 14.3, 26.5, 15.9, 5.5, 5.1 ]
s2 = State(b, name="M2") s22 = State(bb, name="M22") s222 = State(bbb, name="M222") s2222 = State(bbbb, name="M2222") s22222 = State(bbbbb, name="M22222") s222222 = State(bbbbbb, name="M222222") s3 = State(c, name="M3") s33 = State(cc, name="M33") s333 = State(ccc, name="M333") s3333 = State(cccc, name="M3333") s33333 = State(ccccc, name="M33333") s333333 = State(cccccc, name="M333333") hmm = HiddenMarkovModel() hmm.add_states(s1, s11, s111, s2, s22, s222, s3, s33, s333, s1111, s11111, s111111, s2222, s22222, s222222, s3333, s33333, s333333) hmm.add_transition(hmm.start, s1, 1.) hmm.add_transition(hmm.start, s11, 1.) hmm.add_transition(hmm.start, s111, 1.) hmm.add_transition(hmm.start, s2, 1.) hmm.add_transition(hmm.start, s22, 1.) hmm.add_transition(hmm.start, s222, 1.) hmm.add_transition(hmm.start, s3, 1.) hmm.add_transition(hmm.start, s33, 1.) hmm.add_transition(hmm.start, s333, 1.) hmm.add_transition(hmm.start, s1111, 1.) hmm.add_transition(hmm.start, s11111, 1.)
def prep(cluster_directory_root, depth, genefile): # load data gc, mt, track = load_data(None, 0) genes = load(open(genefile, 'r')) gc.data = gc.data.loc[genes, :] data = pd.concat([gc.data, mt.data]) labels = data.index.values original_labels = labels pos_labels = labels + '+' neg_labels = labels + '-' pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels, columns=data.columns.values) neg_data = pd.DataFrame(data=(data.as_matrix() * -1), index=neg_labels, columns=data.columns.values) data = pd.concat([data, pos_data, neg_data]) print data.index.values generic_dir = cluster_directory_root.split('/') + (['*'] * depth) generic_dir = ('/').join(generic_dir) cluster_directories = \ glob.glob(generic_dir) clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') if cluster_members == ['']: cluster_members = [] clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass """ background = set() for clustering in clusterings.itervalues(): for cid, members in clustering.iteritems(): background.update(set(members)) """ background = list(original_labels) # data = data.loc[background, :] # generate ranomd clusterings of the same size k as our models random_clusterings = {} np.random.seed(int(time.time())) for clustering_id, clustering in clusterings.iteritems(): source = np.array(background) random_assignments = np.random.choice(len(clustering), source.size) random_clusters = {} for i, cluster_id in enumerate(clustering.iterkeys()): random_clusters[cluster_id] = \ source[np.where(random_assignments == i)[0]].tolist() random_clusterings[clustering_id] = random_clusters # generate random signed clustering random_signed_clusterings = {} pn = np.array(['+', '-']) for clustering_id, clustering in clusterings.iteritems(): source = np.array(background) random_assignments = np.random.choice(len(clustering), source.size) random_clusters = {} for i, cluster_id in enumerate(clustering.iterkeys()): members = source[np.where(random_assignments == i)[0]].tolist() signed_members = [] for member in members: sign = np.random.choice(pn, 1)[0] signed_members.append(member + sign) random_clusters[cluster_id] = signed_members random_signed_clusterings[clustering_id] = random_clusters return clusterings, random_clusterings, random_signed_clusterings,\ clusterings_models, data, original_labels
def init_lr_hmm(sequences, steps, states_per_step, force_end=False, model_id='Left-Righ HMM', seed=None): """ insantiate a left-right model with random parameters randomly generates start and transition matrices generates nomal distrobutions for each state from partition on sequences force_end if we require sequence to end in end state """ # seed random number generator if seed is not None: np.random.seed(seed) model = HiddenMarkovModel(model_id) n_states = steps * states_per_step # make distrobutions from chronological subsets of timepoints step_size = int(math.ceil(sequences.shape[1] / float(n_states+1))) # generate states states = np.empty((steps, states_per_step), dtype=object) for i in range(steps): for j in range(states_per_step): temp_assignment = np.arange(step_size * i, step_size * (i+1)) dist = \ NormalDistribution.from_samples(sequences[:, temp_assignment]) state_name = str(i) + '-' + str(j) states[i, j] = State(dist, name=str(state_name)) # add states to model model.add_states(states.flatten().tolist()) # make random transition from start -> step0 trans = np.random.ranf(states_per_step) trans = trans / trans.sum() for j in range(states_per_step): model.add_transition(model.start, states[0, j], trans[j]) # make random transition from step(i) -> step(i+1) for i in range(steps-1): for j in range(states_per_step): trans = np.random.ranf(states_per_step + 1) trans = trans / trans.sum() # self transition model.add_transition(states[i, j], states[i, j], trans[0]) # out transition for x in range(states_per_step): model.add_transition(states[i, j], states[i + 1, x], trans[x + 1]) # make random transition from stepn -> end if force_end: for j in range(states_per_step): trans = np.random.ranf(2) trans = trans / trans.sum() # self transition model.add_transition(states[(steps - 1), j], states[(steps - 1), j], trans[0]) # end transition model.add_transition(states[(steps - 1), j], model.end, trans[1]) model.bake() print 'Initialized Left-Right HMM:', model.name, '[', \ steps, states_per_step, ']' return model
def init_cycle_hmm(sequences, steps, states_per_step, model_id): """ insantiate a left-right model with random parameters randomly generates start and transition matrices generates nomal distrobutions for each state from partition on sequences """ model = HiddenMarkovModel(model_id) n_states = steps * states_per_step # make distrobutions from chronological subsets of timepoints step_size = int(math.ceil(sequences.shape[1] / float(n_states+1))) # generate states states = np.empty((steps, states_per_step), dtype=object) for i in range(steps): for j in range(states_per_step): temp_assignment = np.arange(step_size * i, step_size * (i+1)) dist = \ NormalDistribution.from_samples(sequences[:, temp_assignment]) state_name = str(i) + '-' + str(j) states[i, j] = State(dist, name=str(state_name)) # add states to model model.add_states(states.flatten().tolist()) # make random transition from start -> step0 trans = np.random.ranf(n_states) trans = trans / trans.sum() for i, state in enumerate(states.flatten().tolist()): model.add_transition(model.start, state, trans[i]) # make random transition from step(i) -> step(i+1) for i in range(steps-1): for j in range(states_per_step): trans = np.random.ranf(states_per_step + 1) trans = trans / trans.sum() # self transition model.add_transition(states[i, j], states[i, j], trans[0]) # out transition for x in range(states_per_step): model.add_transition(states[i, j], states[i + 1, x], trans[x + 1]) # make random transition from stepn -> step0 for j in range(states_per_step): trans = np.random.ranf(states_per_step + 1) trans = trans / trans.sum() # self transition model.add_transition(states[(steps - 1), j], states[(steps - 1), j], trans[0]) # out transition for x in range(states_per_step): model.add_transition(states[(steps - 1), j], states[0, x], trans[x + 1]) model.bake() print 'Initialized Cyclic State HMM:', '[', \ steps, states_per_step, ']' return model
def run(cluster_directory_root, depth, plottype): # load data gc, mt, track = load_data(None, 0) data = pd.concat([gc.data, mt.data]) labels = data.index.values pos_labels = labels + '+' neg_labels = labels + '-' pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels, columns=data.columns.values) neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels, columns=data.columns.values) data = pd.concat([data, pos_data, neg_data]) generic_dir = cluster_directory_root.split('/') + (['*'] * depth) generic_dir = ('/').join(generic_dir) cluster_directories = \ glob.glob(generic_dir) clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass background = set() for clustering in clusterings.itervalues(): for cid, members in clustering.iteritems(): background.update(set(members)) background = list(background) # data = data.loc[background, :] # generate ranomd clusterings of the same size k as our models random_clusterings = {} for clustering_id, clustering in clusterings.iteritems(): source = np.array(background) random_assignments = np.random.choice(len(clustering), source.size) random_clusters = {} for i, cluster_id in enumerate(clustering.iterkeys()): random_clusters[cluster_id] = \ source[np.where(random_assignments == i)[0]].tolist() random_clusterings[clustering_id] = random_clusters # run dunn and davies_bouldin for clusterings and random permutations rand_dunn = report_dunn(random_clusterings, clusterings_models, data) savename = cluster_directory_root + 'dunn_index_random' dump(rand_dunn, open(savename, 'w')) rand_davies = report_davies_bouldin(random_clusterings, clusterings_models, data) savename = cluster_directory_root + 'davies_bouldin_index_random' dump(rand_davies, open(savename, 'w')) if plottype == 'none': pass elif plottype == 'kn_grid': rand_dunn_df = pd.DataFrame() rand_davies_df = pd.DataFrame() for clustering_id, clustering in clusterings.iteritems(): cid = clustering_id.replace('k', '_'). \ replace('n', '_').split('_') m = cid[0] k = int(cid[1]) n = int(cid[2]) rand_dunn_df.loc[k, n] = rand_dunn[clustering_id] rand_davies_df.loc[k, n] = rand_davies[clustering_id] rand_davies_df = rand_davies_df.fillna(0) rand_dunn_df = rand_dunn_df.fillna(0) rand_dunn_df = rand_dunn_df.sort_index().sort_index(1) rand_davies_df = rand_davies_df.sort_index().sort_index(1) odir = cluster_directory_root title = 'RANDOM_' + str(m) + ': Dunn Index' HeatMap(rand_dunn_df.as_matrix(), rand_dunn_df.index.values, rand_dunn_df.columns.values, title=title, odir=odir) odir = cluster_directory_root title = 'RANDOM_' + str(m) + ': Davies-Bouldin Index' HeatMap(rand_davies_df.as_matrix(), rand_davies_df.index.values, rand_davies_df.columns.values, title=title, odir=odir) elif plottype == 'row': rand_dunn_df = pd.Series() rand_davies_df = pd.Series() for clustering_id, clustering in clusterings.iteritems(): rand_dunn_df.loc[clustering_id] = rand_dunn[clustering_id] rand_davies_df.loc[clustering_id] = rand_davies[clustering_id] rand_davies_df = rand_davies_df.fillna(0) rand_dunn_df = rand_dunn_df.fillna(0) rand_dunn_df = rand_dunn_df.sort_index() rand_davies_df = rand_davies_df.sort_index() odir = cluster_directory_root title = 'RANDOM' + ': Dunn Index' HeatMap(rand_dunn_df.as_matrix().reshape(-1, 1), rand_dunn_df.index.values, [' '], title=title, odir=odir, cmin=0, cmax=.5) odir = cluster_directory_root title = 'RANDOM' + ': Davies-Bouldin Index' HeatMap(rand_davies_df.as_matrix().reshape(-1, 1), rand_davies_df.index.values, [' '], title=title, odir=odir, cmin=5, cmax=10) return clusterings, clusterings_models
X_1 = X[y == 0] X_2 = X[y == 1] X_3 = X[y == 2] else: X_1 = X[2000:4000] X_2 = X[400:800] X_3 = X[7000:8000] a = MultivariateGaussianDistribution.from_samples(X_1) b = MultivariateGaussianDistribution.from_samples(X_2) c = MultivariateGaussianDistribution.from_samples(X_3) s1 = State(a, name="M1") s2 = State(b, name="M2") s3 = State(c, name="M3") hmm = HiddenMarkovModel() hmm.add_states(s1, s2, s3) hmm.add_transition(hmm.start, s1, 0.34) hmm.add_transition(hmm.start, s3, 0.33) hmm.add_transition(hmm.start, s2, 0.33) hmm.add_transition(s1, s1, 0.9) hmm.add_transition(s1, s2, 0.05) hmm.add_transition(s1, s3, 0.05) hmm.add_transition(s2, s1, 0.05) hmm.add_transition(s2, s3, 0.05) hmm.add_transition(s2, s2, 0.9) hmm.add_transition(s3, s3, 0.9) hmm.add_transition(s3, s2, 0.05)
cc = MultivariateGaussianDistribution.from_samples(X_33) ccc = MultivariateGaussianDistribution.from_samples(X_333) s1 = State(a, name="M1") s11 = State(aa, name="M11") s111 = State(aaa, name="M111") s2 = State(b, name="M2") s22 = State(bb, name="M22") s222 = State(bbb, name="M222") s3 = State(c, name="M3") s33 = State(cc, name="M33") s333 = State(ccc, name="M333") hmm = HiddenMarkovModel() hmm.add_states(s1, s11, s111, s2, s22, s222, s3, s33, s333) hmm.add_transition(hmm.start, s1, 0.12) hmm.add_transition(hmm.start, s11, 0.11) hmm.add_transition(hmm.start, s111, 0.11) hmm.add_transition(hmm.start, s2, 0.11) hmm.add_transition(hmm.start, s22, 0.11) hmm.add_transition(hmm.start, s222, 0.11) hmm.add_transition(hmm.start, s3, 0.11) hmm.add_transition(hmm.start, s33, 0.11) hmm.add_transition(hmm.start, s333, 0.11) hmm.add_transition(s1, s1, 0.92) hmm.add_transition(s1, s11, 0.01)
def gen_cluster_plots(cluster_directory_root, depth): # load data gc, mt, track = load_data(None, 0) data = pd.concat([gc.data, mt.data]) labels = data.index.values pos_labels = labels + '+' neg_labels = labels + '-' pos_data = pd.DataFrame(data=data.as_matrix(), index=pos_labels, columns=data.columns.values) neg_data = pd.DataFrame(data=data.as_matrix(), index=neg_labels, columns=data.columns.values) data = pd.concat([data, pos_data, neg_data]) generic_dir = cluster_directory_root.split('/') + (['*'] * depth) generic_dir = ('/').join(generic_dir) cluster_directories = \ glob.glob(generic_dir) clusterings = {} clusterings_models = {} for cluster_dir in cluster_directories: try: clustering_id = cluster_dir.split('/')[-1:][0] # read final clusters clusters = {} filepath = '/'.join(cluster_dir.split('/') + ['assignments.txt']) lines = (open(filepath, 'r').read().splitlines()) l = 0 while l < len(lines): cluster_name = lines[l] cluster_members = lines[l + 1].split('\t') clusters[cluster_name] = cluster_members l += 4 clusterings[clustering_id] = clusters # load models models = {} model_files = glob.glob(cluster_dir + '/*') for model_file in model_files: try: model_id = model_file.split('/')[-1:][0] json = open(model_file).read() models[model_id] = HiddenMarkovModel.from_json(json) print 'model loaded from: ', model_file except: pass clusterings_models[clustering_id] = models except: pass background = set() for clustering in clusterings.itervalues(): for cid, members in clustering.iteritems(): background.update(set(members)) background = list(background) # data = data.loc[background, :] # generate ranomd clusterings of the same size k as our models for clustering_id, clustering in clusterings.iteritems(): for model_id, members in clustering.iteritems(): sequences = data.loc[members, :] pltdir = '/'.join(cluster_directory_root.split('/') + ['plots']) # make line plots directory if not os.path.isdir(pltdir + '/line'): print "Creating directory...", pltdir os.mkdir(pltdir + '/line') savename = pltdir + '/line/' + model_id + '_lineplot' plt_title = model_id + ' Line Plot' ax = sequences.T.plot(legend=False, rot=2) ax.set_title(plt_title) ax.set_xlabel('Timepoint') ax.set_ylabel('Normalized Expression') print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() # make autocorr plots directory if not os.path.isdir(pltdir + '/autocorr'): print "Creating directory...", pltdir os.mkdir(pltdir + '/autocorr') savename = pltdir + '/autocorr/' + model_id + '_autocorr' plt_title = model_id + ' Autocorr Plot' for seq in sequences.index: ax = autocorrelation_plot(sequences.loc[seq]) ax.set_title(plt_title) print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() # make lag plots directory if not os.path.isdir(pltdir + '/lag'): print "Creating directory...", pltdir os.mkdir(pltdir + '/lag') from pylab import * NUM_COLORS = len(members) cm = get_cmap('gist_rainbow') colors = [] for i in range(NUM_COLORS): colors.append(cm(1.*i/NUM_COLORS)) savename = pltdir + '/lag/' + model_id + '_lagplot' plt_title = model_id + ' Lag Plot' for i, seq in enumerate(sequences.index): ax = lag_plot(sequences.loc[seq], c=colors[i]) ax.set_title(plt_title) print 'Saving: ', savename fig = ax.get_figure() fig.savefig(savename) fig.clear() """
def gen_model(sequences, labels, algorithm, initialization, restarts, n, k, out_dir, base_id, tied): if initialization == 'rand': init_method = init_gaussian_hmm init_args = {'n_states': n[0]} if init_lr_hmm == 'lr': init_method = init_lr_hmm s = n[0] sps = n[1] init_args = {'steps': s, 'state_per_step': sps, 'force_end': True} if initialization == 'cycle': init_method = init_cycle_hmm s = n[0] sps = n[1] init_args = {'steps': s, 'state_per_step': sps} best = 0 best_score = -1e1000 # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() np.random.seed(int(time.time())) randassigns = [] for x in range(restarts): randassigns.append(np.random.randint(k, size=labels.size)) for x in range(restarts): randassign = randassigns[x] assignments = {} for i in range(k): model_id = str(i) assignments[model_id] = \ np.where(randassign == i)[0].tolist() in_model = assignments[model_id] print assignments # gen model for number of restarts for x in range(restarts): try: collection_id = base_id + '_' + str(x) odir = '/'.join(out_dir.split('/') + [collection_id]) print 'Learning: ', collection_id # generate random initial assignments # initialize models on random assignments randassign = randassigns[x] assignments = {} models = {} for i in range(k): model_id = str(i) assignments[model_id] = \ np.where(randassign == i)[0].tolist() in_model = assignments[model_id] models[model_id] = \ init_method(sequences[in_model, :], model_id=model_id, **init_args) # add noise model models['noise'] = noise assignments['noise'] = [] # all are un-fixed fixed = {} for model_id, model in models.iteritems(): fixed[model_id] = [] # perform clustering models, assignments, c = cluster(models=models, sequences=sequences, assignments=assignments, algorithm=algorithm, fixed=fixed, tied=tied, labels=labels, odir=odir) score = total_log_prob(models, sequences, assignments) if best_score < score: best_score = score best = collection_id bestfile = '/'.join(out_dir.split('/') + ['best']) with open(bestfile, 'w') as f: print >> f, collection_id f.close() except: error_file = odir.split('/') + ['errors.txt'] error_file = '/'.join(error_file) f = open(error_file, 'a') print >> f, 'error computing parameters for: ', collection_id print >> f, "Unexpected error:", sys.exc_info()[0] f.close() return best
def init_gaussian_hmm(sequences, n_states, model_id, seed=None): """ insantiate a model with random parameters randomly generates start and transition matrices generates nomal distrobutions for each state from partition on sequences """ """ # make random transition probability matrix # scale each row to sum to 1 trans = np.random.ranf((n_states, n_states)) for i in range(n_states): trans[i, :] = trans[i, :] / trans[i, :].sum() # make distrobutions from random subsets of timepoints x = int(math.ceil(sequences.shape[1] / float(n_states))) # x = math.min(3, x) dists = [] for i in range(n_states): temp_assignment = np.random.choice(sequences.shape[1], x) dists.append(NormalDistribution.from_samples (sequences[:, temp_assignment])) # random start probabilities # scale to sum to 1 starts = np.random.ranf(n_states) starts = starts / sum(starts) model = HiddenMarkovModel.from_matrix(trans, dists, starts, name=model_id) """ # seed random numer generator if seed is not None: np.random.seed(seed) model = HiddenMarkovModel(model_id) # make states with distrobutions from random subsets of timepoints x = int(math.ceil(sequences.shape[1] / float(n_states))) states = [] for i in range(n_states): temp_assignment = np.random.choice(sequences.shape[1], x) dist = \ NormalDistribution.from_samples(sequences[:, temp_assignment]) states.append(State(dist, name=str(i))) model.add_states(states) # add random start probabilities start_probs = np.random.ranf(n_states) start_probs = start_probs / start_probs.sum() for i, state in enumerate(states): model.add_transition(model.start, state, start_probs[i]) # add random transition probabilites out of each state for state1 in states: transitions = np.random.ranf(n_states) transitions = transitions / transitions.sum() for i, state2 in enumerate(states): model.add_transition(state1, state2, transitions[i]) model.bake() print 'Initialized HMM: ', model.name return model
# [email protected] """ Example rainy-sunny HMM using yahmm. Example drawn from the wikipedia HMM article: http://en.wikipedia.org/wiki/Hidden_Markov_model describing what Bob likes to do on rainy or sunny days. """ from pomegranate import * from pomegranate import HiddenMarkovModel as Model import random import math random.seed(0) model = Model( name="Rainy-Sunny" ) # Emission probabilities rainy = State( DiscreteDistribution({ 'walk': 0.1, 'shop': 0.4, 'clean': 0.5 }), name='Rainy' ) sunny = State( DiscreteDistribution({ 'walk': 0.6, 'shop': 0.3, 'clean': 0.1 }), name='Sunny' ) model.add_transition( model.start, rainy, 0.6 ) model.add_transition( model.start, sunny, 0.4 ) # Transition matrix, with 0.05 subtracted from each probability to add to # the probability of exiting the hmm model.add_transition( rainy, rainy, 0.65 ) model.add_transition( rainy, sunny, 0.25 ) model.add_transition( sunny, rainy, 0.35 ) model.add_transition( sunny, sunny, 0.55 )