def _buildModel(self, data): ''' builds the model given the data to init the distributions at good point data: 2d matrix every row is a vector of features ''' # we want to call from_matrix(transition, dists, starts, ends) tm = np.zeros((self.statesNumber, self.statesNumber)) indices = [(x, x) for x in range(self.statesNumber)] indices.extend([(x, x + 1) for x in range(self.statesNumber)]) indices.pop( ) # this the item (self.statesNumber-1 , self.statesNumber) that is out of bound indices = np.array(indices) tm[indices[:, 0], indices[:, 1]] = 0.5 tm[self.statesNumber - 1, self.statesNumber - 1] = 0.5 # this is the end state prob, i write it alone as we may change it specificity dists = self._initDists(data) starts = np.zeros((self.statesNumber, )) starts[0] = 1 ends = np.zeros((self.statesNumber, )) ends[-1] = 0.5 self.model = HiddenMarkovModel.from_matrix(tm, dists, starts, ends, name=self.mname) return self.model
def init(): m = 1000 # restricts number of genes, used for local testing gc, mt, track = load_data(m) state_range = [5, 10, 25, 50, 100] z_range = [3, 5, 10, 20] msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) sequences = np.concatenate((sequences, -1 * sequences)) # tie positive and negative expression sequences tied = {} for i, label in enumerate(labels): tied[label] = [i, i+labels.size] state_labels = np.concatenate(((labels + '+'), (labels + '-'))) labels = np.concatenate((labels, labels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts) noise.freeze_distributions() return gc, mt, sequences, labels, state_labels, tied, noise, z_range, \ state_range
def init(m, seed): if m == -1: m = None gc, mt, track = load_data(m, seed) msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) sequences = np.concatenate((sequences, -1 * sequences)) # tie positive and negative expression sequences tied = {} for i, label in enumerate(labels): tied[label] = [i, i+labels.size] labels = np.concatenate(((labels + '+'), (labels + '-'))) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() return sequences, labels, tied, noise
def get_variable_number_of_repeats_matcher_hmm(patterns, copies=1, vpaths=None): model = get_constant_number_of_repeats_matcher_hmm(patterns, copies, vpaths) start_repeats_matches = State(None, name='start_repeating_pattern_match') end_repeats_matches = State(None, name='end_repeating_pattern_match') mat = model.dense_transition_matrix() states = model.states states.append(start_repeats_matches) states.append(end_repeats_matches) states_count = len(mat) start_repeats_ind = states_count end_repeats_ind = states_count + 1 mat = np.c_[mat, np.zeros(states_count), np.zeros(states_count)] mat = np.r_[mat, [np.zeros(states_count + 2)]] mat = np.r_[mat, [np.zeros(states_count + 2)]] unit_ends = [] for i, state in enumerate(model.states): if state.name.startswith('unit_end'): unit_ends.append(i) first_unit_start = None for i in range(len(mat[model.start_index])): if mat[model.start_index][i] != 0: first_unit_start = i mat[model.start_index][first_unit_start] = 0.0 mat[model.start_index][start_repeats_ind] = 1 mat[start_repeats_ind][first_unit_start] = 1 for unit_end in unit_ends: next_state = None for j in range(len(mat[unit_end])): if mat[unit_end][j] != 0: next_state = j mat[unit_end][next_state] = 0.5 mat[unit_end][end_repeats_ind] = 0.5 mat[end_repeats_ind][model.end_index] = 1 starts = np.zeros(states_count + 2) starts[model.start_index] = 1.0 ends = np.zeros(states_count + 2) ends[model.end_index] = 1.0 state_names = [state.name for state in states] distributions = [state.distribution for state in states] name = 'Repeat Matcher HMM Model' new_model = Model.from_matrix(mat, distributions, starts, ends, name=name, state_names=state_names, merge=None) new_model.bake(merge=None) return new_model
def get_read_matcher_model(left_flanking_region, right_flanking_region, patterns, copies=1, vpaths=None): model = get_suffix_matcher_hmm(left_flanking_region) repeats_matcher = get_variable_number_of_repeats_matcher_hmm( patterns, copies, vpaths) right_flanking_matcher = get_prefix_matcher_hmm(right_flanking_region) model.concatenate(repeats_matcher) model.concatenate(right_flanking_matcher) model.bake(merge=None) mat = model.dense_transition_matrix() first_repeat_matches = [] repeat_match_states = [] suffix_start = None for i, state in enumerate(model.states): if state.name[0] == 'M' and state.name.split('_')[-1] == '0': first_repeat_matches.append(i) if state.name[0] == 'M' and state.name.split('_')[-1] not in [ 'prefix', 'suffix' ]: repeat_match_states.append(i) if state.name == 'suffix_start_suffix': suffix_start = i mat[model.start_index][suffix_start] = 0.3 for first_repeat_match in first_repeat_matches: mat[model. start_index][first_repeat_match] = 0.7 / len(first_repeat_matches) for match_state in repeat_match_states: to_end = 0.7 / len(repeat_match_states) total = 1 + to_end for next_state in range(len(mat[match_state])): if mat[match_state][next_state] != 0: mat[match_state][next_state] /= total mat[match_state][model.end_index] = to_end / total starts = np.zeros(len(model.states)) starts[model.start_index] = 1.0 ends = np.zeros(len(model.states)) ends[model.end_index] = 1.0 state_names = [state.name for state in model.states] distributions = [state.distribution for state in model.states] name = 'Read Matcher' new_model = Model.from_matrix(mat, distributions, starts, ends, name=name, state_names=state_names, merge=None) new_model.bake(merge=None) return new_model
def test_sample_from_site(): dists = [ NormalDistribution(5, 1), NormalDistribution(1, 7), NormalDistribution(8, 2) ] trans_mat = np.array([[0.7, 0.3, 0.0], [0.0, 0.8, 0.2], [0.0, 0.0, 0.9]]) starts = np.array([1.0, 0.0, 0.0]) ends = np.array([0.0, 0.0, 0.1]) model = HiddenMarkovModel.from_matrix(trans_mat, dists, starts, ends) model.plot()
def __init__(self, length=None, n_features=None, initial=None, match_match=0.9, delete_insert=0.1, flank_prob=0): #last is polymorphism dummy super(ProfileHMM, self).__init__() if length is not None: n_states = 3 * length + 1 #print(self.get_emission_dists(n_states, n_features, initial)[:3]) self.model = HiddenMarkovModel.from_matrix( transition_probabilities=self.get_transmat( n_states, match_match, delete_insert), distributions=self.get_emission_dists(n_states, n_features, initial), starts=self.get_startprob(n_states), ends=self.get_endprob(n_states), state_names=self.get_state_names(length))
def build_model(self): distributions = [] for _ in range(self.hidden_size): emission_probs = np.random.random(self.num_characters) emission_probs = emission_probs / emission_probs.sum() distributions.append( DiscreteDistribution( dict(zip(self.all_characters, emission_probs)))) trans_mat = np.random.random((self.hidden_size, self.hidden_size)) trans_mat = trans_mat / trans_mat.sum(axis=1, keepdims=1) starts = np.random.random(self.hidden_size) starts = starts / starts.sum() # testing initializations np.testing.assert_almost_equal(starts.sum(), 1) np.testing.assert_array_almost_equal(np.ones(self.hidden_size), trans_mat.sum(axis=1)) self.model = HiddenMarkovModel.from_matrix(trans_mat, distributions, starts) self.model.bake()
def oriHMMParams(self, numdists=3): """ Set initial parameters for the Hidden Markov Model (HMM). """ # GMM emissions # 3 Hidden States: # 0--downstream, 1--no bias, 2--upstream if numdists == 1: dists = [ NormalDistribution(-2.5, 7.5), NormalDistribution(0, 7.5), NormalDistribution(2.5, 7.5) ] else: var = 7.5 / (numdists - 1) means = [[], [], []] for i in range(numdists): means[0].append(i * 7.5 / (numdists - 1) + 2.5) means[1].append(i * 7.5 * (-1)**i / (numdists - 1)) means[2].append(-i * 7.5 / (numdists - 1) - 2.5) dists = [] for i, m in enumerate(means): tmp = [] for j in m: tmp.append(NormalDistribution(j, var)) mixture = GeneralMixtureModel(tmp) dists.append(mixture) # transition matrix A = [[0.34, 0.33, 0.33], [0.33, 0.34, 0.33], [0.33, 0.33, 0.34]] starts = np.ones(3) / 3 hmm = HiddenMarkovModel.from_matrix(A, dists, starts, state_names=['0', '1', '2'], name='mixture{0}'.format(numdists)) return hmm
def init(): m = 1000 # restricts number of genes, used for local testing gc, mt, track = load_data(m) state_range = [5, 10, 25, 50, 100] z_range = [3, 5, 10, 20] msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data.iloc[:m, :]) sequences = np.concatenate((msequences, gsequences)) labels = np.concatenate((mlabels, glabels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts) noise.freeze_distributions() return gc, mt, sequences, labels, noise, z_range, state_range
def init(m, seed): if m == -1: m = None gc, mt, track = load_data(m, seed) msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() return sequences, labels, noise
def __init__(self, length=None, n_features=None, initial=None, match_match=0.9, delete_insert=0.1, flank_prob=0.9999999): super(ProfileHMM, self).__init__() if length is not None: n_states = 3 * length + 1 transmat = self.get_transmat(n_states, match_match, delete_insert, length, flank_prob) #print(transmat.shape) #np.set_printoptions(edgeitems=10, linewidth=200) #print(transmat.round(2)) emissions = self.get_emission_dists(n_states, n_features, initial) self.model = HiddenMarkovModel.from_matrix( transition_probabilities=transmat, distributions=emissions, starts=self.get_startprob(n_states, flank_prob), ends=self.get_endprob(n_states, flank_prob), state_names=self.get_state_names(length))
def init(m=None, seed=None): gc, mt, track = load_data(m, seed) msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() # khmm clustering over a range of k and states-per model k_range = [10, 25, 50, 100, 200] state_range = [5, 10, 25, 50, 100] return sequences, labels, noise, k_range, state_range
def roh_poissonhmm(gv, pos, phet_roh=0.001, phet_nonroh=(0.0025, 0.01), transition=1e-3, window_size=1000, min_roh=0, is_accessible=None, contig_size=None): """Call ROH (runs of homozygosity) in a single individual given a genotype vector. This function computes the likely ROH using a Poisson HMM model. The chromosome is divided into equally accessible windows of specified size, then the number of hets observed in each is used to fit a Poisson HMM. Note this is much faster than `roh_mhmm`, but at the cost of some resolution. The model is provided with a probability of observing a het in a ROH (`phet_roh`) and one or more probabilities of observing a het in a non-ROH, as this probability may not be constant across the genome (`phet_nonroh`). Parameters ---------- gv : array_like, int, shape (n_variants, ploidy) Genotype vector. pos: array_like, int, shape (n_variants,) Positions of variants, same 0th dimension as `gv`. phet_roh: float, optional Probability of observing a heterozygote in a ROH. Appropriate values will depend on de novo mutation rate and genotype error rate. phet_nonroh: tuple of floats, optional One or more probabilites of observing a heterozygote outside of ROH. Appropriate values will depend primarily on nucleotide diversity within the population, but also on mutation rate and genotype error rate. transition: float, optional Probability of moving between states. This is based on windows, so a larger window size may call for a larger transitional probability window_size: integer, optional Window size (equally accessible bases) to consider as a potential ROH. Setting this window too small may result in spurious ROH calls, while too large will result in a lack of resolution. min_roh: integer, optional Minimum size (bp) to condsider as a ROH. Will depend on contig size and recombination rate. is_accessible: array_like, bool, shape (`contig_size`,), optional Boolean array for each position in contig describing whether accessible or not. Although optional, highly recommended so invariant sites are distinguishable from sites where variation is inaccessible contig_size: integer, optional If is_accessible is not available, use this to specify the size of the contig, and assume all sites are accessible. Returns ------- df_roh: DataFrame Data frame where each row describes a run of homozygosity. Columns are 'start', 'stop', 'length' and 'is_marginal'. Start and stop are 1-based, stop-inclusive. froh: float Proportion of genome in a ROH. Notes ----- This function requires `pomegranate` (>= 0.9.0) to be installed. """ from pomegranate import HiddenMarkovModel, PoissonDistribution # equally accessbile windows if is_accessible is None: if contig_size is None: raise ValueError( "If is_accessibile argument is not provided, you must provide contig_size" ) is_accessible = np.ones((contig_size, ), dtype="bool") else: contig_size = is_accessible.size eqw = equally_accessible_windows(is_accessible, window_size) ishet = GenotypeVector(gv).is_het() counts, wins, records = windowed_statistic(pos, ishet, np.sum, windows=eqw) # heterozygote probabilities het_px = np.concatenate([(phet_roh, ), phet_nonroh]) # start probabilities (all equal) start_prob = np.repeat(1 / het_px.size, het_px.size) # transition between underlying states transition_mx = _hmm_derive_transition_matrix(transition, het_px.size) dists = [PoissonDistribution(x * window_size) for x in het_px] model = HiddenMarkovModel.from_matrix( transition_probabilities=transition_mx, distributions=dists, starts=start_prob) prediction = np.array(model.predict(counts[:, None])) df_blocks = tabulate_state_blocks(prediction, states=list(range(len(het_px)))) df_roh = df_blocks[(df_blocks.state == 0)].reset_index(drop=True) # adapt the dataframe for ROH df_roh["start"] = df_roh.start_ridx.apply(lambda y: eqw[y, 0]) df_roh["stop"] = df_roh.stop_lidx.apply(lambda y: eqw[y, 1]) df_roh["length"] = df_roh.stop - df_roh.start # filter by ROH size if min_roh > 0: df_roh = df_roh[df_roh.length >= min_roh] # compute FROH froh = df_roh.length.sum() / contig_size return df_roh[["start", "stop", "length", "is_marginal"]], froh
def gen_model(sequences, labels, algorithm, initialization, restarts, n, k, out_dir, base_id, tied): if initialization == 'rand': init_method = init_gaussian_hmm init_args = {'n_states': n[0]} if init_lr_hmm == 'lr': init_method = init_lr_hmm s = n[0] sps = n[1] init_args = {'steps': s, 'state_per_step': sps, 'force_end': True} if initialization == 'cycle': init_method = init_cycle_hmm s = n[0] sps = n[1] init_args = {'steps': s, 'state_per_step': sps} best = 0 best_score = -1e1000 # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() np.random.seed(int(time.time())) randassigns = [] for x in range(restarts): randassigns.append(np.random.randint(k, size=labels.size)) for x in range(restarts): randassign = randassigns[x] assignments = {} for i in range(k): model_id = str(i) assignments[model_id] = \ np.where(randassign == i)[0].tolist() in_model = assignments[model_id] print assignments # gen model for number of restarts for x in range(restarts): try: collection_id = base_id + '_' + str(x) odir = '/'.join(out_dir.split('/') + [collection_id]) print 'Learning: ', collection_id # generate random initial assignments # initialize models on random assignments randassign = randassigns[x] assignments = {} models = {} for i in range(k): model_id = str(i) assignments[model_id] = \ np.where(randassign == i)[0].tolist() in_model = assignments[model_id] models[model_id] = \ init_method(sequences[in_model, :], model_id=model_id, **init_args) # add noise model models['noise'] = noise assignments['noise'] = [] # all are un-fixed fixed = {} for model_id, model in models.iteritems(): fixed[model_id] = [] # perform clustering models, assignments, c = cluster(models=models, sequences=sequences, assignments=assignments, algorithm=algorithm, fixed=fixed, tied=tied, labels=labels, odir=odir) score = total_log_prob(models, sequences, assignments) if best_score < score: best_score = score best = collection_id bestfile = '/'.join(out_dir.split('/') + ['best']) with open(bestfile, 'w') as f: print >> f, collection_id f.close() except: error_file = odir.split('/') + ['errors.txt'] error_file = '/'.join(error_file) f = open(error_file, 'a') print >> f, 'error computing parameters for: ', collection_id print >> f, "Unexpected error:", sys.exc_info()[0] f.close() return best