def get_dist(data_vec): dist_list = [] for vec in data_vec: vec = vec[~np.isnan(vec)] if len(vec): dist_list.append(pg.NormalDistribution(np.nanmean(vec), max(np.std(vec), 1E-6))) else: dist_list.append(pg.NormalDistribution(0, 999999)) return pg.IndependentComponentsDistribution(dist_list)
def generate_fret_states(kind, state_means, trans_mat, trans_prob): """Creates artificial FRET states""" if all(isinstance(s, float) for s in state_means): kind = "defined" rand_k_states = np.random.randint(1, random_k_states_max + 1) if kind == "random": k_states = rand_k_states state_means = generate_state_means(min_state_diff, k_states) elif kind == "aggregate": state_means = np.random.uniform(0, 1) k_states = 1 else: if np.size(state_means) <= random_k_states_max: # Pick the same amount of k states as state means given k_states = np.size(state_means) else: # Pick no more than k_states_max from the state means (e.g. # given [0.1, 0.2, 0.3, 0.4, 0.5] use only # random_k_states_max of these) k_states = rand_k_states state_means = np.random.choice( state_means, size=k_states, replace=False ) if type(state_means) == float: dists = [pg.NormalDistribution(state_means, 0)] else: dists = [pg.NormalDistribution(m, 0) for m in state_means] starts = np.array([1 / k_states] * k_states) lib.utils.random_seed_mp() np.random.shuffle(dists) # Generate arbitrary transition matrix if trans_mat is None: trans_mat = np.empty([k_states, k_states]) trans_mat.fill(trans_prob) np.fill_diagonal(trans_mat, 1 - trans_prob) # Make sure that each row/column sums to exactly 1 if trans_prob != 0: stay_prob = 1 - trans_prob remaining_prob = 1 - trans_mat.sum(axis=0) trans_mat[trans_mat == stay_prob] += remaining_prob # Generate HMM model model = pg.HiddenMarkovModel.from_matrix( trans_mat, distributions=dists, starts=starts ) model.bake() E_true = np.array(model.sample(trace_length)) return E_true
def get_blank_distribution(self): nd = 3 dist_list = [] for _ in range(nd): dist_list.append(pg.IndependentComponentsDistribution( [pg.NormalDistribution(0, 1) for _ in range(len(self.feature_list))])) return pg.GeneralMixtureModel(dist_list, weights=[1 / nd] * nd)
def build_model(n_bins, n_cmps, n_features, means, stds, state_names=None): # Initial values for all Gaussian components dist_init = np.random.random((n_bins, n_cmps, n_features, 2)) dist_init[..., 0] -= 0.5 # Center means to 0.0 for feat_i in range(n_features): # Random init mean in range [-2std, 2std) dist_init[..., feat_i, 0] *= 4 * stds[feat_i] dist_init[..., feat_i, 0] += means[feat_i] # Random init std in range [0, std) dist_init[..., feat_i, 1] *= stds[feat_i] if n_cmps > 1: dists = tuple( pgn.GeneralMixtureModel( list( pgn.IndependentComponentsDistribution( tuple( pgn.NormalDistribution(*dist_init[bin_i, cmp_i, feat_i, :]) for feat_i in range(n_features))) for cmp_i in range(n_cmps))) for bin_i in range(n_bins)) else: dists = tuple( pgn.IndependentComponentsDistribution( tuple( pgn.NormalDistribution(*dist_init[bin_i, 0, feat_i, :]) for feat_i in range(n_features))) for bin_i in range(n_bins)) trans_mat = np.random.random((n_bins, n_bins)) starts = np.ones(n_bins) model = pgn.HiddenMarkovModel.from_matrix(trans_mat, dists, starts, state_names=state_names) return model
def generate_fret_states(kind, state_means, trans_mat, trans_prob): """Creates artificial FRET states""" if all(isinstance(s, float) for s in state_means): kind = "defined" rand_k_states = np.random.randint(1, random_k_states_max + 1) if kind == "aggregate": state_means = np.random.uniform(0, 1) k_states = 1 elif kind == "random": k_states = (len(trans_mat) if trans_mat is not None else rand_k_states) state_means = generate_state_means(min_state_diff, k_states) else: if np.size(state_means) <= random_k_states_max: # Pick the same amount of k states as state means given k_states = np.size(state_means) else: # Pick no more than k_states_max from the state means (e.g. # given [0.1, 0.2, 0.3, 0.4, 0.5] use only # random_k_states_max of these) k_states = rand_k_states state_means = np.random.choice(state_means, size=k_states, replace=False) if type(state_means) == float: dists = [pg.NormalDistribution(state_means, eps)] else: dists = [pg.NormalDistribution(m, eps) for m in state_means] starts = np.random.uniform(0, 1, size=k_states) starts /= starts.sum() # Generate arbitrary transition matrix if trans_mat is None: matrix = np.empty([k_states, k_states]) matrix.fill(trans_prob) np.fill_diagonal(matrix, 1 - trans_prob) # Make sure that each row/column sums to exactly 1 if trans_prob != 0: stay_prob = 1 - trans_prob remaining_prob = 1 - matrix.sum(axis=0) matrix[matrix == stay_prob] += remaining_prob else: if len(state_means) != len(trans_mat): raise ValueError( "Number of FRET states ({0}) doesn't match transition matrix {1}x{1}" .format(len(state_means), len(trans_mat))) matrix = trans_mat model = pg.HiddenMarkovModel.from_matrix(matrix, distributions=dists, starts=starts) model.bake() final_matrix = model.dense_transition_matrix()[:k_states, :k_states] E_true = np.array(model.sample(n=1, length=trace_length)) E_true = np.squeeze(E_true).round(4) return E_true, final_matrix
def hmm_get_model(cnarr, method, processes): """ Parameters ---------- cnarr : CopyNumArray The normalized bin-level values to be segmented. method : string One of 'hmm', 'hmm-tumor', 'hmm-germline'. processes : int Number of parallel jobs to run. Returns ------- model : A pomegranate HiddenMarkovModel trained on the given dataset. """ assert method in ('hmm-tumor', 'hmm-germline', 'hmm') observations = as_observation_matrix(cnarr.autosomes()) # Estimate standard deviation from the full distribution, robustly stdev = biweight_midvariance(np.concatenate(observations), initial=0) if method == 'hmm-germline': state_names = ["loss", "neutral", "gain"] distributions = [ pom.NormalDistribution(-1.0, stdev, frozen=True), pom.NormalDistribution(0.0, stdev, frozen=True), pom.NormalDistribution(0.585, stdev, frozen=True), ] elif method == 'hmm-tumor': state_names = ["del", "loss", "neutral", "gain", "amp"] distributions = [ pom.NormalDistribution(-2.0, stdev, frozen=False), pom.NormalDistribution(-0.5, stdev, frozen=False), pom.NormalDistribution(0.0, stdev, frozen=True), pom.NormalDistribution(0.3, stdev, frozen=False), pom.NormalDistribution(1.0, stdev, frozen=False), ] else: state_names = ["loss", "neutral", "gain"] distributions = [ pom.NormalDistribution(-1.0, stdev, frozen=False), pom.NormalDistribution(0.0, stdev, frozen=False), pom.NormalDistribution(0.585, stdev, frozen=False), ] n_states = len(distributions) # Starts -- prefer neutral binom_coefs = scipy.special.binom(n_states - 1, range(n_states)) start_probabilities = binom_coefs / binom_coefs.sum() # Ends -- equally likely #end_probabilities = np.ones(n_states) / n_states # Prefer to keep the current state in each transition # All other transitions are equally likely, to start transition_matrix = (np.identity(n_states) * 100 + np.ones( (n_states, n_states)) / n_states) model = pom.HiddenMarkovModel.from_matrix(transition_matrix, distributions, start_probabilities, state_names=state_names, name=method) model.fit( sequences=observations, weights=[len(obs) for obs in observations], distribution_inertia=.8, # Allow updating dists, but slowly edge_inertia=0.1, # lr_decay=.75, pseudocount=5, use_pseudocount=True, max_iterations=100000, n_jobs=processes, verbose=False) return model
def variants_in_segment(varr, segment, min_variants=50): if len(varr) > min_variants: observations = varr.mirrored_baf(above_half=True) state_names = ["neutral", "alt"] distributions = [ pom.NormalDistribution(0.5, .1, frozen=True), pom.NormalDistribution(0.67, .1, frozen=True), ] n_states = len(distributions) # Starts -- prefer neutral start_probabilities = [.95, .05] # Prefer to keep the current state in each transition # All other transitions are equally likely, to start transition_matrix = (np.identity(n_states) * 100 + np.ones( (n_states, n_states)) / n_states) model = pom.HiddenMarkovModel.from_matrix(transition_matrix, distributions, start_probabilities, state_names=state_names, name="loh") model.fit( sequences=[observations], edge_inertia=0.1, lr_decay=.75, pseudocount=5, use_pseudocount=True, max_iterations=100000, #n_jobs=1, # processes, verbose=False) states = np.array(model.predict(observations, algorithm='map')) logging.info("Done, now finalizing") logging.debug("Model states: %s", model.states) logging.debug("Predicted states: %s", states[:100]) logging.debug(str(collections.Counter(states))) #logging.debug("Observations: %s", observations[0][:100]) logging.debug("Edges: %s", model.edges) # Merge adjacent bins with the same state to create segments fake_cnarr = CNA(varr.add_columns(weight=1, log2=0, gene='.').data) results = squash_by_groups(fake_cnarr, varr.as_series(states), by_arm=False) assert (results.start < results.end).all() else: results = None if results is not None and len(results) > 1: logging.info( "Segment %s:%d-%d on allele freqs for %d additional breakpoints", segment.chromosome, segment.start, segment.end, len(results) - 1) # Place breakpoints midway between SNVs # XXX TODO use original cnarr bin boundaries to select/adjust breakpoint mid_breakpoints = (results.start.values[1:] + results.end.values[:-1]) // 2 starts = np.concatenate([[segment.start], mid_breakpoints]) ends = np.concatenate([mid_breakpoints, [segment.end]]) dframe = pd.DataFrame({ 'chromosome': segment.chromosome, 'start': starts, 'end': ends, # 'baf': results['mean'], 'gene': segment.gene, # '-' 'log2': segment.log2, 'probes': results['probes'], # 'weight': (segment.weight * results['probes'] # / (segment.end - segment.start)), }) bad_segs_idx = (dframe.start >= dframe.end) if bad_segs_idx.any(): raise RuntimeError("Improper post-processing of segment {} -- " "{} bins start >= end:\n{}\n".format( segment, bad_segs_idx.sum(), dframe[bad_segs_idx])) else: dframe = pd.DataFrame( { 'chromosome': segment.chromosome, 'start': segment.start, 'end': segment.end, 'gene': segment.gene, #'-', 'log2': segment.log2, 'probes': segment.probes, # 'weight': segment.weight, }, index=[0]) return dframe
def get_model(r, params, window_size, num_skipped, seq_len, p, \ g, resample_prob, x_chr=False, haploid=False, debug=False, h_t=1, skip_score=float("-Inf")): """ Builds the hidden Markov model for a given chromosome or scaffold, using the Pomegranate module. Arguments: r -- (float) the per site, per generation recombination probability params -- a dict where keys are names of states (AA, AB, and BB) and values are dicts where values are mu and sd, which are floats representing means and standard deviations of emission probability distributions window_size -- (int) the window size for this run, in bp num_skipped -- (int) the number of windows that were skipped due to not passing criteria seq_len -- (int) the number of windows in the current chromosome/scaffold p -- (float) the percent ancestry the admixed population derives from ancestral population A (estimated beforehand) g -- (int) the number of generations since admixture (estimated beforehand) resample_prob -- (float) probability of resampling the same ancestral recombination event twice in an individual after the set number of generations since admixture (referred to as z in the paper) x_chr -- (boolean) does this chromosome/scaffold belong to a hemizygous sex chromosome? haploid -- (boolean) is this individual haploid along this chromosome/scaffold? debug -- (boolean) should debugging messages be printed to the screen? h_t -- (float) if the user has specified that expected reduction in heterozygosity given the number of generations since admixture should be incorporated into the model, this is the expected fraction of the initial heterozygosity that remains after g generations. skip_score -- (float) the number emitted by adlibs_score when "skipped" windows are encountered Returns: a Pomegranate HMM object for the current chromosome/scaffold """ global prob_lim model = pomegranate.HiddenMarkovModel(name='ancestry') # Compute probabilities of transitioning to a skip state or the end. Cap these # both at the specified probability limit. skip_prob = num_skipped / seq_len if skip_prob > prob_lim: skip_prob = prob_lim state_end = 1 / seq_len if state_end > prob_lim: state_end = prob_lim if x_chr: r *= (2 / 3) # Determine probabilities of transitions if haploid: # Should 2 be 1.5? I don't think so -- we already multiplied r by (2/3) # so that's in here already. aa_bb = g * r * (1 - p) bb_aa = g * r * p # Eliminate the heterozygous state. aa_ab = 0 ab_aa = 0 bb_ab = 0 ab_bb = 0 else: probs = get_trans_probs(r, g, p, resample_prob) aa_ab = probs['aa_ab'] ab_aa = probs['ab_aa'] aa_bb = probs['aa_bb'] bb_ab = probs['bb_ab'] ab_bb = probs['ab_bb'] bb_aa = probs['bb_aa'] aa_ab *= window_size ab_aa *= window_size aa_bb *= window_size bb_ab *= window_size ab_bb *= window_size bb_aa *= window_size aa_aa = 1 - (aa_ab + aa_bb + state_end + skip_prob) ab_ab = 1 - (ab_aa + ab_bb + state_end + skip_prob) bb_bb = 1 - (bb_aa + bb_ab + state_end + skip_prob) # Account for reduction in heterozygosity due to genetic drift if haploid: pass #aa_aa += (aa_bb - aa_bb*h_t) #aa_bb *= h_t #bb_bb += (bb_aa - bb_aa*h_t) #bb_aa *= h_t else: aa_aa += (aa_aa / (aa_aa + aa_bb)) * (aa_ab - aa_ab * h_t) aa_bb += (aa_bb / (aa_aa + aa_bb)) * (aa_ab - aa_ab * h_t) bb_aa += (bb_aa / (bb_aa + bb_bb)) * (bb_ab - bb_ab * h_t) bb_bb += (bb_bb / (bb_aa + bb_bb)) * (bb_ab - bb_ab * h_t) aa_ab *= h_t bb_ab *= h_t ab_aa += (ab_aa / (ab_aa + ab_bb)) * (ab_ab - ab_ab * h_t) ab_bb += (ab_bb / (ab_aa + ab_bb)) * (ab_ab - ab_ab * h_t) ab_ab *= h_t if debug: print("# AA -> AA {}".format(aa_aa), file=sys.stderr) print("# AA -> AB {}".format(aa_ab), file=sys.stderr) print("# AA -> BB {}".format(aa_bb), file=sys.stderr) print("# AB -> AA {}".format(ab_aa), file=sys.stderr) print("# AB -> AB {}".format(ab_ab), file=sys.stderr) print("# AB -> BB {}".format(ab_bb), file=sys.stderr) print("# BB -> AA {}".format(bb_aa), file=sys.stderr) print("# BB -> AB {}".format(bb_ab), file=sys.stderr) print("# BB -> BB {}".format(bb_bb), file=sys.stderr) print("# SKIP {}".format(skip_prob), file=sys.stderr) aaDist = pomegranate.NormalDistribution(params['AA']['mu'], params['AA']['sd']) abDist = pomegranate.NormalDistribution(params['AB']['mu'], params['AB']['sd']) bbDist = pomegranate.NormalDistribution(params['BB']['mu'], params['BB']['sd']) aaState = pomegranate.State(aaDist, name="AA") abState = pomegranate.State(abDist, name="AB") bbState = pomegranate.State(bbDist, name="BB") model.add_state(aaState) if not haploid: model.add_state(abState) model.add_state(bbState) #### ADD skip states skip_dist = pomegranate.UniformDistribution(skip_score - 0.01, skip_score) aa_skip_state = pomegranate.State(skip_dist, name="skip-AA") ab_skip_state = pomegranate.State(skip_dist, name="skip-AB") bb_skip_state = pomegranate.State(skip_dist, name="skip-BB") model.add_state(aa_skip_state) if not haploid: model.add_state(ab_skip_state) model.add_state(bb_skip_state) if haploid: model.add_transition(model.start, aaState, p * (1 - skip_prob)) model.add_transition(model.start, aa_skip_state, p * skip_prob) model.add_transition(model.start, bbState, (1 - p) * (1 - skip_prob)) model.add_transition(model.start, bb_skip_state, (1 - p) * skip_prob) else: model.add_transition(model.start, aaState, p**2 * (1 - skip_prob)) model.add_transition(model.start, aa_skip_state, p**2 * skip_prob) model.add_transition(model.start, abState, 2 * p * (1 - p) * (1 - skip_prob)) model.add_transition(model.start, ab_skip_state, 2 * p * (1 - p) * skip_prob) model.add_transition(model.start, bbState, (1 - p)**2 * (1 - skip_prob)) model.add_transition(model.start, bb_skip_state, (1 - p)**2 * skip_prob) model.add_transition(aaState, model.end, 1 / seq_len) if not haploid: model.add_transition(abState, model.end, 1 / seq_len) model.add_transition(bbState, model.end, 1 / seq_len) model.add_transition(aaState, bbState, aa_bb) model.add_transition(aaState, aaState, aa_aa) model.add_transition(bbState, aaState, bb_aa) model.add_transition(bbState, bbState, bb_bb) if not haploid: model.add_transition(aaState, abState, aa_ab) model.add_transition(abState, aaState, ab_aa) model.add_transition(abState, bbState, ab_bb) model.add_transition(abState, abState, ab_ab) model.add_transition(bbState, abState, bb_ab) ### Add skip state transitions model.add_transition(aaState, aa_skip_state, skip_prob) if not haploid: model.add_transition(abState, ab_skip_state, skip_prob) model.add_transition(bbState, bb_skip_state, skip_prob) model.add_transition(aa_skip_state, aa_skip_state, skip_prob) if not haploid: model.add_transition(ab_skip_state, ab_skip_state, skip_prob) model.add_transition(bb_skip_state, bb_skip_state, skip_prob) model.add_transition(aa_skip_state, bbState, aa_bb) model.add_transition(bb_skip_state, aaState, bb_aa) if not haploid: model.add_transition(aa_skip_state, abState, aa_ab) model.add_transition(ab_skip_state, aaState, ab_aa) model.add_transition(ab_skip_state, bbState, ab_bb) model.add_transition(bb_skip_state, abState, bb_ab) model.add_transition(aa_skip_state, model.end, 1 / seq_len) if not haploid: model.add_transition(ab_skip_state, model.end, 1 / seq_len) model.add_transition(bb_skip_state, model.end, 1 / seq_len) model.add_transition(aa_skip_state, aaState, 1 - skip_prob - aa_ab - aa_bb - 1 / seq_len) if not haploid: model.add_transition(ab_skip_state, abState, 1 - skip_prob - ab_aa - ab_bb - 1 / seq_len) model.add_transition(bb_skip_state, bbState, 1 - skip_prob - bb_aa - bb_ab - 1 / seq_len) ### model.bake() return model