def remove_small_clones(previous_est, min_mut_clone, inputMU): """ this function removes clones smaller than a threshold (min_mut_clone) After each post-hoc modification, the estimator is refit with initialization to previous parameters (adjusted if needed) Parameters ---------- previous_est: Estimator object current estimator fitted to the data, from which to remove clones that are too small min_mut_clone: int or float if int, the minimal number of mutations per returned clone by hard assignement (most likely clone). If the threshold is not met for a clone, it is deleted, and attributions to the remaining clones are computed for all mutations. if float, same principle, but the threshold is applied to the \\xi parameters, representing the proportion of each clone with soft assignement. inputMU: array-like (L, 96) known L signatures to be fit by clonesig Returns ------- new_est: new estimator fit with new number of clones. """ if isinstance(min_mut_clone, float): future_clones = previous_est.xi > min_mut_clone elif isinstance(min_mut_clone, int): useful_counts = np.zeros(previous_est.J) pre_counts = np.unique(np.argmax(previous_est.qun, axis=1), return_counts=True) useful_counts[pre_counts[0]] = pre_counts[1] actual_min_mut_clone = min(np.max(useful_counts), min_mut_clone) future_clones = useful_counts >= actual_min_mut_clone new_phi = previous_est.phi[future_clones] new_xi = previous_est.xi[future_clones] /\ previous_est.xi[future_clones].sum() new_pi = previous_est.pi[future_clones, :] new_nb_clones = sum(future_clones) new_est = Estimator(previous_est.T, previous_est.B, previous_est.C_normal, previous_est.C_tumor_tot, previous_est.C_tumor_minor, previous_est.D, previous_est.p, new_nb_clones, inputMU=inputMU, pi=new_pi, phi=new_phi, xi=new_xi, nu=previous_est.nu, tau=previous_est.tau) new_est.fit() return new_est
def score_sig_1D(sim, est_sig, inputMU, cancer_type=None): """ percent of mutations with the right signature """ data_df = sim._get_data_df() est = Estimator(data_df.trinucleotide.values, data_df.var_counts.values, data_df.normal_cn.values, data_df.minor_cn.values + data_df.major_cn.values, data_df.minor_cn.values, data_df.var_counts.values + data_df.ref_counts.values, sim.purity, 1, inputMU=inputMU, pi=est_sig.reshape(1, -1)) est_sig_att = est.rnus[np.arange(est.N), est.qun.argmax(axis=1), :].argmax(axis=1) sim_sig = sim.S.copy() if sim.MU.shape != est.mu_matrix.shape: filter_filename = 'data/curated_match_signature_cancertype_tcgawes_literature.csv' cancer_type_sig = pd.read_csv(pkg_resources.resource_stream( 'clonesig', filter_filename), sep='\t', index_col=0).values select = cancer_type_sig[:, cancer_type].astype(bool) if sim.MU.shape[0] != 65: sim_sig = np.array([np.where(select)[0][int(i)] for i in sim_sig]) if est.mu_matrix.shape[0] == 47: big_select = cancer_type_sig.sum(axis=1).astype(bool) est_sig_att = np.array([np.where(big_select)[0][int(i)] for i in est_sig_att]) elif est.mu_matrix.shape[0] < 47: est_sig_att = np.array([np.where(select)[0][int(i)] for i in est_sig_att]) return score_sig_1D_base(sim_sig, est_sig_att)
def format_deconstructsigs(folder_path): res_filename = '{}/deconstructsigs/signatures_cancertype.csv'\ .format(folder_path) result_file = pd.read_csv(res_filename, sep=' ') pred_signatures = np.zeros(len(all_sigs)) filename = '{}/subMU.csv'.format(folder_path) sub_matrix = pd.read_csv(filename, sep='\t') mu_mat_setting = sub_matrix[sub_matrix.columns[1:]].values.T sub_sigs = sub_matrix.columns[1:] idx = [list(all_sigs).index(s) for s in sub_sigs] pred_signatures[np.array(idx)] = result_file.iloc[0].values sig_profile = result_file.values.dot(mu_mat_setting) input_filename = '{}/deconstructsigs/pattern96.csv'.format(folder_path) pattern = pd.read_csv(input_filename, sep='\t') data_df = pd.read_csv('{}/input_t.tsv'.format(folder_path), sep='\t') est = Estimator(data_df.trinucleotide.values, data_df.var_counts.values, data_df.normal_cn.values, data_df.minor_cn.values + data_df.major_cn.values, data_df.minor_cn.values, data_df.var_counts.values + data_df.ref_counts.values, data_df.purity.mean(), 1, inputMU=mu_mat_setting, pi=result_file.values.reshape(1, -1)) est_sig_att = est.rnus[np.arange(est.N), est.qun.argmax(axis=1), :].argmax(axis=1) nb_mut = pattern.sum().sum() pred_profile_1E = np.repeat([sig_profile], nb_mut, axis=0) runtime = pd.read_csv('{}/deconstructsigs/deconstructsig_runtime_cancertype.csv' .format(folder_path), index_col=0).values[0][0] return (None, None, None, None, None, None, None, sig_profile, pred_signatures, est_sig_att, pred_profile_1E, runtime)
def remove_small_sigs(previous_est, single_clone_est, min_prop_sig, inputMU): """ this function removes signatures with exposure smaller than a threshold (min_prop_sig) in all subclones of previous_est and in a global fit (single_clone_est). After each post-hoc modification, the estimator is refit with initialization to previous parameters (adjusted if needed) Parameters ---------- previous_est: Estimator object current estimator fitted to the data, from which to remove signatures with too small exposures in the sample single_clone_est: Estimator object current single clone estimator fitted to the data. Because of the likelihood test ratio, it is necessary to adjust it as well accordingly. min_prop_sig: float minimal exposure for signatures. If the maximal exposure of a given signature among all clones is smaller than min_prop_sig, then it is removed, and the contribution of other signatures is scaled to 1. inputMU: array-like (L, 96) known L signatures to be fit by clonesig. Returns ------- """ big_pi = np.concatenate((single_clone_est.pi, previous_est.pi), axis=0) future_sigs = np.max(big_pi, axis=0) > min_prop_sig new_inputMU = inputMU[future_sigs, :] pre_new_single_clone_pi = single_clone_est.pi[:, future_sigs] pre_new_pi = previous_est.pi[:, future_sigs] new_single_clone_pi = pre_new_single_clone_pi /\ pre_new_single_clone_pi.sum(axis=1)[:, np.newaxis] new_pi = pre_new_pi / pre_new_pi.sum(axis=1)[:, np.newaxis] new_inputMU = inputMU[future_sigs, :] new_est = Estimator(previous_est.T, previous_est.B, previous_est.C_normal, previous_est.C_tumor_tot, previous_est.C_tumor_minor, previous_est.D, previous_est.p, previous_est.J, inputMU=new_inputMU, pi=new_pi, phi=previous_est.phi, xi=previous_est.xi, nu=previous_est.nu, tau=previous_est.tau) new_est.fit() new_sc_est = Estimator(single_clone_est.T, single_clone_est.B, single_clone_est.C_normal, single_clone_est.C_tumor_tot, single_clone_est.C_tumor_minor, single_clone_est.D, single_clone_est.p, single_clone_est.J, inputMU=new_inputMU, pi=new_single_clone_pi, phi=single_clone_est.phi, xi=single_clone_est.xi, nu=single_clone_est.nu, tau=single_clone_est.tau) new_sc_est.fit() return new_est, new_sc_est, new_inputMU
def get_loglikelihood(self): est = Estimator(self.T, self.B, self.C_normal, self.C_tumor_tot, self.C_tumor_minor, self.D, self.purity, self.J, inputMU=self.MU, pi=self.pi, phi=self.phi, xi=self.xi, tau=self.tau) return est.get_loglikelihood
# get xi steady_xi = np.random.dirichlet(alpha=np.ones(nb_clones)) while min(steady_xi) < 0.1: steady_xi = np.random.dirichlet(alpha=np.ones(nb_clones)) np.random.seed(20190610 + nb_seed) uu = SimLoader(nb_mut, nb_clones, inputMU=subMU, pi_param=pi, phi_param=steady_phi, xi_param=steady_xi, rho_param=100, cn=True, dip_prop=perc_dip) uu._get_unobserved_nodes() uu._get_observed_nodes() sc_est_subMU = Estimator(uu.T, uu.B, uu.C_normal, uu.C_tumor_tot, uu.C_tumor_minor, uu.D, uu.purity, 1, inputMU=subMU) sc_est_subMU.fit() est_subMU = Estimator(uu.T, uu.B, uu.C_normal, uu.C_tumor_tot, uu.C_tumor_minor, uu.D, uu.purity, nb_clones, inputMU=subMU) est_subMU.fit() sc_est_MU = Estimator(uu.T, uu.B, uu.C_normal, uu.C_tumor_tot, uu.C_tumor_minor, uu.D, uu.purity, 1, inputMU=MU) sc_est_MU.fit() est_MU = Estimator(uu.T, uu.B, uu.C_normal, uu.C_tumor_tot, uu.C_tumor_minor, uu.D, uu.purity, nb_clones, inputMU=MU) est_MU.fit()
def run_clonesig(T, B, D, C_normal, C_tumor_tot, C_tumor_minor, purity, inputMU, inputNu=None, nu_heuristics=None, nb_fits=1, seeds=None, max_nb_clones=6, return_sig_change_test=True, min_mut_clone=0, min_prop_sig=0.0, prefit_signatures=False, prefit_thresh=0.05, model_selection_function=None, model_selection_kws=None): """ this function is a wrapper that takes data (and settings) as input, tries to fit clonesig model for a various number of clones, and returns the best fit, with some relevant selected post-hoc adjustements. After each post-hoc modification, the estimator is refit with initialization to previous parameters (adjusted if needed) Parameters ---------- T : iterable of length N with the trinucleotide context of each mutation, numbered from 0 to 95 B : iterable of length N with the variant allele read count for each mutation D : iterable of length N with the total read count for each mutation C_normal : iterable of length N copy number of non-tumor cells in the sample at each mutation locus C_tumor_tot : iterable of length N the total copy number of tumor cells in the sample at each mutation locus C_tumor_minor : iterable of length N the minor copy number of tumor cells in the sample at each mutation locus. If this info is not available, set it to zero so that clonesig considers all possible genotypes purity : float in [0, 1] an estimate of the tumor purity of the sample inputMU : array-like (L, 96) known L signatures to be fit by clonesig. inputNu : array-like (N, Mmax) with Mmax = max(C_tumor_tot - C_tumor_minor) probablity distribution of number of mutated copies for each mutation be careful, it is a probability distribution, so one should have np.sum(inputNu) / N = 1 nu_heuristics : string among ('ones', 'minor', 'major', 'clonal') automatic generation of the nu parameter with 3 possible heuristics: set tue number of mutated copy number to 1, or set the number of mutated copy number to major or minor copy number, or set the mutated CN to the max of 1, and the number of mutated copy to get a CCF of 1 given purity and total copy number this option will over-ride any inputNu given by user. nb_fits : integer (>1) number of independant fits to perform for this sample (as results depend on the random initialization, and might be local maxima of the EM objective function) seeds : iterable, of length nb_fits seeds for the different initialization. If not provided, seeds are set to 0 to nb_fits-1 max_nb_clones : integer (>1) maximum number of clones wanted to be found by the model return_sig_change_test : boolean perform a statistical test (adapted from a loglikelihood ratio test) to assess whether there is a change of signature in the sample (H1) or if all clones have the same signature exposures (H0) min_mut_clone : int or float if int, the minimal number of mutations per returned clone by hard assignement (most likely clone). If the threshold is not met for a clone, it is deleted, and attributions to the remaining clones are computed for all mutations. if float, same principle, but the threshold is applied to the \\xi parameters, representing the proportion of each clone with soft assignement. min_prop_sig : float minimal exposure for signatures. If the maximal exposure of a given signature among all clones is smaller than min_prop_sig, then it is removed, and the contribution of other signatures is scaled to 1 prefit_signatures : boolean fit signatures to the sample (globally, with 1 clone), and then just use the subset of signatures with an exposure of at least prefit_thresh prefit_thresh : float minimal threshold to select signature in the prefit step model_selection_function : string among (...) model selection function to use model_selection_kws : dictionary parameters to pass to the model_selection_function Returns ------- """ (T, B, D, C_normal, C_tumor_tot, C_tumor_minor, purity, inputMU, inputNu, nb_fits, seeds, max_nb_clones, return_sig_change_test, min_mut_clone, min_prop_sig, prefit_signatures, prefit_thresh, model_selection_function, model_selection_kws) = check_parameters( T, B, D, C_normal, C_tumor_tot, C_tumor_minor, purity, inputMU, inputNu, nu_heuristics, nb_fits, seeds, max_nb_clones, return_sig_change_test, min_mut_clone, min_prop_sig, prefit_signatures, prefit_thresh, model_selection_function, model_selection_kws) # prefit of signatures if prefit_signatures: prefit_est = Estimator(T, B, C_normal, C_tumor_tot, C_tumor_minor, D, purity, 1, inputMU=inputMU, nu=inputNu) prefit_est.fit() future_sigs = (prefit_est.pi.T.dot(prefit_est.xi)) > prefit_thresh prefit_inputMU = inputMU[future_sigs, :] else: prefit_inputMU = inputMU.copy() future_sigs = None criterion = np.zeros((nb_fits, max_nb_clones+2)) loglikelihood = np.zeros((nb_fits, max_nb_clones+2)) loglikelihood_nopi = np.zeros((nb_fits, max_nb_clones+2)) est_matrix = np.zeros((nb_fits, max_nb_clones+2)).astype(object) for j, nb_clones in enumerate(range(1, max_nb_clones+3)): for i, s in enumerate(seeds): print(j, i) np.random.seed(s) if nb_clones >= 2: previous_est = est_matrix[i, j-1] new_phi, new_xi, new_pi = \ split_clone_initialization(previous_est, prefit_inputMU) est = Estimator(T, B, C_normal, C_tumor_tot, C_tumor_minor, D, purity, nb_clones, inputMU=prefit_inputMU, pi=new_pi, phi=new_phi, xi=new_xi, nu=inputNu, tau=previous_est.tau) else: est = Estimator(T, B, C_normal, C_tumor_tot, C_tumor_minor, D, purity, nb_clones, inputMU=prefit_inputMU, nu=inputNu) est.fit() criterion[i, j] = est.get_bic_heuristics(**model_selection_kws) loglikelihood[i, j] = est.get_loglikelihood est_matrix[i, j] = est if j > 1: bm = criterion.mean(axis=0) if (bm[j-2] > bm[j-1]) and (bm[j-2] > bm[j]) and (bm[j-1] > bm[j]): print('stopped and chosen number of clones is ', nb_clones - 2) print(loglikelihood.mean(axis=0)) print(bm) break print('stopped and chosen number of clones is ', nb_clones - 2) print(loglikelihood.mean(axis=0)) print(bm) # get best run chosen_nb_clones = max(nb_clones - 2, 1) chosen_nb_clones_idx = chosen_nb_clones - 1 i_best = np.argmin(loglikelihood_nopi[:, chosen_nb_clones_idx]) est_best = est_matrix[i_best, chosen_nb_clones_idx] # sc = single clone sc_est_best = est_matrix[i_best, 0] est_best_big_clones = remove_small_clones(est_best, min_mut_clone, prefit_inputMU) new_est, new_sc_est, new_inputMU = remove_small_sigs(est_best_big_clones, sc_est_best, min_prop_sig, prefit_inputMU) print(np.repeat(new_sc_est.pi, new_est.J, axis=0).shape, new_inputMU.shape, new_est.J) cst_est = Estimator(T, B, C_normal, C_tumor_tot, C_tumor_minor, D, purity, new_est.J, inputMU=new_inputMU, pi=np.repeat(new_sc_est.pi, new_est.J, axis=0), phi=new_est.phi, tau=new_est.tau, xi=new_est.xi, nu=inputNu) dof_test = get_ll_test_dof(new_inputMU, new_est.J, new_est.N) lr, p = lrtest(cst_est.get_loglikelihood, new_est.get_loglikelihood, dof_test) return new_est, lr, p, new_inputMU, cst_est, future_sigs
def fit_model_special(T, B, C_normal, C_tumor_tot, C_tumor_minor, D, purity, inputMU, nb_fits=1, seeds=None, max_nb_clones=6, extra=4): """ possible metrics : F, loglikelihood, BIC, AIC, AICc, ICL_q, ICL_qn, SH. """ L = inputMU.shape[0] if isinstance(seeds, Iterable): if len(seeds) != nb_fits: raise ValueError("Number of seeds is incompatible with number of required fits") if seeds is None: seeds = list(range(nb_fits)) Fres = np.zeros((nb_fits, max_nb_clones+extra)) bic = np.zeros((nb_fits, max_nb_clones+extra)) loglikelihood = np.zeros((nb_fits, max_nb_clones+extra)) aic = np.zeros((nb_fits, max_nb_clones+extra)) aicc = np.zeros((nb_fits, max_nb_clones+extra)) icl_q = np.zeros((nb_fits, max_nb_clones+extra)) icl_qn = np.zeros((nb_fits, max_nb_clones+extra)) bic_alt = np.zeros((nb_fits, max_nb_clones+extra)) for i, s in enumerate(seeds): np.random.seed(s) for j, nb_clones in enumerate(range(1, max_nb_clones+1+extra)): print(j, i) if nb_clones >= 2: to_split = np.argmax(-(est.qun * np.log(est.qun)).sum(axis=0)) mask = np.ones(nb_clones-1, dtype=bool) mask[to_split] = 0 new_phi = np.zeros(nb_clones) new_phi[:nb_clones - 2] = est.phi[mask] new_phi[-2] = np.random.ranf() * 0.8 + 0.1 new_phi[-1] = np.random.ranf() * 0.8 + 0.1 new_xi = np.zeros(nb_clones) new_xi[:nb_clones - 2] = est.xi[mask] new_xi[-1], new_xi[-2] = [est.xi[to_split]] * 2 new_pi = np.zeros((nb_clones, inputMU.shape[0])) new_pi[:nb_clones - 2, :] = est.pi[mask, :] new_pi[-1, :] = np.random.dirichlet(alpha=np.ones(inputMU.shape[0])) new_pi[-2, :] = np.random.dirichlet(alpha=np.ones(inputMU.shape[0])) est = Estimator(T, B, C_normal, C_tumor_tot, C_tumor_minor, D, purity, nb_clones, inputMU=inputMU, pi=new_pi, phi=new_phi, xi=new_xi) else: est = Estimator(T, B, C_normal, C_tumor_tot, C_tumor_minor, D, purity, nb_clones, inputMU=inputMU) est.fit() print(nb_clones, est.tau) Fres[i, j] = est.Fs[-1] bic[i, j] = est.get_bic() bic_alt[i, j] = np.nan loglikelihood[i, j] = est.get_loglikelihood aic[i, j] = est.get_aic() aicc[i, j] = est.get_aicc() icl_q[i, j] = est.get_icl() icl_qn[i, j] = est.get_icl(norm=True) dict_results = {'bic': np.argmax(bic.mean(axis=0)) + 1, 'aic': np.argmax(aic.mean(axis=0)) + 1, 'aicc': np.argmax(aicc.mean(axis=0)) + 1, 'icl_q': np.argmax(icl_q.mean(axis=0)) + 1, 'icl_qn': np.argmax(icl_qn.mean(axis=0)) + 1, 'bic_alt': np.argmax(bic_alt.mean(axis=0)) + 1} # compute SH estimate for mc in range(max_nb_clones-2, max_nb_clones + extra + 1): slopes = list() chpt = list() for end_p in range(0, mc-1): ransac = linear_model.LinearRegression() ransac.fit(((np.array(range(end_p+1, mc+1)))*(L+1)).reshape(-1, 1), loglikelihood.mean(axis=0)[end_p:mc]) slopes.append(ransac.coef_) # print('pen', mc, end_p, loglikelihood[0][:mc] - np.arange(1, len(loglikelihood[0][:mc])+1) * (M+1) * 2 * ransac.estimator_.coef_) chpt.append(np.argmax(loglikelihood.mean(axis=0)[:mc] - np.arange(1, len(loglikelihood.mean(axis=0)[:mc])+1) * (L+1) * 2 * max(ransac.coef_, 0.0))+1) chpt = np.array(chpt) diff = chpt[1:] - chpt[0:-1] last_point = np.argmax(diff<0) if (last_point == 0) & (chpt[1] >= chpt[0]): last_point = mc counts = np.bincount(chpt[:last_point+1]) # b = counts[::-1] # final_nb_clones = len(b) - np.argmax(b) - 1 final_nb_clones = np.argmax(counts) dict_results['sh_{}'.format(mc)] = final_nb_clones ll = loglikelihood.mean(axis=0) dict_results['max_curvature'] = np.argmax( np.abs(ll[2:] + ll[0: -2] - 2 * ll[1: -1])) + 2 return dict_results, ll