Example #1
0
def remove_small_clones(previous_est, min_mut_clone, inputMU):
    """
    this function removes clones smaller than a threshold (min_mut_clone)
    After each post-hoc modification, the estimator is refit with
    initialization to previous parameters (adjusted if needed)

    Parameters
    ----------
    previous_est: Estimator object
                  current estimator fitted to the data, from which to remove
                  clones that are too small
    min_mut_clone: int or float
                   if int, the minimal number of mutations per returned clone
                   by hard assignement (most likely clone). If the threshold
                   is not met for a clone, it is deleted, and attributions to
                   the remaining clones are computed for all mutations.
                   if float, same principle, but the threshold is applied to
                   the \\xi parameters, representing the proportion of each
                   clone with soft assignement.
    inputMU: array-like (L, 96)
             known L signatures to be fit by clonesig
    Returns
    -------
    new_est: new estimator fit with new number of clones.
    """
    if isinstance(min_mut_clone, float):
        future_clones = previous_est.xi > min_mut_clone
    elif isinstance(min_mut_clone, int):
        useful_counts = np.zeros(previous_est.J)
        pre_counts = np.unique(np.argmax(previous_est.qun, axis=1),
                               return_counts=True)
        useful_counts[pre_counts[0]] = pre_counts[1]
        actual_min_mut_clone = min(np.max(useful_counts), min_mut_clone)
        future_clones = useful_counts >= actual_min_mut_clone
    new_phi = previous_est.phi[future_clones]
    new_xi = previous_est.xi[future_clones] /\
        previous_est.xi[future_clones].sum()
    new_pi = previous_est.pi[future_clones, :]
    new_nb_clones = sum(future_clones)
    new_est = Estimator(previous_est.T, previous_est.B,
                        previous_est.C_normal, previous_est.C_tumor_tot,
                        previous_est.C_tumor_minor, previous_est.D,
                        previous_est.p, new_nb_clones,
                        inputMU=inputMU, pi=new_pi, phi=new_phi,
                        xi=new_xi, nu=previous_est.nu,
                        tau=previous_est.tau)
    new_est.fit()
    return new_est
def score_sig_1D(sim, est_sig, inputMU, cancer_type=None):
    """
    percent of mutations with the right signature
    """
    data_df = sim._get_data_df()
    est = Estimator(data_df.trinucleotide.values, data_df.var_counts.values,
                    data_df.normal_cn.values,
                    data_df.minor_cn.values + data_df.major_cn.values,
                    data_df.minor_cn.values,
                    data_df.var_counts.values + data_df.ref_counts.values,
                    sim.purity, 1, inputMU=inputMU, pi=est_sig.reshape(1, -1))
    est_sig_att = est.rnus[np.arange(est.N), est.qun.argmax(axis=1), :].argmax(axis=1)
    sim_sig = sim.S.copy()
    if sim.MU.shape != est.mu_matrix.shape:
        filter_filename = 'data/curated_match_signature_cancertype_tcgawes_literature.csv'
        cancer_type_sig = pd.read_csv(pkg_resources.resource_stream(
            'clonesig', filter_filename), sep='\t', index_col=0).values
        select = cancer_type_sig[:, cancer_type].astype(bool)
        if sim.MU.shape[0] != 65:
            sim_sig = np.array([np.where(select)[0][int(i)] for i in sim_sig])
        if est.mu_matrix.shape[0] == 47:
            big_select = cancer_type_sig.sum(axis=1).astype(bool)
            est_sig_att = np.array([np.where(big_select)[0][int(i)] for i in est_sig_att])
        elif est.mu_matrix.shape[0] < 47:
            est_sig_att = np.array([np.where(select)[0][int(i)] for i in est_sig_att])
    return score_sig_1D_base(sim_sig, est_sig_att)
def format_deconstructsigs(folder_path):
    res_filename = '{}/deconstructsigs/signatures_cancertype.csv'\
        .format(folder_path)
    result_file = pd.read_csv(res_filename, sep=' ')
    pred_signatures = np.zeros(len(all_sigs))
    filename = '{}/subMU.csv'.format(folder_path)
    sub_matrix = pd.read_csv(filename, sep='\t')
    mu_mat_setting = sub_matrix[sub_matrix.columns[1:]].values.T
    sub_sigs = sub_matrix.columns[1:]
    idx = [list(all_sigs).index(s) for s in sub_sigs]
    pred_signatures[np.array(idx)] = result_file.iloc[0].values
    sig_profile = result_file.values.dot(mu_mat_setting)
    input_filename = '{}/deconstructsigs/pattern96.csv'.format(folder_path)
    pattern = pd.read_csv(input_filename, sep='\t')

    data_df = pd.read_csv('{}/input_t.tsv'.format(folder_path), sep='\t')
    est = Estimator(data_df.trinucleotide.values, data_df.var_counts.values,
                    data_df.normal_cn.values,
                    data_df.minor_cn.values + data_df.major_cn.values,
                    data_df.minor_cn.values,
                    data_df.var_counts.values + data_df.ref_counts.values,
                    data_df.purity.mean(), 1,
                    inputMU=mu_mat_setting, pi=result_file.values.reshape(1, -1))
    est_sig_att = est.rnus[np.arange(est.N), est.qun.argmax(axis=1), :].argmax(axis=1)


    nb_mut = pattern.sum().sum()
    pred_profile_1E = np.repeat([sig_profile], nb_mut, axis=0)
    runtime = pd.read_csv('{}/deconstructsigs/deconstructsig_runtime_cancertype.csv'
                          .format(folder_path),
                          index_col=0).values[0][0]
    return (None, None, None, None, None, None, None,
            sig_profile, pred_signatures, est_sig_att, pred_profile_1E, runtime)
Example #4
0
def remove_small_sigs(previous_est, single_clone_est, min_prop_sig, inputMU):
    """
    this function removes signatures with exposure smaller than a threshold
    (min_prop_sig) in all subclones of previous_est and in a global fit
    (single_clone_est).
    After each post-hoc modification, the estimator is refit with
    initialization to previous parameters (adjusted if needed)

    Parameters
    ----------
    previous_est: Estimator object
                  current estimator fitted to the data, from which to remove
                  signatures with too small exposures in the sample
    single_clone_est: Estimator object
                      current single clone estimator fitted to the data.
                      Because of the likelihood test ratio, it is necessary to
                      adjust it as well accordingly.
    min_prop_sig: float
                  minimal exposure for signatures. If the maximal exposure of
                  a given signature among all clones is smaller than
                  min_prop_sig, then it is removed, and the contribution of
                  other signatures is scaled to 1.
    inputMU: array-like (L, 96)
             known L signatures to be fit by clonesig.
    Returns
    -------
    """
    big_pi = np.concatenate((single_clone_est.pi, previous_est.pi), axis=0)
    future_sigs = np.max(big_pi, axis=0) > min_prop_sig
    new_inputMU = inputMU[future_sigs, :]
    pre_new_single_clone_pi = single_clone_est.pi[:, future_sigs]
    pre_new_pi = previous_est.pi[:, future_sigs]
    new_single_clone_pi = pre_new_single_clone_pi /\
        pre_new_single_clone_pi.sum(axis=1)[:, np.newaxis]
    new_pi = pre_new_pi / pre_new_pi.sum(axis=1)[:, np.newaxis]
    new_inputMU = inputMU[future_sigs, :]
    new_est = Estimator(previous_est.T, previous_est.B,
                        previous_est.C_normal, previous_est.C_tumor_tot,
                        previous_est.C_tumor_minor, previous_est.D,
                        previous_est.p, previous_est.J, inputMU=new_inputMU,
                        pi=new_pi, phi=previous_est.phi, xi=previous_est.xi,
                        nu=previous_est.nu, tau=previous_est.tau)
    new_est.fit()
    new_sc_est = Estimator(single_clone_est.T, single_clone_est.B,
                           single_clone_est.C_normal,
                           single_clone_est.C_tumor_tot,
                           single_clone_est.C_tumor_minor, single_clone_est.D,
                           single_clone_est.p, single_clone_est.J,
                           inputMU=new_inputMU, pi=new_single_clone_pi,
                           phi=single_clone_est.phi, xi=single_clone_est.xi,
                           nu=single_clone_est.nu, tau=single_clone_est.tau)
    new_sc_est.fit()
    return new_est, new_sc_est, new_inputMU
Example #5
0
 def get_loglikelihood(self):
     est = Estimator(self.T,
                     self.B,
                     self.C_normal,
                     self.C_tumor_tot,
                     self.C_tumor_minor,
                     self.D,
                     self.purity,
                     self.J,
                     inputMU=self.MU,
                     pi=self.pi,
                     phi=self.phi,
                     xi=self.xi,
                     tau=self.tau)
     return est.get_loglikelihood
Example #6
0
# get xi
steady_xi = np.random.dirichlet(alpha=np.ones(nb_clones))
while min(steady_xi) < 0.1:
    steady_xi = np.random.dirichlet(alpha=np.ones(nb_clones))

np.random.seed(20190610 + nb_seed)
uu = SimLoader(nb_mut, nb_clones, inputMU=subMU,
               pi_param=pi, phi_param=steady_phi, xi_param=steady_xi,
               rho_param=100, cn=True, dip_prop=perc_dip)
uu._get_unobserved_nodes()
uu._get_observed_nodes()


sc_est_subMU = Estimator(uu.T, uu.B, uu.C_normal, uu.C_tumor_tot,
                         uu.C_tumor_minor, uu.D, uu.purity, 1,
                         inputMU=subMU)
sc_est_subMU.fit()
est_subMU = Estimator(uu.T, uu.B, uu.C_normal, uu.C_tumor_tot,
                      uu.C_tumor_minor, uu.D, uu.purity, nb_clones,
                      inputMU=subMU)
est_subMU.fit()

sc_est_MU = Estimator(uu.T, uu.B, uu.C_normal, uu.C_tumor_tot,
                      uu.C_tumor_minor, uu.D, uu.purity, 1,
                      inputMU=MU)
sc_est_MU.fit()
est_MU = Estimator(uu.T, uu.B, uu.C_normal, uu.C_tumor_tot,
                   uu.C_tumor_minor, uu.D, uu.purity, nb_clones,
                   inputMU=MU)
est_MU.fit()
Example #7
0
def run_clonesig(T, B, D, C_normal, C_tumor_tot, C_tumor_minor, purity,
                 inputMU, inputNu=None, nu_heuristics=None, nb_fits=1,
                 seeds=None, max_nb_clones=6, return_sig_change_test=True,
                 min_mut_clone=0, min_prop_sig=0.0, prefit_signatures=False,
                 prefit_thresh=0.05, model_selection_function=None,
                 model_selection_kws=None):
    """
    this function is a wrapper that takes data (and settings) as input, tries
    to fit clonesig model for a various number of clones, and returns the best
    fit, with some relevant selected post-hoc adjustements. After each
    post-hoc modification, the estimator is refit with initialization to
    previous parameters (adjusted if needed)

    Parameters
    ----------
    T : iterable of length N
        with the trinucleotide context of each mutation, numbered
        from 0 to 95
    B : iterable of length N
        with the variant allele read count for each mutation
    D : iterable of length N
        with the total read count for each mutation
    C_normal : iterable of length N
               copy number of non-tumor cells in the sample at each mutation
               locus
    C_tumor_tot : iterable of length N
                  the total copy number of tumor cells in the sample at each
                  mutation locus
    C_tumor_minor : iterable of length N
                    the minor copy number of tumor cells in the sample at each
                    mutation locus. If this info is not available, set it to
                    zero so that clonesig considers all possible genotypes
    purity : float in [0, 1]
             an estimate of the tumor purity of the sample
    inputMU : array-like (L, 96)
              known L signatures to be fit by clonesig.
    inputNu : array-like (N, Mmax)
              with Mmax = max(C_tumor_tot - C_tumor_minor)
              probablity distribution of number of mutated copies for each
              mutation
              be careful, it is a probability distribution, so one should have
              np.sum(inputNu) / N = 1
    nu_heuristics : string among ('ones', 'minor', 'major', 'clonal')
                    automatic generation of the nu parameter with 3 possible
                    heuristics: set tue number of mutated copy number to 1,
                    or set the number of mutated copy number to major or minor
                    copy number, or set the mutated CN to the max of 1, and the
                    number of mutated copy to get a CCF of 1 given purity and
                    total copy number
                    this option will over-ride any inputNu given by user.
    nb_fits : integer (>1)
              number of independant fits to perform for this sample (as results
              depend on the random initialization, and might be local maxima of
              the EM objective function)
    seeds : iterable, of length nb_fits
            seeds for the different initialization. If not provided, seeds are
            set to 0 to nb_fits-1
    max_nb_clones : integer (>1)
                    maximum number of clones wanted to be found by the model
    return_sig_change_test : boolean
                             perform a statistical test (adapted from a
                             loglikelihood ratio test) to assess whether there
                             is a change of signature in the sample (H1) or if
                             all clones have the same signature exposures (H0)
    min_mut_clone : int or float
                    if int, the minimal number of mutations per returned clone
                    by hard assignement (most likely clone). If the threshold
                    is not met for a clone, it is deleted, and attributions to
                    the remaining clones are computed for all mutations.
                    if float, same principle, but the threshold is applied to
                    the \\xi parameters, representing the proportion of each
                    clone with soft assignement.
    min_prop_sig : float
                   minimal exposure for signatures. If the maximal exposure of
                   a given signature among all clones is smaller than
                   min_prop_sig, then it is removed, and the contribution of
                   other signatures is scaled to 1
    prefit_signatures : boolean
                        fit signatures to the sample (globally, with 1 clone),
                        and then just use the subset of signatures with an
                        exposure of at least prefit_thresh
    prefit_thresh : float
                    minimal threshold to select signature in the prefit step
    model_selection_function : string among (...)
                               model selection function to use
    model_selection_kws : dictionary
                          parameters to pass to the model_selection_function


    Returns
    -------
    """
    (T, B, D, C_normal, C_tumor_tot, C_tumor_minor, purity, inputMU,
     inputNu, nb_fits, seeds, max_nb_clones, return_sig_change_test,
     min_mut_clone, min_prop_sig, prefit_signatures, prefit_thresh,
     model_selection_function, model_selection_kws) = check_parameters(
        T, B, D, C_normal, C_tumor_tot, C_tumor_minor, purity, inputMU,
        inputNu, nu_heuristics, nb_fits, seeds, max_nb_clones,
        return_sig_change_test, min_mut_clone, min_prop_sig, prefit_signatures,
        prefit_thresh, model_selection_function, model_selection_kws)
    # prefit of signatures
    if prefit_signatures:
        prefit_est = Estimator(T, B, C_normal, C_tumor_tot,
                               C_tumor_minor, D, purity, 1,
                               inputMU=inputMU, nu=inputNu)
        prefit_est.fit()
        future_sigs = (prefit_est.pi.T.dot(prefit_est.xi)) > prefit_thresh
        prefit_inputMU = inputMU[future_sigs, :]
    else:
        prefit_inputMU = inputMU.copy()
        future_sigs = None

    criterion = np.zeros((nb_fits, max_nb_clones+2))
    loglikelihood = np.zeros((nb_fits, max_nb_clones+2))
    loglikelihood_nopi = np.zeros((nb_fits, max_nb_clones+2))
    est_matrix = np.zeros((nb_fits, max_nb_clones+2)).astype(object)
    for j, nb_clones in enumerate(range(1, max_nb_clones+3)):
        for i, s in enumerate(seeds):
            print(j, i)
            np.random.seed(s)
            if nb_clones >= 2:
                previous_est = est_matrix[i, j-1]
                new_phi, new_xi, new_pi = \
                    split_clone_initialization(previous_est, prefit_inputMU)
                est = Estimator(T, B, C_normal, C_tumor_tot,
                                C_tumor_minor, D, purity, nb_clones,
                                inputMU=prefit_inputMU, pi=new_pi, phi=new_phi,
                                xi=new_xi, nu=inputNu, tau=previous_est.tau)
            else:
                est = Estimator(T, B, C_normal, C_tumor_tot,
                                C_tumor_minor, D, purity, nb_clones,
                                inputMU=prefit_inputMU, nu=inputNu)

            est.fit()
            criterion[i, j] = est.get_bic_heuristics(**model_selection_kws)
            loglikelihood[i, j] = est.get_loglikelihood
            est_matrix[i, j] = est
        if j > 1:
            bm = criterion.mean(axis=0)
            if (bm[j-2] > bm[j-1]) and (bm[j-2] > bm[j]) and (bm[j-1] > bm[j]):
                print('stopped and chosen number of clones is ', nb_clones - 2)
                print(loglikelihood.mean(axis=0))
                print(bm)
                break
    print('stopped and chosen number of clones is ', nb_clones - 2)
    print(loglikelihood.mean(axis=0))
    print(bm)

    # get best run
    chosen_nb_clones = max(nb_clones - 2, 1)
    chosen_nb_clones_idx = chosen_nb_clones - 1
    i_best = np.argmin(loglikelihood_nopi[:, chosen_nb_clones_idx])
    est_best = est_matrix[i_best, chosen_nb_clones_idx]
    # sc = single clone
    sc_est_best = est_matrix[i_best, 0]

    est_best_big_clones = remove_small_clones(est_best, min_mut_clone,
                                              prefit_inputMU)
    new_est, new_sc_est, new_inputMU = remove_small_sigs(est_best_big_clones,
                                                         sc_est_best,
                                                         min_prop_sig,
                                                         prefit_inputMU)
    print(np.repeat(new_sc_est.pi, new_est.J, axis=0).shape, new_inputMU.shape, new_est.J)
    cst_est = Estimator(T, B, C_normal, C_tumor_tot,
                        C_tumor_minor, D, purity, new_est.J,
                        inputMU=new_inputMU,
                        pi=np.repeat(new_sc_est.pi, new_est.J, axis=0),
                        phi=new_est.phi, tau=new_est.tau, xi=new_est.xi,
                        nu=inputNu)
    dof_test = get_ll_test_dof(new_inputMU, new_est.J, new_est.N)
    lr, p = lrtest(cst_est.get_loglikelihood,
                   new_est.get_loglikelihood, dof_test)
    return new_est, lr, p, new_inputMU, cst_est, future_sigs
def fit_model_special(T, B, C_normal, C_tumor_tot, C_tumor_minor, D, purity,
                      inputMU, nb_fits=1, seeds=None, max_nb_clones=6, extra=4):
    """
    possible metrics : F, loglikelihood, BIC, AIC, AICc, ICL_q, ICL_qn, SH.
    """
    L = inputMU.shape[0]
    if isinstance(seeds, Iterable):
        if len(seeds) != nb_fits:
            raise ValueError("Number of seeds is incompatible with number of required fits")
    if seeds is None:
        seeds = list(range(nb_fits))
    Fres = np.zeros((nb_fits, max_nb_clones+extra))
    bic = np.zeros((nb_fits, max_nb_clones+extra))
    loglikelihood = np.zeros((nb_fits, max_nb_clones+extra))
    aic = np.zeros((nb_fits, max_nb_clones+extra))
    aicc = np.zeros((nb_fits, max_nb_clones+extra))
    icl_q = np.zeros((nb_fits, max_nb_clones+extra))
    icl_qn = np.zeros((nb_fits, max_nb_clones+extra))
    bic_alt = np.zeros((nb_fits, max_nb_clones+extra))
    for i, s in enumerate(seeds):
        np.random.seed(s)
        for j, nb_clones in enumerate(range(1, max_nb_clones+1+extra)):
            print(j, i)
            if nb_clones >= 2:
                to_split = np.argmax(-(est.qun * np.log(est.qun)).sum(axis=0))
                mask = np.ones(nb_clones-1, dtype=bool)
                mask[to_split] = 0
                new_phi = np.zeros(nb_clones)
                new_phi[:nb_clones - 2] = est.phi[mask]
                new_phi[-2] = np.random.ranf() * 0.8 + 0.1
                new_phi[-1] = np.random.ranf() * 0.8 + 0.1
                new_xi = np.zeros(nb_clones)
                new_xi[:nb_clones - 2] = est.xi[mask]
                new_xi[-1], new_xi[-2] = [est.xi[to_split]] * 2
                new_pi = np.zeros((nb_clones, inputMU.shape[0]))
                new_pi[:nb_clones - 2, :] = est.pi[mask, :]
                new_pi[-1, :] = np.random.dirichlet(alpha=np.ones(inputMU.shape[0]))
                new_pi[-2, :] = np.random.dirichlet(alpha=np.ones(inputMU.shape[0]))
                est = Estimator(T, B, C_normal, C_tumor_tot,
                                C_tumor_minor, D, purity, nb_clones,
                                inputMU=inputMU, pi=new_pi, phi=new_phi, xi=new_xi)
            else:
                est = Estimator(T, B, C_normal, C_tumor_tot,
                                C_tumor_minor, D, purity, nb_clones,
                                inputMU=inputMU)
            est.fit()
            print(nb_clones, est.tau)
            Fres[i, j] = est.Fs[-1]
            bic[i, j] = est.get_bic()
            bic_alt[i, j] = np.nan
            loglikelihood[i, j] = est.get_loglikelihood
            aic[i, j] = est.get_aic()
            aicc[i, j] = est.get_aicc()
            icl_q[i, j] = est.get_icl()
            icl_qn[i, j] = est.get_icl(norm=True)
    dict_results = {'bic': np.argmax(bic.mean(axis=0)) + 1,
                    'aic': np.argmax(aic.mean(axis=0)) + 1,
                    'aicc': np.argmax(aicc.mean(axis=0)) + 1,
                    'icl_q': np.argmax(icl_q.mean(axis=0)) + 1,
                    'icl_qn': np.argmax(icl_qn.mean(axis=0)) + 1,
                    'bic_alt': np.argmax(bic_alt.mean(axis=0)) + 1}

    # compute SH estimate
    for mc in range(max_nb_clones-2, max_nb_clones + extra + 1):
        slopes = list()
        chpt = list()
        for end_p in range(0, mc-1):
            ransac = linear_model.LinearRegression()
            ransac.fit(((np.array(range(end_p+1, mc+1)))*(L+1)).reshape(-1, 1), loglikelihood.mean(axis=0)[end_p:mc])
            slopes.append(ransac.coef_)
            # print('pen', mc, end_p, loglikelihood[0][:mc] - np.arange(1, len(loglikelihood[0][:mc])+1) * (M+1) * 2 * ransac.estimator_.coef_)
            chpt.append(np.argmax(loglikelihood.mean(axis=0)[:mc] - np.arange(1, len(loglikelihood.mean(axis=0)[:mc])+1) * (L+1) * 2 * max(ransac.coef_, 0.0))+1)
        chpt = np.array(chpt)
        diff = chpt[1:] - chpt[0:-1]
        last_point = np.argmax(diff<0)
        if (last_point == 0) & (chpt[1] >= chpt[0]):
            last_point = mc
        counts = np.bincount(chpt[:last_point+1])
        # b = counts[::-1]
        # final_nb_clones = len(b) - np.argmax(b) - 1
        final_nb_clones = np.argmax(counts)
        dict_results['sh_{}'.format(mc)] = final_nb_clones

    ll = loglikelihood.mean(axis=0)
    dict_results['max_curvature'] = np.argmax(
        np.abs(ll[2:] + ll[0: -2] - 2 * ll[1: -1])) + 2
    return dict_results, ll