Esempio n. 1
0
def sample_cv(dataset, model_name, num_folds, fold, use_cosmic, num_signatures,
              shuffle_seed, random_seed, max_iterations, epsilon, out_dir):

    if fold >= num_folds:
        raise ValueError('num_folds is {} but fold is {}'.format(
            num_folds, fold))

    dataset_name = dataset
    dataset, active_signatures = get_data_by_model_name(dataset, model_name)
    if use_cosmic:
        num_signatures = len(active_signatures)
        signatures = get_cosmic_signatures()[active_signatures]
    elif num_signatures == 0:
        print(
            'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}'
            .format(len(active_signatures)))
        num_signatures = len(active_signatures)
        signatures = None
    else:
        signatures = None

    use_cosmic_dir = 'refit' if use_cosmic else 'denovo'
    out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name,
                           str(num_signatures), str(shuffle_seed),
                           str(num_folds), str(fold))

    try:
        os.makedirs(out_dir)
    except OSError:
        pass

    random_seed = int(time.time()) if random_seed == 0 else random_seed
    out_file = out_dir + "/" + str(random_seed)
    if os.path.isfile(out_file + '.json'):
        print(
            'Experiment with parameters {} {} {} {} {} {} {} {} already exist'.
            format(dataset_name, model_name, num_folds, fold, use_cosmic,
                   num_signatures, shuffle_seed, random_seed))
        return

    train_data, test_data = split_train_test_sample_cv(dataset, num_folds,
                                                       fold, shuffle_seed)

    model, train_ll, test_ll = train_test_stickysig(train_data, test_data,
                                                    num_signatures, signatures,
                                                    random_seed, epsilon,
                                                    max_iterations)
    parameters = model.get_params()

    parameters['alpha'] = parameters['alpha'].tolist()
    parameters['e'] = parameters['e'].tolist()
    for sample in parameters['pi']:
        parameters['pi'][sample] = parameters['pi'][sample].tolist()

    out = {
        'log-likelihood-train': train_ll,
        'log-likelihood-test': test_ll,
        'parameters': parameters
    }
    save_json(out_file, out)
Esempio n. 2
0
def leave_one_chromosome_out(dataset, model_name, chromosome, use_cosmic,
                             num_signatures, random_seed, max_iterations,
                             epsilon, out_dir):
    use_cosmic_dir = 'refit' if use_cosmic else 'denovo'

    all_chromosomes = [str(i) for i in range(1, 23)]
    all_chromosomes.extend(['X', 'Y'])
    chromosome_name = all_chromosomes[chromosome]

    dataset_name = dataset
    dataset, active_signatures = get_data_by_model_name(dataset, model_name)
    if use_cosmic:
        num_signatures = len(active_signatures)
        signatures = get_cosmic_signatures()[active_signatures]
    elif num_signatures == 0:
        print(
            'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}'
            .format(len(active_signatures)))
        num_signatures = len(active_signatures)
        signatures = None
    else:
        signatures = None

    out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name,
                           str(num_signatures), chromosome_name)

    try:
        os.makedirs(out_dir)
    except OSError:
        pass

    random_seed = int(time.time()) if random_seed == 0 else random_seed
    out_file = out_dir + "/" + str(random_seed)
    if os.path.isfile(out_file + '.json'):
        print('Experiment with parameters {} {} {} {} {} {} already exist'.
              format(dataset_name, model_name, chromosome, use_cosmic,
                     num_signatures, random_seed))
        return

    train_data, test_data = split_train_test_loco(dataset, chromosome)

    model, train_ll, test_ll = train_test_stickysig(train_data, test_data,
                                                    num_signatures, signatures,
                                                    random_seed, epsilon,
                                                    max_iterations)
    parameters = model.get_params()

    parameters['alpha'] = parameters['alpha'].tolist()
    parameters['e'] = parameters['e'].tolist()
    for sample in parameters['pi']:
        parameters['pi'][sample] = parameters['pi'][sample].tolist()

    out = {
        'log-likelihood-train': train_ll,
        'log-likelihood-test': test_ll,
        'parameters': parameters
    }
    save_json(out_file, out)
Esempio n. 3
0
def plot_sig_correlations(dataset, num_sigs, beta_loss=2, plot_title=True, save_plot=True):
    cosmic_signatures = get_cosmic_signatures()
    mix_dir = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/denovo'.format(dataset))
    scores_dict = get_best_model(mix_dir, return_params=True)
    BIC_scores = scores_dict['BIC_scores']
    num_signatures = scores_dict['num_signatures']
    model_paths = scores_dict['model_paths']
    signatures_dict = {}

    # MIX signatures
    indices = num_signatures == num_sigs
    best_model_path = model_paths[indices][np.argmin(BIC_scores[indices])]
    e = np.array(load_json(best_model_path)['parameters']['e'])
    signatures_dict['MIX'] = e.copy()

    # NMF signatures
    data, _ = get_data(dataset)
    e = learn_NMF(data, num_sigs, beta_loss=beta_loss)
    signatures_dict['NMF'] = e.copy()

    # clustered-NMF signatures (if needed)
    if dataset == 'MSK-ALL':
        data, _ = get_data('clustered-MSK-ALL')
        e = learn_NMF(data, num_sigs, beta_loss=beta_loss)
        signatures_dict['clustered-NMF'] = e.copy()

    plt.rcParams.update({'font.size': 12})
    x_axis = np.array([str(i + 1) for i in range(num_sigs)])
    plt.axhline(0.80, color='grey', linestyle='--', label='_nolegend_')
    legends = []
    for i, model in enumerate(signatures_dict.keys()):
        legends.append(model)
        e = signatures_dict[model]
        sigs, corrs = get_signatures_correlations(e, cosmic_signatures)
        sigs = sigs[np.argsort(-corrs)]
        corrs = corrs[np.argsort(-corrs)]
        curr_x_axis = x_axis
        color = 'C{}'.format(i)
        plt.plot(curr_x_axis, corrs, '.-k', color=color)
        for i in range(len(sigs)):
            plt.annotate(str(sigs[i] + 1), (i, corrs[i] + 0.002), color=color)
        print('{} - {} - {} - {}'.format(model, sigs.tolist(), corrs.tolist(), sum(corrs)))

    plt.yticks(np.arange(2, 6) * 0.2)
    plt.ylabel('Cosine similarity', fontsize='large')
    plt.xlabel('Rank of signature', fontsize='large')
    plt.legend(legends, loc='lower left')
    if plot_title:
        plt.title('{} signatures'.format(num_sigs))
    if save_plot:
        plt.savefig(os.path.join(ROOT_DIR, 'results', 'signatures_similarity', '{}-signatures.pdf'.format(num_sigs)))
Esempio n. 4
0
def train_model(dataset, model_name, use_cosmic, num_signatures, random_seed,
                max_iterations, epsilon, out_dir):
    use_cosmic_dir = 'refit' if use_cosmic else 'denovo'
    dataset_name = dataset
    dataset, active_signatures = get_data_by_model_name(dataset, model_name)
    if use_cosmic:
        num_signatures = len(active_signatures)
        signatures = get_cosmic_signatures()[active_signatures]
    elif num_signatures == 0:
        print(
            'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}'
            .format(len(active_signatures)))
        num_signatures = len(active_signatures)
        signatures = None
    else:
        signatures = None

    out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name,
                           str(num_signatures))

    try:
        os.makedirs(out_dir)
    except OSError:
        pass

    random_seed = int(time.time()) if random_seed == 0 else random_seed
    out_file = out_dir + "/" + str(random_seed)
    if os.path.isfile(out_file + '.json'):
        print('Experiment with parameters {} {} {} {} {} already exist'.format(
            dataset_name, model_name, use_cosmic, num_signatures, random_seed))
        return

    model, ll = train_stickysig(dataset, num_signatures, signatures,
                                random_seed, epsilon, max_iterations)
    parameters = model.get_params()

    parameters['alpha'] = parameters['alpha'].tolist()
    parameters['e'] = parameters['e'].tolist()
    for sample in parameters['pi']:
        parameters['pi'][sample] = parameters['pi'][sample].tolist()

    out = {'log-likelihood': ll, 'parameters': parameters}
    save_json(out_file, out)
Esempio n. 5
0
def ROC_HRD():

    from src.analyze_results import get_best_model, stack_nnls
    import matplotlib.pyplot as plt
    from src.utils import get_data, get_cosmic_signatures
    import pandas as pd
    from sklearn.metrics import roc_curve
    from sklearn.metrics import roc_auc_score
    import seaborn as sns

    try:
        os.makedirs(os.path.join(ROOT_DIR, 'results', 'HRD'))
    except OSError:
        pass

    # Loading data
    test_mutations, signatures = get_data('nature2019-panel')
    test_labels, _ = get_data('nature2019-labels')

    # fixing to 0, 1. Removing intermediate hrd
    test_labels = test_labels[:, 0]
    test_data_samples = test_labels != 0
    test_labels = test_labels[test_data_samples]
    test_labels[test_labels == -1] += 1
    # models = ['MIX Sig3', 'SigMA', 'TMB', 'NNLS Sig3', 'WGS Sig3']
    models = ['MIX Sig3', 'SigMA', 'NNLS Sig3']
    scores = []
    for model in models:
        if 'MIX Sig3' in model:
            # MIX
            directory = os.path.join(ROOT_DIR, 'experiments/trained_models/BRCA-panel/refit')
            mix = get_best_model(directory, return_model=True)
            test_data = mix.weighted_exposures(test_mutations)

            if 'normalized' not in model:
                test_data *= test_mutations.sum(1, keepdims=True)  # un-normalizing exposures
            test_data = test_data[:, [2]]

        elif model == 'SigMA':
            sigma_output = os.path.join(ROOT_DIR, 'data/nature2019/SigMA_output.tsv')
            all_df = pd.read_csv(sigma_output, sep='\t')
            # In case this is comma separated
            if len(all_df.columns) == 1:
                all_df = pd.read_csv(sigma_output, sep=',')
            test_data = all_df[['Signature_3_mva']].values
            # test_data = all_df[['Signature_3_mva']].values

        elif model == 'TMB':
            test_data = test_mutations.sum(1, keepdims=True)

        elif model == 'WGS Sig3':
            # NNLS on WGS
            a, _ = get_data('nature2019-full')
            test_data = stack_nnls(a, get_cosmic_signatures()[signatures])
            test_data = test_data[:, [2]]

        else:
            # NNLS on panel
            test_data = stack_nnls(test_mutations, get_cosmic_signatures()[signatures])
            test_data = test_data[:, [2]]

        test_data = test_data[test_data_samples]

        # Test estimator on data
        prediction_probabilities = test_data
        auc_roc = roc_auc_score(test_labels, prediction_probabilities)
        print(model, 'auc: {:.2f}'.format(auc_roc))
        scores.append([model, '{:.2f}'.format(auc_roc)])
        fpr, tpr, thresholds = roc_curve(test_labels, prediction_probabilities)
        sns.lineplot(fpr, tpr, ci=None)

    plt.legend(models, loc='lower right')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'HRD', 'ROC_HRD.pdf'))
    # plt.show()
    np.savetxt(os.path.join(ROOT_DIR, 'results', 'HRD', 'ROC_AUC.tsv'), scores, '%s', '\t')
Esempio n. 6
0
def ROC_immunotherapy():
    from src.analyze_results import get_best_model, stack_nnls, get_signatures_correlations
    import matplotlib.pyplot as plt
    from src.utils import get_data, get_cosmic_signatures
    from sklearn.metrics import roc_curve
    from sklearn.metrics import roc_auc_score
    import seaborn as sns

    try:
        os.makedirs(os.path.join(ROOT_DIR, 'results', 'immunotherapy'))
    except OSError:
        pass

    # Loading data
    test_mutations, sigs = get_data('msk2018-LUAD')
    _, sigs = get_data('MSK-ALL')
    test_labels, _ = get_data('msk2018-LUAD-labels')

    models = ['MIX refit Sig4', 'MIX denovo M5', 'TMB', 'NNLS Sig4']
    scores = []
    for model in models:
        if 'MIX refit Sig4' in model:
            # MIX
            directory = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/refit')
            mix = get_best_model(directory, return_model=True)
            test_data = mix.weighted_exposures(test_mutations)

            if 'normalized' not in model:
                test_data *= test_mutations.sum(1, keepdims=True)  # un-normalizing exposures
            test_data = test_data[:, [3]]

        elif 'MIX denovo M5' in model:
            # MIX
            directory = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/denovo')
            mix = get_best_model(directory, return_model=True)

            test_data = mix.weighted_exposures(test_mutations)

            if 'normalized' not in model:
                test_data *= test_mutations.sum(1, keepdims=True)  # un-normalizing exposures

            sig4 = get_cosmic_signatures()[sigs][[3]]
            _, corrs = get_signatures_correlations(mix.e, sig4)
            best_sig = np.argmax(corrs)
            test_data = test_data[:, [best_sig]]

        elif model == 'TMB':
            test_data = test_mutations.sum(1, keepdims=True)

        else:
            # NNLS
            test_data = stack_nnls(test_mutations, get_cosmic_signatures()[sigs])
            test_data = test_data[:, [3]]

        # Test estimator on data
        prediction_probabilities = test_data
        auc_roc = roc_auc_score(test_labels, prediction_probabilities)
        print(model, 'auc: {:.2f}'.format(auc_roc))
        scores.append([model, '{:.2f}'.format(auc_roc)])
        fpr, tpr, thresholds = roc_curve(test_labels, prediction_probabilities)
        sns.lineplot(fpr, tpr, ci=None)

    plt.legend(models, loc='lower right')
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'immunotherapy', 'ROC_immunotherapy.pdf'))
    # plt.show()
    np.savetxt(os.path.join(ROOT_DIR, 'results', 'immunotherapy', 'ROC_AUC.tsv'), scores, '%s', '\t')
Esempio n. 7
0
def compare_panel_clusters():
    np.random.seed(1359)
    # full_dataset, panel_dataset = 'BRCA-panel-full', 'BRCA-panel'
    full_dataset, panel_dataset = 'nature2019-full', 'nature2019-panel'
    full_data, active_signatures = get_data(full_dataset)
    panel_data, _ = get_data(panel_dataset)
    signatures = get_cosmic_signatures()[active_signatures]
    num_samples = len(full_data)

    full_data_exposures = stack_nnls(full_data, signatures)

    full_data_exposures_dists = cosine_similarity(full_data_exposures)
    corrs = []
    models = []
    relations = []

    for model in ['MIX', 'SigMA']:
        if model == 'MIX':
            d = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/refit'.format('BRCA-panel'))

            mix = get_model(load_json(get_best_model(d))['parameters'])
            clusters = np.argmax(mix.soft_cluster(full_data), 1)

        elif model == 'SigMA':
            # d = os.path.join(ROOT_DIR, 'data/ICGC-BRCA/out-sigma-brca-panel.tsv')
            d = os.path.join(ROOT_DIR, 'data/nature2019/SigMA_output.tsv')
            all_df = pd.read_csv(d, sep='\t')
            # In case this is comma separated
            if len(all_df.columns) == 1:
                all_df = pd.read_csv(d, sep=',')
            clusters = all_df['categ'].values
            unique_clusters = np.unique(clusters)
            cluster_to_num = {}
            for i, c in enumerate(unique_clusters):
                cluster_to_num[c] = i

            clusters = np.array([cluster_to_num[c] for c in clusters])

        else:
            raise ValueError('error')

        dists_in_clusters = []
        dists_out_clusters = []
        for i in range(num_samples):
            for j in range(i + 1, num_samples):
                if clusters[i] == clusters[j]:
                    dists_in_clusters.append(full_data_exposures_dists[i, j])
                else:
                    dists_out_clusters.append(full_data_exposures_dists[i, j])

        dists_in_clusters = np.array(dists_in_clusters)
        dists_out_clusters = np.array(dists_out_clusters)

        dists_in_clusters = np.random.choice(dists_in_clusters, 200, replace=False)
        dists_out_clusters = np.random.choice(dists_out_clusters, 200, replace=False)
        corrs.extend(dists_in_clusters)
        corrs.extend(dists_out_clusters)
        models.extend([model] * len(dists_out_clusters) * 2)
        relations.extend(['Intra-cluster pairs'] * len(dists_out_clusters))
        relations.extend(['Inter-cluster pairs'] * len(dists_out_clusters))

        print(model, len(np.unique(clusters)))
        print(ranksums(dists_in_clusters, dists_out_clusters), np.mean(dists_in_clusters), np.mean(dists_out_clusters))

    df = {'Cosine similarity': corrs, 'model': models, 'relation': relations}
    df = pd.DataFrame(df)
    sns.violinplot(x='relation', y='Cosine similarity', hue='model', data=df, split=True, inner='stick')

    plt.xlabel('')
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'clusters_quality', 'clusters_quality.pdf'))
Esempio n. 8
0
def RE(dataset, models=None, computation='mutations'):
    # Handle input
    if models is None:
        models = ['MIX-conditional-clustering', 'MIX-soft-clustering', 'NNLS']

    if dataset == 'BRCA':
        full_dataset_name = 'ICGC-BRCA'
    elif dataset == 'OV':
        full_dataset_name = 'TCGA-OV'
    else:
        raise ValueError('{} is no a valid dataset'.format(dataset))

    if computation == 'mutations':
        RE_func = compute_RE_per_sample
    elif computation == 'exposures':
        RE_func = compute_exposures_RE_per_sample
    else:
        raise ValueError('{} is no a valid computation inpute'.format(computation))

    # Prepare data
    full_data, active_signatures = get_data(full_dataset_name)
    normalized_full_data = full_data / full_data.sum(1, keepdims=1)
    full_panel_data, _ = get_data('{}-panel-full'.format(dataset))
    normalized_full_panel_data = full_panel_data / full_panel_data.sum(1, keepdims=1)

    signatures = get_cosmic_signatures()[active_signatures]

    trained_model_dir = os.path.join(ROOT_DIR, 'experiments/trained_models')
    all_experiments = os.listdir(trained_model_dir)
    results = {s.split('-par')[0]: {} for s in all_experiments if dataset in s and 'part' in s}
    for ds_dataset_name in results:
        results[ds_dataset_name] = {model: [] for model in models}

        # Prepare data
        ds_dataset_part1_name = ds_dataset_name + '-part' + str(1)
        ds_dataset_part2_name = ds_dataset_name + '-part' + str(2)

        ds_data_part1, _ = get_data(ds_dataset_part1_name)
        ds_data_part2, _ = get_data(ds_dataset_part2_name)
        ds_data = np.row_stack((ds_data_part1, ds_data_part2))

        if 'panel' in ds_dataset_name:
            curr_normalized_full_data = normalized_full_panel_data
        else:
            curr_normalized_full_data = normalized_full_data

        # Find the best model
        mix_part1 = get_best_model(os.path.join(trained_model_dir, ds_dataset_part1_name, 'refit'), return_model=True)
        mix_part2 = get_best_model(os.path.join(trained_model_dir, ds_dataset_part2_name, 'refit'), return_model=True)

        # MIX RE with cluster's pi using conditional probability
        if 'MIX-conditional-clustering' in models:
            clusters = np.argmax(mix_part1.soft_cluster(ds_data_part2), axis=1)
            exposures_part2 = mix_part1.pi[clusters]
            clusters = np.argmax(mix_part2.soft_cluster(ds_data_part1), axis=1)
            exposures_part1 = mix_part2.pi[clusters]
            exposures = np.row_stack((exposures_part1, exposures_part2))
            results[ds_dataset_name]['MIX-conditional-clustering'] = \
                RE_func(curr_normalized_full_data, exposures, signatures)

        # MIX RE with weighted cluster pi
        if 'MIX-soft-clustering' in models:
            exposures_part2 = mix_part1.weighted_exposures(ds_data_part2)
            exposures_part1 = mix_part2.weighted_exposures(ds_data_part1)
            exposures = np.row_stack((exposures_part1, exposures_part2))
            results[ds_dataset_name]['MIX-soft-clustering'] = \
                RE_func(curr_normalized_full_data, exposures, signatures)

        # NNLS RE
        if 'NNLS' in models:
            exposures = []
            for m in ds_data:
                exposures.append(nnls(signatures.T, m)[0])
            exposures = np.array(exposures)
            results[ds_dataset_name]['NNLS'] = RE_func(curr_normalized_full_data, exposures, signatures)

        if 'SigMA' in models:
            if 'panel' in ds_dataset_name:
                path = 'data/sigma_exposure/out-sigma-{}-panel-full.tsv'.format(dataset.lower())
            else:
                ds = ds_dataset_name.split('ds')[-1]
                path = 'data/sigma_exposure/out-sigma-{}-downsize{}.tsv'.format(dataset.lower(), ds.zfill(3))
            # exposures = sigma_output_to_exposures(path)[:, active_signatures]
            # print('Active_signatures: {}'.format(np.where(sigma_output_to_exposures(path).sum(0) > 0)[0] + 1))
            exposures = sigma_output_to_exposures(path)
            exposures /= exposures.sum(1, keepdims=True)
            results[ds_dataset_name]['SigMA'] = RE_func(curr_normalized_full_data, exposures, get_cosmic_signatures())

    # Shallow analysis (no p-values)
    summed_RE_results = {}
    for s in results:
        summed_RE_results[s] = {}
        for m in results[s]:
            summed_RE_results[s][m] = np.sum(results[s][m]) / len(results[s][m])

    df = pd.DataFrame(summed_RE_results).T
    return df
Esempio n. 9
0
def plot_cluster_AMI(range_clusters, computation='AMI'):
    if computation == 'AMI':
        score_func = AMI_score
    elif computation == 'MI':
        score_func = MI_score
    elif computation == 'jaccard':
        score_func = Jaccard_score
    else:
        raise ValueError('{} is not a valid computation'.format(computation))

    rich_sample_threshold = 10
    data, active_signatures = get_data('MSK-ALL')
    signatures = get_cosmic_signatures()[active_signatures]
    num_data_points = data.sum()

    nnls_exposures = np.zeros((len(data), len(signatures)))
    for i in range(len(data)):
        nnls_exposures[i] = nnls(signatures.T, data[i])[0]

    num_mutations_per_sample = data.sum(1)
    rich_samples = num_mutations_per_sample >= rich_sample_threshold

    all_df = pd.read_csv(os.path.join(ROOT_DIR, 'data/MSK-processed/oncotype_counts.txt'), sep='\t')
    all_df['Counts'] = all_df['Counts'].astype(int)
    all_df = all_df[all_df['Counts'] > 100]
    cancer_types = np.array(all_df['Oncotree'])

    sample_cancer_assignments = []
    sample_cancer_id_assignments = []
    for i, oc in enumerate(cancer_types):

        # dat_f = "data/processed/%s_counts.npy" % oc
        dat_f = os.path.join(ROOT_DIR, 'data/MSK-processed/{}_counts.npy'.format(oc))
        tmp_data = np.array(np.load(dat_f, allow_pickle=True), dtype=np.float64)
        sample_cancer_assignments.extend([oc] * len(tmp_data))
        sample_cancer_id_assignments.extend([i] * len(tmp_data))
    sample_cancer_assignments = np.array(sample_cancer_assignments)
    sample_cancer_id_assignments = np.array(sample_cancer_id_assignments)
    shuffled_indices = np.arange(len(sample_cancer_assignments))

    # Finding best_models
    d = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/denovo')
    BIC_summary = get_best_model(d, return_params=True)
    BIC_scores, BIC_clusters, BIC_paths = BIC_summary['BIC_scores'], BIC_summary['num_clusters'], BIC_summary['model_paths']

    MIX_scores = np.zeros((2, len(range_clusters)))
    MIX_soft_scores = np.zeros((2, len(range_clusters)))
    MIX_refit_scores = np.zeros((2, len(range_clusters)))
    MIX_soft_refit_scores = np.zeros((2, len(range_clusters)))
    KMeans_scores = np.zeros((2, len(range_clusters)))
    NNLS_KMeans_scores = np.zeros((2, len(range_clusters)))
    for idx, num_clusters in enumerate(range_clusters):
        best_model_path = BIC_paths[BIC_clusters == num_clusters][np.argmin(BIC_scores[BIC_clusters == num_clusters])]

        model = get_model(load_json(best_model_path)['parameters'])
        MIX_soft_clustering = model.soft_cluster(data)
        sample_cluster_assignment_MIX = np.argmax(MIX_soft_clustering, 1)
        MIX_scores[0, idx] = score_func(sample_cancer_id_assignments, sample_cluster_assignment_MIX)
        MIX_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples],
                                            sample_cluster_assignment_MIX[rich_samples])
        if computation == 'MI':
            MIX_soft_scores[0, idx] = MI_score_soft_clustering(sample_cancer_id_assignments, MIX_soft_clustering)
            MIX_soft_scores[1, idx] = MI_score_soft_clustering(sample_cancer_id_assignments[rich_samples],
                                                                   MIX_soft_clustering[rich_samples])

        # MIX refit
        d = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/refit/mix_{}clusters_017signatures'.format(str(num_clusters).zfill(3)))
        model = get_model(load_json(get_best_run(d))['parameters'])
        MIX_refit_soft_clustering = model.soft_cluster(data)
        sample_cluster_assignment_MIX_refit = np.argmax(MIX_refit_soft_clustering, 1)
        MIX_refit_scores[0, idx] = score_func(sample_cancer_id_assignments, sample_cluster_assignment_MIX_refit)
        MIX_refit_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples],
                                                  sample_cluster_assignment_MIX_refit[rich_samples])
        if computation == 'MI':
            MIX_soft_refit_scores[0, idx] = MI_score_soft_clustering(sample_cancer_id_assignments, MIX_refit_soft_clustering)
            MIX_soft_refit_scores[1, idx] = MI_score_soft_clustering(sample_cancer_id_assignments[rich_samples],
                                                                         MIX_refit_soft_clustering[rich_samples])

        # KMeans clustering
        cluster_model = KMeans(num_clusters, n_init=100, random_state=140296)
        np.random.shuffle(shuffled_indices)
        shuffled_data = data[shuffled_indices]
        cluster_model.fit(shuffled_data)
        kmeans_clusters = cluster_model.predict(data)
        KMeans_scores[0, idx] = score_func(sample_cancer_id_assignments, kmeans_clusters)
        KMeans_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples],
                                               kmeans_clusters[rich_samples])

        # NNLS + KMeans clustering
        cluster_model = KMeans(num_clusters, n_init=100, random_state=140296)
        np.random.shuffle(shuffled_indices)
        shuffled_data = nnls_exposures[shuffled_indices]
        cluster_model.fit(shuffled_data)
        nnls_kmeans_clusters = cluster_model.predict(nnls_exposures)
        NNLS_KMeans_scores[0, idx] = score_func(sample_cancer_id_assignments, nnls_kmeans_clusters)
        NNLS_KMeans_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples],
                                                    nnls_kmeans_clusters[rich_samples])

        print('finished {}'.format(num_clusters))

    plt.plot(range_clusters, MIX_scores[0], label='MIX-denovo')
    if computation == 'MI':
        plt.plot(range_clusters, MIX_soft_scores[0], label='MIX-denovo-soft')
    plt.plot(range_clusters, MIX_refit_scores[0], label='MIX-refit')
    if computation == 'MI':
        plt.plot(range_clusters, MIX_soft_refit_scores[0], label='MIX-refit-soft')
    plt.plot(range_clusters, KMeans_scores[0], label='KMeans')
    plt.plot(range_clusters, NNLS_KMeans_scores[0], label='NNLS+KMeans')
    plt.title('All samples AMI score')
    plt.xlabel('clusters')
    plt.ylabel(computation)
    plt.legend(loc='lower right')
    plt.xticks(np.arange(min(range_clusters), max(range_clusters) + 1, 2))
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'AMI', 'cluster_score_all.pdf'))
    # plt.show()

    plt.plot(range_clusters, MIX_scores[1], label='MIX-denovo')
    if computation == 'MI':
        plt.plot(range_clusters, MIX_soft_scores[1], label='MIX-denovo-soft')
    plt.plot(range_clusters, MIX_refit_scores[1], label='MIX-refit')
    if computation == 'MI':
        plt.plot(range_clusters, MIX_soft_refit_scores[1], label='MIX-refit-soft')
    plt.plot(range_clusters, KMeans_scores[1], label='KMeans')
    plt.plot(range_clusters, NNLS_KMeans_scores[1], label='NNLS+KMeans')
    plt.title('Filtered AMI score')
    plt.xlabel('clusters')
    plt.ylabel(computation)
    plt.legend(loc='lower right')
    plt.xticks(np.arange(min(range_clusters), max(range_clusters) + 1, 2))
    plt.savefig(os.path.join(ROOT_DIR, 'results', 'AMI', 'cluster_score_filtered.pdf'))
    # plt.show()
    return