def sample_cv(dataset, model_name, num_folds, fold, use_cosmic, num_signatures, shuffle_seed, random_seed, max_iterations, epsilon, out_dir): if fold >= num_folds: raise ValueError('num_folds is {} but fold is {}'.format( num_folds, fold)) dataset_name = dataset dataset, active_signatures = get_data_by_model_name(dataset, model_name) if use_cosmic: num_signatures = len(active_signatures) signatures = get_cosmic_signatures()[active_signatures] elif num_signatures == 0: print( 'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}' .format(len(active_signatures))) num_signatures = len(active_signatures) signatures = None else: signatures = None use_cosmic_dir = 'refit' if use_cosmic else 'denovo' out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name, str(num_signatures), str(shuffle_seed), str(num_folds), str(fold)) try: os.makedirs(out_dir) except OSError: pass random_seed = int(time.time()) if random_seed == 0 else random_seed out_file = out_dir + "/" + str(random_seed) if os.path.isfile(out_file + '.json'): print( 'Experiment with parameters {} {} {} {} {} {} {} {} already exist'. format(dataset_name, model_name, num_folds, fold, use_cosmic, num_signatures, shuffle_seed, random_seed)) return train_data, test_data = split_train_test_sample_cv(dataset, num_folds, fold, shuffle_seed) model, train_ll, test_ll = train_test_stickysig(train_data, test_data, num_signatures, signatures, random_seed, epsilon, max_iterations) parameters = model.get_params() parameters['alpha'] = parameters['alpha'].tolist() parameters['e'] = parameters['e'].tolist() for sample in parameters['pi']: parameters['pi'][sample] = parameters['pi'][sample].tolist() out = { 'log-likelihood-train': train_ll, 'log-likelihood-test': test_ll, 'parameters': parameters } save_json(out_file, out)
def leave_one_chromosome_out(dataset, model_name, chromosome, use_cosmic, num_signatures, random_seed, max_iterations, epsilon, out_dir): use_cosmic_dir = 'refit' if use_cosmic else 'denovo' all_chromosomes = [str(i) for i in range(1, 23)] all_chromosomes.extend(['X', 'Y']) chromosome_name = all_chromosomes[chromosome] dataset_name = dataset dataset, active_signatures = get_data_by_model_name(dataset, model_name) if use_cosmic: num_signatures = len(active_signatures) signatures = get_cosmic_signatures()[active_signatures] elif num_signatures == 0: print( 'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}' .format(len(active_signatures))) num_signatures = len(active_signatures) signatures = None else: signatures = None out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name, str(num_signatures), chromosome_name) try: os.makedirs(out_dir) except OSError: pass random_seed = int(time.time()) if random_seed == 0 else random_seed out_file = out_dir + "/" + str(random_seed) if os.path.isfile(out_file + '.json'): print('Experiment with parameters {} {} {} {} {} {} already exist'. format(dataset_name, model_name, chromosome, use_cosmic, num_signatures, random_seed)) return train_data, test_data = split_train_test_loco(dataset, chromosome) model, train_ll, test_ll = train_test_stickysig(train_data, test_data, num_signatures, signatures, random_seed, epsilon, max_iterations) parameters = model.get_params() parameters['alpha'] = parameters['alpha'].tolist() parameters['e'] = parameters['e'].tolist() for sample in parameters['pi']: parameters['pi'][sample] = parameters['pi'][sample].tolist() out = { 'log-likelihood-train': train_ll, 'log-likelihood-test': test_ll, 'parameters': parameters } save_json(out_file, out)
def plot_sig_correlations(dataset, num_sigs, beta_loss=2, plot_title=True, save_plot=True): cosmic_signatures = get_cosmic_signatures() mix_dir = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/denovo'.format(dataset)) scores_dict = get_best_model(mix_dir, return_params=True) BIC_scores = scores_dict['BIC_scores'] num_signatures = scores_dict['num_signatures'] model_paths = scores_dict['model_paths'] signatures_dict = {} # MIX signatures indices = num_signatures == num_sigs best_model_path = model_paths[indices][np.argmin(BIC_scores[indices])] e = np.array(load_json(best_model_path)['parameters']['e']) signatures_dict['MIX'] = e.copy() # NMF signatures data, _ = get_data(dataset) e = learn_NMF(data, num_sigs, beta_loss=beta_loss) signatures_dict['NMF'] = e.copy() # clustered-NMF signatures (if needed) if dataset == 'MSK-ALL': data, _ = get_data('clustered-MSK-ALL') e = learn_NMF(data, num_sigs, beta_loss=beta_loss) signatures_dict['clustered-NMF'] = e.copy() plt.rcParams.update({'font.size': 12}) x_axis = np.array([str(i + 1) for i in range(num_sigs)]) plt.axhline(0.80, color='grey', linestyle='--', label='_nolegend_') legends = [] for i, model in enumerate(signatures_dict.keys()): legends.append(model) e = signatures_dict[model] sigs, corrs = get_signatures_correlations(e, cosmic_signatures) sigs = sigs[np.argsort(-corrs)] corrs = corrs[np.argsort(-corrs)] curr_x_axis = x_axis color = 'C{}'.format(i) plt.plot(curr_x_axis, corrs, '.-k', color=color) for i in range(len(sigs)): plt.annotate(str(sigs[i] + 1), (i, corrs[i] + 0.002), color=color) print('{} - {} - {} - {}'.format(model, sigs.tolist(), corrs.tolist(), sum(corrs))) plt.yticks(np.arange(2, 6) * 0.2) plt.ylabel('Cosine similarity', fontsize='large') plt.xlabel('Rank of signature', fontsize='large') plt.legend(legends, loc='lower left') if plot_title: plt.title('{} signatures'.format(num_sigs)) if save_plot: plt.savefig(os.path.join(ROOT_DIR, 'results', 'signatures_similarity', '{}-signatures.pdf'.format(num_sigs)))
def train_model(dataset, model_name, use_cosmic, num_signatures, random_seed, max_iterations, epsilon, out_dir): use_cosmic_dir = 'refit' if use_cosmic else 'denovo' dataset_name = dataset dataset, active_signatures = get_data_by_model_name(dataset, model_name) if use_cosmic: num_signatures = len(active_signatures) signatures = get_cosmic_signatures()[active_signatures] elif num_signatures == 0: print( 'use_cosmic is False and num_signatures is 0, using number of active cosmic signatures {}' .format(len(active_signatures))) num_signatures = len(active_signatures) signatures = None else: signatures = None out_dir = os.path.join(out_dir, dataset_name, use_cosmic_dir, model_name, str(num_signatures)) try: os.makedirs(out_dir) except OSError: pass random_seed = int(time.time()) if random_seed == 0 else random_seed out_file = out_dir + "/" + str(random_seed) if os.path.isfile(out_file + '.json'): print('Experiment with parameters {} {} {} {} {} already exist'.format( dataset_name, model_name, use_cosmic, num_signatures, random_seed)) return model, ll = train_stickysig(dataset, num_signatures, signatures, random_seed, epsilon, max_iterations) parameters = model.get_params() parameters['alpha'] = parameters['alpha'].tolist() parameters['e'] = parameters['e'].tolist() for sample in parameters['pi']: parameters['pi'][sample] = parameters['pi'][sample].tolist() out = {'log-likelihood': ll, 'parameters': parameters} save_json(out_file, out)
def ROC_HRD(): from src.analyze_results import get_best_model, stack_nnls import matplotlib.pyplot as plt from src.utils import get_data, get_cosmic_signatures import pandas as pd from sklearn.metrics import roc_curve from sklearn.metrics import roc_auc_score import seaborn as sns try: os.makedirs(os.path.join(ROOT_DIR, 'results', 'HRD')) except OSError: pass # Loading data test_mutations, signatures = get_data('nature2019-panel') test_labels, _ = get_data('nature2019-labels') # fixing to 0, 1. Removing intermediate hrd test_labels = test_labels[:, 0] test_data_samples = test_labels != 0 test_labels = test_labels[test_data_samples] test_labels[test_labels == -1] += 1 # models = ['MIX Sig3', 'SigMA', 'TMB', 'NNLS Sig3', 'WGS Sig3'] models = ['MIX Sig3', 'SigMA', 'NNLS Sig3'] scores = [] for model in models: if 'MIX Sig3' in model: # MIX directory = os.path.join(ROOT_DIR, 'experiments/trained_models/BRCA-panel/refit') mix = get_best_model(directory, return_model=True) test_data = mix.weighted_exposures(test_mutations) if 'normalized' not in model: test_data *= test_mutations.sum(1, keepdims=True) # un-normalizing exposures test_data = test_data[:, [2]] elif model == 'SigMA': sigma_output = os.path.join(ROOT_DIR, 'data/nature2019/SigMA_output.tsv') all_df = pd.read_csv(sigma_output, sep='\t') # In case this is comma separated if len(all_df.columns) == 1: all_df = pd.read_csv(sigma_output, sep=',') test_data = all_df[['Signature_3_mva']].values # test_data = all_df[['Signature_3_mva']].values elif model == 'TMB': test_data = test_mutations.sum(1, keepdims=True) elif model == 'WGS Sig3': # NNLS on WGS a, _ = get_data('nature2019-full') test_data = stack_nnls(a, get_cosmic_signatures()[signatures]) test_data = test_data[:, [2]] else: # NNLS on panel test_data = stack_nnls(test_mutations, get_cosmic_signatures()[signatures]) test_data = test_data[:, [2]] test_data = test_data[test_data_samples] # Test estimator on data prediction_probabilities = test_data auc_roc = roc_auc_score(test_labels, prediction_probabilities) print(model, 'auc: {:.2f}'.format(auc_roc)) scores.append([model, '{:.2f}'.format(auc_roc)]) fpr, tpr, thresholds = roc_curve(test_labels, prediction_probabilities) sns.lineplot(fpr, tpr, ci=None) plt.legend(models, loc='lower right') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.savefig(os.path.join(ROOT_DIR, 'results', 'HRD', 'ROC_HRD.pdf')) # plt.show() np.savetxt(os.path.join(ROOT_DIR, 'results', 'HRD', 'ROC_AUC.tsv'), scores, '%s', '\t')
def ROC_immunotherapy(): from src.analyze_results import get_best_model, stack_nnls, get_signatures_correlations import matplotlib.pyplot as plt from src.utils import get_data, get_cosmic_signatures from sklearn.metrics import roc_curve from sklearn.metrics import roc_auc_score import seaborn as sns try: os.makedirs(os.path.join(ROOT_DIR, 'results', 'immunotherapy')) except OSError: pass # Loading data test_mutations, sigs = get_data('msk2018-LUAD') _, sigs = get_data('MSK-ALL') test_labels, _ = get_data('msk2018-LUAD-labels') models = ['MIX refit Sig4', 'MIX denovo M5', 'TMB', 'NNLS Sig4'] scores = [] for model in models: if 'MIX refit Sig4' in model: # MIX directory = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/refit') mix = get_best_model(directory, return_model=True) test_data = mix.weighted_exposures(test_mutations) if 'normalized' not in model: test_data *= test_mutations.sum(1, keepdims=True) # un-normalizing exposures test_data = test_data[:, [3]] elif 'MIX denovo M5' in model: # MIX directory = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/denovo') mix = get_best_model(directory, return_model=True) test_data = mix.weighted_exposures(test_mutations) if 'normalized' not in model: test_data *= test_mutations.sum(1, keepdims=True) # un-normalizing exposures sig4 = get_cosmic_signatures()[sigs][[3]] _, corrs = get_signatures_correlations(mix.e, sig4) best_sig = np.argmax(corrs) test_data = test_data[:, [best_sig]] elif model == 'TMB': test_data = test_mutations.sum(1, keepdims=True) else: # NNLS test_data = stack_nnls(test_mutations, get_cosmic_signatures()[sigs]) test_data = test_data[:, [3]] # Test estimator on data prediction_probabilities = test_data auc_roc = roc_auc_score(test_labels, prediction_probabilities) print(model, 'auc: {:.2f}'.format(auc_roc)) scores.append([model, '{:.2f}'.format(auc_roc)]) fpr, tpr, thresholds = roc_curve(test_labels, prediction_probabilities) sns.lineplot(fpr, tpr, ci=None) plt.legend(models, loc='lower right') plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.savefig(os.path.join(ROOT_DIR, 'results', 'immunotherapy', 'ROC_immunotherapy.pdf')) # plt.show() np.savetxt(os.path.join(ROOT_DIR, 'results', 'immunotherapy', 'ROC_AUC.tsv'), scores, '%s', '\t')
def compare_panel_clusters(): np.random.seed(1359) # full_dataset, panel_dataset = 'BRCA-panel-full', 'BRCA-panel' full_dataset, panel_dataset = 'nature2019-full', 'nature2019-panel' full_data, active_signatures = get_data(full_dataset) panel_data, _ = get_data(panel_dataset) signatures = get_cosmic_signatures()[active_signatures] num_samples = len(full_data) full_data_exposures = stack_nnls(full_data, signatures) full_data_exposures_dists = cosine_similarity(full_data_exposures) corrs = [] models = [] relations = [] for model in ['MIX', 'SigMA']: if model == 'MIX': d = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/refit'.format('BRCA-panel')) mix = get_model(load_json(get_best_model(d))['parameters']) clusters = np.argmax(mix.soft_cluster(full_data), 1) elif model == 'SigMA': # d = os.path.join(ROOT_DIR, 'data/ICGC-BRCA/out-sigma-brca-panel.tsv') d = os.path.join(ROOT_DIR, 'data/nature2019/SigMA_output.tsv') all_df = pd.read_csv(d, sep='\t') # In case this is comma separated if len(all_df.columns) == 1: all_df = pd.read_csv(d, sep=',') clusters = all_df['categ'].values unique_clusters = np.unique(clusters) cluster_to_num = {} for i, c in enumerate(unique_clusters): cluster_to_num[c] = i clusters = np.array([cluster_to_num[c] for c in clusters]) else: raise ValueError('error') dists_in_clusters = [] dists_out_clusters = [] for i in range(num_samples): for j in range(i + 1, num_samples): if clusters[i] == clusters[j]: dists_in_clusters.append(full_data_exposures_dists[i, j]) else: dists_out_clusters.append(full_data_exposures_dists[i, j]) dists_in_clusters = np.array(dists_in_clusters) dists_out_clusters = np.array(dists_out_clusters) dists_in_clusters = np.random.choice(dists_in_clusters, 200, replace=False) dists_out_clusters = np.random.choice(dists_out_clusters, 200, replace=False) corrs.extend(dists_in_clusters) corrs.extend(dists_out_clusters) models.extend([model] * len(dists_out_clusters) * 2) relations.extend(['Intra-cluster pairs'] * len(dists_out_clusters)) relations.extend(['Inter-cluster pairs'] * len(dists_out_clusters)) print(model, len(np.unique(clusters))) print(ranksums(dists_in_clusters, dists_out_clusters), np.mean(dists_in_clusters), np.mean(dists_out_clusters)) df = {'Cosine similarity': corrs, 'model': models, 'relation': relations} df = pd.DataFrame(df) sns.violinplot(x='relation', y='Cosine similarity', hue='model', data=df, split=True, inner='stick') plt.xlabel('') plt.savefig(os.path.join(ROOT_DIR, 'results', 'clusters_quality', 'clusters_quality.pdf'))
def RE(dataset, models=None, computation='mutations'): # Handle input if models is None: models = ['MIX-conditional-clustering', 'MIX-soft-clustering', 'NNLS'] if dataset == 'BRCA': full_dataset_name = 'ICGC-BRCA' elif dataset == 'OV': full_dataset_name = 'TCGA-OV' else: raise ValueError('{} is no a valid dataset'.format(dataset)) if computation == 'mutations': RE_func = compute_RE_per_sample elif computation == 'exposures': RE_func = compute_exposures_RE_per_sample else: raise ValueError('{} is no a valid computation inpute'.format(computation)) # Prepare data full_data, active_signatures = get_data(full_dataset_name) normalized_full_data = full_data / full_data.sum(1, keepdims=1) full_panel_data, _ = get_data('{}-panel-full'.format(dataset)) normalized_full_panel_data = full_panel_data / full_panel_data.sum(1, keepdims=1) signatures = get_cosmic_signatures()[active_signatures] trained_model_dir = os.path.join(ROOT_DIR, 'experiments/trained_models') all_experiments = os.listdir(trained_model_dir) results = {s.split('-par')[0]: {} for s in all_experiments if dataset in s and 'part' in s} for ds_dataset_name in results: results[ds_dataset_name] = {model: [] for model in models} # Prepare data ds_dataset_part1_name = ds_dataset_name + '-part' + str(1) ds_dataset_part2_name = ds_dataset_name + '-part' + str(2) ds_data_part1, _ = get_data(ds_dataset_part1_name) ds_data_part2, _ = get_data(ds_dataset_part2_name) ds_data = np.row_stack((ds_data_part1, ds_data_part2)) if 'panel' in ds_dataset_name: curr_normalized_full_data = normalized_full_panel_data else: curr_normalized_full_data = normalized_full_data # Find the best model mix_part1 = get_best_model(os.path.join(trained_model_dir, ds_dataset_part1_name, 'refit'), return_model=True) mix_part2 = get_best_model(os.path.join(trained_model_dir, ds_dataset_part2_name, 'refit'), return_model=True) # MIX RE with cluster's pi using conditional probability if 'MIX-conditional-clustering' in models: clusters = np.argmax(mix_part1.soft_cluster(ds_data_part2), axis=1) exposures_part2 = mix_part1.pi[clusters] clusters = np.argmax(mix_part2.soft_cluster(ds_data_part1), axis=1) exposures_part1 = mix_part2.pi[clusters] exposures = np.row_stack((exposures_part1, exposures_part2)) results[ds_dataset_name]['MIX-conditional-clustering'] = \ RE_func(curr_normalized_full_data, exposures, signatures) # MIX RE with weighted cluster pi if 'MIX-soft-clustering' in models: exposures_part2 = mix_part1.weighted_exposures(ds_data_part2) exposures_part1 = mix_part2.weighted_exposures(ds_data_part1) exposures = np.row_stack((exposures_part1, exposures_part2)) results[ds_dataset_name]['MIX-soft-clustering'] = \ RE_func(curr_normalized_full_data, exposures, signatures) # NNLS RE if 'NNLS' in models: exposures = [] for m in ds_data: exposures.append(nnls(signatures.T, m)[0]) exposures = np.array(exposures) results[ds_dataset_name]['NNLS'] = RE_func(curr_normalized_full_data, exposures, signatures) if 'SigMA' in models: if 'panel' in ds_dataset_name: path = 'data/sigma_exposure/out-sigma-{}-panel-full.tsv'.format(dataset.lower()) else: ds = ds_dataset_name.split('ds')[-1] path = 'data/sigma_exposure/out-sigma-{}-downsize{}.tsv'.format(dataset.lower(), ds.zfill(3)) # exposures = sigma_output_to_exposures(path)[:, active_signatures] # print('Active_signatures: {}'.format(np.where(sigma_output_to_exposures(path).sum(0) > 0)[0] + 1)) exposures = sigma_output_to_exposures(path) exposures /= exposures.sum(1, keepdims=True) results[ds_dataset_name]['SigMA'] = RE_func(curr_normalized_full_data, exposures, get_cosmic_signatures()) # Shallow analysis (no p-values) summed_RE_results = {} for s in results: summed_RE_results[s] = {} for m in results[s]: summed_RE_results[s][m] = np.sum(results[s][m]) / len(results[s][m]) df = pd.DataFrame(summed_RE_results).T return df
def plot_cluster_AMI(range_clusters, computation='AMI'): if computation == 'AMI': score_func = AMI_score elif computation == 'MI': score_func = MI_score elif computation == 'jaccard': score_func = Jaccard_score else: raise ValueError('{} is not a valid computation'.format(computation)) rich_sample_threshold = 10 data, active_signatures = get_data('MSK-ALL') signatures = get_cosmic_signatures()[active_signatures] num_data_points = data.sum() nnls_exposures = np.zeros((len(data), len(signatures))) for i in range(len(data)): nnls_exposures[i] = nnls(signatures.T, data[i])[0] num_mutations_per_sample = data.sum(1) rich_samples = num_mutations_per_sample >= rich_sample_threshold all_df = pd.read_csv(os.path.join(ROOT_DIR, 'data/MSK-processed/oncotype_counts.txt'), sep='\t') all_df['Counts'] = all_df['Counts'].astype(int) all_df = all_df[all_df['Counts'] > 100] cancer_types = np.array(all_df['Oncotree']) sample_cancer_assignments = [] sample_cancer_id_assignments = [] for i, oc in enumerate(cancer_types): # dat_f = "data/processed/%s_counts.npy" % oc dat_f = os.path.join(ROOT_DIR, 'data/MSK-processed/{}_counts.npy'.format(oc)) tmp_data = np.array(np.load(dat_f, allow_pickle=True), dtype=np.float64) sample_cancer_assignments.extend([oc] * len(tmp_data)) sample_cancer_id_assignments.extend([i] * len(tmp_data)) sample_cancer_assignments = np.array(sample_cancer_assignments) sample_cancer_id_assignments = np.array(sample_cancer_id_assignments) shuffled_indices = np.arange(len(sample_cancer_assignments)) # Finding best_models d = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/denovo') BIC_summary = get_best_model(d, return_params=True) BIC_scores, BIC_clusters, BIC_paths = BIC_summary['BIC_scores'], BIC_summary['num_clusters'], BIC_summary['model_paths'] MIX_scores = np.zeros((2, len(range_clusters))) MIX_soft_scores = np.zeros((2, len(range_clusters))) MIX_refit_scores = np.zeros((2, len(range_clusters))) MIX_soft_refit_scores = np.zeros((2, len(range_clusters))) KMeans_scores = np.zeros((2, len(range_clusters))) NNLS_KMeans_scores = np.zeros((2, len(range_clusters))) for idx, num_clusters in enumerate(range_clusters): best_model_path = BIC_paths[BIC_clusters == num_clusters][np.argmin(BIC_scores[BIC_clusters == num_clusters])] model = get_model(load_json(best_model_path)['parameters']) MIX_soft_clustering = model.soft_cluster(data) sample_cluster_assignment_MIX = np.argmax(MIX_soft_clustering, 1) MIX_scores[0, idx] = score_func(sample_cancer_id_assignments, sample_cluster_assignment_MIX) MIX_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples], sample_cluster_assignment_MIX[rich_samples]) if computation == 'MI': MIX_soft_scores[0, idx] = MI_score_soft_clustering(sample_cancer_id_assignments, MIX_soft_clustering) MIX_soft_scores[1, idx] = MI_score_soft_clustering(sample_cancer_id_assignments[rich_samples], MIX_soft_clustering[rich_samples]) # MIX refit d = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/refit/mix_{}clusters_017signatures'.format(str(num_clusters).zfill(3))) model = get_model(load_json(get_best_run(d))['parameters']) MIX_refit_soft_clustering = model.soft_cluster(data) sample_cluster_assignment_MIX_refit = np.argmax(MIX_refit_soft_clustering, 1) MIX_refit_scores[0, idx] = score_func(sample_cancer_id_assignments, sample_cluster_assignment_MIX_refit) MIX_refit_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples], sample_cluster_assignment_MIX_refit[rich_samples]) if computation == 'MI': MIX_soft_refit_scores[0, idx] = MI_score_soft_clustering(sample_cancer_id_assignments, MIX_refit_soft_clustering) MIX_soft_refit_scores[1, idx] = MI_score_soft_clustering(sample_cancer_id_assignments[rich_samples], MIX_refit_soft_clustering[rich_samples]) # KMeans clustering cluster_model = KMeans(num_clusters, n_init=100, random_state=140296) np.random.shuffle(shuffled_indices) shuffled_data = data[shuffled_indices] cluster_model.fit(shuffled_data) kmeans_clusters = cluster_model.predict(data) KMeans_scores[0, idx] = score_func(sample_cancer_id_assignments, kmeans_clusters) KMeans_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples], kmeans_clusters[rich_samples]) # NNLS + KMeans clustering cluster_model = KMeans(num_clusters, n_init=100, random_state=140296) np.random.shuffle(shuffled_indices) shuffled_data = nnls_exposures[shuffled_indices] cluster_model.fit(shuffled_data) nnls_kmeans_clusters = cluster_model.predict(nnls_exposures) NNLS_KMeans_scores[0, idx] = score_func(sample_cancer_id_assignments, nnls_kmeans_clusters) NNLS_KMeans_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples], nnls_kmeans_clusters[rich_samples]) print('finished {}'.format(num_clusters)) plt.plot(range_clusters, MIX_scores[0], label='MIX-denovo') if computation == 'MI': plt.plot(range_clusters, MIX_soft_scores[0], label='MIX-denovo-soft') plt.plot(range_clusters, MIX_refit_scores[0], label='MIX-refit') if computation == 'MI': plt.plot(range_clusters, MIX_soft_refit_scores[0], label='MIX-refit-soft') plt.plot(range_clusters, KMeans_scores[0], label='KMeans') plt.plot(range_clusters, NNLS_KMeans_scores[0], label='NNLS+KMeans') plt.title('All samples AMI score') plt.xlabel('clusters') plt.ylabel(computation) plt.legend(loc='lower right') plt.xticks(np.arange(min(range_clusters), max(range_clusters) + 1, 2)) plt.savefig(os.path.join(ROOT_DIR, 'results', 'AMI', 'cluster_score_all.pdf')) # plt.show() plt.plot(range_clusters, MIX_scores[1], label='MIX-denovo') if computation == 'MI': plt.plot(range_clusters, MIX_soft_scores[1], label='MIX-denovo-soft') plt.plot(range_clusters, MIX_refit_scores[1], label='MIX-refit') if computation == 'MI': plt.plot(range_clusters, MIX_soft_refit_scores[1], label='MIX-refit-soft') plt.plot(range_clusters, KMeans_scores[1], label='KMeans') plt.plot(range_clusters, NNLS_KMeans_scores[1], label='NNLS+KMeans') plt.title('Filtered AMI score') plt.xlabel('clusters') plt.ylabel(computation) plt.legend(loc='lower right') plt.xticks(np.arange(min(range_clusters), max(range_clusters) + 1, 2)) plt.savefig(os.path.join(ROOT_DIR, 'results', 'AMI', 'cluster_score_filtered.pdf')) # plt.show() return