def main(): data_provider = DataProvider(data_directory=Path('./data')) item_users = transform_to_item_user_csr_matrix( data_provider.get_purchases_train()) # baseline model model = get_model() np.random.seed(42) model.fit(item_users=item_users) test_user_ids, test_purchases = get_purchases_by_customer( data_provider.get_purchases_test()) recommendations = get_recommendations(model, test_user_ids, item_users) score = mapk(test_purchases, recommendations, k=10) return score
def run_simulate(user_document_params, click_exposure_params): user_document_generator = UserDocumentDataGenerator(**user_document_params) click_exposure_generator = ClickExposureDataGenerator( user_document=user_document_generator, **click_exposure_params) user_document_data = user_document_generator.generate_data() relevance, exposure, click, exposure_labels, implicit_feedback = click_exposure_generator.generate_data( ) # model setup model = get_model(implicit_feedback, user_document_data, exposure_labels) # train trainer = Trainer(model) trainer.train(relevance[0].reshape(-1), user_document_data.reshape(-1, 1100)) return trainer
def examinate(algorithm_name): table_name = 'score_' + algorithm_name #结果存储表名 for mall_id in malls: print(mall_id, ' with ', algorithm_name, ' starts...') sql = "SELECT train_time FROM {s} WHERE mall_id='{m}'".format( m=mall_id, s=table_name) # 检测这个商场有没有建过模型,建过模型会有记录 try: # 测试有没有建过表,如果没建过就会建表 cur.execute(sql) except pymysql.err.ProgrammingError: sql2 = '''CREATE TABLE `{n}` ( `mall_id` varchar(255) NOT NULL , `result` varchar(255) NULL , `param` varchar(255) NULL , `train_time` int NULL , PRIMARY KEY (`mall_id`) );'''.format(n=table_name) cur.execute(sql2) cur.execute(sql) if cur.rowcount != 0: # 已经建过模型 print(mall_id, ' has already been fittedwith ', algorithm_name) continue metrix, tar = get_data(mall_id) x_train, x_test, y_train, y_test = train_test_split( metrix, tar, test_size=0.1, random_state=random_state) # 分割测试集和训练集 save_dir = root_path + "model/" + algorithm_name + "_" + mall_id + "_model.m" # 存储模型位置 clf = get_model(algorithm_name) # 根据名称获取新模型 train_time = time.time() clf.fit(x_train, y_train) train_time = time.time() - train_time print('time : ', train_time) score = clf.score(x_test, y_test) # 检验训练效果,得到准确度 train_time = int(train_time) sql = "INSERT INTO {tn} SET result='{s}', train_time={tt},mall_id='{m}' " \ "ON DUPLICATE KEY UPDATE result='{s}', train_time={tt}".format( s=score, m=mall_id, tt=train_time, tn=table_name) cur.execute(sql) joblib.dump(clf, save_dir) print(get_time(), ' saved a model for ', mall_id, ' with ', algorithm_name, ' . score ', score) conn.commit()
def get_best_model(dataset_dir, return_model=False, return_params=False): if len(os.listdir(dataset_dir)) == 0: return None dataset = os.path.split(os.path.split(dataset_dir)[0])[-1] data, _ = get_data(dataset) num_data_points = np.sum(data) models = [] BIC_scores = [] sigs = [] clusters = [] for model in os.listdir(dataset_dir): experiment_dir = os.path.join(dataset_dir, model) best_run = get_best_run(experiment_dir) if len(best_run) > 0: best_score = load_json(best_run)['log-likelihood'] num_sigs = int(model.split('_')[2][:3]) num_clusters = int(model.split('_')[1][:3]) num_params = (num_clusters - 1) + (num_sigs - 1) * num_clusters + (96 - 1) * num_sigs models.append(best_run) clusters.append(num_clusters) sigs.append(num_sigs) BIC_scores.append(np.log(num_data_points) * num_params - 2 * best_score) models = np.array(models) BIC_scores = np.array(BIC_scores) sigs = np.array(sigs, dtype='int') clusters = np.array(clusters, dtype='int') best_model = models[np.argmin(BIC_scores)] if return_model: return get_model(load_json(best_model)['parameters']) if return_params: return {'BIC_scores': BIC_scores, 'num_clusters': clusters, 'model_paths': models, 'num_signatures': sigs} return best_model
def get_model_rf(mall_id): data, tar = u.get_data(mall_id) clf = u.get_model('RF_1000') clf.fit(data, tar) return clf
def get_model_knn(mall_id): data, tar = u.get_data(mall_id) clf = u.get_model('knn_5') clf.fit(data, tar) return clf
# x_train,x_test, y_train, y_test = train_test_split(data, tar, test_size=test_size, random_state=random_state) x_train, x_test, y_train, y_test = data, data, tar, tar labels = sorted(set(y_train)) print('start ', 'xgb') model = u.get_model_xgb(mall) score = model.score(x_test, y_test) print(mall, ' score : ', score, ' ', 'xgb') print(mall, ' predicting ', 'xgb') result = [] result_proba = [] result.append(model.predict(x_test)) result_proba.append(model.predict_proba(x_test)) for al in algs: gc.collect() print('start ', al) model = u.get_model(al) print(mall, ' training model ', al) model.fit(x_train, y_train) score = model.score(x_test, y_test) print(mall, ' score : ', score, ' ', al) print(mall, ' predicting ', al) result.append(model.predict(x_test)) result_proba.append(model.predict_proba(x_test)) wrong = 0 # print(result) result = [[result[0][i], result[1][i], result[2][i]] for i in range(0, len(result[0]))] for i in range(0, len(y_test)): if y_test[i] not in result[i] or len(set(result[i])) > 1: print(y_test[i], '-------', result[i]) if y_test[i] not in result[i]:
def main(): # Argparse custom actions class SetModes(argparse.Action): """Set the modes of operations.""" def __call__(self, parser, args, values, option_string=None): for value in values: setattr(args, value, True) # yapf: disable parser = argparse.ArgumentParser(description='Fake News Classifier') # Initialization parser.add_argument('--init', action='store_true', default=False, help='perform initialization') # Modes parser.add_argument('-m', '--mode', action=SetModes, nargs='+', choices=['train', 'test', 'demo', 'plot'], help='specify the mode of operation: train, test, demo, plot') parser.add_argument('--train', action='store_true', default=False, help='train the model') parser.add_argument('--test', action='store_true', default=False, help='test the model (must either train or load a model)') parser.add_argument('--demo', action='store_true', default=False, help='demo the model on linewise samples from a file (must either train or load a model)') parser.add_argument('--plot', action='store_true', default=False, help='plot training data (must either train or have existing training data)') # Options parser.add_argument('-b', '--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('-c', '--config', type=str, help='path to configuration json file (overrides args)') parser.add_argument('--data-loader', type=str, default='BatchLoader', help='data loader to use (default: "BatchLoader")') parser.add_argument('--dataset', type=str, default='FakeRealNews', help='dataset to use (default: "FakeRealNews")') parser.add_argument('-e', '--epochs', type=int, default=10, help='number of epochs to train (default: 10)') parser.add_argument('-f', '--file', type=str, help='specify a file for another argument') parser.add_argument('--lr', '--learning-rate', dest='learning_rate', type=float, default=1e-4, help='learning rate (default: 1e-4)') parser.add_argument('-l', '--load', type=int, metavar='EPOCH', help='load a model and its training data') parser.add_argument('--loss', type=str, default='BCEWithLogitsLoss', help='loss function (default: "BCEWithLogitsLoss")') parser.add_argument('--model', type=str, default='FakeNewsNet', help='model architecture to use (default: "FakeNewsNet")') parser.add_argument('-s', '--sample-size', type=int, metavar='N', help='limit sample size for training') parser.add_argument('--seed', type=int, default=0, help='random seed (default: 0)') parser.add_argument('--save', action='store_true', default=True, help='save model checkpoints and training data (default: True)') parser.add_argument('--no-save', dest='save', action='store_false') args = parser.parse_args() # yapf: enable # Print help if no args if len(sys.argv) == 1: parser.print_help() parser.exit() # Configure logger logging.basicConfig(level=logging.DEBUG) logging.getLogger('matplotlib').setLevel(logging.WARNING) # Load configuration file if specified if args.config is not None: utils.load_config(args) # Exit if no mode is specified if not args.init and not args.train and not args.test and not args.demo and not args.plot: logging.error( 'No mode specified. Please specify with: --mode {init,train,test,demo,plot}' ) exit(1) # Exit on `--load` if run directory not found if (args.load is not None or (args.plot and not args.train)) and not os.path.isdir(utils.get_path(args)): logging.error( 'Could not find directory for current configuration {}'.format( utils.get_path(args))) exit(1) # Exit on `test` or `demo` without `train` or `--load EPOCH` if (args.test or args.demo) and not (args.train or args.load is not None): logging.error( 'Cannot run `test` or `demo` without a model. Try again with either `train` or `--load EPOCH`.' ) exit(1) # Exit on `demo` without a string file if args.demo and not args.file: logging.error( 'Cannot run `demo` without a file. Try again with `--file FILE`.') exit(1) # Setup run directory if args.save and not args.init and not (args.train or args.test or args.demo or args.plot): utils.save_config(args) path = utils.get_path(args) + '/output.log' os.makedirs(os.path.dirname(path), exist_ok=True) logging.getLogger().addHandler(logging.FileHandler(path)) # Set random seeds random.seed(args.seed) torch.manual_seed(args.seed) # Variable declarations training_data = None # Load GloVe vocabulary if args.init or args.train or args.test or args.demo: glove = torchtext.vocab.GloVe(name='6B', dim=50) # Perform initialization if args.init or args.train or args.test: # Determine which dataset to use dataset = utils.get_dataset(args) # Preload the dataset dataset.load() # Get preprocessed samples samples = preprocessing.get_samples(dataset, glove, args.init) random.shuffle(samples) # DataLoader setup for `train`, `test` if args.train or args.test: # Select data loader to use DataLoader = utils.get_data_loader(args) # Split samples split_ratio = [.6, .2, .2] trainset, validset, testset = list( DataLoader.splits(samples, split_ratio)) if args.sample_size is not None: # limit samples used in training trainset = trainset[:args.sample_size] validset = validset[:int(args.sample_size * split_ratio[1] / split_ratio[0])] # Get data loaders train_loader, valid_loader, test_loader = [ DataLoader(split, batch_size=args.batch_size) for split in [trainset, validset, testset] ] # Load samples for demo if args.demo: if os.path.isfile(args.file): # Read samples from the input file with open(args.file, 'r') as f: samples = [line for line in f if line.strip()] data = pd.DataFrame({ 'text': samples, 'label': [0.5] * len(samples) }) # Preprocess samples preprocessing.clean(data) samples = preprocessing.encode(data, glove) samples = [(torch.tensor(text).long(), label) for text, label in samples] # Select data loader to use DataLoader = utils.get_data_loader(args) # Get data loader data_loader = DataLoader(samples, batch_size=1, shuffle=False) else: logging.error('Could not find file for demo at {}'.format( args.file)) exit(1) # Model setup for `train`, `test`, `demo` if args.train or args.test or args.demo: # Create the model model = utils.get_model(glove, args) # Load a model if args.load is not None: utils.load_model(args.load, model, args) # Run `train` if args.train: training_data = training.train(model, train_loader, valid_loader, args) # Run `test` if args.test: if args.train or args.load is not None: criterion = utils.get_criterion(args.loss) acc, loss = training.evaluate(model, test_loader, criterion) logging.info('Testing accuracy: {:.4%}, loss: {:.6f}'.format( acc, loss)) else: logging.error('No model loaded for testing') exit(1) # Run `demo` if args.demo: if args.train or args.load is not None: model.eval() # set model to evaluate mode logging.info('-- Results --') for i, (text, _) in enumerate(data_loader): preview = data['text'][i][:32] + '...' out = model(text).flatten() prob = torch.sigmoid(out) # apply sigmoid to get probability pred = (prob > 0.5).long() # predict `true` if greater than 0.5 label = ['fake', 'true'][pred.item()] label = '{}{}{}'.format( '\033[92m' if pred.item() else '\033[93m', label, '\033[0m') confidence = (prob if pred.item() else 1 - prob).item() logging.info( 'Report {}: {} with {:.2%} confidence - "{}"'.format( i, label, confidence, preview)) else: logging.error('No model loaded for demo') exit(1) # Run `plot` if args.plot: if training_data is None: training_data = utils.load_training_data(args, allow_missing=False) if args.load is not None and not args.train: for k, v in training_data.items(): training_data[k] = v[:args.load + 1] logging.info('Plotting training data') training.plot(training_data)
def compare_panel_clusters(): np.random.seed(1359) # full_dataset, panel_dataset = 'BRCA-panel-full', 'BRCA-panel' full_dataset, panel_dataset = 'nature2019-full', 'nature2019-panel' full_data, active_signatures = get_data(full_dataset) panel_data, _ = get_data(panel_dataset) signatures = get_cosmic_signatures()[active_signatures] num_samples = len(full_data) full_data_exposures = stack_nnls(full_data, signatures) full_data_exposures_dists = cosine_similarity(full_data_exposures) corrs = [] models = [] relations = [] for model in ['MIX', 'SigMA']: if model == 'MIX': d = os.path.join(ROOT_DIR, 'experiments/trained_models/{}/refit'.format('BRCA-panel')) mix = get_model(load_json(get_best_model(d))['parameters']) clusters = np.argmax(mix.soft_cluster(full_data), 1) elif model == 'SigMA': # d = os.path.join(ROOT_DIR, 'data/ICGC-BRCA/out-sigma-brca-panel.tsv') d = os.path.join(ROOT_DIR, 'data/nature2019/SigMA_output.tsv') all_df = pd.read_csv(d, sep='\t') # In case this is comma separated if len(all_df.columns) == 1: all_df = pd.read_csv(d, sep=',') clusters = all_df['categ'].values unique_clusters = np.unique(clusters) cluster_to_num = {} for i, c in enumerate(unique_clusters): cluster_to_num[c] = i clusters = np.array([cluster_to_num[c] for c in clusters]) else: raise ValueError('error') dists_in_clusters = [] dists_out_clusters = [] for i in range(num_samples): for j in range(i + 1, num_samples): if clusters[i] == clusters[j]: dists_in_clusters.append(full_data_exposures_dists[i, j]) else: dists_out_clusters.append(full_data_exposures_dists[i, j]) dists_in_clusters = np.array(dists_in_clusters) dists_out_clusters = np.array(dists_out_clusters) dists_in_clusters = np.random.choice(dists_in_clusters, 200, replace=False) dists_out_clusters = np.random.choice(dists_out_clusters, 200, replace=False) corrs.extend(dists_in_clusters) corrs.extend(dists_out_clusters) models.extend([model] * len(dists_out_clusters) * 2) relations.extend(['Intra-cluster pairs'] * len(dists_out_clusters)) relations.extend(['Inter-cluster pairs'] * len(dists_out_clusters)) print(model, len(np.unique(clusters))) print(ranksums(dists_in_clusters, dists_out_clusters), np.mean(dists_in_clusters), np.mean(dists_out_clusters)) df = {'Cosine similarity': corrs, 'model': models, 'relation': relations} df = pd.DataFrame(df) sns.violinplot(x='relation', y='Cosine similarity', hue='model', data=df, split=True, inner='stick') plt.xlabel('') plt.savefig(os.path.join(ROOT_DIR, 'results', 'clusters_quality', 'clusters_quality.pdf'))
def simulated_data_analysis(dataset, trained_models_dir=def_trained_models_dir): if 'simulated' not in dataset: raise ValueError('dataset is no synthetic') data, _ = get_data(dataset) num_data_points = data.sum() dataset_dir = os.path.join(trained_models_dir, dataset) dataset_params = dataset[10:] original_model = get_model(load_json(os.path.join(ROOT_DIR, 'data/simulated-data/{}/model.json'.format(dataset_params)))) original_num_clusters, original_num_sigs = original_model.num_clusters, original_model.num_topics original_model_ll = original_model.log_likelihood(data) original_num_params = (original_num_clusters - 1) + (original_num_sigs - 1) * original_num_clusters + (96 - 1) * original_num_sigs original_bic = np.log(num_data_points) * original_num_params - 2 * original_model_ll # Plot BIC scores_dict = get_best_model(os.path.join(dataset_dir, 'denovo'), return_params=True) BIC_scores = scores_dict['BIC_scores'] num_clusters = scores_dict['num_clusters'] num_signatures = scores_dict['num_signatures'] # print(dataset, signature_learning, model_paths[np.argmin(BIC_scores)]) unique_clusters = np.unique(num_clusters) unique_signaturs = np.unique(num_signatures) from mpl_toolkits.mplot3d import axes3d fig = plt.figure() ax = fig.add_subplot(111, projection='3d') for c in unique_clusters: tmp = num_clusters == c curr_sigs = num_signatures[tmp] curr_clusters = num_clusters[tmp] curr_BIC_scores = BIC_scores[tmp] arg_sort_curr_sigs = np.argsort(curr_sigs) curr_clusters = np.array([curr_clusters[arg_sort_curr_sigs]]) curr_sigs = np.array([curr_sigs[arg_sort_curr_sigs]]) curr_BIC_scores = np.array([curr_BIC_scores[arg_sort_curr_sigs]]) ax.plot_wireframe(curr_clusters, curr_sigs, curr_BIC_scores, rstride=1, cstride=1) for s in unique_signaturs: tmp = num_signatures == s curr_sigs = num_signatures[tmp] curr_clusters = num_clusters[tmp] curr_BIC_scores = BIC_scores[tmp] arg_sort_curr_clusters = np.argsort(curr_clusters) curr_clusters = np.array([curr_clusters[arg_sort_curr_clusters]]) curr_sigs = np.array([curr_sigs[arg_sort_curr_clusters]]) curr_BIC_scores = np.array([curr_BIC_scores[arg_sort_curr_clusters]]) ax.plot_wireframe(curr_clusters, curr_sigs, curr_BIC_scores, rstride=1, cstride=1) ax.set_xlabel('clusters') ax.set_ylabel('signatures') ax.set_zlabel('BIC score') plt.xticks(unique_clusters) plt.yticks(unique_signaturs) # if plot_title: plt.title(dataset) plt.savefig(os.path.join(ROOT_DIR, 'results', 'synthetic', dataset, 'BIC.pdf')) # plt.show() ### Test sig/cluster/weight correlations results = [['Model', 'Average clusters similarity', '# Unique clusters', 'Average signatures similarity', '# Unique signatures']] best_model = get_best_model(os.path.join(dataset_dir, 'denovo'), return_model=True) best_num_clusters, best_num_sigs = best_model.num_clusters, best_model.num_topics original_sigs, original_clusters, original_weights = original_model.e.copy(), original_model.pi.copy(), original_model.w.copy() best_model_sigs, best_model_clusters, best_model_weights = best_model.e, best_model.pi, best_model.w sig, sig_corr = get_signatures_correlations(best_model_sigs, original_sigs) # print(sig, sig_corr) # rearange clusters reagranged_clusters = original_clusters[:, sig] reagranged_clusters /= reagranged_clusters.sum(1, keepdims=True) cluster, cluster_corr = get_signatures_correlations(best_model_clusters, reagranged_clusters) # print(cluster, cluster_corr) # rearange weights reagranged_weights = original_weights[cluster] reagranged_weights /= reagranged_weights.sum() weight_corr = cosine_similarity(best_model_weights, reagranged_weights)[0, 1] # print(weight_corr) best_model_ll = best_model.log_likelihood(data) best_num_params = (best_num_clusters - 1) + (best_num_sigs - 1) * best_num_clusters + (96 - 1) * best_num_sigs best_bic = np.log(num_data_points) * best_num_params - 2 * best_model_ll # print(best_bic) # print(best_num_clusters, best_num_sigs, best_bic, best_model_ll, weight_corr, np.min(cluster_corr), np.max(cluster_corr), np.min(sig_corr), np.max(sig_corr), len(np.unique(cluster[cluster_corr > 0.8])), len(np.unique(sig[sig_corr > 0.8]))) results.append(['Mix ({}, {})'.format(best_num_clusters, best_num_sigs), str(np.mean(cluster_corr)), str(len(np.unique(cluster[cluster_corr > 0.8]))), str(np.mean(sig_corr)), str(len(np.unique(sig[sig_corr > 0.8])))]) ### Test the same with the best model with the same parameters same_params_model = get_best_run(os.path.join(dataset_dir, 'denovo', 'mix_{}clusters_{}signatures'.format(str(original_num_clusters).zfill(3), str(original_num_sigs).zfill(3)))) same_params_model = get_model(load_json(same_params_model)['parameters']) original_sigs, original_clusters, original_weights = original_model.e.copy(), original_model.pi.copy(), original_model.w.copy() same_params_model_sigs, same_params_model_clusters, same_params_model_weights = same_params_model.e, same_params_model.pi, same_params_model.w sig, sig_corr = get_signatures_correlations(same_params_model_sigs, original_sigs) # print(sig, sig_corr) # rearange clusters reagranged_clusters = original_clusters[:, sig] reagranged_clusters /= reagranged_clusters.sum(1, keepdims=True) cluster, cluster_corr = get_signatures_correlations(same_params_model_clusters, reagranged_clusters) # print(cluster, cluster_corr) # rearange weights reagranged_weights = original_weights[cluster] reagranged_weights /= reagranged_weights.sum() weight_corr = cosine_similarity(same_params_model_weights, reagranged_weights)[0, 1] # print(weight_corr) same_params_ll = same_params_model.log_likelihood(data) same_params_bic = np.log(num_data_points) * original_num_params - 2 * same_params_ll # print(best_bic, same_params_bic, original_bic) # print(original_num_clusters, original_num_sigs, same_params_bic, same_params_ll, weight_corr, np.min(cluster_corr), np.max(cluster_corr), np.min(sig_corr), np.max(sig_corr), len(np.unique(cluster[cluster_corr > 0.8])), len(np.unique(sig[sig_corr > 0.8]))) # print('-', '-', original_bic, original_model_ll, '-', '-', '-', '-', '-', '-', '-') results.append(['Mix ({}, {})'.format(original_num_clusters, original_num_sigs), str(np.mean(cluster_corr)), str(len(np.unique(cluster[cluster_corr > 0.8]))), str(np.mean(sig_corr)), str(len(np.unique(sig[sig_corr > 0.8])))]) np.savetxt(os.path.join(ROOT_DIR, 'results', 'synthetic', dataset, 'summary.tsv'), results, fmt='%s', delimiter='\t', header='{} clusters | {} signatures'.format(original_num_clusters, original_num_sigs))
def plot_cluster_AMI(range_clusters, computation='AMI'): if computation == 'AMI': score_func = AMI_score elif computation == 'MI': score_func = MI_score elif computation == 'jaccard': score_func = Jaccard_score else: raise ValueError('{} is not a valid computation'.format(computation)) rich_sample_threshold = 10 data, active_signatures = get_data('MSK-ALL') signatures = get_cosmic_signatures()[active_signatures] num_data_points = data.sum() nnls_exposures = np.zeros((len(data), len(signatures))) for i in range(len(data)): nnls_exposures[i] = nnls(signatures.T, data[i])[0] num_mutations_per_sample = data.sum(1) rich_samples = num_mutations_per_sample >= rich_sample_threshold all_df = pd.read_csv(os.path.join(ROOT_DIR, 'data/MSK-processed/oncotype_counts.txt'), sep='\t') all_df['Counts'] = all_df['Counts'].astype(int) all_df = all_df[all_df['Counts'] > 100] cancer_types = np.array(all_df['Oncotree']) sample_cancer_assignments = [] sample_cancer_id_assignments = [] for i, oc in enumerate(cancer_types): # dat_f = "data/processed/%s_counts.npy" % oc dat_f = os.path.join(ROOT_DIR, 'data/MSK-processed/{}_counts.npy'.format(oc)) tmp_data = np.array(np.load(dat_f, allow_pickle=True), dtype=np.float64) sample_cancer_assignments.extend([oc] * len(tmp_data)) sample_cancer_id_assignments.extend([i] * len(tmp_data)) sample_cancer_assignments = np.array(sample_cancer_assignments) sample_cancer_id_assignments = np.array(sample_cancer_id_assignments) shuffled_indices = np.arange(len(sample_cancer_assignments)) # Finding best_models d = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/denovo') BIC_summary = get_best_model(d, return_params=True) BIC_scores, BIC_clusters, BIC_paths = BIC_summary['BIC_scores'], BIC_summary['num_clusters'], BIC_summary['model_paths'] MIX_scores = np.zeros((2, len(range_clusters))) MIX_soft_scores = np.zeros((2, len(range_clusters))) MIX_refit_scores = np.zeros((2, len(range_clusters))) MIX_soft_refit_scores = np.zeros((2, len(range_clusters))) KMeans_scores = np.zeros((2, len(range_clusters))) NNLS_KMeans_scores = np.zeros((2, len(range_clusters))) for idx, num_clusters in enumerate(range_clusters): best_model_path = BIC_paths[BIC_clusters == num_clusters][np.argmin(BIC_scores[BIC_clusters == num_clusters])] model = get_model(load_json(best_model_path)['parameters']) MIX_soft_clustering = model.soft_cluster(data) sample_cluster_assignment_MIX = np.argmax(MIX_soft_clustering, 1) MIX_scores[0, idx] = score_func(sample_cancer_id_assignments, sample_cluster_assignment_MIX) MIX_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples], sample_cluster_assignment_MIX[rich_samples]) if computation == 'MI': MIX_soft_scores[0, idx] = MI_score_soft_clustering(sample_cancer_id_assignments, MIX_soft_clustering) MIX_soft_scores[1, idx] = MI_score_soft_clustering(sample_cancer_id_assignments[rich_samples], MIX_soft_clustering[rich_samples]) # MIX refit d = os.path.join(ROOT_DIR, 'experiments/trained_models/MSK-ALL/refit/mix_{}clusters_017signatures'.format(str(num_clusters).zfill(3))) model = get_model(load_json(get_best_run(d))['parameters']) MIX_refit_soft_clustering = model.soft_cluster(data) sample_cluster_assignment_MIX_refit = np.argmax(MIX_refit_soft_clustering, 1) MIX_refit_scores[0, idx] = score_func(sample_cancer_id_assignments, sample_cluster_assignment_MIX_refit) MIX_refit_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples], sample_cluster_assignment_MIX_refit[rich_samples]) if computation == 'MI': MIX_soft_refit_scores[0, idx] = MI_score_soft_clustering(sample_cancer_id_assignments, MIX_refit_soft_clustering) MIX_soft_refit_scores[1, idx] = MI_score_soft_clustering(sample_cancer_id_assignments[rich_samples], MIX_refit_soft_clustering[rich_samples]) # KMeans clustering cluster_model = KMeans(num_clusters, n_init=100, random_state=140296) np.random.shuffle(shuffled_indices) shuffled_data = data[shuffled_indices] cluster_model.fit(shuffled_data) kmeans_clusters = cluster_model.predict(data) KMeans_scores[0, idx] = score_func(sample_cancer_id_assignments, kmeans_clusters) KMeans_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples], kmeans_clusters[rich_samples]) # NNLS + KMeans clustering cluster_model = KMeans(num_clusters, n_init=100, random_state=140296) np.random.shuffle(shuffled_indices) shuffled_data = nnls_exposures[shuffled_indices] cluster_model.fit(shuffled_data) nnls_kmeans_clusters = cluster_model.predict(nnls_exposures) NNLS_KMeans_scores[0, idx] = score_func(sample_cancer_id_assignments, nnls_kmeans_clusters) NNLS_KMeans_scores[1, idx] = score_func(sample_cancer_id_assignments[rich_samples], nnls_kmeans_clusters[rich_samples]) print('finished {}'.format(num_clusters)) plt.plot(range_clusters, MIX_scores[0], label='MIX-denovo') if computation == 'MI': plt.plot(range_clusters, MIX_soft_scores[0], label='MIX-denovo-soft') plt.plot(range_clusters, MIX_refit_scores[0], label='MIX-refit') if computation == 'MI': plt.plot(range_clusters, MIX_soft_refit_scores[0], label='MIX-refit-soft') plt.plot(range_clusters, KMeans_scores[0], label='KMeans') plt.plot(range_clusters, NNLS_KMeans_scores[0], label='NNLS+KMeans') plt.title('All samples AMI score') plt.xlabel('clusters') plt.ylabel(computation) plt.legend(loc='lower right') plt.xticks(np.arange(min(range_clusters), max(range_clusters) + 1, 2)) plt.savefig(os.path.join(ROOT_DIR, 'results', 'AMI', 'cluster_score_all.pdf')) # plt.show() plt.plot(range_clusters, MIX_scores[1], label='MIX-denovo') if computation == 'MI': plt.plot(range_clusters, MIX_soft_scores[1], label='MIX-denovo-soft') plt.plot(range_clusters, MIX_refit_scores[1], label='MIX-refit') if computation == 'MI': plt.plot(range_clusters, MIX_soft_refit_scores[1], label='MIX-refit-soft') plt.plot(range_clusters, KMeans_scores[1], label='KMeans') plt.plot(range_clusters, NNLS_KMeans_scores[1], label='NNLS+KMeans') plt.title('Filtered AMI score') plt.xlabel('clusters') plt.ylabel(computation) plt.legend(loc='lower right') plt.xticks(np.arange(min(range_clusters), max(range_clusters) + 1, 2)) plt.savefig(os.path.join(ROOT_DIR, 'results', 'AMI', 'cluster_score_filtered.pdf')) # plt.show() return
def simulate(num_clusters, num_signatures, num_samples, random_seed): np.random.seed(random_seed) base_model = get_model( load_json( os.path.join(ROOT_DIR, 'data', 'simulated-data', 'base_model.json'))['parameters']) if num_clusters > base_model.num_clusters: raise ValueError( 'num_clusters cannot be larger than base_model.num_clusters ({})'. format(base_model.num_clusters)) if num_signatures > base_model.num_topics: raise ValueError( 'num_clusters cannot be larger than base_model.num_topics ({})'. format(base_model.num_topics)) msk_data, _ = get_data('MSK-ALL') msk_sizes = np.sum(msk_data, 1).astype('int') clusters = np.random.choice(base_model.num_clusters, size=num_clusters, replace=False, p=base_model.w) pi = base_model.pi[clusters] w = base_model.w[clusters] w /= w.sum() prob_sig = np.dot(w, pi) signatures = np.random.choice(base_model.num_topics, size=num_signatures, replace=False, p=prob_sig) pi = pi[:, signatures] pi /= pi.sum(1, keepdims=True) e = base_model.e[signatures] model = Mix(num_clusters, num_signatures, init_params={ 'w': w, 'pi': pi, 'e': e }) sample_sizes = np.random.choice(msk_sizes, num_samples) clusters, signatures, mutations = model.sample(sample_sizes) curr_dir = os.path.join( ROOT_DIR, 'data', 'simulated-data', '{}_{}_{}_{}'.format(num_clusters, num_signatures, num_samples, random_seed)) try: os.makedirs(curr_dir) except OSError: pass # Save model, base data save_json(os.path.join(curr_dir, 'full_simulated'), { 'clusters': clusters, 'signatures': signatures, 'mutations': mutations }) parameters = model.get_params() parameters['w'] = parameters['w'].tolist() parameters['pi'] = parameters['pi'].tolist() parameters['e'] = parameters['e'].tolist() save_json(os.path.join(curr_dir, 'model'), parameters) # Transform the basic data into mutation matrix mutation_mat = np.zeros((num_samples, 96), dtype='int') for i in range(num_samples): a, b = np.unique(mutations[i], return_counts=True) mutation_mat[i, a] = b np.save(os.path.join(curr_dir, 'mutations'), mutation_mat)