def run_ngmc(X, fit_params, param_space, val_hidden_fraction, hidden_fraction, n_repeats, hyperopt_iters=10, seed=None, logistic=True): assert logistic == True # Dummy argument so that calls to run_<alg> are the same log = get_logger() log.info('[Training NGMC model]') scaler = MCScaler(mode='0-1') use_validation = True return run_mc_alg(X, fold_objective=ngmc_objective, retrain_model=train_ngmc, fit_params=fit_params, space=param_space, scaler=scaler, val_hidden_fraction=val_hidden_fraction, hidden_fraction=hidden_fraction, train_with_validation=use_validation, n_repeats=n_repeats, hyperopt_iters=hyperopt_iters, hyperopt_seed=seed)
def run_kpmf(X, fit_params, param_space, val_hidden_fraction, hidden_fraction, n_repeats, hyperopt_iters=10, seed=None, logistic=True): assert fit_params is not None log = get_logger() log.info('[Training KPMF model]') scaler = MCScaler(mode='0-1' if logistic else 'std') use_validation = True return run_mc_alg(X, fold_objective=kpmf_objective, retrain_model=train_kpmf, fit_params=fit_params, space=param_space, scaler=scaler, val_hidden_fraction=val_hidden_fraction, hidden_fraction=hidden_fraction, train_with_validation=use_validation, n_repeats=n_repeats, hyperopt_iters=hyperopt_iters, hyperopt_seed=seed)
def main(): args = parse_args() setup_logging(args.logfile) log = get_logger() assert( 0 <= args.hidden_fraction <= 1 ) np.random.seed(args.random_seed) tf.set_random_seed(args.random_seed) args = parse_args() log.info('*' * 100) log.info('[Starting MC experiment]') log_dict(log.info, vars(args)) log.info('[Loading input data]') with open(args.target_gis, 'rb') as f: gi_data = cpkl.load(f) row_genes = gi_data['rows'] log.info('\t- setting up training and test sets') train_test_sets = [gi_train_test_split(gi_data, args.hidden_fraction) for _ in range(args.n_repeats)] train_Xs, test_Xs, test_masks= zip(*train_test_sets) if args.mc_alg == 'NGMC': scalers = [MCScaler('0-1') for _ in range(args.n_repeats)] else: scalers = [MCScaler('std') for _ in range(args.n_repeats)] train_Xs = [scaler.fit_transform(X) for scaler, X in zip(scalers, train_Xs)] if args.mc_alg == 'PMF': imputed_Xs, models_info = train_pmf_models(train_Xs = train_Xs, rank = args.rank, iters = args.iters, lr = args.lr, lam = args.lambda_f, report_every = args.report_every) elif args.mc_alg == 'PMF_b': imputed_Xs, models_info = train_pmf_b_models(train_Xs = train_Xs, rank = args.rank, iters = args.iters, lr = args.lr, lam = args.lambda_f, lam_b = args.lambda_b, report_every = args.report_every) elif args.mc_alg == 'KPMF': L = get_laplacian(list(row_genes), args.target_ppi) imputed_Xs, models_info = train_kpmf_models(train_Xs = train_Xs, L = L, rank = args.rank, iters = args.iters, lr = args.lr, lambda_f = args.lambda_f, lambda_h = args.lambda_h, rl_lambda = args.rl_lambda, report_every = args.report_every) elif args.mc_alg == 'KPMF_b': L = get_laplacian(list(row_genes), args.target_ppi) imputed_Xs, models_info = train_kpmf_b_models(train_Xs = train_Xs, L = L, rank = args.rank, iters = args.iters, lr = args.lr, lambda_b = args.lambda_b, lambda_f = args.lambda_f, lambda_h = args.lambda_h, rl_lambda = args.rl_lambda, report_every = args.report_every) elif args.mc_alg == 'NGMC': ppi = nx.read_edgelist(args.target_ppi) A = get_ppi_data(list(row_genes), ppi, mode='normalized_adjacency') imputed_Xs, models_info = train_ngmc_models(train_Xs = train_Xs, A = A, rank = args.rank, iters = args.iters, lr = args.lr, alpha_p = args.alpha_p, lambda_f = args.lambda_f, lambda_h = args.lambda_h, lambda_p = args.lambda_p) elif args.mc_alg == 'XSMF': with open(args.source_gis, 'rb') as f: src_gi_data = cpkl.load(f) X_src = src_gi_data['values'] X_src = MCScaler(mode='std').fit_transform(X_src) log.info('[Loading sim scores]') with open(args.sim_scores, 'rb') as f: sim_scores_data = cpkl.load(f) sim_scores = sim_scores_data['values'] sim_scores = sim_scores / np.max(sim_scores) # Normalize imputed_Xs, models_info = train_xsmf_models(train_Xs = train_Xs, X_src = X_src, sim_scores=sim_scores, rank = args.rank, iters = args.iters, lr = args.lr, lambda_sim = args.lambda_sim, lambda_src = args.lambda_src, lambda_u = args.lambda_u, lambda_v = args.lambda_v, lambda_us = args.lambda_us, lambda_vs = args.lambda_vs, report_every = args.report_every) elif args.mc_alg == 'KXSMF': with open(args.source_gis, 'rb') as f: src_gi_data = cpkl.load(f) X_src = src_gi_data['values'] X_src = MCScaler(mode='std').fit_transform(X_src) log.info('[Loading sim scores]') with open(args.sim_scores, 'rb') as f: sim_scores_data = cpkl.load(f) sim_scores = sim_scores_data['values'] sim_scores = sim_scores / np.max(sim_scores) # Normalize L_tgt = get_laplacian(list(gi_data['rows']), args.target_ppi) L_src = get_laplacian(list(src_gi_data['rows']), args.source_ppi) log.warn('%s, %s' % L_src.shape) log.warn('%s, %s' % X_src.shape) imputed_Xs, models_info = train_kxsmf_models(train_Xs = train_Xs, X_src = X_src, L_tgt=L_tgt, L_src=L_src, sim_scores=sim_scores, rank = args.rank, iters = args.iters, lr = args.lr, lambda_sim = args.lambda_sim, lambda_src = args.lambda_src, lambda_u = args.lambda_u, lambda_v = args.lambda_v, lambda_us = args.lambda_us, lambda_vs = args.lambda_vs, lambda_tgt_rl = args.lambda_tgt_rl, lambda_src_rl = args.lambda_src_rl, report_every = args.report_every) elif args.mc_alg == 'KXSMF_b': with open(args.source_gis, 'rb') as f: src_gi_data = cpkl.load(f) X_src = src_gi_data['values'] X_src = MCScaler(mode='std').fit_transform(X_src) log.info('[Loading sim scores]') with open(args.sim_scores, 'rb') as f: sim_scores_data = cpkl.load(f) sim_scores = sim_scores_data['values'] sim_scores = sim_scores / np.max(sim_scores) # Normalize L_tgt = get_laplacian(list(gi_data['rows']), args.target_ppi) L_src = get_laplacian(list(src_gi_data['rows']), args.source_ppi) log.warn('%s, %s' % L_src.shape) log.warn('%s, %s' % X_src.shape) imputed_Xs, models_info = train_kxsmfb_models(train_Xs = train_Xs, X_src = X_src, L_tgt=L_tgt, L_src=L_src, sim_scores=sim_scores, rank = args.rank, iters = args.iters, lr = args.lr, lambda_b= args.lambda_b, lambda_sim = args.lambda_sim, lambda_src = args.lambda_src, lambda_u = args.lambda_u, lambda_v = args.lambda_v, lambda_us = args.lambda_us, lambda_vs = args.lambda_vs, lambda_tgt_rl = args.lambda_tgt_rl, lambda_src_rl = args.lambda_src_rl, report_every = args.report_every) else: raise NotImplementedError imputed_Xs = [scaler.inverse_transform(X) for scaler, X in zip(scalers, imputed_Xs)] # Take transposes here for XSMF, KXSMF results = evaluate_preds(test_Xs, imputed_Xs, test_masks) results, fold_results = summarize_results(results) log_results(results) results_dict = dict(summary=results, collected=fold_results, args=vars(args)) pvals_data = None if args.pval_file: # given pval file with open(args.pval_file, 'rb') as f: pvals_data = cpkl.load(f) assert(np.all(pvals_data['cols'] == gi_data['cols'])) assert(np.all(pvals_data['rows'] == gi_data['rows'])) pvals = pvals_data['values'] pvals_filled = np.where(np.isnan(pvals), 1000, pvals) sig_mask = pvals_filled < args.pval_thresh sig_test_Xs = [np.where(sig_mask, _X, np.nan) for _X in test_Xs] sig_imputed_Xs = [np.where(sig_mask, _X, np.nan) for _X in imputed_Xs] sig_results = evaluate_preds(sig_test_Xs, sig_imputed_Xs, test_masks) sig_results, sig_fold_results = summarize_results(sig_results) log_results(sig_results) results_dict['sig_summary'] = sig_results results_dict['sig_collected'] = sig_fold_results with open(args.results_output, 'w') as f: json.dump(results_dict, f, indent=2) serialized_data = { 'GIs': gi_data, 'alg': args.mc_alg, 'fold_data': dict(train_Xs=train_Xs, test_Xs=test_Xs, masks=test_masks), 'imputed_Xs': imputed_Xs, 'models_info': models_info, 'pvals': pvals_data } with open(args.models_output, 'wb') as f: cpkl.dump(serialized_data, f)
def run_kxsmfb_experiment(tgt_gis, src_gis, sim_scores, L_tgt, L_src, space, val_hf, test_hf, n_repeats, hp_iters, hp_seed): all_results = [] all_params = [] all_models = [] log = get_logger() param_search_training_curves = [] hp_trials = [] src_X_scaled = MCScaler(mode='std').fit_transform(src_gis['values']) for i in range(n_repeats): log.info('[Outer fold: %i]' % i) scaler = MCScaler(mode='std') X_train, X_test, eval_mask = gi_train_test_split(tgt_gis, test_hf) X_train = scaler.fit_transform(X_train) X_train_all = X_train.copy() tgt_gis['values'] = X_train log.info('- Holding out %.3f fraction of data for validation' % val_hf) X_train, X_val, _ = gi_train_test_split(tgt_gis, val_hf) log.info('- Performing hyperparameter search for %i iterations' % hp_iters) trials = hyperopt.Trials() # NB: Ignore the returned hyperopt parameters, we want to know which parameters were # used even if they were default values for keyword arguments _ = hyperopt.fmin(fn=get_kxsmfb_obj(X_train, X_val, src_X_scaled, sim_scores, L_tgt, L_src), space=space, algo=hyperopt.tpe.suggest, max_evals = hp_iters, trials=trials, show_progressbar=True, rstate=np.random.RandomState(hp_seed)) # NB: random state of hyperopt cannot be set globally, so we pass a # np.RandomState object for reproducibility... hp_seed += 1 best_trial = trials.best_trial['result'] # NB: that the parameter dictionary in trial['params'] is specified *explicitly* # the retraining of new models then use *no optional arguments*. # This makes reporting and retraining easy and unambiguous best_params = best_trial['params'] # TODO: # # It's too fussy to serialize the models and save them as attachments in hyperopt. # # Instead, we retrain a model and compute training curves instead. # log.info('- Retraining model with validation data to get training curve') # training_curve = compute_training_curve(retrain_model, X_train, X_val, best_params, # fit_params=fit_params) # param_search_training_curves.append(training_curve) # Retrain model using the number of iterations and parameters found in hp search log.info('- Retraining model without validation to get best model') best_model = KXSMF_b(X_tgt=X_train_all, X_val=None, X_src=src_X_scaled, sim_scores=sim_scores, L_tgt=L_tgt, L_src=L_src, **best_params) best_model.fit() # Make predictions and evaluate the model X_fitted = best_model.X_fitted X_fitted = scaler.inverse_transform(X_fitted) if len(tgt_gis['rows']) == len(tgt_gis['cols']) and np.all(tgt_gis['rows'] == tgt_gis['cols']): log.info('* Averaging over pairs because input is symmetric') X_fitted = (X_fitted.T + X_fitted) / 2. # test_mask = ~np.isnan(X_test) # test_mask[np.tril_indices(len(test_mask))] = False results = evaluate_model(X_test[eval_mask], X_fitted[eval_mask]) log.info('[Results for fold %i]' % i) log.info('- Best params for model') log_dict(log.info, best_params) log.info('- Results:') log_dict(log.info, results) hp_trials.append(trials.results) all_results.append(results) all_params.append(best_params) all_models.append(best_model) # Collate the results and return summarized, collected = summarize_results(all_results) return dict(summary=summarized, fold_results=collected, best_params=all_params), \ all_models, \ param_search_training_curves, \ hp_trials