def ensemble_cluster_results( directory=r'\\fusi1\crispr2\analysis\cluster\results\cluster_experiment_izf_ob', ensemble_type='median', models_to_ensemble=['all']): all_results = {} all_learn_options = {} for results_file in glob.glob(directory + '\\*.pickle'): if 'learn_options' in results_file: continue with open(results_file, 'rb') as f: results, learn_options = pickle.load(f) for k in results.keys(): assert k not in all_results.keys() all_results[k] = results[k] all_learn_options[k] = learn_options[k] genes = all_results[all_results.keys()[0]][1][0][0].keys() models = all_results.keys() ens_predictions = {} ens_truths = {} for g, gene in enumerate(genes): test_predictions = None cv_predictions = None cv_truth = None prev_model_truth = None for i, model in enumerate(models): if len([m for m in models_to_ensemble if m in model]) == 0: continue truth, predictions = all_results[model][1][0] if test_predictions == None: test_predictions = predictions[gene][:, None] else: test_predictions = np.append(test_predictions, predictions[gene][:, None], axis=1) # this is just to check that all the models are using the same ordering of # the ground truth and hence of the samples, as this might screw up the ensemble. if prev_model_truth is not None: assert np.all(truth[gene]['ranks'] == prev_model_truth) else: prev_model_truth = truth[gene]['ranks'] # take all the other genes and stack the predictions under a given model. cv_predictions_gene_j = np.array([]) cv_truth_gene_j = np.array([]) for other_gene in genes: if gene == other_gene: continue cv_predictions_gene_j = np.append(cv_predictions_gene_j, predictions[other_gene]) cv_truth_gene_j = np.append(cv_truth_gene_j, truth[other_gene]['ranks']) if cv_truth is None: cv_truth = cv_truth_gene_j.copy()[:, None] if cv_predictions is None: cv_predictions = cv_predictions_gene_j[:, None] else: cv_predictions = np.append(cv_predictions, cv_predictions_gene_j[:, None], axis=1) if ensemble_type is 'majority': y_pred = ensembles.pairwise_majority_voting(test_predictions) if ensemble_type is 'median': y_pred = ensembles.median(test_predictions) if ensemble_type is 'stacking': y_pred = ensembles.linear_stacking(cv_truth, cv_predictions, test_predictions) ens_predictions[gene] = y_pred ens_truths[gene] = truth[gene] all_results[ensemble_type] = [ None, [[ens_truths, ens_predictions]], None, None ] all_learn_options[ensemble_type] = None # spearmans = [] # for gene in ens_predictions.keys(): # spearmans.append(sp.stats.spearmanr(ens_predictions[gene], ens_truths[gene]['raw'])[0]) # print gene, spearmans[-1] # print "median: %.5f" % np.median(spearmans) return all_results, all_learn_options
def ensemble_cluster_results(directory=r'\\fusi1\crispr2\analysis\cluster\results\cluster_experiment_izf_ob', ensemble_type='median', models_to_ensemble=['all']): all_results = {} all_learn_options = {} for results_file in glob.glob(directory+'\\*.pickle'): if 'learn_options' in results_file: continue with open(results_file, 'rb') as f: results, learn_options = pickle.load(f) for k in results.keys(): assert k not in all_results.keys() all_results[k] = results[k] all_learn_options[k] = learn_options[k] genes = all_results[all_results.keys()[0]][1][0][0].keys() models = all_results.keys() ens_predictions = {} ens_truths = {} for g, gene in enumerate(genes): test_predictions = None cv_predictions = None cv_truth = None prev_model_truth = None for i, model in enumerate(models): if len([m for m in models_to_ensemble if m in model]) == 0: continue truth, predictions = all_results[model][1][0] if test_predictions == None: test_predictions = predictions[gene][:,None] else: test_predictions = np.append(test_predictions, predictions[gene][:,None], axis=1) # this is just to check that all the models are using the same ordering of # the ground truth and hence of the samples, as this might screw up the ensemble. if prev_model_truth is not None: assert np.all(truth[gene]['ranks'] == prev_model_truth) else: prev_model_truth = truth[gene]['ranks'] # take all the other genes and stack the predictions under a given model. cv_predictions_gene_j = np.array([]) cv_truth_gene_j = np.array([]) for other_gene in genes: if gene == other_gene: continue cv_predictions_gene_j = np.append(cv_predictions_gene_j, predictions[other_gene]) cv_truth_gene_j = np.append(cv_truth_gene_j, truth[other_gene]['ranks']) if cv_truth is None: cv_truth = cv_truth_gene_j.copy()[:, None] if cv_predictions is None: cv_predictions = cv_predictions_gene_j[:, None] else: cv_predictions = np.append(cv_predictions, cv_predictions_gene_j[:,None], axis=1) if ensemble_type is 'majority': y_pred = ensembles.pairwise_majority_voting(test_predictions) if ensemble_type is 'median': y_pred = ensembles.median(test_predictions) if ensemble_type is 'stacking': y_pred = ensembles.linear_stacking(cv_truth, cv_predictions, test_predictions) ens_predictions[gene] = y_pred ens_truths[gene] = truth[gene] all_results[ensemble_type] = [None, [[ens_truths, ens_predictions]], None, None] all_learn_options[ensemble_type] = None # spearmans = [] # for gene in ens_predictions.keys(): # spearmans.append(sp.stats.spearmanr(ens_predictions[gene], ens_truths[gene]['raw'])[0]) # print gene, spearmans[-1] # print "median: %.5f" % np.median(spearmans) return all_results, all_learn_options