Beispiel #1
0
def ensemble_cluster_results(
        directory=r'\\fusi1\crispr2\analysis\cluster\results\cluster_experiment_izf_ob',
        ensemble_type='median',
        models_to_ensemble=['all']):
    all_results = {}
    all_learn_options = {}

    for results_file in glob.glob(directory + '\\*.pickle'):
        if 'learn_options' in results_file:
            continue

        with open(results_file, 'rb') as f:
            results, learn_options = pickle.load(f)

        for k in results.keys():
            assert k not in all_results.keys()
            all_results[k] = results[k]
            all_learn_options[k] = learn_options[k]

    genes = all_results[all_results.keys()[0]][1][0][0].keys()
    models = all_results.keys()

    ens_predictions = {}
    ens_truths = {}
    for g, gene in enumerate(genes):
        test_predictions = None
        cv_predictions = None
        cv_truth = None

        prev_model_truth = None
        for i, model in enumerate(models):
            if len([m for m in models_to_ensemble if m in model]) == 0:
                continue

            truth, predictions = all_results[model][1][0]

            if test_predictions == None:
                test_predictions = predictions[gene][:, None]
            else:
                test_predictions = np.append(test_predictions,
                                             predictions[gene][:, None],
                                             axis=1)

            # this is just to check that all the models are using the same ordering of
            # the ground truth and hence of the samples, as this might screw up the ensemble.
            if prev_model_truth is not None:
                assert np.all(truth[gene]['ranks'] == prev_model_truth)
            else:
                prev_model_truth = truth[gene]['ranks']

            # take all the other genes and stack the predictions under a given model.
            cv_predictions_gene_j = np.array([])
            cv_truth_gene_j = np.array([])
            for other_gene in genes:
                if gene == other_gene:
                    continue
                cv_predictions_gene_j = np.append(cv_predictions_gene_j,
                                                  predictions[other_gene])
                cv_truth_gene_j = np.append(cv_truth_gene_j,
                                            truth[other_gene]['ranks'])

            if cv_truth is None:
                cv_truth = cv_truth_gene_j.copy()[:, None]

            if cv_predictions is None:
                cv_predictions = cv_predictions_gene_j[:, None]
            else:
                cv_predictions = np.append(cv_predictions,
                                           cv_predictions_gene_j[:, None],
                                           axis=1)

        if ensemble_type is 'majority':
            y_pred = ensembles.pairwise_majority_voting(test_predictions)
        if ensemble_type is 'median':
            y_pred = ensembles.median(test_predictions)
        if ensemble_type is 'stacking':
            y_pred = ensembles.linear_stacking(cv_truth, cv_predictions,
                                               test_predictions)

        ens_predictions[gene] = y_pred
        ens_truths[gene] = truth[gene]

    all_results[ensemble_type] = [
        None, [[ens_truths, ens_predictions]], None, None
    ]
    all_learn_options[ensemble_type] = None
    # spearmans = []
    # for gene in ens_predictions.keys():
    #     spearmans.append(sp.stats.spearmanr(ens_predictions[gene], ens_truths[gene]['raw'])[0])
    #     print gene, spearmans[-1]
    # print "median: %.5f" % np.median(spearmans)

    return all_results, all_learn_options
Beispiel #2
0
def ensemble_cluster_results(directory=r'\\fusi1\crispr2\analysis\cluster\results\cluster_experiment_izf_ob', ensemble_type='median', models_to_ensemble=['all']):
    all_results = {}
    all_learn_options = {}

    for results_file in glob.glob(directory+'\\*.pickle'):
        if 'learn_options' in results_file:
            continue

        with open(results_file, 'rb') as f:
            results, learn_options = pickle.load(f)

        for k in results.keys():
            assert k not in all_results.keys()
            all_results[k] = results[k]
            all_learn_options[k] = learn_options[k]

    genes = all_results[all_results.keys()[0]][1][0][0].keys()
    models = all_results.keys()

    ens_predictions = {}
    ens_truths = {}
    for g, gene in enumerate(genes):
        test_predictions = None
        cv_predictions = None
        cv_truth = None

        prev_model_truth = None
        for i, model in enumerate(models):
            if len([m for m in models_to_ensemble if m in model]) == 0:
                continue

            truth, predictions = all_results[model][1][0]

            if test_predictions == None:
                test_predictions = predictions[gene][:,None]
            else:
                test_predictions = np.append(test_predictions, predictions[gene][:,None], axis=1)

            # this is just to check that all the models are using the same ordering of
            # the ground truth and hence of the samples, as this might screw up the ensemble.
            if prev_model_truth is not None:
                assert np.all(truth[gene]['ranks'] == prev_model_truth)
            else:
                prev_model_truth = truth[gene]['ranks']

            # take all the other genes and stack the predictions under a given model.
            cv_predictions_gene_j = np.array([])
            cv_truth_gene_j = np.array([])
            for other_gene in genes:
                if gene == other_gene:
                    continue
                cv_predictions_gene_j = np.append(cv_predictions_gene_j, predictions[other_gene])
                cv_truth_gene_j = np.append(cv_truth_gene_j, truth[other_gene]['ranks'])

            if cv_truth is None:
                cv_truth = cv_truth_gene_j.copy()[:, None]


            if cv_predictions is None:
                cv_predictions = cv_predictions_gene_j[:, None]
            else:
                cv_predictions = np.append(cv_predictions, cv_predictions_gene_j[:,None],
                                                    axis=1)

        if ensemble_type is 'majority':
            y_pred = ensembles.pairwise_majority_voting(test_predictions)
        if ensemble_type is 'median':
            y_pred = ensembles.median(test_predictions)
        if ensemble_type is 'stacking':
            y_pred = ensembles.linear_stacking(cv_truth, cv_predictions, test_predictions)

        ens_predictions[gene] = y_pred
        ens_truths[gene] = truth[gene]

    all_results[ensemble_type] = [None, [[ens_truths, ens_predictions]], None, None]
    all_learn_options[ensemble_type] = None
    # spearmans = []
    # for gene in ens_predictions.keys():
    #     spearmans.append(sp.stats.spearmanr(ens_predictions[gene], ens_truths[gene]['raw'])[0])
    #     print gene, spearmans[-1]
    # print "median: %.5f" % np.median(spearmans)

    return all_results, all_learn_options