Example #1
0
def run_cross_validation(settings, targets, classifiers, pipelines):
    print 'Cross-validation task'
    print 'Targets', ', '.join(targets)
    print 'Pipelines:\n ', '\n  '.join([p.get_name() for p in pipelines])
    print 'Classifiers', ', '.join([c[1] for c in classifiers])

    run_prepare_data_for_cross_validation(settings, targets, pipelines)

    # run on pool first, then show results after
    pool = Pool(settings.N_jobs)
    for i, pipeline in enumerate(pipelines):
        for j, (classifier, classifier_name) in enumerate(classifiers):
            for k, target in enumerate(targets):
                progress_str = 'P=%d/%d C=%d/%d T=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets))
                cross_validation_score(settings, target, pipeline, classifier, classifier_name,
                    strategy=cross_validation_strategy, pool=pool, progress_str=progress_str, return_data=False, quiet=True)
    pool.close()
    pool.join()

    summaries = []
    best = {}
    for p_num, pipeline in enumerate(pipelines):
        for c_num, (classifier, classifier_name) in enumerate(classifiers):
            mean_scores = []
            median_scores = []
            datas = []
            for target in targets:
                print 'Running %s pipeline %s classifier %s' % (target, pipeline.get_name(), classifier_name)
                data = cross_validation_score(settings, target, pipeline, classifier, classifier_name,
                    strategy=cross_validation_strategy, quiet=True)
                datas.append(data)
                if data.mean_score != data.median_score:
                    print '%.3f (mean)' % data.mean_score, data.mean_scores
                    print '%.3f (median)' % data.median_score, data.median_scores
                else:
                    print '%.3f' % data.mean_score
                mean_scores.append(data.mean_score)
                median_scores.append(data.median_score)

                best_score = best.get(target, [0, None, None, None])[0]
                cur_score = max(data.mean_score, data.median_score)
                if cur_score > best_score:
                    best[target] = [cur_score, pipeline, classifier, classifier_name]

            name = 'p=%d c=%d %s mean %s' % (p_num, c_num, classifier_name, pipeline.get_name())
            summary = get_score_summary(name, mean_scores)
            summaries.append((summary, np.mean(mean_scores)))
            print summary
            name = 'p=%d c=%d %s median %s' % (p_num, c_num, classifier_name, pipeline.get_name())
            summary = get_score_summary(name, median_scores)
            summaries.append((summary, np.mean(median_scores)))
            print summary

    print_results(summaries)

    print '\nbest'
    for target in targets:
        pipeline = best[target][1]
        classifier_name = best[target][3]
        print target, best[target][0], classifier_name, pipeline.get_names()
Example #2
0
def run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers):
    pool = Pool(settings.N_jobs)
    for i, pipeline in enumerate(pipelines):
        for j, (classifier, classifier_name) in enumerate(classifiers):
            for k, target in enumerate(targets):
                pool.apply_async(cross_validation_score, [settings, target, pipeline, classifier, classifier_name], {'quiet': True})
                for split_num, split_ratio in enumerate(split_ratios):
                    masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
                    for mask_num, mask in enumerate(masks):
                        progress_str = 'P=%d/%d C=%d/%d T=%d/%d S=%d/%d M=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets), split_num+1, len(split_ratios), mask_num+1, len(masks))
                        cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, quiet=True, return_data=False, pool=pool, progress_str=progress_str)
    pool.close()
    pool.join()
    print 'Finished cross validation mp'

    summaries = []
    for p_num, pipeline in enumerate(pipelines):
        for classifier, classifier_name in classifiers:
            scores_full = []
            scores_masked = [[[] for y in mask_range] for x in split_ratios]
            for i, target in enumerate(targets):
                run_prepare_data_for_cross_validation(settings, [target], [pipeline], quiet=True)
                data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, pool=None, quiet=True)
                scores_full.append(data.mean_score)

                for split_index, split_ratio in enumerate(split_ratios):
                    masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
                    for mask_index, num_masks in enumerate(mask_range):
                        predictions = []
                        y_cvs = None
                        for mask in masks[0:num_masks]:
                            data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, pool=None, quiet=True)
                            predictions.append(data.mean_predictions)
                            if y_cvs is None:
                                y_cvs = data.y_cvs
                            else:
                                for y_cv_1, y_cv_2 in zip(y_cvs, data.y_cvs):
                                    assert np.alltrue(y_cv_1 == y_cv_2)

                        predictions = np.mean(predictions, axis=0)
                        scores = [roc_auc_score(y_cv, p) for p, y_cv in zip(predictions, y_cvs)]
                        score = np.mean(scores)
                        scores_masked[split_index][mask_index].append(score)

            summary = get_score_summary('%s p=%d full' % (classifier_name, p_num), scores_full, np.mean(scores_full), targets)
            summaries.append((summary, np.mean(scores_full)))
            for split_index, split_ratio in enumerate(split_ratios):
                for mask_index, num_masks in enumerate(mask_range):
                    scores = scores_masked[split_index][mask_index]
                    summary = get_score_summary('%s p=%d split_ratio=%s masks=%d' % (classifier_name, p_num, split_ratio, num_masks), scores, np.mean(scores), targets)
                    summaries.append((summary, np.mean(scores)))
                    print summary

    print_results(summaries)
Example #3
0
def run_cross_validation(settings, targets, pipelines, mask_range, split_ratios, classifiers):
    pool = Pool(settings.N_jobs)
    for i, pipeline in enumerate(pipelines):
        for j, (classifier, classifier_name) in enumerate(classifiers):
            for k, target in enumerate(targets):
                pool.apply_async(cross_validation_score, [settings, target, pipeline, classifier, classifier_name], {'quiet': True})
                for split_num, split_ratio in enumerate(split_ratios):
                    masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
                    for mask_num, mask in enumerate(masks):
                        progress_str = 'P=%d/%d C=%d/%d T=%d/%d S=%d/%d M=%d/%d' % (i+1, len(pipelines), j+1, len(classifiers), k+1, len(targets), split_num+1, len(split_ratios), mask_num+1, len(masks))
                        cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, quiet=True, return_data=False, pool=pool, progress_str=progress_str)
    pool.close()
    pool.join()
    print('Finished cross validation mp')

    summaries = []
    for p_num, pipeline in enumerate(pipelines):
        for classifier, classifier_name in classifiers:
            scores_full = []
            scores_masked = [[[] for y in mask_range] for x in split_ratios]
            for i, target in enumerate(targets):
                run_prepare_data_for_cross_validation(settings, [target], [pipeline], quiet=True)
                data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, pool=None, quiet=True)
                scores_full.append(data.mean_score)

                for split_index, split_ratio in enumerate(split_ratios):
                    masks = generate_feature_masks(settings, target, pipeline, np.max(mask_range), split_ratio, random_state=0, quiet=True)
                    for mask_index, num_masks in enumerate(mask_range):
                        predictions = []
                        y_cvs = None
                        for mask in masks[0:num_masks]:
                            data = cross_validation_score(settings, target, pipeline, classifier, classifier_name, feature_mask=mask, pool=None, quiet=True)
                            predictions.append(data.mean_predictions)
                            if y_cvs is None:
                                y_cvs = data.y_cvs
                            else:
                                for y_cv_1, y_cv_2 in zip(y_cvs, data.y_cvs):
                                    assert np.alltrue(y_cv_1 == y_cv_2)

                        predictions = np.mean(predictions, axis=0)
                        scores = [roc_auc_score(y_cv, p) for p, y_cv in zip(predictions, y_cvs)]
                        score = np.mean(scores)
                        scores_masked[split_index][mask_index].append(score)

            summary = get_score_summary('%s p=%d full' % (classifier_name, p_num), scores_full)
            summaries.append((summary, np.mean(scores_full)))
            for split_index, split_ratio in enumerate(split_ratios):
                for mask_index, num_masks in enumerate(mask_range):
                    scores = scores_masked[split_index][mask_index]
                    summary = get_score_summary('%s p=%d split_ratio=%s masks=%d' % (classifier_name, p_num, split_ratio, num_masks), scores)
                    summaries.append((summary, np.mean(scores)))
                    print(summary)

    print_results(summaries)
Example #4
0
def main():
    settings = load_settings()

    targets = [
        'Dog_1', 'Dog_2', 'Dog_3', 'Dog_4', 'Dog_5', 'Patient_1', 'Patient_2'
    ]

    # The genetic algorithm will be run individually on each pipeline group
    pipeline_groups = [
        ([
            Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
        ], 0.55),
        ([
            Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
        ], 0.55),
        ([
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy(
                    [0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5,
                     24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([0.25, 2, 3.5])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([6, 15, 24])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([2, 3.5, 6])),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                PIBSpectralEntropy([3.5, 6, 15])),
            Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
        ], 0.55),
    ]

    make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission'
    run_ga = not make_submission

    # This classifier is used in the genetic algorithm
    ga_classifier, ga_classifier_name = make_svm(gamma=0.0079, C=2.7)

    if run_ga:
        quiet = False
        summaries = []
        for ngen in [10]:
            for pipelines, ratio in pipeline_groups:
                out = []
                for target in targets:
                    print 'Running target', target
                    run_prepare_data_for_cross_validation(settings, [target],
                                                          pipelines,
                                                          quiet=True)
                    pipeline = FeatureConcatPipeline(*pipelines)
                    score, best_N = process_target(settings,
                                                   target,
                                                   pipeline,
                                                   ga_classifier,
                                                   ga_classifier_name,
                                                   ratio=ratio,
                                                   ngen=ngen,
                                                   quiet=quiet)
                    print target, score, [
                        np.sum(mask) for mask in best_N[0:10]
                    ]
                    out.append((target, score, pipeline, best_N))

            scores = np.array([score for _, score, _, _ in out])
            summary = get_score_summary(
                '%s ngen=%d' % (ga_classifier_name, ngen), scores,
                np.mean(scores), targets)
            summaries.append((summary, np.mean(scores)))
            print summary

        print_results(summaries)

    if make_submission:
        random_pipelines = [
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     Correlation('none')),
            Pipeline(InputSource(), Preprocess(), Windower(75),
                     FreqCorrelation(1, None, 'none')),
            Pipeline(
                InputSource(Preprocess(), Windower(75), FFT(), Magnitude()),
                FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
        ]

        # These classifiers are used to make the final predictions
        final_classifiers = [
            # make_svm(gamma=0.0079, C=2.7),
            make_svm(gamma=0.0068, C=2.0),
            # make_svm(gamma=0.003, C=150.0),
            # make_lr(C=0.04),
            # make_simple_lr(),
        ]
        targets_and_pipelines = get_submission_targets_and_masks(
            settings, targets, ga_classifier, ga_classifier_name,
            pipeline_groups, random_pipelines)
        for classifier, classifier_name in final_classifiers:
            run_make_submission(settings, targets_and_pipelines, classifier,
                                classifier_name)
def main():
    settings = load_settings()

    targets = [
        'Dog_1',
        'Dog_2',
        'Dog_3',
        'Dog_4',
        'Dog_5',
        'Patient_1',
        'Patient_2'
    ]

    # The genetic algorithm will be run individually on each pipeline group
    pipeline_groups = [
        ([
            Pipeline(InputSource(), Preprocess(), Windower(75), PFD()),
        ], 0.55),
        ([
            Pipeline(InputSource(), Preprocess(), Windower(75), Hurst()),
        ], 0.55),
        ([
            Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 1, 1.75, 2.5, 3.25, 4, 5, 8.5, 12, 15.5, 19.5, 24])),
            Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15, 24])),
            Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5, 6, 15])),
            Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([0.25, 2, 3.5])),
            Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([6, 15, 24])),
            Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([2, 3.5, 6])),
            Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), PIBSpectralEntropy([3.5, 6, 15])),
            Pipeline(InputSource(), Preprocess(), Windower(75), HFD(2)),
        ], 0.55),
    ]

    make_submission = len(sys.argv) >= 2 and sys.argv[1] == 'submission'
    run_ga = not make_submission

    # This classifier is used in the genetic algorithm
    ga_classifier, ga_classifier_name = make_svm(gamma=0.0079, C=2.7)

    if run_ga:
        quiet = False
        summaries = []
        for ngen in [10]:
            for pipelines, ratio in pipeline_groups:
                out = []
                for target in targets:
                    print 'Running target', target
                    run_prepare_data_for_cross_validation(settings, [target], pipelines, quiet=True)
                    pipeline = FeatureConcatPipeline(*pipelines)
                    score, best_N = process_target(settings, target, pipeline, ga_classifier, ga_classifier_name, ratio=ratio, ngen=ngen, quiet=quiet)
                    print target, score, [np.sum(mask) for mask in best_N[0:10]]
                    out.append((target, score, pipeline, best_N))

            scores = np.array([score for _, score, _, _ in out])
            summary = get_score_summary('%s ngen=%d' % (ga_classifier_name, ngen), scores)
            summaries.append((summary, np.mean(scores)))
            print summary

        print_results(summaries)

    if make_submission:
        random_pipelines = [
            Pipeline(InputSource(), Preprocess(), Windower(75), Correlation('none')),
            Pipeline(InputSource(), Preprocess(), Windower(75), FreqCorrelation(1, None, 'none')),
            Pipeline(InputSource(Preprocess(), Windower(75), FFT(), Magnitude()), FreqBinning(winning_bins, 'mean'), Log10(), FlattenChannels()),
        ]

        # These classifiers are used to make the final predictions
        final_classifiers = [
            # make_svm(gamma=0.0079, C=2.7),
            make_svm(gamma=0.0068, C=2.0),
            # make_svm(gamma=0.003, C=150.0),
            # make_lr(C=0.04),
            # make_simple_lr(),
        ]
        targets_and_pipelines = get_submission_targets_and_masks(settings, targets, ga_classifier, ga_classifier_name, pipeline_groups, random_pipelines)
        for classifier, classifier_name in final_classifiers:
            run_make_submission(settings, targets_and_pipelines, classifier, classifier_name)
Example #6
0
def run_cross_validation(settings, targets, classifiers, pipelines):
    print 'Cross-validation task'
    print 'Targets', ', '.join(targets)
    print 'Pipelines:\n ', '\n  '.join([p.get_name() for p in pipelines])
    print 'Classifiers', ', '.join([c[1] for c in classifiers])

    run_prepare_data_for_cross_validation(settings, targets, pipelines)

    # run on pool first, then show results after
    pool = Pool(settings.N_jobs)
    for i, pipeline in enumerate(pipelines):
        for j, (classifier, classifier_name) in enumerate(classifiers):
            for k, target in enumerate(targets):
                progress_str = 'P=%d/%d C=%d/%d T=%d/%d' % (
                    i + 1, len(pipelines), j + 1, len(classifiers), k + 1,
                    len(targets))
                cross_validation_score(settings,
                                       target,
                                       pipeline,
                                       classifier,
                                       classifier_name,
                                       strategy=cross_validation_strategy,
                                       pool=pool,
                                       progress_str=progress_str,
                                       return_data=False,
                                       quiet=True)
    pool.close()
    pool.join()

    summaries = []
    best = {}
    for p_num, pipeline in enumerate(pipelines):
        for c_num, (classifier, classifier_name) in enumerate(classifiers):
            mean_scores = []
            median_scores = []
            datas = []
            for target in targets:
                print 'Running %s pipeline %s classifier %s' % (
                    target, pipeline.get_name(), classifier_name)
                data = cross_validation_score(
                    settings,
                    target,
                    pipeline,
                    classifier,
                    classifier_name,
                    strategy=cross_validation_strategy,
                    quiet=True)
                datas.append(data)
                if data.mean_score != data.median_score:
                    print '%.3f (mean)' % data.mean_score, data.mean_scores
                    print '%.3f (median)' % data.median_score, data.median_scores
                else:
                    print '%.3f' % data.mean_score
                mean_scores.append(data.mean_score)
                median_scores.append(data.median_score)

                best_score = best.get(target, [0, None, None, None])[0]
                cur_score = max(data.mean_score, data.median_score)
                if cur_score > best_score:
                    best[target] = [
                        cur_score, pipeline, classifier, classifier_name
                    ]

            name = 'p=%d c=%d %s mean %s' % (p_num, c_num, classifier_name,
                                             pipeline.get_name())
            summary = get_score_summary(name, mean_scores)
            summaries.append((summary, np.mean(mean_scores)))
            print summary
            name = 'p=%d c=%d %s median %s' % (p_num, c_num, classifier_name,
                                               pipeline.get_name())
            summary = get_score_summary(name, median_scores)
            summaries.append((summary, np.mean(median_scores)))
            print summary

    print_results(summaries)

    print '\nbest'
    for target in targets:
        pipeline = best[target][1]
        classifier_name = best[target][3]
        print target, best[target][0], classifier_name, pipeline.get_names()