def _evaluate_combinations(values, train_splits, test_splits, args):
    scores = []
    keys = values.keys()
    combinations = list(itertools.product(*values.values()))
    for combination in combinations:
        for idx, value in enumerate(combination):
            setattr(args, "decision_maker_" + keys[idx], value)
        print("evaluating combination %s ..." % str(combination))
        start = timeit.default_timer()
        y_true = []
        y_pred = []
        for rnd, (train, test) in enumerate(zip(train_splits, test_splits)):
            decision_maker = decision.decision_maker_from_args(args)
            y_pred.append(evaluate_decision_maker(decision_maker, train, test, args))
            y_true.append(test[1])
        y_true = np.vstack(y_true)
        y_pred = np.vstack(y_pred)
        assert y_true.shape == y_pred.shape
        score = _compute_score(y_true, y_pred, args)
        scores.append(score)
        print("score: %f" % score)
        print("done, took %fs" % (timeit.default_timer() - start))
        print("")

    best_idx = _select_best_score(scores, args)
    print("best combination with score %f: %s" % (scores[best_idx], str(combinations[best_idx])))

    # Save results
    if args.output is not None:
        with open(args.output, "wb") as f:
            writer = csv.writer(f, delimiter=";")
            writer.writerow(["", "idx", "score"] + keys)
            for idx, (score, combination) in enumerate(zip(scores, combinations)):
                selected = "*" if best_idx == idx else ""
                writer.writerow([selected, "%d" % idx, "%f" % score] + list(combination))
コード例 #2
0
def _evaluate_combinations(values, train_splits, test_splits, args):
    scores = []
    keys = values.keys()
    combinations = list(itertools.product(*values.values()))
    for combination in combinations:
        for idx, value in enumerate(combination):
            setattr(args, 'decision_maker_' + keys[idx], value)
        print('evaluating combination %s ...' % str(combination))
        start = timeit.default_timer()
        y_true = []
        y_pred = []
        for rnd, (train, test) in enumerate(zip(train_splits, test_splits)):
            decision_maker = decision.decision_maker_from_args(args)
            y_pred.append(
                evaluate_decision_maker(decision_maker, train, test, args))
            y_true.append(test[1])
        y_true = np.vstack(y_true)
        y_pred = np.vstack(y_pred)
        assert y_true.shape == y_pred.shape
        score = _compute_score(y_true, y_pred, args)
        scores.append(score)
        print('score: %f' % score)
        print('done, took %fs' % (timeit.default_timer() - start))
        print('')

    best_idx = _select_best_score(scores, args)
    print('best combination with score %f: %s' %
          (scores[best_idx], str(combinations[best_idx])))

    # Save results
    if args.output is not None:
        with open(args.output, 'wb') as f:
            writer = csv.writer(f, delimiter=';')
            writer.writerow(['', 'idx', 'score'] + keys)
            for idx, (score,
                      combination) in enumerate(zip(scores, combinations)):
                selected = '*' if best_idx == idx else ''
                writer.writerow([selected, '%d' %
                                 idx, '%f' % score] + list(combination))
コード例 #3
0
def evaluate_end_to_end(dataset, iterator, args):
    # Things to evaluate
    feature_sets = [('normalized_root_rot_norm',),
                    ('normalized_root_pos', 'normalized_root_vel', 'normalized_extremity_pos', 'normalized_root_rot', 'normalized_root_rot_norm'),
                    ('normalized_extremity_pos', 'normalized_root_rot', 'normalized_root_rot_norm',),
                    ('normalized_root_pos', 'normalized_root_vel', 'normalized_com_pos', 'normalized_extremity_pos', 'normalized_root_rot', 'normalized_root_rot_norm'),
                    ('normalized_root_vel', 'normalized_extremity_pos', 'normalized_root_rot', 'normalized_root_rot_norm'),
                    ('normalized_root_pos', 'normalized_root_vel', 'normalized_com_pos', 'normalized_extremity_pos', 'normalized_root_rot', 'normalized_root_rot_norm', 'normalized_marker_vel_norm')]
    values = {'features': range(len(feature_sets)),
              'hyperparams': [('left-to-right-full', 5), ('left-to-right-1', 6), ('left-to-right-2', 5), ('full', 8)],
              'init': [('uniform', 'k-means', 'diag')],
              'model': [('hmm', None), ('fhmm-seq', 2)],
              'decision': ['all']}
    decision_makers = ['log-regression', 'svm', 'decision-tree', 'random-forest', 'zero', 'max']

    datasets = []
    print('selecting features ...')
    start = timeit.default_timer()
    for feature_set in feature_sets:
        features = _explode_features(feature_set)
        curr_dataset = dataset.dataset_from_feature_names(features)
        datasets.append(curr_dataset)
    dataset = None  # ensure that dataset is not usable hereinafter
    assert len(datasets) == len(feature_sets)
    print('done, took %fs' % (timeit.default_timer() - start))
    print('')

    # Save state
    output_dir = args.output_dir

    # Stats
    combinations = []
    total_accuracies = []
    precisions_mean = []
    precisions_std = []
    precisions_min = []
    precisions_max = []
    recalls_mean = []
    recalls_std = []
    recalls_min = []
    recalls_max = []
    fscores_mean = []
    fscores_std = []
    fscores_max = []
    fscores_min = []
    pos_ll_means = []
    pos_ll_stds = []
    neg_ll_means = []
    neg_ll_stds = []

    keys = values.keys()
    iterable_combinations = list(itertools.product(*values.values()))
    curr_step = 0
    for idx, combination in enumerate(iterable_combinations):
        print('(%.3d/%.3d) evaluating combination %s + decision makers ...' % (idx + 1, len(iterable_combinations), str(combination)))
        start = timeit.default_timer()

        curr_dataset = datasets[combination[keys.index('features')]]
        topology, n_states = combination[keys.index('hyperparams')]
        transition_init, emission_init, covar_type = combination[keys.index('init')]
        model, n_chains = combination[keys.index('model')]

        if output_dir is not None:
            curr_path = os.path.join(output_dir, '%.3d' % curr_step)
            os.mkdir(curr_path)
            args.output_dir = curr_path

        # Configure HMMs
        args.topology = topology
        args.n_states = n_states
        args.transition_init = transition_init
        args.emission_init = emission_init
        args.covar_type = covar_type
        args.model = model
        args.n_chains = n_chains
        args.decision_maker = None

        train_ll, train_y, test_ll, test_y = _evaluate_model(curr_dataset, iterator, args, print_results=False)
        assert len(train_ll) == len(train_y)
        assert len(train_ll) == len(test_ll)
        assert len(train_ll) == len(test_y)
        train_ll_combined = np.vstack(train_ll)
        test_ll_combined = np.vstack(test_ll)
        test_y_combined = np.vstack(test_y)
        assert train_ll_combined.shape == train_ll_combined.shape
        assert test_ll_combined.shape == test_y_combined.shape
        assert train_ll_combined.shape[0] > test_ll_combined.shape[0]  # just a sanity check so that the both are not confused

        n_samples, n_labels = test_ll_combined.shape
        curr_pos_ll_means = []
        curr_pos_ll_stds = []
        curr_neg_ll_means = []
        curr_neg_ll_stds = []
        for label_idx in xrange(n_labels):
            label_y = test_y_combined[:, label_idx]
            pos_indexes = np.where(label_y == 1)[0]
            neg_indexes = np.where(label_y == 0)[0]
            pos_ll = test_ll_combined[pos_indexes, label_idx]
            neg_ll = test_ll_combined[neg_indexes, label_idx]
            assert np.size(pos_ll) + np.size(neg_ll) == n_samples
            curr_pos_ll_means.append(np.mean(pos_ll))
            curr_pos_ll_stds.append(np.std(pos_ll))
            curr_neg_ll_means.append(np.mean(neg_ll))
            curr_neg_ll_stds.append(np.std(neg_ll))

        for name in decision_makers:
            args.decision_maker = name
            modified_combination = list(combination)
            modified_combination[keys.index('decision')] = name
            if name == 'svm':
                args.decision_maker_C = 1e-2
                args.decision_maker_penalty = 'l1'
            elif name == 'log-regression':
                args.decision_maker_C = 1e-3
                args.decision_maker_penalty = 'l1'
            elif name == 'decision-tree':
                args.decision_maker_criterion = 'entropy'
                args.decision_maker_max_depth = 15
            elif name == 'random-forest':
                args.decision_maker_criterion = 'entropy'
                args.decision_maker_n_estimators = 40
                args.decision_maker_max_depth = 15

            curr_preds = []
            for curr_train_ll, curr_train_y, curr_test_ll in zip(train_ll, train_y, test_ll):
                print('training decision maker')
                assert curr_train_ll.shape == curr_train_y.shape
                assert curr_train_ll.shape[0] > curr_test_ll.shape[0]
                decision_maker = decision.decision_maker_from_args(args)
                assert decision_maker is not None

                # Fit and predict using the decision maker
                if hasattr(decision_maker, 'fit') and callable(decision_maker.fit):
                    decision_maker.fit(curr_train_ll, curr_train_y)
                curr_preds.append(decision_maker.predict(curr_test_ll))
            print('')
            test_pred_combined = np.vstack(curr_preds)
            assert test_y_combined.shape == test_pred_combined.shape

            # Track everything
            combinations.append(modified_combination)
            total_accuracies.append(sk_metrics.accuracy_score(test_y_combined, test_pred_combined))
            precision, recall, fscore, _ = sk_metrics.precision_recall_fscore_support(test_y_combined, test_pred_combined)

            precisions_mean.append(np.mean(precision))
            precisions_std.append(np.std(precision))
            precisions_min.append(np.min(precision))
            precisions_max.append(np.max(precision))

            recalls_mean.append(np.mean(recall))
            recalls_std.append(np.std(recall))
            recalls_min.append(np.min(recall))
            recalls_max.append(np.max(recall))

            fscores_mean.append(np.mean(fscore))
            fscores_std.append(np.std(fscore))
            fscores_min.append(np.min(fscore))
            fscores_max.append(np.max(fscore))

            pos_ll_means.append(np.array(np.median(curr_pos_ll_means)))
            pos_ll_stds.append(np.array(np.median(curr_pos_ll_stds)))
            neg_ll_means.append(np.array(np.median(curr_neg_ll_means)))
            neg_ll_stds.append(np.array(np.median(curr_neg_ll_stds)))

            curr_step += 1
        print('done, took %fs' % (timeit.default_timer() - start))
        print('')
    assert len(combinations) == len(fscores_mean)
    assert len(combinations) == len(fscores_std)
    assert len(combinations) == len(fscores_min)
    assert len(combinations) == len(fscores_max)

    assert len(combinations) == len(precisions_mean)
    assert len(combinations) == len(precisions_std)
    assert len(combinations) == len(precisions_min)
    assert len(combinations) == len(precisions_max)

    assert len(combinations) == len(recalls_mean)
    assert len(combinations) == len(recalls_std)
    assert len(combinations) == len(recalls_min)
    assert len(combinations) == len(recalls_max)

    assert len(combinations) == len(pos_ll_means)
    assert len(combinations) == len(pos_ll_stds)
    assert len(combinations) == len(neg_ll_means)
    assert len(combinations) == len(neg_ll_stds)

    assert len(combinations) == len(total_accuracies)

    # Save results
    if output_dir is not None:
        filename = 'results.csv'
        with open(os.path.join(output_dir, filename), 'wb') as f:
            writer = csv.writer(f, delimiter=';')
            writer.writerow(['idx', 'combination', 'f1-score-mean', 'f1-score-std', 'f1-score-min', 'f1-score-max',
                             'precision-mean', 'precision-std', 'precision-min', 'precision-max',
                             'recall-mean', 'recall-std', 'recall-min', 'recall-max', 'total-accuracy',
                             'pos-ll-mean', 'pos-ll-std', 'neg-ll-mean', 'neg-ll-std'])
            for idx, d in enumerate(zip(combinations, fscores_mean, fscores_std, fscores_min, fscores_max,
                                        precisions_mean, precisions_std, precisions_min, precisions_max,
                                        recalls_mean, recalls_std, recalls_min, recalls_max,
                                        total_accuracies, pos_ll_means, pos_ll_stds, neg_ll_means, neg_ll_stds)):
                combination = ', '.join([str(x) for x in d[0]])
                new_data = ['%d' % idx] + list((combination, ) + d[1:])
                writer.writerow(new_data)
    print len(combinations)
コード例 #4
0
def _evaluate_model(dataset, iterator, args, print_results=False):
    loglikelihood_method = args.loglikelihood_method

    # Collect stats
    train_loglikelihoods = []
    train_predictions = []
    train_labels = []
    test_loglikelihoods = []
    test_predictions = []
    test_labels = []
    for rnd, (train_indexes, test_indexes) in enumerate(iterator):
        assert len(set(train_indexes).intersection(set(test_indexes))) == 0
        transformers = data.transformers_from_args(args)
        train, test = dataset.split_train_test(train_indexes, test_indexes, transformers)
        assert train.n_samples == len(train_indexes)
        assert test.n_samples == len(test_indexes)
        train_labels.append(train.y)
        test_labels.append(test.y)
        classifier = get_classifier(args)

        if print_results:
            print('evaluation round %d' % (rnd + 1))
            print('  train split: %s' % train_indexes)
            print('  test split:  %s' % test_indexes)
            print('  training classifier on training samples ...')
        start = timeit.default_timer()
        classifier.fit(train.X, train.y)
        stop = timeit.default_timer()
        if args.output_dir is not None:
            name = 'rnd%d_model.pkl' % (rnd+1)
            with open(os.path.join(args.output_dir, name), 'wb') as f:
                pickle.dump(classifier, f)
        if print_results:
            print('  done, took %fs' % (stop - start))

        if print_results:
            print('  computing %s loglikelihoods on train dataset ...' % loglikelihood_method)
        start = timeit.default_timer()
        train_ll = classifier.loglikelihoods(train.X, method=loglikelihood_method)
        train_loglikelihoods.append(train_ll)
        stop = timeit.default_timer()
        if print_results:
            print('  done, took %fs' % (stop - start))

        if print_results:
            print('  computing %s loglikelihoods on test dataset ...' % loglikelihood_method)
        start = timeit.default_timer()
        test_ll = classifier.loglikelihoods(test.X, method=loglikelihood_method)
        test_loglikelihoods.append(test_ll)
        stop = timeit.default_timer()
        if print_results:
            print('  done, took %fs' % (stop - start))

        decision_maker = decision.decision_maker_from_args(args)
        train_pred, test_pred = None, None
        if decision_maker is not None:
            if hasattr(decision_maker, 'fit') and callable(decision_maker.fit):
                if print_results:
                    print('  training decision maker %s on train loglikelihoods ...' % args.decision_maker)
                start = timeit.default_timer()
                decision_maker.fit(train_ll, train.y)
                stop = timeit.default_timer()
                if print_results:
                    print('  done, took %fs' % (stop - start))

            if print_results:
                print('  predicting labels on train dataset ...')
            start = timeit.default_timer()
            train_pred = decision_maker.predict(train_ll)
            train_predictions.append(train_pred)
            stop = timeit.default_timer()
            if print_results:
                print('  done, took %fs' % (stop - start))

            if print_results:
                print('  predicting labels on test dataset ...')
            start = timeit.default_timer()
            test_pred = decision_maker.predict(test_ll)
            test_predictions.append(test_pred)
            stop = timeit.default_timer()
            if print_results:
                print('  done, took %fs' % (stop - start))
        if print_results:
            print('')

        # Save round results
        if args.output_dir is not None:
            save_results(args.output_dir, train.y, train_pred, train_ll, prefix='rnd%d_train' % (rnd+1))
            save_results(args.output_dir, test.y, test_pred, test_ll, prefix='rnd%d_test' % (rnd+1))

    # Combine and save combined results
    train_y_combined = np.vstack(train_labels)
    train_ll_combined = np.vstack(train_loglikelihoods)
    train_pred_combined = np.vstack(train_predictions) if len(train_predictions) > 0 else None
    test_ll_combined = np.vstack(test_loglikelihoods)
    test_y_combined = np.vstack(test_labels)
    test_pred_combined = np.vstack(test_predictions) if len(test_predictions) > 0 else None
    if args.output_dir is not None:
        save_results(args.output_dir, train_y_combined, train_pred_combined, train_ll_combined, 'combined_train')
        save_results(args.output_dir, test_y_combined, test_pred_combined, test_ll_combined, 'combined_test')

    if print_results:
        # Print report
        label_names = dataset.unique_labels
        print('*** train dataset summary ***')
        print('')
        print(metrics.multilabel_loglikelihood_summary_report(train_y_combined, train_ll_combined, target_names=label_names))
        print('')
        if train_pred_combined is not None:
            print(metrics.multilabel_classification_report(train_y_combined, train_pred_combined, target_names=label_names))
            print('total accuracy: %.3f' % sk_metrics.accuracy_score(train_y_combined, train_pred_combined))
            print('')

        print('')
        print('*** test dataset summary ***')
        print('')
        print(metrics.multilabel_loglikelihood_summary_report(test_y_combined, test_ll_combined, target_names=label_names))
        print('')
        if test_pred_combined is not None:
            print(metrics.multilabel_classification_report(test_y_combined, test_pred_combined, target_names=label_names))
            print('total accuracy: %.3f' % sk_metrics.accuracy_score(test_y_combined, test_pred_combined))
            print('')

    return train_loglikelihoods, train_labels, test_loglikelihoods, test_labels
コード例 #5
0
def _evaluate_model(dataset, iterator, args, print_results=False):
    loglikelihood_method = args.loglikelihood_method

    # Collect stats
    train_loglikelihoods = []
    train_predictions = []
    train_labels = []
    test_loglikelihoods = []
    test_predictions = []
    test_labels = []
    for rnd, (train_indexes, test_indexes) in enumerate(iterator):
        assert len(set(train_indexes).intersection(set(test_indexes))) == 0
        transformers = data.transformers_from_args(args)
        train, test = dataset.split_train_test(train_indexes, test_indexes,
                                               transformers)
        assert train.n_samples == len(train_indexes)
        assert test.n_samples == len(test_indexes)
        train_labels.append(train.y)
        test_labels.append(test.y)
        classifier = get_classifier(args)

        if print_results:
            print('evaluation round %d' % (rnd + 1))
            print('  train split: %s' % train_indexes)
            print('  test split:  %s' % test_indexes)
            print('  training classifier on training samples ...')
        start = timeit.default_timer()
        classifier.fit(train.X, train.y)
        stop = timeit.default_timer()
        if args.output_dir is not None:
            name = 'rnd%d_model.pkl' % (rnd + 1)
            with open(os.path.join(args.output_dir, name), 'wb') as f:
                pickle.dump(classifier, f)
        if print_results:
            print('  done, took %fs' % (stop - start))

        if print_results:
            print('  computing %s loglikelihoods on train dataset ...' %
                  loglikelihood_method)
        start = timeit.default_timer()
        train_ll = classifier.loglikelihoods(train.X,
                                             method=loglikelihood_method)
        train_loglikelihoods.append(train_ll)
        stop = timeit.default_timer()
        if print_results:
            print('  done, took %fs' % (stop - start))

        if print_results:
            print('  computing %s loglikelihoods on test dataset ...' %
                  loglikelihood_method)
        start = timeit.default_timer()
        test_ll = classifier.loglikelihoods(test.X,
                                            method=loglikelihood_method)
        test_loglikelihoods.append(test_ll)
        stop = timeit.default_timer()
        if print_results:
            print('  done, took %fs' % (stop - start))

        decision_maker = decision.decision_maker_from_args(args)
        train_pred, test_pred = None, None
        if decision_maker is not None:
            if hasattr(decision_maker, 'fit') and callable(decision_maker.fit):
                if print_results:
                    print(
                        '  training decision maker %s on train loglikelihoods ...'
                        % args.decision_maker)
                start = timeit.default_timer()
                decision_maker.fit(train_ll, train.y)
                stop = timeit.default_timer()
                if print_results:
                    print('  done, took %fs' % (stop - start))

            if print_results:
                print('  predicting labels on train dataset ...')
            start = timeit.default_timer()
            train_pred = decision_maker.predict(train_ll)
            train_predictions.append(train_pred)
            stop = timeit.default_timer()
            if print_results:
                print('  done, took %fs' % (stop - start))

            if print_results:
                print('  predicting labels on test dataset ...')
            start = timeit.default_timer()
            test_pred = decision_maker.predict(test_ll)
            test_predictions.append(test_pred)
            stop = timeit.default_timer()
            if print_results:
                print('  done, took %fs' % (stop - start))
        if print_results:
            print('')

        # Save round results
        if args.output_dir is not None:
            save_results(args.output_dir,
                         train.y,
                         train_pred,
                         train_ll,
                         prefix='rnd%d_train' % (rnd + 1))
            save_results(args.output_dir,
                         test.y,
                         test_pred,
                         test_ll,
                         prefix='rnd%d_test' % (rnd + 1))

    # Combine and save combined results
    train_y_combined = np.vstack(train_labels)
    train_ll_combined = np.vstack(train_loglikelihoods)
    train_pred_combined = np.vstack(
        train_predictions) if len(train_predictions) > 0 else None
    test_ll_combined = np.vstack(test_loglikelihoods)
    test_y_combined = np.vstack(test_labels)
    test_pred_combined = np.vstack(
        test_predictions) if len(test_predictions) > 0 else None
    if args.output_dir is not None:
        save_results(args.output_dir, train_y_combined, train_pred_combined,
                     train_ll_combined, 'combined_train')
        save_results(args.output_dir, test_y_combined, test_pred_combined,
                     test_ll_combined, 'combined_test')

    if print_results:
        # Print report
        label_names = dataset.unique_labels
        print('*** train dataset summary ***')
        print('')
        print(
            metrics.multilabel_loglikelihood_summary_report(
                train_y_combined, train_ll_combined, target_names=label_names))
        print('')
        if train_pred_combined is not None:
            print(
                metrics.multilabel_classification_report(
                    train_y_combined,
                    train_pred_combined,
                    target_names=label_names))
            print(
                'total accuracy: %.3f' % sk_metrics.accuracy_score(
                    train_y_combined, train_pred_combined))
            print('')

        print('')
        print('*** test dataset summary ***')
        print('')
        print(
            metrics.multilabel_loglikelihood_summary_report(
                test_y_combined, test_ll_combined, target_names=label_names))
        print('')
        if test_pred_combined is not None:
            print(
                metrics.multilabel_classification_report(
                    test_y_combined,
                    test_pred_combined,
                    target_names=label_names))
            print(
                'total accuracy: %.3f' %
                sk_metrics.accuracy_score(test_y_combined, test_pred_combined))
            print('')

    return train_loglikelihoods, train_labels, test_loglikelihoods, test_labels
コード例 #6
0
def evaluate_end_to_end(dataset, iterator, args):
    # Things to evaluate
    feature_sets = [
        ('normalized_root_rot_norm', ),
        ('normalized_root_pos', 'normalized_root_vel',
         'normalized_extremity_pos', 'normalized_root_rot',
         'normalized_root_rot_norm'),
        (
            'normalized_extremity_pos',
            'normalized_root_rot',
            'normalized_root_rot_norm',
        ),
        ('normalized_root_pos', 'normalized_root_vel', 'normalized_com_pos',
         'normalized_extremity_pos', 'normalized_root_rot',
         'normalized_root_rot_norm'),
        ('normalized_root_vel', 'normalized_extremity_pos',
         'normalized_root_rot', 'normalized_root_rot_norm'),
        ('normalized_root_pos', 'normalized_root_vel', 'normalized_com_pos',
         'normalized_extremity_pos', 'normalized_root_rot',
         'normalized_root_rot_norm', 'normalized_marker_vel_norm')
    ]
    values = {
        'features':
        range(len(feature_sets)),
        'hyperparams': [('left-to-right-full', 5), ('left-to-right-1', 6),
                        ('left-to-right-2', 5), ('full', 8)],
        'init': [('uniform', 'k-means', 'diag')],
        'model': [('hmm', None), ('fhmm-seq', 2)],
        'decision': ['all']
    }
    decision_makers = [
        'log-regression', 'svm', 'decision-tree', 'random-forest', 'zero',
        'max'
    ]

    datasets = []
    print('selecting features ...')
    start = timeit.default_timer()
    for feature_set in feature_sets:
        features = _explode_features(feature_set)
        curr_dataset = dataset.dataset_from_feature_names(features)
        datasets.append(curr_dataset)
    dataset = None  # ensure that dataset is not usable hereinafter
    assert len(datasets) == len(feature_sets)
    print('done, took %fs' % (timeit.default_timer() - start))
    print('')

    # Save state
    output_dir = args.output_dir

    # Stats
    combinations = []
    total_accuracies = []
    precisions_mean = []
    precisions_std = []
    precisions_min = []
    precisions_max = []
    recalls_mean = []
    recalls_std = []
    recalls_min = []
    recalls_max = []
    fscores_mean = []
    fscores_std = []
    fscores_max = []
    fscores_min = []
    pos_ll_means = []
    pos_ll_stds = []
    neg_ll_means = []
    neg_ll_stds = []

    keys = values.keys()
    iterable_combinations = list(itertools.product(*values.values()))
    curr_step = 0
    for idx, combination in enumerate(iterable_combinations):
        print('(%.3d/%.3d) evaluating combination %s + decision makers ...' %
              (idx + 1, len(iterable_combinations), str(combination)))
        start = timeit.default_timer()

        curr_dataset = datasets[combination[keys.index('features')]]
        topology, n_states = combination[keys.index('hyperparams')]
        transition_init, emission_init, covar_type = combination[keys.index(
            'init')]
        model, n_chains = combination[keys.index('model')]

        if output_dir is not None:
            curr_path = os.path.join(output_dir, '%.3d' % curr_step)
            os.mkdir(curr_path)
            args.output_dir = curr_path

        # Configure HMMs
        args.topology = topology
        args.n_states = n_states
        args.transition_init = transition_init
        args.emission_init = emission_init
        args.covar_type = covar_type
        args.model = model
        args.n_chains = n_chains
        args.decision_maker = None

        train_ll, train_y, test_ll, test_y = _evaluate_model(
            curr_dataset, iterator, args, print_results=False)
        assert len(train_ll) == len(train_y)
        assert len(train_ll) == len(test_ll)
        assert len(train_ll) == len(test_y)
        train_ll_combined = np.vstack(train_ll)
        test_ll_combined = np.vstack(test_ll)
        test_y_combined = np.vstack(test_y)
        assert train_ll_combined.shape == train_ll_combined.shape
        assert test_ll_combined.shape == test_y_combined.shape
        assert train_ll_combined.shape[0] > test_ll_combined.shape[
            0]  # just a sanity check so that the both are not confused

        n_samples, n_labels = test_ll_combined.shape
        curr_pos_ll_means = []
        curr_pos_ll_stds = []
        curr_neg_ll_means = []
        curr_neg_ll_stds = []
        for label_idx in xrange(n_labels):
            label_y = test_y_combined[:, label_idx]
            pos_indexes = np.where(label_y == 1)[0]
            neg_indexes = np.where(label_y == 0)[0]
            pos_ll = test_ll_combined[pos_indexes, label_idx]
            neg_ll = test_ll_combined[neg_indexes, label_idx]
            assert np.size(pos_ll) + np.size(neg_ll) == n_samples
            curr_pos_ll_means.append(np.mean(pos_ll))
            curr_pos_ll_stds.append(np.std(pos_ll))
            curr_neg_ll_means.append(np.mean(neg_ll))
            curr_neg_ll_stds.append(np.std(neg_ll))

        for name in decision_makers:
            args.decision_maker = name
            modified_combination = list(combination)
            modified_combination[keys.index('decision')] = name
            if name == 'svm':
                args.decision_maker_C = 1e-2
                args.decision_maker_penalty = 'l1'
            elif name == 'log-regression':
                args.decision_maker_C = 1e-3
                args.decision_maker_penalty = 'l1'
            elif name == 'decision-tree':
                args.decision_maker_criterion = 'entropy'
                args.decision_maker_max_depth = 15
            elif name == 'random-forest':
                args.decision_maker_criterion = 'entropy'
                args.decision_maker_n_estimators = 40
                args.decision_maker_max_depth = 15

            curr_preds = []
            for curr_train_ll, curr_train_y, curr_test_ll in zip(
                    train_ll, train_y, test_ll):
                print('training decision maker')
                assert curr_train_ll.shape == curr_train_y.shape
                assert curr_train_ll.shape[0] > curr_test_ll.shape[0]
                decision_maker = decision.decision_maker_from_args(args)
                assert decision_maker is not None

                # Fit and predict using the decision maker
                if hasattr(decision_maker, 'fit') and callable(
                        decision_maker.fit):
                    decision_maker.fit(curr_train_ll, curr_train_y)
                curr_preds.append(decision_maker.predict(curr_test_ll))
            print('')
            test_pred_combined = np.vstack(curr_preds)
            assert test_y_combined.shape == test_pred_combined.shape

            # Track everything
            combinations.append(modified_combination)
            total_accuracies.append(
                sk_metrics.accuracy_score(test_y_combined, test_pred_combined))
            precision, recall, fscore, _ = sk_metrics.precision_recall_fscore_support(
                test_y_combined, test_pred_combined)

            precisions_mean.append(np.mean(precision))
            precisions_std.append(np.std(precision))
            precisions_min.append(np.min(precision))
            precisions_max.append(np.max(precision))

            recalls_mean.append(np.mean(recall))
            recalls_std.append(np.std(recall))
            recalls_min.append(np.min(recall))
            recalls_max.append(np.max(recall))

            fscores_mean.append(np.mean(fscore))
            fscores_std.append(np.std(fscore))
            fscores_min.append(np.min(fscore))
            fscores_max.append(np.max(fscore))

            pos_ll_means.append(np.array(np.median(curr_pos_ll_means)))
            pos_ll_stds.append(np.array(np.median(curr_pos_ll_stds)))
            neg_ll_means.append(np.array(np.median(curr_neg_ll_means)))
            neg_ll_stds.append(np.array(np.median(curr_neg_ll_stds)))

            curr_step += 1
        print('done, took %fs' % (timeit.default_timer() - start))
        print('')
    assert len(combinations) == len(fscores_mean)
    assert len(combinations) == len(fscores_std)
    assert len(combinations) == len(fscores_min)
    assert len(combinations) == len(fscores_max)

    assert len(combinations) == len(precisions_mean)
    assert len(combinations) == len(precisions_std)
    assert len(combinations) == len(precisions_min)
    assert len(combinations) == len(precisions_max)

    assert len(combinations) == len(recalls_mean)
    assert len(combinations) == len(recalls_std)
    assert len(combinations) == len(recalls_min)
    assert len(combinations) == len(recalls_max)

    assert len(combinations) == len(pos_ll_means)
    assert len(combinations) == len(pos_ll_stds)
    assert len(combinations) == len(neg_ll_means)
    assert len(combinations) == len(neg_ll_stds)

    assert len(combinations) == len(total_accuracies)

    # Save results
    if output_dir is not None:
        filename = 'results.csv'
        with open(os.path.join(output_dir, filename), 'wb') as f:
            writer = csv.writer(f, delimiter=';')
            writer.writerow([
                'idx', 'combination', 'f1-score-mean', 'f1-score-std',
                'f1-score-min', 'f1-score-max', 'precision-mean',
                'precision-std', 'precision-min', 'precision-max',
                'recall-mean', 'recall-std', 'recall-min', 'recall-max',
                'total-accuracy', 'pos-ll-mean', 'pos-ll-std', 'neg-ll-mean',
                'neg-ll-std'
            ])
            for idx, d in enumerate(
                    zip(combinations, fscores_mean, fscores_std, fscores_min,
                        fscores_max, precisions_mean, precisions_std,
                        precisions_min, precisions_max, recalls_mean,
                        recalls_std, recalls_min, recalls_max,
                        total_accuracies, pos_ll_means, pos_ll_stds,
                        neg_ll_means, neg_ll_stds)):
                combination = ', '.join([str(x) for x in d[0]])
                new_data = ['%d' % idx] + list((combination, ) + d[1:])
                writer.writerow(new_data)
    print len(combinations)