def test_calibration(X, y, cal_method='sigmoid'):
    # CalibratedClassifierCV
    base_clf = XGBClassifier(objective='binary:logitraw')
    base_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480)
    cal_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480)
    cal_clf = CalibratedClassifierCV(base_clf, method=cal_method, cv=cal_cv)
    cv_clf = ClassifierCV(cal_clf, cv=base_cv)
    cv_clf.fit(X, y)

    print(cal_method + ' Calibrated accuracy: %s' % cv_clf.score_cv())
    print(cal_method + ' Calibrated logloss: %s' % cv_clf.score_cv(skm.log_loss))
def test_threshold(X, y, thr_method='youden'):
    # ThresholdClassifierCV
    base_clf = XGBClassifier(objective='binary:logistic')
    base_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480)
    thr_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480)
    thr_clf = ThresholdClassifierCV(base_clf, method=thr_method, cv=thr_cv)
    cv_clf = ClassifierCV(thr_clf, cv=base_cv)
    cv_clf.fit(X, y)

    print(thr_method + ' Threshold: %s' % (sum([unit.threshold for unit in cv_clf.classifiers_])/len([unit.threshold for unit in cv_clf.classifiers_])))
    print(thr_method + ' Threshold accuracy: %s' % cv_clf.score_cv())
    print(thr_method + ' Threshold logloss: %s' % cv_clf.score_cv(skm.log_loss))
def test_calibration_threshold(X, y, cal_method='sigmoid', thr_method='youden'):
    # # CalibratedClassifierCV and CutoffClassifierCV
    cal_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480)
    thr_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480)
    cv_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5960, initial_test_index=-480)
    base_clf = XGBClassifier(objective='binary:logitraw')
    cal_clf = CalibratedClassifierCV(base_clf, method=cal_method, cv=cal_cv)
    thr_clf = ThresholdClassifierCV(cal_clf, method=thr_method, cv=thr_cv)
    cv_clf = ClassifierCV(thr_clf, cv=cv_cv)
    cv_clf.fit(X, y)

    print(cal_method + ' ' + thr_method + ' Threshold: %s' % (sum([unit.threshold for unit in cv_clf.classifiers_])/len([unit.threshold for unit in cv_clf.classifiers_])))
    print(cal_method + ' ' + thr_method + ' Threshold accuracy: %s' % cv_clf.score_cv())
    print(cal_method + ' ' + thr_method + ' Threshold logloss: %s' % cv_clf.score_cv(skm.log_loss))
def test_cutoff(X, y):
    # # CutoffClassifierCV
    base_clf = XGBClassifier(objective='binary:logistic')
    base_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480)
    cut_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480)
    cut_clf = CutoffClassifierCV(base_clf, cv=cut_cv)
    cv_clf = ClassifierCV(cut_clf, cv=base_cv)
    cv_clf.fit(X, y)

    # Get cutoff
    upper_cutoffs = [unit.cutoff[0] for unit in cv_clf.classifiers_]
    lower_cutoffs = [unit.cutoff[1] for unit in cv_clf.classifiers_]
    print('Cutoff: ' + str((sum(upper_cutoffs)/len(upper_cutoffs), sum(lower_cutoffs)/len(lower_cutoffs))))
    print('Cutoff accuracy: %s' % cv_clf.score_cv())
    print('Cutoff logloss: %s' % cv_clf.score_cv(skm.log_loss))
def test_calibration_cutoff(X, y, cal_method='sigmoid'):
    # # CalibratedClassifierCV and CutoffClassifierCV
    cal_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480)
    cut_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480)
    cv_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5960, initial_test_index=-480)
    base_clf = XGBClassifier(objective='binary:logitraw')
    cal_clf = CalibratedClassifierCV(base_clf, method=cal_method, cv=cal_cv)
    cut_clf = CutoffClassifierCV(cal_clf, cv=cut_cv)
    cv_clf = ClassifierCV(cut_clf, cv=cv_cv)
    cv_clf.fit(X, y)

    # Get cutoff
    upper_cutoffs = [unit.cutoff[0] for unit in cv_clf.classifiers_]
    lower_cutoffs = [unit.cutoff[1] for unit in cv_clf.classifiers_]
    print(cal_method + ' Cutoff: ' + str((sum(upper_cutoffs)/len(upper_cutoffs), sum(lower_cutoffs)/len(lower_cutoffs))))
    print(cal_method + ' Cutoff accuracy: %s' % cv_clf.score_cv())
    print(cal_method + ' Cutoff logloss: %s' % cv_clf.score_cv(skm.log_loss))
def init():
    X, y = make_classification(7000)

    print('Truth freqs: %s' % str({k: v for k, v in zip(*[x.tolist() for x in np.unique(y, return_counts=True)])}))

    # Base CV
    cv = WindowSplit(test_size=120, step_size=120, sliding_size=1000, initial_test_index=-480)
    base_clf = XGBClassifier()
    cv_clf = ClassifierCV(base_clf, cv)
    cv_clf.fit(X, y)

    print('Base accuracy: %s' % cv_clf.score_cv())
    print('Base logloss: %s' % cv_clf.score_cv(skm.log_loss))

    test_calibration(X, y, cal_method='sigmoid')
    test_calibration(X, y, cal_method='isotonic')
    test_calibration(X, y, cal_method='rocch')
    test_calibration(X, y, cal_method='beta')
    test_threshold(X, y, thr_method='youden')
    test_threshold(X, y, thr_method='roc')
    test_cutoff(X, y)
    test_calibration_threshold(X, y, cal_method='sigmoid', thr_method='youden')
    test_calibration_threshold(X, y, cal_method='isotonic', thr_method='youden')
    test_calibration_threshold(X, y, cal_method='rocch', thr_method='youden')
    test_calibration_threshold(X, y, cal_method='beta', thr_method='youden')
    test_calibration_threshold(X, y, cal_method='sigmoid', thr_method='roc')
    test_calibration_threshold(X, y, cal_method='isotonic', thr_method='roc')
    test_calibration_threshold(X, y, cal_method='rocch', thr_method='roc')
    test_calibration_threshold(X, y, cal_method='beta', thr_method='roc')
    test_calibration_cutoff(X, y, cal_method='sigmoid')
    test_calibration_cutoff(X, y, cal_method='isotonic')
    test_calibration_cutoff(X, y, cal_method='rocch')
    test_calibration_cutoff(X, y, cal_method='beta')

    import pdb; pdb.set_trace()
    pass
Beispiel #7
0
def get_cv(prices, data_params, cv_params, base_only=False, do_verify=False):
    if 'cv' in cv_params:
        return cv_params['cv'](**cv_params['params'])
    elif 'single_split' in cv_params:
        return SingleSplit(test_size=cv_params['single_split'])

    # else, construct chained WindowSplit CV
    # base params go last
    transforms = get_transforms(cv_params)
    master_transform = transforms[-1]  # should have master in it
    sample_len = get_sample_len(data_params, cv_params)
    target_gap = cv_params['target_gap'] if 'target_gap' in cv_params else False

    verify_cv = []
    verify_factors = [1] if not doing_verify(
        cv_params) else cv_params['verify_factor'] if isinstance(
            cv_params['verify_factor'], Iterable) else [
                cv_params['verify_factor']
            ] if cv_params['verify_factor'] is not None else [1]

    total_test_size, total_verify_size, total_post_size = get_split_sizes(
        transforms, verify_factors=verify_factors)

    # assume that data bounds accurately encompass verify_n*test_size + test_n*test_size
    data_post_end = len(prices) - data_params[
        'end_buffer']  # exclusive post end
    data_verify_end = data_post_end - total_post_size + 1  # exclusive verify end, inclusive post start
    data_test_end = data_verify_end - total_verify_size  # exclusive test end, inclusive verify start
    data_train_end = data_test_end - total_test_size  # exclusive train end, inclusive test start

    data_post_start = data_verify_end

    if target_gap:
        data_verify_end -= sample_len['target']
        data_test_end -= sample_len['target']
        data_train_end -= sample_len['target']

    data_verify_start = data_test_end - sum([
        transform['test_size'] * transform['test_n']
        for transform in transforms if 'master' not in transform
    ])
    # this is different from verify master split, as pre-transforms must run before master split, unless separate_verify is true (todo)

    post_able = do_verify and len(prices) >= sum(sample_len.values(
    )) + data_params['start_buffer'] + data_params['end_buffer'] - sample_len[
        'target'] + (sample_len['target'] if target_gap else 0)
    ### todo: do per factor, not just all of them

    for verify_factor in verify_factors:
        # verify factor for verification split, verify subfactor for post split (if available)
        if post_able:  ### todo: do per factor, not just all of them
            verify_subfactors = [{
                'post': False,
                'factor': verify_factor
            }, {
                'post': True,
                'factor': 1 - verify_factor
            }]
        else:
            verify_subfactors = [{'post': False, 'factor': verify_factor}]

        verify_subcv = []
        for verify_subfactor_unit in verify_subfactors:
            factor_is_post, verify_subfactor = verify_subfactor_unit[
                'post'], verify_subfactor_unit['factor']
            transform_cv = []
            prior_train_size = cv_params['train_size']

            if not do_verify:
                test_start = data_train_end
            else:
                if factor_is_post and target_gap:
                    test_start = data_verify_start + sample_len['target']
                else:
                    test_start = data_verify_start

            if factor_is_post:
                prior_test_size = master_transform['test_size'] * get_verify_n(
                    master_transform['test_n'], max(verify_factors))
            else:
                prior_test_size = master_transform['test_size'] * (
                    get_verify_n(master_transform['test_n'],
                                 max(verify_factors)) -
                    get_verify_n(master_transform['test_n'], verify_subfactor))

            prior_data_size = cv_params['train_size']

            for transform in transforms:
                # Window size calculation: [train = (sum(test len) + train len)] + sum(test len)
                if not do_verify:
                    current_test_size = transform['test_size'] * transform[
                        'test_n']
                    initial_test_index = test_start + prior_test_size
                    final_index = initial_test_index + current_test_size
                    prices_size = len(prices)
                else:
                    if 'master' in transform:
                        current_test_size = transform[
                            'test_size'] * get_verify_n(
                                transform['test_n'], verify_subfactor)
                    else:
                        current_test_size = transform['test_size'] * transform[
                            'test_n']
                    initial_test_index = test_start + prior_test_size  # inclusive start of verify split
                    final_index = initial_test_index + current_test_size

                if True or 'master' in transform:
                    prices_size = len(prices)
                else:
                    # hack: CV needs to be relative to data size, which for transforms is
                    # truncated in CalibratedCV
                    # so if transform is 'master', pass the original prices size
                    # else, pass the sum of train size; all prior test sizes; and current test size
                    prior_data_size += current_test_size
                    prices_size = prior_data_size
                    prices_size_diff = len(prices) - prices_size

                train_size = prior_train_size

                if 'master' in transform:
                    args = {
                        'test_size': abs(transform['test_size']),
                        'step_size': abs(transform['test_size']),
                        'initial_test_index':
                        initial_test_index - prices_size - 1,
                        'final_index': final_index - prices_size
                    }

                    if final_index - prices_size >= 0 and final_index - prices_size <= 1:
                        # hack: this should only happen if we're at the last data row (e.g., factor_is_post)
                        # clear final_index so we don't erroneously clip it severely
                        args['final_index'] = None

                    if cv_params['train_sliding']:
                        args['initial_train_index'] = 0
                        if base_only and 'master' in transform:  # HACK: change train length to base; all else is correct
                            args['sliding_size'] = cv_params['train_size']
                        else:
                            args['sliding_size'] = train_size
                    else:
                        if base_only and 'master' in transform:  # HACK: change train length to base; all else is correct
                            args['initial_train_index'] = min(
                                0, args['initial_test_index'] -
                                cv_params['train_size'])
                        else:
                            args['initial_train_index'] = min(
                                0, args['initial_test_index'] - train_size)
                        args['sliding_size'] = None
                else:
                    # hack: transform CVs are relative to the required size of transform
                    # because in classifyCV, data length passed is exactly what is needed for
                    # non-master transforms
                    args = {
                        'test_size': abs(transform['test_size']),
                        'step_size': abs(transform['test_size']),
                        'initial_test_index': -current_test_size,
                        'final_index': None
                    }
                    if cv_params['train_sliding']:
                        args['initial_train_index'] = 0
                        args['sliding_size'] = train_size
                    else:
                        args[
                            'initial_train_index'] = -current_test_size - train_size
                        args['sliding_size'] = None

                transform_cv.append(WindowSplit(**args))
                prior_train_size += current_test_size
                prior_test_size += current_test_size
            verify_subcv.append(transform_cv)

        if not do_verify:
            verify_cv.append(verify_subcv[-1])
        else:
            verify_cv.append(verify_subcv)

    if not do_verify and len(verify_cv) > 0:
        if base_only:
            return [verify_cv[0][-1]]
        else:
            return verify_cv[0]
    else:
        if base_only:
            return [[unit[-1]] for unit in verify_cv]
        else:
            return verify_cv