def test_calibration(X, y, cal_method='sigmoid'): # CalibratedClassifierCV base_clf = XGBClassifier(objective='binary:logitraw') base_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480) cal_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480) cal_clf = CalibratedClassifierCV(base_clf, method=cal_method, cv=cal_cv) cv_clf = ClassifierCV(cal_clf, cv=base_cv) cv_clf.fit(X, y) print(cal_method + ' Calibrated accuracy: %s' % cv_clf.score_cv()) print(cal_method + ' Calibrated logloss: %s' % cv_clf.score_cv(skm.log_loss))
def test_threshold(X, y, thr_method='youden'): # ThresholdClassifierCV base_clf = XGBClassifier(objective='binary:logistic') base_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480) thr_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480) thr_clf = ThresholdClassifierCV(base_clf, method=thr_method, cv=thr_cv) cv_clf = ClassifierCV(thr_clf, cv=base_cv) cv_clf.fit(X, y) print(thr_method + ' Threshold: %s' % (sum([unit.threshold for unit in cv_clf.classifiers_])/len([unit.threshold for unit in cv_clf.classifiers_]))) print(thr_method + ' Threshold accuracy: %s' % cv_clf.score_cv()) print(thr_method + ' Threshold logloss: %s' % cv_clf.score_cv(skm.log_loss))
def test_calibration_threshold(X, y, cal_method='sigmoid', thr_method='youden'): # # CalibratedClassifierCV and CutoffClassifierCV cal_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480) thr_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480) cv_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5960, initial_test_index=-480) base_clf = XGBClassifier(objective='binary:logitraw') cal_clf = CalibratedClassifierCV(base_clf, method=cal_method, cv=cal_cv) thr_clf = ThresholdClassifierCV(cal_clf, method=thr_method, cv=thr_cv) cv_clf = ClassifierCV(thr_clf, cv=cv_cv) cv_clf.fit(X, y) print(cal_method + ' ' + thr_method + ' Threshold: %s' % (sum([unit.threshold for unit in cv_clf.classifiers_])/len([unit.threshold for unit in cv_clf.classifiers_]))) print(cal_method + ' ' + thr_method + ' Threshold accuracy: %s' % cv_clf.score_cv()) print(cal_method + ' ' + thr_method + ' Threshold logloss: %s' % cv_clf.score_cv(skm.log_loss))
def test_cutoff(X, y): # # CutoffClassifierCV base_clf = XGBClassifier(objective='binary:logistic') base_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480) cut_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480) cut_clf = CutoffClassifierCV(base_clf, cv=cut_cv) cv_clf = ClassifierCV(cut_clf, cv=base_cv) cv_clf.fit(X, y) # Get cutoff upper_cutoffs = [unit.cutoff[0] for unit in cv_clf.classifiers_] lower_cutoffs = [unit.cutoff[1] for unit in cv_clf.classifiers_] print('Cutoff: ' + str((sum(upper_cutoffs)/len(upper_cutoffs), sum(lower_cutoffs)/len(lower_cutoffs)))) print('Cutoff accuracy: %s' % cv_clf.score_cv()) print('Cutoff logloss: %s' % cv_clf.score_cv(skm.log_loss))
def test_calibration_cutoff(X, y, cal_method='sigmoid'): # # CalibratedClassifierCV and CutoffClassifierCV cal_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5000, initial_test_index=-480) cut_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5480, initial_test_index=-480) cv_cv = WindowSplit(test_size=120, step_size=120, sliding_size=5960, initial_test_index=-480) base_clf = XGBClassifier(objective='binary:logitraw') cal_clf = CalibratedClassifierCV(base_clf, method=cal_method, cv=cal_cv) cut_clf = CutoffClassifierCV(cal_clf, cv=cut_cv) cv_clf = ClassifierCV(cut_clf, cv=cv_cv) cv_clf.fit(X, y) # Get cutoff upper_cutoffs = [unit.cutoff[0] for unit in cv_clf.classifiers_] lower_cutoffs = [unit.cutoff[1] for unit in cv_clf.classifiers_] print(cal_method + ' Cutoff: ' + str((sum(upper_cutoffs)/len(upper_cutoffs), sum(lower_cutoffs)/len(lower_cutoffs)))) print(cal_method + ' Cutoff accuracy: %s' % cv_clf.score_cv()) print(cal_method + ' Cutoff logloss: %s' % cv_clf.score_cv(skm.log_loss))
def init(): X, y = make_classification(7000) print('Truth freqs: %s' % str({k: v for k, v in zip(*[x.tolist() for x in np.unique(y, return_counts=True)])})) # Base CV cv = WindowSplit(test_size=120, step_size=120, sliding_size=1000, initial_test_index=-480) base_clf = XGBClassifier() cv_clf = ClassifierCV(base_clf, cv) cv_clf.fit(X, y) print('Base accuracy: %s' % cv_clf.score_cv()) print('Base logloss: %s' % cv_clf.score_cv(skm.log_loss)) test_calibration(X, y, cal_method='sigmoid') test_calibration(X, y, cal_method='isotonic') test_calibration(X, y, cal_method='rocch') test_calibration(X, y, cal_method='beta') test_threshold(X, y, thr_method='youden') test_threshold(X, y, thr_method='roc') test_cutoff(X, y) test_calibration_threshold(X, y, cal_method='sigmoid', thr_method='youden') test_calibration_threshold(X, y, cal_method='isotonic', thr_method='youden') test_calibration_threshold(X, y, cal_method='rocch', thr_method='youden') test_calibration_threshold(X, y, cal_method='beta', thr_method='youden') test_calibration_threshold(X, y, cal_method='sigmoid', thr_method='roc') test_calibration_threshold(X, y, cal_method='isotonic', thr_method='roc') test_calibration_threshold(X, y, cal_method='rocch', thr_method='roc') test_calibration_threshold(X, y, cal_method='beta', thr_method='roc') test_calibration_cutoff(X, y, cal_method='sigmoid') test_calibration_cutoff(X, y, cal_method='isotonic') test_calibration_cutoff(X, y, cal_method='rocch') test_calibration_cutoff(X, y, cal_method='beta') import pdb; pdb.set_trace() pass
def get_cv(prices, data_params, cv_params, base_only=False, do_verify=False): if 'cv' in cv_params: return cv_params['cv'](**cv_params['params']) elif 'single_split' in cv_params: return SingleSplit(test_size=cv_params['single_split']) # else, construct chained WindowSplit CV # base params go last transforms = get_transforms(cv_params) master_transform = transforms[-1] # should have master in it sample_len = get_sample_len(data_params, cv_params) target_gap = cv_params['target_gap'] if 'target_gap' in cv_params else False verify_cv = [] verify_factors = [1] if not doing_verify( cv_params) else cv_params['verify_factor'] if isinstance( cv_params['verify_factor'], Iterable) else [ cv_params['verify_factor'] ] if cv_params['verify_factor'] is not None else [1] total_test_size, total_verify_size, total_post_size = get_split_sizes( transforms, verify_factors=verify_factors) # assume that data bounds accurately encompass verify_n*test_size + test_n*test_size data_post_end = len(prices) - data_params[ 'end_buffer'] # exclusive post end data_verify_end = data_post_end - total_post_size + 1 # exclusive verify end, inclusive post start data_test_end = data_verify_end - total_verify_size # exclusive test end, inclusive verify start data_train_end = data_test_end - total_test_size # exclusive train end, inclusive test start data_post_start = data_verify_end if target_gap: data_verify_end -= sample_len['target'] data_test_end -= sample_len['target'] data_train_end -= sample_len['target'] data_verify_start = data_test_end - sum([ transform['test_size'] * transform['test_n'] for transform in transforms if 'master' not in transform ]) # this is different from verify master split, as pre-transforms must run before master split, unless separate_verify is true (todo) post_able = do_verify and len(prices) >= sum(sample_len.values( )) + data_params['start_buffer'] + data_params['end_buffer'] - sample_len[ 'target'] + (sample_len['target'] if target_gap else 0) ### todo: do per factor, not just all of them for verify_factor in verify_factors: # verify factor for verification split, verify subfactor for post split (if available) if post_able: ### todo: do per factor, not just all of them verify_subfactors = [{ 'post': False, 'factor': verify_factor }, { 'post': True, 'factor': 1 - verify_factor }] else: verify_subfactors = [{'post': False, 'factor': verify_factor}] verify_subcv = [] for verify_subfactor_unit in verify_subfactors: factor_is_post, verify_subfactor = verify_subfactor_unit[ 'post'], verify_subfactor_unit['factor'] transform_cv = [] prior_train_size = cv_params['train_size'] if not do_verify: test_start = data_train_end else: if factor_is_post and target_gap: test_start = data_verify_start + sample_len['target'] else: test_start = data_verify_start if factor_is_post: prior_test_size = master_transform['test_size'] * get_verify_n( master_transform['test_n'], max(verify_factors)) else: prior_test_size = master_transform['test_size'] * ( get_verify_n(master_transform['test_n'], max(verify_factors)) - get_verify_n(master_transform['test_n'], verify_subfactor)) prior_data_size = cv_params['train_size'] for transform in transforms: # Window size calculation: [train = (sum(test len) + train len)] + sum(test len) if not do_verify: current_test_size = transform['test_size'] * transform[ 'test_n'] initial_test_index = test_start + prior_test_size final_index = initial_test_index + current_test_size prices_size = len(prices) else: if 'master' in transform: current_test_size = transform[ 'test_size'] * get_verify_n( transform['test_n'], verify_subfactor) else: current_test_size = transform['test_size'] * transform[ 'test_n'] initial_test_index = test_start + prior_test_size # inclusive start of verify split final_index = initial_test_index + current_test_size if True or 'master' in transform: prices_size = len(prices) else: # hack: CV needs to be relative to data size, which for transforms is # truncated in CalibratedCV # so if transform is 'master', pass the original prices size # else, pass the sum of train size; all prior test sizes; and current test size prior_data_size += current_test_size prices_size = prior_data_size prices_size_diff = len(prices) - prices_size train_size = prior_train_size if 'master' in transform: args = { 'test_size': abs(transform['test_size']), 'step_size': abs(transform['test_size']), 'initial_test_index': initial_test_index - prices_size - 1, 'final_index': final_index - prices_size } if final_index - prices_size >= 0 and final_index - prices_size <= 1: # hack: this should only happen if we're at the last data row (e.g., factor_is_post) # clear final_index so we don't erroneously clip it severely args['final_index'] = None if cv_params['train_sliding']: args['initial_train_index'] = 0 if base_only and 'master' in transform: # HACK: change train length to base; all else is correct args['sliding_size'] = cv_params['train_size'] else: args['sliding_size'] = train_size else: if base_only and 'master' in transform: # HACK: change train length to base; all else is correct args['initial_train_index'] = min( 0, args['initial_test_index'] - cv_params['train_size']) else: args['initial_train_index'] = min( 0, args['initial_test_index'] - train_size) args['sliding_size'] = None else: # hack: transform CVs are relative to the required size of transform # because in classifyCV, data length passed is exactly what is needed for # non-master transforms args = { 'test_size': abs(transform['test_size']), 'step_size': abs(transform['test_size']), 'initial_test_index': -current_test_size, 'final_index': None } if cv_params['train_sliding']: args['initial_train_index'] = 0 args['sliding_size'] = train_size else: args[ 'initial_train_index'] = -current_test_size - train_size args['sliding_size'] = None transform_cv.append(WindowSplit(**args)) prior_train_size += current_test_size prior_test_size += current_test_size verify_subcv.append(transform_cv) if not do_verify: verify_cv.append(verify_subcv[-1]) else: verify_cv.append(verify_subcv) if not do_verify and len(verify_cv) > 0: if base_only: return [verify_cv[0][-1]] else: return verify_cv[0] else: if base_only: return [[unit[-1]] for unit in verify_cv] else: return verify_cv