def test_value_error_raise(self): """ Test seq_bootstrap and ind_matrix functions for raising ValueError on nan values """ with self.assertRaises(ValueError): get_ind_matrix(self.meta_labeled_events.t1, self.data)
def test_value_error_raise(self): """ Test seq_bootstrap and ind_matrix functions for raising ValueError on nan values """ nan_samples_info_sets = self.samples_info_sets.copy() nan_samples_info_sets.loc[pd.Timestamp(2019, 1, 1), 't1'] = None with self.assertRaises(ValueError): get_ind_matrix(nan_samples_info_sets.t1, self.price_bars)
def __init__(self, samples_info_sets, price_bars, base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0, bootstrap_features=False, oob_score=False, warm_start=False, n_jobs=None, random_state=None, verbose=0): super().__init__(base_estimator=base_estimator, n_estimators=n_estimators, bootstrap=True, max_samples=max_samples, max_features=max_features, bootstrap_features=bootstrap_features, oob_score=oob_score, warm_start=warm_start, n_jobs=n_jobs, random_state=random_state, verbose=verbose) # pylint: disable=invalid-name self.samples_info_sets = samples_info_sets self.price_bars = price_bars self.ind_mat = get_ind_matrix(samples_info_sets, price_bars) # Used for create get ind_matrix subsample during cross-validation self.timestamp_int_index_mapping = pd.Series( index=samples_info_sets.index, data=range(self.ind_mat.shape[1])) self.X_time_index = None # Timestamp index of X_train
def test_seq_bootstrap(self): """ Test sequential bootstrapping length, indicator matrix length and NaN checks """ non_nan_meta_labels = self.meta_labeled_events.dropna() ind_mat = get_ind_matrix(non_nan_meta_labels, self.data) label_endtime = non_nan_meta_labels.t1 trimmed_price_bars_index = self.data[(self.data.index >= non_nan_meta_labels.index.min()) & (self.data.index <= non_nan_meta_labels.t1.max())].index bar_index = list(non_nan_meta_labels.index) # Generate index for indicator matrix from t1 and index bar_index.extend(non_nan_meta_labels.t1) bar_index.extend(trimmed_price_bars_index) bar_index = sorted(list(set(bar_index))) # Drop duplicates and sort ind_mat_book_implementation = book_ind_mat_implementation(bar_index, label_endtime) self.assertTrue(bool((ind_mat_book_implementation.values == ind_mat).all()) is True) # Indicator matrix shape should be (unique(meta_label_index+t1+price_bars_index), t1) self.assertTrue(ind_mat.shape == (782, 7)) # Check indicator matrix values for specific labels self.assertTrue(bool((ind_mat[:100, 0] == np.ones(100)).all()) is True) self.assertTrue(bool((ind_mat[191:340, 2] == np.ones(149)).all()) is True) self.assertTrue(bool((ind_mat[341:420, 2] == np.zeros(79)).all()) is True) self.assertTrue(bool((ind_mat[406:412, 4] == np.ones(6)).all()) is True) self.assertTrue(bool((ind_mat[662:, 6] == np.ones(120)).all()) is True) bootstrapped_samples = seq_bootstrap(ind_mat, compare=False, verbose=True, warmup_samples=None) bootstrapped_samples_1000 = seq_bootstrap(ind_mat, compare=True, sample_length=100) self.assertTrue(len(bootstrapped_samples) == non_nan_meta_labels.shape[0]) self.assertTrue(len(bootstrapped_samples_1000) == 100) # Test sequential bootstrapping on example from a book ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3)) ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0] ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0] ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1] ind_mat = ind_mat.values seq_bootstrap(ind_mat, sample_length=3, verbose=True, warmup_samples=[1]) # Show printed probabilities # Perform Monte-Carlo test standard_unq_array = np.zeros(1000) * np.nan seq_unq_array = np.zeros(1000) * np.nan for i in range(0, 1000): bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3) random_samples = np.random.choice(ind_mat.shape[1], size=3) random_unq = get_ind_mat_average_uniqueness(ind_mat[:, random_samples]) sequential_unq = get_ind_mat_average_uniqueness(ind_mat[:, bootstrapped_samples]) standard_unq_array[i] = random_unq seq_unq_array[i] = sequential_unq self.assertTrue(np.mean(seq_unq_array) >= np.mean(standard_unq_array)) self.assertTrue(np.median(seq_unq_array) >= np.median(standard_unq_array))
def setUp(self): """ Set the file path for the sample dollar bars data and get triple barrier events, generate features """ project_path = os.path.dirname(__file__) self.path = project_path + '/test_data/dollar_bar_sample.csv' self.data = pd.read_csv(self.path, index_col='date_time') self.data.index = pd.to_datetime(self.data.index) # Compute moving averages self.data['fast_mavg'] = self.data['close'].rolling( window=20, min_periods=20, center=False).mean() self.data['slow_mavg'] = self.data['close'].rolling( window=50, min_periods=50, center=False).mean() # Compute sides self.data['side'] = np.nan long_signals = self.data['fast_mavg'] >= self.data['slow_mavg'] short_signals = self.data['fast_mavg'] < self.data['slow_mavg'] self.data.loc[long_signals, 'side'] = 1 self.data.loc[short_signals, 'side'] = -1 # Remove Look ahead bias by lagging the signal self.data['side'] = self.data['side'].shift(1) daily_vol = get_daily_vol(close=self.data['close'], lookback=50) * 0.5 cusum_events = cusum_filter(self.data['close'], threshold=0.005) vertical_barriers = add_vertical_barrier(t_events=cusum_events, close=self.data['close'], num_hours=2) meta_labeled_events = get_events( close=self.data['close'], t_events=cusum_events, pt_sl=[1, 4], target=daily_vol, min_ret=5e-5, num_threads=3, vertical_barrier_times=vertical_barriers, side_prediction=self.data['side']) meta_labeled_events.dropna(inplace=True) labels = get_bins(meta_labeled_events, self.data['close']) # Generate data set which shows the power of SB Bagging vs Standard Bagging ind_mat = get_ind_matrix(meta_labeled_events.t1, self.data.close) unique_samples = _get_synthetic_samples(ind_mat, 0.5, 0.1) X = self.data.loc[labels.index, ].iloc[unique_samples].dropna( ) # get synthetic data set with drawn samples labels = labels.loc[X.index, :] X.loc[labels.index, 'y'] = labels.bin # Generate features (some of them are informative, others are just noise) for index, value in X.y.iteritems(): X.loc[index, 'label_prob_0.6'] = _generate_label_with_prob(value, 0.6) X.loc[index, 'label_prob_0.5'] = _generate_label_with_prob(value, 0.5) X.loc[index, 'label_prob_0.3'] = _generate_label_with_prob(value, 0.3) X.loc[index, 'label_prob_0.2'] = _generate_label_with_prob(value, 0.2) X.loc[index, 'label_prob_0.1'] = _generate_label_with_prob(value, 0.1) features = ['label_prob_0.6', 'label_prob_0.2', 'label_prob_0.1'] # Two super-informative features for prob in [0.5, 0.3, 0.2, 0.1]: for window in [2, 5]: X['label_prob_{}_sma_{}'.format( prob, window)] = X['label_prob_{}'.format(prob)].rolling( window=window).mean() features.append('label_prob_{}_sma_{}'.format(prob, window)) X.dropna(inplace=True) y = X.pop('y') self.X_train, self.X_test, self.y_train_clf, self.y_test_clf = train_test_split( X[features], y, test_size=0.4, random_state=1, shuffle=False) self.y_train_reg = (1 + self.y_train_clf) self.y_test_reg = (1 + self.y_test_clf) self.samples_info_sets = meta_labeled_events.loc[self.X_train.index, 't1'] self.price_bars_trim = self.data[ (self.data.index >= self.X_train.index.min()) & (self.data.index <= self.X_train.index.max())].close
### Bagging, Bootstrapping and Random Forrest # Ensemble learning technique (bagging with replacement) the goal is to randomly choose data samples # that are unique and non-concurrent for each decision tree # With sequential bootsrapping our goal is to select samples such that with each iteration we can # maximize average unqiueness of subsamples ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3)) ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0] ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0] ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1] ind_mat print(ind_mat) # Get triple barier method indicator matrix triple_barrier_ind_mat = get_ind_matrix(barrier_events, price_bars=close_prices['close']) print(triple_barrier_ind_mat) ind_mat_uniqueness = get_ind_mat_average_uniqueness( triple_barrier_ind_mat) ### CHECK BACK AFTER FIXING DUPLICATE T Values print(ind_mat_uniqueness) first_sample = ind_mat_uniqueness first_sample[first_sample > 0].mean() # Jupyter notebook output # av_unique.loc[0] # Get the values ind_mat = ind_mat.values
def test_seq_bootstrap(self): """ Test sequential bootstrapping length, indicator matrix length and NaN checks """ non_nan_meta_labels = self.meta_labeled_events.dropna() ind_mat = get_ind_matrix(non_nan_meta_labels) self.assertTrue(ind_mat.shape == ( 13, 7)) # Indicator matrix shape should be (meta_label_index+t1, t1) # Check indicator matrix values for specific labels self.assertTrue( bool((ind_mat[:, 0] == [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ).all()) is True) self.assertTrue( bool((ind_mat[:, 2] == [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0] ).all()) is True) self.assertTrue( bool((ind_mat[:, 4] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0] ).all()) is True) self.assertTrue( bool((ind_mat[:, 6] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] ).all()) is True) bootstrapped_samples = seq_bootstrap(ind_mat, compare=False, verbose=True, warmup_samples=None) bootstrapped_samples_1000 = seq_bootstrap(ind_mat, compare=True, sample_length=100) self.assertTrue( len(bootstrapped_samples) == non_nan_meta_labels.shape[0]) self.assertTrue(len(bootstrapped_samples_1000) == 100) # Test sequential bootstrapping on example from a book ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3)) ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0] ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0] ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1] ind_mat = ind_mat.values seq_bootstrap(ind_mat, sample_length=3, verbose=True, warmup_samples=[1]) # Show printed probabilities # Perform Monte-Carlo test standard_unq_array = np.zeros(1000) * np.nan seq_unq_array = np.zeros(1000) * np.nan for i in range(0, 1000): bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3) random_samples = np.random.choice(ind_mat.shape[1], size=3) random_unq = get_ind_mat_average_uniqueness( ind_mat[:, random_samples]) random_unq_mean = random_unq[random_unq > 0].mean() sequential_unq = get_ind_mat_average_uniqueness( ind_mat[:, bootstrapped_samples]) sequential_unq_mean = sequential_unq[sequential_unq > 0].mean() standard_unq_array[i] = random_unq_mean seq_unq_array[i] = sequential_unq_mean self.assertTrue(np.mean(seq_unq_array) >= np.mean(standard_unq_array)) self.assertTrue( np.median(seq_unq_array) >= np.median(standard_unq_array))