def test_seq_bootstrap(self): """ Test sequential bootstrapping length, indicator matrix length and NaN checks """ non_nan_meta_labels = self.meta_labeled_events.dropna() ind_mat = get_ind_matrix(non_nan_meta_labels, self.data) label_endtime = non_nan_meta_labels.t1 trimmed_price_bars_index = self.data[(self.data.index >= non_nan_meta_labels.index.min()) & (self.data.index <= non_nan_meta_labels.t1.max())].index bar_index = list(non_nan_meta_labels.index) # Generate index for indicator matrix from t1 and index bar_index.extend(non_nan_meta_labels.t1) bar_index.extend(trimmed_price_bars_index) bar_index = sorted(list(set(bar_index))) # Drop duplicates and sort ind_mat_book_implementation = book_ind_mat_implementation(bar_index, label_endtime) self.assertTrue(bool((ind_mat_book_implementation.values == ind_mat).all()) is True) # Indicator matrix shape should be (unique(meta_label_index+t1+price_bars_index), t1) self.assertTrue(ind_mat.shape == (782, 7)) # Check indicator matrix values for specific labels self.assertTrue(bool((ind_mat[:100, 0] == np.ones(100)).all()) is True) self.assertTrue(bool((ind_mat[191:340, 2] == np.ones(149)).all()) is True) self.assertTrue(bool((ind_mat[341:420, 2] == np.zeros(79)).all()) is True) self.assertTrue(bool((ind_mat[406:412, 4] == np.ones(6)).all()) is True) self.assertTrue(bool((ind_mat[662:, 6] == np.ones(120)).all()) is True) bootstrapped_samples = seq_bootstrap(ind_mat, compare=False, verbose=True, warmup_samples=None) bootstrapped_samples_1000 = seq_bootstrap(ind_mat, compare=True, sample_length=100) self.assertTrue(len(bootstrapped_samples) == non_nan_meta_labels.shape[0]) self.assertTrue(len(bootstrapped_samples_1000) == 100) # Test sequential bootstrapping on example from a book ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3)) ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0] ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0] ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1] ind_mat = ind_mat.values seq_bootstrap(ind_mat, sample_length=3, verbose=True, warmup_samples=[1]) # Show printed probabilities # Perform Monte-Carlo test standard_unq_array = np.zeros(1000) * np.nan seq_unq_array = np.zeros(1000) * np.nan for i in range(0, 1000): bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3) random_samples = np.random.choice(ind_mat.shape[1], size=3) random_unq = get_ind_mat_average_uniqueness(ind_mat[:, random_samples]) sequential_unq = get_ind_mat_average_uniqueness(ind_mat[:, bootstrapped_samples]) standard_unq_array[i] = random_unq seq_unq_array[i] = sequential_unq self.assertTrue(np.mean(seq_unq_array) >= np.mean(standard_unq_array)) self.assertTrue(np.median(seq_unq_array) >= np.median(standard_unq_array))
def _generate_bagging_indices(random_state, bootstrap_features, n_features, max_features, max_samples, ind_mat): """Randomly draw feature and sample indices.""" # Get valid random state random_state = check_random_state(random_state) # Draw indices feature_indices = _generate_random_features(random_state, bootstrap_features, n_features, max_features) sample_indices = seq_bootstrap(ind_mat, sample_length=max_samples, random_state=random_state) return feature_indices, sample_indices
uniqueness_array[i] = (label_uniqueness[label_uniqueness > 0].mean()) prob_array = uniqueness_array / sum(uniqueness_array) phi = [1, 2, 0] uniqueness_array = np.array([None, None, None]) for i in range(0, 3): ind_mat_reduced = ind_mat[:, phi + [i]] label_uniqueness = get_ind_mat_average_uniqueness( ind_mat_reduced) #[-1] TODO fix thiss uniqueness_array[i] = (label_uniqueness[label_uniqueness > 0].mean()) prob_array = uniqueness_array / sum(uniqueness_array) print(prob_array) samples = seq_bootstrap(ind_mat, sample_length=4, warmup_samples=[1], verbose=True) print(samples) ### Monte-Carlo experiment (checks to see how sequential bootsrapping will improve average label uniqueness) standard_unq_array = np.zeros( 10000) * np.nan # Array of random sampling uniqueness seq_unq_array = np.zeros( 10000) * np.nan # Array of Sequential Bootstapping uniqueness for i in range(0, 10000): bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3) random_samples = np.random.choice(ind_mat.shape[1], size=3) random_unq = get_ind_mat_average_uniqueness(ind_mat[:, random_samples]) random_unq_mean = random_unq[random_unq > 0].mean()
def test_seq_bootstrap(self): """ Test sequential bootstrapping length, indicator matrix length and NaN checks """ non_nan_meta_labels = self.meta_labeled_events.dropna() ind_mat = get_ind_matrix(non_nan_meta_labels) self.assertTrue(ind_mat.shape == ( 13, 7)) # Indicator matrix shape should be (meta_label_index+t1, t1) # Check indicator matrix values for specific labels self.assertTrue( bool((ind_mat[:, 0] == [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] ).all()) is True) self.assertTrue( bool((ind_mat[:, 2] == [0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0] ).all()) is True) self.assertTrue( bool((ind_mat[:, 4] == [0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0] ).all()) is True) self.assertTrue( bool((ind_mat[:, 6] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1] ).all()) is True) bootstrapped_samples = seq_bootstrap(ind_mat, compare=False, verbose=True, warmup_samples=None) bootstrapped_samples_1000 = seq_bootstrap(ind_mat, compare=True, sample_length=100) self.assertTrue( len(bootstrapped_samples) == non_nan_meta_labels.shape[0]) self.assertTrue(len(bootstrapped_samples_1000) == 100) # Test sequential bootstrapping on example from a book ind_mat = pd.DataFrame(index=range(0, 6), columns=range(0, 3)) ind_mat.loc[:, 0] = [1, 1, 1, 0, 0, 0] ind_mat.loc[:, 1] = [0, 0, 1, 1, 0, 0] ind_mat.loc[:, 2] = [0, 0, 0, 0, 1, 1] ind_mat = ind_mat.values seq_bootstrap(ind_mat, sample_length=3, verbose=True, warmup_samples=[1]) # Show printed probabilities # Perform Monte-Carlo test standard_unq_array = np.zeros(1000) * np.nan seq_unq_array = np.zeros(1000) * np.nan for i in range(0, 1000): bootstrapped_samples = seq_bootstrap(ind_mat, sample_length=3) random_samples = np.random.choice(ind_mat.shape[1], size=3) random_unq = get_ind_mat_average_uniqueness( ind_mat[:, random_samples]) random_unq_mean = random_unq[random_unq > 0].mean() sequential_unq = get_ind_mat_average_uniqueness( ind_mat[:, bootstrapped_samples]) sequential_unq_mean = sequential_unq[sequential_unq > 0].mean() standard_unq_array[i] = random_unq_mean seq_unq_array[i] = sequential_unq_mean self.assertTrue(np.mean(seq_unq_array) >= np.mean(standard_unq_array)) self.assertTrue( np.median(seq_unq_array) >= np.median(standard_unq_array))