def test_numpy_projection_logic(self): """ Test that the projection works as expected for numpy objects """ mock_dataset = MagicMock() mock_dataset.get_split.return_value = Split( X=np.ones((100, 10)) * np.array(range(100)).reshape(-1, 1), y=np.array(range(100)), ) projected_split = self.mock_cls( dataset=mock_dataset, split="ddefg", indices=range(10, 30) ) expected_split = Split( X=np.ones((20, 10)) * np.array(range(10, 30)).reshape(-1, 1), y=np.array(range(10, 30)), ) split = projected_split.dataset_split explicit_output = projected_split.apply_projection(split) implicit_output = projected_split.projected_split def numpy_split_comparison(a, b): self.assertEqual(a.keys(), b.keys()) for k, v in a.items(): np.testing.assert_equal(v, b[k]) with self.assertRaises(AssertionError): numpy_split_comparison(split, explicit_output) with self.assertRaises(AssertionError): numpy_split_comparison(split, implicit_output) numpy_split_comparison(expected_split, explicit_output) numpy_split_comparison(expected_split, implicit_output) numpy_split_comparison(implicit_output, explicit_output)
def __getitem__(self, index) -> Split: """Gets batch at position `index`. # Arguments index: position of the batch in the Sequence. # Returns A batch """ current_index = index * self.batch_size # list index of batch start batch = self.indices[current_index:min(current_index + self.batch_size, self. dataset_size)] X = self.validated_split(self.split.X) y = self.validated_split(self.split.y) if y is not None: # Supervised if isinstance(X, (pd.DataFrame, pd.Series)): split = Split(X=X.loc[batch], y=np.stack(y.loc[batch].squeeze().values)) else: split = Split(X=X[batch], y=y[batch]) else: # Unsupervised if isinstance(X, (pd.DataFrame, pd.Series)): split = Split(X=X.loc[batch]) else: split = Split(X=X[batch]) if self.return_tuple: return split_to_ordered_tuple(split) else: return split
def test_squeeze_behavior(self): split = Split(a=None, b=pd.DataFrame(), c=pd.Series(), d=[], e=tuple(), f={}) split.squeeze() self.assertEqual(split, {})
def apply_projection(self, dataset_split: Split) -> Split: """ Index subset return """ return Split( **{ k: self.indexing_method(v, self.indices) for k, v in dataset_split.items() }).squeeze()
def __getitem__(self, *args, **kwargs) -> Union[Split, Tuple]: """ Pass-through to dataset sequence - applies transform on data and returns transformed batch """ batch: Union[Tuple, Split] = self.dataset_sequence(*args, **kwargs) if isinstance(batch, tuple): X = batch[0] return_tuple = True else: X = batch.X return_tuple = False output = self.pipeline.transform(X) if return_tuple: # Return input with X replaced by output (transformed X) # Contains y or other named inputs to propagate downstream # Explicitly order for *args input -- X, y, other... return tuple((X, *batch[1:])) else: return Split(X=output, **{k: v for k, v in batch.items() if k != "X"})
def __next__(self) -> Union[Split, Tuple]: """ NOTE: Some downstream objects expect to consume a generator with a tuple of X, y, other... not a Split object, so an ordered tuple will be returned if the dataset iterator returns a tuple """ batch = next(self.data_iterator) if isinstance(batch, tuple): X = batch[0] return_tuple = True else: X = batch.X return_tuple = False output = self.pipeline.transform(X) if return_tuple: # Return input with X replaced by output (transformed X) # Contains y or other named inputs to propagate downstream # Explicitly order for *args input -- X, y, other... return tuple((X, *batch[1:])) else: return Split(X=output, **{k: v for k, v in batch.items() if k != "X"})
def __next__(self) -> Union[Split, Tuple]: """ Turn a dataset split into a generator """ X = self.split.X y = self.split.y if self.dataset_size == 0: # Return None raise StopIteration # Loop so that infinite batches can be generated if self.current_index >= self.dataset_size: if self.infinite_loop: self.current_index = 0 self.first_run = False else: raise StopIteration # shuffle on new loops if self.current_index == 0 and self.shuffle and not self.first_run: self.indices = np.random.shuffle(self.indices) # next batch indices batch = self.indices[ self.current_index:min(self.current_index + self.batch_size, self.dataset_size)] self.current_index += self.batch_size if y is not None and (isinstance(y, (pd.DataFrame, pd.Series)) and not y.empty): # Supervised if isinstance(X, (pd.DataFrame, pd.Series)): split = Split(X=X.loc[batch], y=np.stack(y.loc[batch].squeeze().values)) else: split = Split(X=X[batch], y=y[batch]) else: # Unsupervised if isinstance(X, (pd.DataFrame, pd.Series)): split = Split(X=X.loc[batch]) else: split = Split(X=X[batch]) if self.return_tuple: return split_to_ordered_tuple(split) else: return split
def test_projection_logic(self): """ Test that the projection works as expected """ mock_dataset = MagicMock() mock_dataset.get_split.return_value = Split(X="a", y="b") projected_split = self.mock_cls(dataset=mock_dataset, split="ddefg") split = projected_split.dataset_split explicit_output = projected_split.apply_projection(split) implicit_output = projected_split.projected_split self.assertEqual(split, explicit_output) self.assertEqual(split, implicit_output) self.assertEqual(implicit_output, explicit_output)
def test_projection_logic(self): """ Test that the projection works as expected for pandas objects """ mock_dataset = MagicMock() mock_dataset.get_split.return_value = Split( X=pd.DataFrame(range(100)), y=pd.Series(range(100)) ) projected_split = self.mock_cls( dataset=mock_dataset, split="ddefg", indices=range(10, 30) ) expected_split = Split( X=pd.DataFrame(range(10, 30), index=range(10, 30)), y=pd.Series(range(10, 30), index=range(10, 30)), ) split = projected_split.dataset_split explicit_output = projected_split.apply_projection(split) implicit_output = projected_split.projected_split def pandas_split_comparison(a, b): self.assertEqual(a.keys(), b.keys()) for k, v in a.items(): if isinstance(v, pd.DataFrame): pd.testing.assert_frame_equal(v, b[k]) else: pd.testing.assert_series_equal(v, b[k]) with self.assertRaises(AssertionError): pandas_split_comparison(split, explicit_output) with self.assertRaises(AssertionError): pandas_split_comparison(split, implicit_output) pandas_split_comparison(expected_split, explicit_output) pandas_split_comparison(expected_split, implicit_output) pandas_split_comparison(implicit_output, explicit_output)
def test_itemgetter_behavior(self): """ Test the split is proxied for references """ mock_split = Split(X="abc", y="def", other="xyz") prop_mock = PropertyMock(return_value=mock_split) with patch.object(self.test_cls, "projected_split", new_callable=prop_mock): projected_split = self.mock_cls() for section in mock_split: self.assertEqual(mock_split[section], projected_split[section]) # **behavior self.assertEqual({**mock_split}, {**projected_split}) self.assertEqual( {"X": "abc", "y": "def", "other": "xyz"}, {**projected_split} )
def split_to_ordered_tuple(split: Split) -> Tuple: """ Helper to convert a split object into an ordered tuple of X, y, other """ return_objects = [] X = split.X y = split.y if X is not None: return_objects.append(X) if y is not None: return_objects.append(y) for k, v in split.items(): if k not in ("X", "y") and v is not None: return_objects.append(v) return return_objects
def test_default_value(self): container = SplitContainer() self.assertEqual(container["nonexistent_key"], Split())
def test_getattr_behavior(self): split = Split(a="ab") self.assertEqual(split.a, "ab") self.assertEqual(split.b, None)
def test_null_type_check(self): for null_type in (None, pd.DataFrame(), pd.Series(), [], tuple(), {}): self.assertTrue(Split.is_null_type(null_type))
def apply_projection(self, dataset_split: Split) -> Split: """ Identity return """ return dataset_split.squeeze()