def test_sparse_with_header_drop_double_col(self): given_row0 = {0: 1, 1: 2, 2: 3} given_row1 = {0: 4, 1: 5, 2: 6} expected_row0 = {0: 1} expected_row1 = {0: 4} given = [['a', 'b', 'c'], given_row0, given_row1] expected = [['a'], expected_row0, expected_row1] self.assertEqual(expected, list(Drop(drop_cols=[1, 2]).filter(given)))
def test_dense_with_header_drop_double_col(self): given_row0 = [1, 2, 3] given_row1 = [4, 5, 6] expected_row0 = [1] expected_row1 = [4] given = [['a', 'b', 'c'], given_row0, given_row1] expected = [['a'], expected_row0, expected_row1] self.assertEqual(expected, list(Drop(drop_cols=[1, 2]).filter(given)))
def test_sparse_sans_header_drop_single_col(self): given_row0 = {0: 1, 1: 2, 2: 3} given_row1 = {0: 4, 1: 5, 2: 6} expected_row0 = {0: 1, 2: 3} expected_row1 = {0: 4, 2: 6} given = [None, given_row0, given_row1] expected = [None, expected_row0, expected_row1] self.assertEqual(expected, list(Drop(drop_cols=[1]).filter(given)))
def test_dense_sans_header_drop_single_col(self): given_row0 = [1, 2, 3] given_row1 = [4, 5, 6] expected_row0 = [1, 3] expected_row1 = [4, 6] given = [given_row0, given_row1] expected = [expected_row0, expected_row1] self.assertEqual(expected, list(Drop(drop_cols=[1]).filter(given)))
def read(self) -> Iterable[Tuple[Any, Any]]: """Read and parse the openml source.""" try: dataset_description = self._get_dataset_description(self._data_id) if dataset_description['status'] == 'deactivated': raise CobaException( f"Openml {self._data_id} has been deactivated. This is often due to flags on the data." ) feature_descriptions = self._get_feature_descriptions( self._data_id) task_descriptions = self._get_task_descriptions(self._data_id) is_ignore = lambda r: (r['is_ignore'] == 'true' or r[ 'is_row_identifier'] == 'true' or r['data_type'] not in ['numeric', 'nominal']) ignore = [ self._name_cleaning(f['name']) for f in feature_descriptions if is_ignore(f) ] target = self._name_cleaning( self._get_target_for_problem_type(task_descriptions)) if target in ignore: ignore.pop(ignore.index(target)) def row_has_missing_values(row): row_values = row._values.values() if isinstance( row, SparseWithMeta) else row._values return "?" in row_values or "" in row_values source = ListSource( self._get_dataset_lines(dataset_description["file_id"], None)) reader = ArffReader(cat_as_str=self._cat_as_str) drop = Drop(drop_cols=ignore, drop_row=row_has_missing_values) structure = Structure([None, target]) return Pipes.join(source, reader, drop, structure).read() except KeyboardInterrupt: #we don't want to clear the cache in the case of a KeyboardInterrupt raise except CobaException: #we don't want to clear the cache if it is an error we know about (the original raise should clear if needed) raise except Exception: #if something unexpected went wrong clear the cache just in case it was corrupted somehow self._clear_cache() raise