Example #1
0
    def test_sparse_with_header_drop_double_col(self):

        given_row0 = {0: 1, 1: 2, 2: 3}
        given_row1 = {0: 4, 1: 5, 2: 6}

        expected_row0 = {0: 1}
        expected_row1 = {0: 4}

        given = [['a', 'b', 'c'], given_row0, given_row1]
        expected = [['a'], expected_row0, expected_row1]

        self.assertEqual(expected, list(Drop(drop_cols=[1, 2]).filter(given)))
Example #2
0
    def test_dense_with_header_drop_double_col(self):

        given_row0 = [1, 2, 3]
        given_row1 = [4, 5, 6]

        expected_row0 = [1]
        expected_row1 = [4]

        given = [['a', 'b', 'c'], given_row0, given_row1]
        expected = [['a'], expected_row0, expected_row1]

        self.assertEqual(expected, list(Drop(drop_cols=[1, 2]).filter(given)))
Example #3
0
    def test_sparse_sans_header_drop_single_col(self):

        given_row0 = {0: 1, 1: 2, 2: 3}
        given_row1 = {0: 4, 1: 5, 2: 6}

        expected_row0 = {0: 1, 2: 3}
        expected_row1 = {0: 4, 2: 6}

        given = [None, given_row0, given_row1]
        expected = [None, expected_row0, expected_row1]

        self.assertEqual(expected, list(Drop(drop_cols=[1]).filter(given)))
Example #4
0
    def test_dense_sans_header_drop_single_col(self):

        given_row0 = [1, 2, 3]
        given_row1 = [4, 5, 6]

        expected_row0 = [1, 3]
        expected_row1 = [4, 6]

        given = [given_row0, given_row1]
        expected = [expected_row0, expected_row1]

        self.assertEqual(expected, list(Drop(drop_cols=[1]).filter(given)))
Example #5
0
    def read(self) -> Iterable[Tuple[Any, Any]]:
        """Read and parse the openml source."""
        try:
            dataset_description = self._get_dataset_description(self._data_id)

            if dataset_description['status'] == 'deactivated':
                raise CobaException(
                    f"Openml {self._data_id} has been deactivated. This is often due to flags on the data."
                )

            feature_descriptions = self._get_feature_descriptions(
                self._data_id)
            task_descriptions = self._get_task_descriptions(self._data_id)

            is_ignore = lambda r: (r['is_ignore'] == 'true' or r[
                'is_row_identifier'] == 'true' or r['data_type'] not in
                                   ['numeric', 'nominal'])

            ignore = [
                self._name_cleaning(f['name']) for f in feature_descriptions
                if is_ignore(f)
            ]
            target = self._name_cleaning(
                self._get_target_for_problem_type(task_descriptions))

            if target in ignore: ignore.pop(ignore.index(target))

            def row_has_missing_values(row):
                row_values = row._values.values() if isinstance(
                    row, SparseWithMeta) else row._values
                return "?" in row_values or "" in row_values

            source = ListSource(
                self._get_dataset_lines(dataset_description["file_id"], None))
            reader = ArffReader(cat_as_str=self._cat_as_str)
            drop = Drop(drop_cols=ignore, drop_row=row_has_missing_values)
            structure = Structure([None, target])

            return Pipes.join(source, reader, drop, structure).read()

        except KeyboardInterrupt:
            #we don't want to clear the cache in the case of a KeyboardInterrupt
            raise

        except CobaException:
            #we don't want to clear the cache if it is an error we know about (the original raise should clear if needed)
            raise

        except Exception:
            #if something unexpected went wrong clear the cache just in case it was corrupted somehow
            self._clear_cache()
            raise