Beispiel #1
0
    def test_simple(self):
        lines = [
            "@relation news20",
            "@attribute a numeric",
            "@attribute B numeric",
            "@data",
            "1,  2",
            "2,  3",
        ]

        expected = [[1, 2], [2, 3]]

        self.assertEqual(expected, list(ArffSource(ListSource(lines)).read()))
        self.assertEqual({}, ArffSource(ListSource(lines)).params)
        self.assertEqual('{},{}', str(ArffSource(ListSource(lines))))
Beispiel #2
0
    def from_prebuilt(name: str) -> 'Environments':
        """Instantiate Environments from a pre-built definition made for diagnostics and comparisons across projects."""

        repo_url = "https://github.com/mrucker/coba_prebuilds/blob/main"
        definition_url = f"{repo_url}/{name}/index.json?raw=True"

        definition_rsp = HttpSource(definition_url).read()

        if definition_rsp.status_code == 404:
            root_dir_text = HttpSource(
                "https://api.github.com/repos/mrucker/coba_prebuilds/contents/"
            ).read().content.decode('utf-8')
            root_dir_json = JsonDecode().filter(root_dir_text)
            known_names = [
                obj['name'] for obj in root_dir_json
                if obj['name'] != "README.md"
            ]
            raise CobaException(
                f"The given prebuilt name, {name}, couldn't be found. Known names are: {known_names}"
            )

        definition_txt = definition_rsp.content.decode('utf-8')
        definition_txt = definition_txt.replace('"./', f'"{repo_url}/{name}/')
        definition_txt = definition_txt.replace('.json"', '.json?raw=True"')

        return Environments.from_file(ListSource([definition_txt]))
    def test_two_environments_two_filters(self):
        json_txt = """{
            "environments" : [
                [{ "OpenmlSimulation": [150,151], "method":"foreach" }, { "Take":[10,20], "method":"foreach" }]
            ]
        }"""

        environments = EnvironmentDefinitionFileV1(ListSource([json_txt
                                                               ])).read()

        self.assertEqual(4, len(environments))
        self.assertDictEqual(
            {
                "openml": 150,
                "take": 10,
                **environments[0].params
            }, environments[0].params)
        self.assertDictEqual(
            {
                "openml": 150,
                "take": 20,
                **environments[1].params
            }, environments[1].params)
        self.assertDictEqual(
            {
                "openml": 151,
                "take": 10,
                **environments[2].params
            }, environments[2].params)
        self.assertDictEqual(
            {
                "openml": 151,
                "take": 20,
                **environments[3].params
            }, environments[3].params)
Beispiel #4
0
    def __init__(self, *args, **kwargs) -> None:
        """Instantiate a SupervisedSimulation."""

        if 'source' in kwargs or (args and hasattr(args[0], 'read')):
            source = args[0] if len(args) > 0 else kwargs['source']
            label_col = args[1] if len(args) > 1 else kwargs.get(
                "label_col", None)
            label_type = args[2] if len(args) > 2 else kwargs.get(
                "label_type", "C")
            take = args[3] if len(args) > 3 else kwargs.get("take", None)
            if take is not None: source = Pipes.join(source, Reservoir(take))
            if label_col is not None:
                source = Pipes.join(source, Structure((None, label_col)))
            params = source.params

        else:
            X = args[0]
            Y = args[1]
            label_type = args[2] if len(args) > 2 else kwargs.get(
                "label_type", "C")
            source = ListSource(list(zip(X, Y)))
            params = {"source": "[X,Y]"}

        self._label_type = label_type
        self._source = source
        self._params = {
            **params, "label_type": self._label_type,
            "type": "SupervisedSimulation"
        }
Beispiel #5
0
    def test_run(self):
        source = ListSource(list(range(10)))
        sink   = ListSink()

        Pipes.join(source, ProcessNameFilter(), sink).run()

        self.assertEqual(sink.items[0], ['MainProcess']*10)
    def test_bad_pipe_exception(self):
        json_txt = """{
            "environments" : [
                [ {"OpenmlSimulation":150}, null ]
            ]
        }"""

        with self.assertRaises(CobaException) as e:
            EnvironmentDefinitionFileV1(ListSource([json_txt])).read()

        self.assertIn("We were unable to construct", str(e.exception))
Beispiel #7
0
    def read(self) -> Iterable[Tuple[Any, Any]]:
        """Read and parse the openml source."""
        try:
            dataset_description = self._get_dataset_description(self._data_id)

            if dataset_description['status'] == 'deactivated':
                raise CobaException(
                    f"Openml {self._data_id} has been deactivated. This is often due to flags on the data."
                )

            feature_descriptions = self._get_feature_descriptions(
                self._data_id)
            task_descriptions = self._get_task_descriptions(self._data_id)

            is_ignore = lambda r: (r['is_ignore'] == 'true' or r[
                'is_row_identifier'] == 'true' or r['data_type'] not in
                                   ['numeric', 'nominal'])

            ignore = [
                self._name_cleaning(f['name']) for f in feature_descriptions
                if is_ignore(f)
            ]
            target = self._name_cleaning(
                self._get_target_for_problem_type(task_descriptions))

            if target in ignore: ignore.pop(ignore.index(target))

            def row_has_missing_values(row):
                row_values = row._values.values() if isinstance(
                    row, SparseWithMeta) else row._values
                return "?" in row_values or "" in row_values

            source = ListSource(
                self._get_dataset_lines(dataset_description["file_id"], None))
            reader = ArffReader(cat_as_str=self._cat_as_str)
            drop = Drop(drop_cols=ignore, drop_row=row_has_missing_values)
            structure = Structure([None, target])

            return Pipes.join(source, reader, drop, structure).read()

        except KeyboardInterrupt:
            #we don't want to clear the cache in the case of a KeyboardInterrupt
            raise

        except CobaException:
            #we don't want to clear the cache if it is an error we know about (the original raise should clear if needed)
            raise

        except Exception:
            #if something unexpected went wrong clear the cache just in case it was corrupted somehow
            self._clear_cache()
            raise
    def test_raw_environment(self):
        json_txt = """{
            "environments" : { "OpenmlSimulation": 150 }
        }"""

        environments = EnvironmentDefinitionFileV1(ListSource([json_txt
                                                               ])).read()

        self.assertIsInstance(environments[0], OpenmlSimulation)
        self.assertDictEqual({
            'openml': 150,
            **environments[0].params
        }, environments[0].params)
    def test_two_variables(self):
        json_txt = """{
            "variables": {
                "$openmls": {"OpenmlSimulation": [150,151], "method":"foreach"},
                "$takes"  : {"Take":[10,20], "method":"foreach"}
            },
            "environments": [
                ["$openmls", "$takes"],
                "$openmls"
            ]
        }"""

        environments = EnvironmentDefinitionFileV1(ListSource([json_txt
                                                               ])).read()

        self.assertEqual(6, len(environments))
        self.assertIsInstance(environments[4], OpenmlSimulation)
        self.assertIsInstance(environments[5], OpenmlSimulation)
        self.assertDictEqual(
            {
                "openml": 150,
                "take": 10,
                **environments[0].params
            }, environments[0].params)
        self.assertDictEqual(
            {
                "openml": 150,
                "take": 20,
                **environments[1].params
            }, environments[1].params)
        self.assertDictEqual(
            {
                "openml": 151,
                "take": 10,
                **environments[2].params
            }, environments[2].params)
        self.assertDictEqual(
            {
                "openml": 151,
                "take": 20,
                **environments[3].params
            }, environments[3].params)
        self.assertDictEqual({
            "openml": 150,
            **environments[4].params
        }, environments[4].params)
        self.assertDictEqual({
            "openml": 151,
            **environments[5].params
        }, environments[5].params)
    def test_one_environment_one_filter(self):
        json_txt = """{
            "environments" : [
                [{ "OpenmlSimulation": 150 }, {"Take":10} ]
            ]
        }"""

        environments = EnvironmentDefinitionFileV1(ListSource([json_txt
                                                               ])).read()
        self.assertDictEqual(
            {
                "openml": 150,
                "take": 10,
                **environments[0].params
            }, environments[0].params)
    def test_pipe_str(self):
        json_txt = """{
            "environments" : [
                [ {"OpenmlSimulation":150}, "Identity" ]
            ]
        }"""

        environments = EnvironmentDefinitionFileV1(ListSource([json_txt
                                                               ])).read()

        self.assertEqual(1, len(environments))
        self.assertDictEqual({
            "openml": 150,
            **environments[0].params
        }, environments[0].params)
Beispiel #12
0
    def test_simple(self):
        lines = [
            "meta line",
            "0 1:2 2:3",
            "1 1:1 2:1",
            "2 2:1",
            "1 1:1",
        ]

        expected = [({
            1: 2,
            2: 3
        }, ['0']), ({
            1: 1,
            2: 1
        }, ['1']), ({
            2: 1
        }, ['2']), ({
            1: 1
        }, ['1'])]

        self.assertEqual(expected, list(ManikSource(ListSource(lines)).read()))
        self.assertEqual({}, ManikSource(ListSource(lines)).params)
        self.assertEqual('{},{}', str(ManikSource(ListSource(lines))))
Beispiel #13
0
    def test_sim_write_read_with_params_and_none_context(self):
        sink = ListSink()

        expected_env = MemorySimulation(
            params={'a': 1},
            interactions=[SimulatedInteraction(None, [1, 2], rewards=[2, 3])])
        SerializedSimulation(expected_env).write(sink)
        actual_env = SerializedSimulation(ListSource(sink.items))

        self.assertEqual(expected_env.params, actual_env.params)
        self.assertEqual(len(list(expected_env.read())),
                         len(list(actual_env.read())))
        for e_interaction, a_interaction in zip(expected_env.read(),
                                                actual_env.read()):
            self.assertEqual(e_interaction.context, a_interaction.context)
            self.assertEqual(e_interaction.actions, a_interaction.actions)
            self.assertEqual(e_interaction.kwargs, a_interaction.kwargs)
Beispiel #14
0
    def test_source_reader_regression_less_than_10(self):

        source = ArffSource(
            ListSource("""
            @relation weather

            @attribute pH real
            @attribute temperature real
            @attribute conductivity real
            @attribute coli {2, 1}
            @attribute play {yes, no}

            @data
            8.1,27,1410,2,no
            8.2,29,1180,2,no
            8.3,27,1020,1,yes
        """.splitlines()))

        interactions = list(
            SupervisedSimulation(source, "pH", label_type="R").read())

        self.assertEqual(len(interactions), 3)

        for rnd in interactions:

            hash(rnd.context)  #make sure these are hashable
            hash(rnd.actions[0])  #make sure these are hashable
            hash(rnd.actions[1])  #make sure these are hashable

        self.assertEqual((27, 1410, (1, 0), (0, 1)), interactions[0].context)
        self.assertEqual((29, 1180, (1, 0), (0, 1)), interactions[1].context)
        self.assertEqual((27, 1020, (0, 1), (1, 0)), interactions[2].context)

        self.assertEqual([(0, 0, 1), (1, 0, 0), (0, 1, 0)],
                         interactions[0].actions)
        self.assertEqual([(0, 0, 1), (1, 0, 0), (0, 1, 0)],
                         interactions[1].actions)
        self.assertEqual([(0, 0, 1), (1, 0, 0), (0, 1, 0)],
                         interactions[2].actions)

        self.assertEqual([0, 1, .5], interactions[0].kwargs["rewards"])
        self.assertEqual([.5, .5, 1], interactions[1].kwargs["rewards"])
        self.assertEqual([1, 0, .5], interactions[2].kwargs["rewards"])
    def test_one_variable(self):
        json_txt = """{
            "variables"    : {"$openml_sims": {"OpenmlSimulation": [150,151], "method":"foreach"} },
            "environments" : [ "$openml_sims" ]
        }"""

        environments = EnvironmentDefinitionFileV1(ListSource([json_txt
                                                               ])).read()

        self.assertIsInstance(environments[0], OpenmlSimulation)
        self.assertIsInstance(environments[1], OpenmlSimulation)
        self.assertDictEqual({
            "openml": 150,
            **environments[0].params
        }, environments[0].params)
        self.assertDictEqual({
            "openml": 151,
            **environments[1].params
        }, environments[1].params)
    def test_two_singular_environments(self):
        json_txt = """{
            "environments" : [
                {"OpenmlSimulation": 150},
                {"OpenmlSimulation": 151}
            ]
        }"""

        environments = EnvironmentDefinitionFileV1(ListSource([json_txt
                                                               ])).read()

        self.assertIsInstance(environments[0], OpenmlSimulation)
        self.assertIsInstance(environments[1], OpenmlSimulation)
        self.assertDictEqual({
            "openml": 150,
            **environments[0].params
        }, environments[0].params)
        self.assertDictEqual({
            "openml": 151,
            **environments[1].params
        }, environments[1].params)
Beispiel #17
0
    def test_source_reader_too_large_take_exact_min(self):

        source = ArffSource(
            ListSource("""
            @relation weather
            
            @attribute pH real
            @attribute temperature real
            @attribute conductivity real
            @attribute coli {2, 1}
            @attribute play {yes, no}
            
            @data
            8.1,27,1410,2,no
            8.2,29,1180,2,no
            8.3,27,1020,1,yes
        """.splitlines()))

        interactions = list(
            SupervisedSimulation(source, "coli", take=(5, 5)).read())

        self.assertEqual(len(interactions), 0)
    def test_pipe_list(self):
        json_txt = """{
            "environments" : [
                [ {"OpenmlSimulation":150}, [ {"Take":10}, {"Take":20} ] ]
            ]
        }"""

        environments = EnvironmentDefinitionFileV1(ListSource([json_txt
                                                               ])).read()

        self.assertEqual(2, len(environments))
        self.assertDictEqual(
            {
                "openml": 150,
                "take": 10,
                **environments[0].params
            }, environments[0].params)
        self.assertDictEqual(
            {
                "openml": 150,
                "take": 20,
                **environments[1].params
            }, environments[1].params)
Beispiel #19
0
 def test_simple(self):
     self.assertEqual([["1", "2", "3"]],
                      list(CsvSource(ListSource(["1,2,3"])).read()))
     self.assertEqual({}, CsvSource(ListSource(["1,2,3"])).params)
     self.assertEqual('{},{}', str(CsvSource(ListSource(["1,2,3"]))))
Beispiel #20
0
 def test_read_2(self):
     io = ListSource()
     self.assertEqual([], list(io.read()))
Beispiel #21
0
 def test_read_1(self):
     io = ListSource(['a', 'b'])
     self.assertEqual(["a", 'b'], list(io.read()))
Beispiel #22
0
    def __init__(self, log_file: Optional[str] = None, minify:bool=True) -> None:

        self._log_file = log_file
        self._minify   = minify
        self._source   = DiskSource(log_file) if log_file else ListSource()
        self._sink     = DiskSink(log_file)   if log_file else ListSink(self._source.items)
Beispiel #23
0
 def test_exception(self):
     with self.assertRaises(Exception):
         Pipes.join(ListSource(list(range(4))), ExceptionFilter(), ListSink()).run()