def test_simple(self): lines = [ "@relation news20", "@attribute a numeric", "@attribute B numeric", "@data", "1, 2", "2, 3", ] expected = [[1, 2], [2, 3]] self.assertEqual(expected, list(ArffSource(ListSource(lines)).read())) self.assertEqual({}, ArffSource(ListSource(lines)).params) self.assertEqual('{},{}', str(ArffSource(ListSource(lines))))
def from_prebuilt(name: str) -> 'Environments': """Instantiate Environments from a pre-built definition made for diagnostics and comparisons across projects.""" repo_url = "https://github.com/mrucker/coba_prebuilds/blob/main" definition_url = f"{repo_url}/{name}/index.json?raw=True" definition_rsp = HttpSource(definition_url).read() if definition_rsp.status_code == 404: root_dir_text = HttpSource( "https://api.github.com/repos/mrucker/coba_prebuilds/contents/" ).read().content.decode('utf-8') root_dir_json = JsonDecode().filter(root_dir_text) known_names = [ obj['name'] for obj in root_dir_json if obj['name'] != "README.md" ] raise CobaException( f"The given prebuilt name, {name}, couldn't be found. Known names are: {known_names}" ) definition_txt = definition_rsp.content.decode('utf-8') definition_txt = definition_txt.replace('"./', f'"{repo_url}/{name}/') definition_txt = definition_txt.replace('.json"', '.json?raw=True"') return Environments.from_file(ListSource([definition_txt]))
def test_two_environments_two_filters(self): json_txt = """{ "environments" : [ [{ "OpenmlSimulation": [150,151], "method":"foreach" }, { "Take":[10,20], "method":"foreach" }] ] }""" environments = EnvironmentDefinitionFileV1(ListSource([json_txt ])).read() self.assertEqual(4, len(environments)) self.assertDictEqual( { "openml": 150, "take": 10, **environments[0].params }, environments[0].params) self.assertDictEqual( { "openml": 150, "take": 20, **environments[1].params }, environments[1].params) self.assertDictEqual( { "openml": 151, "take": 10, **environments[2].params }, environments[2].params) self.assertDictEqual( { "openml": 151, "take": 20, **environments[3].params }, environments[3].params)
def __init__(self, *args, **kwargs) -> None: """Instantiate a SupervisedSimulation.""" if 'source' in kwargs or (args and hasattr(args[0], 'read')): source = args[0] if len(args) > 0 else kwargs['source'] label_col = args[1] if len(args) > 1 else kwargs.get( "label_col", None) label_type = args[2] if len(args) > 2 else kwargs.get( "label_type", "C") take = args[3] if len(args) > 3 else kwargs.get("take", None) if take is not None: source = Pipes.join(source, Reservoir(take)) if label_col is not None: source = Pipes.join(source, Structure((None, label_col))) params = source.params else: X = args[0] Y = args[1] label_type = args[2] if len(args) > 2 else kwargs.get( "label_type", "C") source = ListSource(list(zip(X, Y))) params = {"source": "[X,Y]"} self._label_type = label_type self._source = source self._params = { **params, "label_type": self._label_type, "type": "SupervisedSimulation" }
def test_run(self): source = ListSource(list(range(10))) sink = ListSink() Pipes.join(source, ProcessNameFilter(), sink).run() self.assertEqual(sink.items[0], ['MainProcess']*10)
def test_bad_pipe_exception(self): json_txt = """{ "environments" : [ [ {"OpenmlSimulation":150}, null ] ] }""" with self.assertRaises(CobaException) as e: EnvironmentDefinitionFileV1(ListSource([json_txt])).read() self.assertIn("We were unable to construct", str(e.exception))
def read(self) -> Iterable[Tuple[Any, Any]]: """Read and parse the openml source.""" try: dataset_description = self._get_dataset_description(self._data_id) if dataset_description['status'] == 'deactivated': raise CobaException( f"Openml {self._data_id} has been deactivated. This is often due to flags on the data." ) feature_descriptions = self._get_feature_descriptions( self._data_id) task_descriptions = self._get_task_descriptions(self._data_id) is_ignore = lambda r: (r['is_ignore'] == 'true' or r[ 'is_row_identifier'] == 'true' or r['data_type'] not in ['numeric', 'nominal']) ignore = [ self._name_cleaning(f['name']) for f in feature_descriptions if is_ignore(f) ] target = self._name_cleaning( self._get_target_for_problem_type(task_descriptions)) if target in ignore: ignore.pop(ignore.index(target)) def row_has_missing_values(row): row_values = row._values.values() if isinstance( row, SparseWithMeta) else row._values return "?" in row_values or "" in row_values source = ListSource( self._get_dataset_lines(dataset_description["file_id"], None)) reader = ArffReader(cat_as_str=self._cat_as_str) drop = Drop(drop_cols=ignore, drop_row=row_has_missing_values) structure = Structure([None, target]) return Pipes.join(source, reader, drop, structure).read() except KeyboardInterrupt: #we don't want to clear the cache in the case of a KeyboardInterrupt raise except CobaException: #we don't want to clear the cache if it is an error we know about (the original raise should clear if needed) raise except Exception: #if something unexpected went wrong clear the cache just in case it was corrupted somehow self._clear_cache() raise
def test_raw_environment(self): json_txt = """{ "environments" : { "OpenmlSimulation": 150 } }""" environments = EnvironmentDefinitionFileV1(ListSource([json_txt ])).read() self.assertIsInstance(environments[0], OpenmlSimulation) self.assertDictEqual({ 'openml': 150, **environments[0].params }, environments[0].params)
def test_two_variables(self): json_txt = """{ "variables": { "$openmls": {"OpenmlSimulation": [150,151], "method":"foreach"}, "$takes" : {"Take":[10,20], "method":"foreach"} }, "environments": [ ["$openmls", "$takes"], "$openmls" ] }""" environments = EnvironmentDefinitionFileV1(ListSource([json_txt ])).read() self.assertEqual(6, len(environments)) self.assertIsInstance(environments[4], OpenmlSimulation) self.assertIsInstance(environments[5], OpenmlSimulation) self.assertDictEqual( { "openml": 150, "take": 10, **environments[0].params }, environments[0].params) self.assertDictEqual( { "openml": 150, "take": 20, **environments[1].params }, environments[1].params) self.assertDictEqual( { "openml": 151, "take": 10, **environments[2].params }, environments[2].params) self.assertDictEqual( { "openml": 151, "take": 20, **environments[3].params }, environments[3].params) self.assertDictEqual({ "openml": 150, **environments[4].params }, environments[4].params) self.assertDictEqual({ "openml": 151, **environments[5].params }, environments[5].params)
def test_one_environment_one_filter(self): json_txt = """{ "environments" : [ [{ "OpenmlSimulation": 150 }, {"Take":10} ] ] }""" environments = EnvironmentDefinitionFileV1(ListSource([json_txt ])).read() self.assertDictEqual( { "openml": 150, "take": 10, **environments[0].params }, environments[0].params)
def test_pipe_str(self): json_txt = """{ "environments" : [ [ {"OpenmlSimulation":150}, "Identity" ] ] }""" environments = EnvironmentDefinitionFileV1(ListSource([json_txt ])).read() self.assertEqual(1, len(environments)) self.assertDictEqual({ "openml": 150, **environments[0].params }, environments[0].params)
def test_simple(self): lines = [ "meta line", "0 1:2 2:3", "1 1:1 2:1", "2 2:1", "1 1:1", ] expected = [({ 1: 2, 2: 3 }, ['0']), ({ 1: 1, 2: 1 }, ['1']), ({ 2: 1 }, ['2']), ({ 1: 1 }, ['1'])] self.assertEqual(expected, list(ManikSource(ListSource(lines)).read())) self.assertEqual({}, ManikSource(ListSource(lines)).params) self.assertEqual('{},{}', str(ManikSource(ListSource(lines))))
def test_sim_write_read_with_params_and_none_context(self): sink = ListSink() expected_env = MemorySimulation( params={'a': 1}, interactions=[SimulatedInteraction(None, [1, 2], rewards=[2, 3])]) SerializedSimulation(expected_env).write(sink) actual_env = SerializedSimulation(ListSource(sink.items)) self.assertEqual(expected_env.params, actual_env.params) self.assertEqual(len(list(expected_env.read())), len(list(actual_env.read()))) for e_interaction, a_interaction in zip(expected_env.read(), actual_env.read()): self.assertEqual(e_interaction.context, a_interaction.context) self.assertEqual(e_interaction.actions, a_interaction.actions) self.assertEqual(e_interaction.kwargs, a_interaction.kwargs)
def test_source_reader_regression_less_than_10(self): source = ArffSource( ListSource(""" @relation weather @attribute pH real @attribute temperature real @attribute conductivity real @attribute coli {2, 1} @attribute play {yes, no} @data 8.1,27,1410,2,no 8.2,29,1180,2,no 8.3,27,1020,1,yes """.splitlines())) interactions = list( SupervisedSimulation(source, "pH", label_type="R").read()) self.assertEqual(len(interactions), 3) for rnd in interactions: hash(rnd.context) #make sure these are hashable hash(rnd.actions[0]) #make sure these are hashable hash(rnd.actions[1]) #make sure these are hashable self.assertEqual((27, 1410, (1, 0), (0, 1)), interactions[0].context) self.assertEqual((29, 1180, (1, 0), (0, 1)), interactions[1].context) self.assertEqual((27, 1020, (0, 1), (1, 0)), interactions[2].context) self.assertEqual([(0, 0, 1), (1, 0, 0), (0, 1, 0)], interactions[0].actions) self.assertEqual([(0, 0, 1), (1, 0, 0), (0, 1, 0)], interactions[1].actions) self.assertEqual([(0, 0, 1), (1, 0, 0), (0, 1, 0)], interactions[2].actions) self.assertEqual([0, 1, .5], interactions[0].kwargs["rewards"]) self.assertEqual([.5, .5, 1], interactions[1].kwargs["rewards"]) self.assertEqual([1, 0, .5], interactions[2].kwargs["rewards"])
def test_one_variable(self): json_txt = """{ "variables" : {"$openml_sims": {"OpenmlSimulation": [150,151], "method":"foreach"} }, "environments" : [ "$openml_sims" ] }""" environments = EnvironmentDefinitionFileV1(ListSource([json_txt ])).read() self.assertIsInstance(environments[0], OpenmlSimulation) self.assertIsInstance(environments[1], OpenmlSimulation) self.assertDictEqual({ "openml": 150, **environments[0].params }, environments[0].params) self.assertDictEqual({ "openml": 151, **environments[1].params }, environments[1].params)
def test_two_singular_environments(self): json_txt = """{ "environments" : [ {"OpenmlSimulation": 150}, {"OpenmlSimulation": 151} ] }""" environments = EnvironmentDefinitionFileV1(ListSource([json_txt ])).read() self.assertIsInstance(environments[0], OpenmlSimulation) self.assertIsInstance(environments[1], OpenmlSimulation) self.assertDictEqual({ "openml": 150, **environments[0].params }, environments[0].params) self.assertDictEqual({ "openml": 151, **environments[1].params }, environments[1].params)
def test_source_reader_too_large_take_exact_min(self): source = ArffSource( ListSource(""" @relation weather @attribute pH real @attribute temperature real @attribute conductivity real @attribute coli {2, 1} @attribute play {yes, no} @data 8.1,27,1410,2,no 8.2,29,1180,2,no 8.3,27,1020,1,yes """.splitlines())) interactions = list( SupervisedSimulation(source, "coli", take=(5, 5)).read()) self.assertEqual(len(interactions), 0)
def test_pipe_list(self): json_txt = """{ "environments" : [ [ {"OpenmlSimulation":150}, [ {"Take":10}, {"Take":20} ] ] ] }""" environments = EnvironmentDefinitionFileV1(ListSource([json_txt ])).read() self.assertEqual(2, len(environments)) self.assertDictEqual( { "openml": 150, "take": 10, **environments[0].params }, environments[0].params) self.assertDictEqual( { "openml": 150, "take": 20, **environments[1].params }, environments[1].params)
def test_simple(self): self.assertEqual([["1", "2", "3"]], list(CsvSource(ListSource(["1,2,3"])).read())) self.assertEqual({}, CsvSource(ListSource(["1,2,3"])).params) self.assertEqual('{},{}', str(CsvSource(ListSource(["1,2,3"]))))
def test_read_2(self): io = ListSource() self.assertEqual([], list(io.read()))
def test_read_1(self): io = ListSource(['a', 'b']) self.assertEqual(["a", 'b'], list(io.read()))
def __init__(self, log_file: Optional[str] = None, minify:bool=True) -> None: self._log_file = log_file self._minify = minify self._source = DiskSource(log_file) if log_file else ListSource() self._sink = DiskSink(log_file) if log_file else ListSink(self._source.items)
def test_exception(self): with self.assertRaises(Exception): Pipes.join(ListSource(list(range(4))), ExceptionFilter(), ListSink()).run()