def test_instantiated_fit_values(self): encoder = OneHotEncoder(fit_values=["0", "1", "2"]) expected = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)] actual = encoder.encode(["0", "1", "2", "1"]) self.assertEqual(actual, expected)
def test_error_if_unkonwn_false(self): encoder = OneHotEncoder(error_if_unknown=False).fit(["0", "1", "2"]) try: actual = encoder.encode(["5"]) except: self.fail("An exception was raised when it shouldn't have been") self.assertEqual(actual, [(0, 0, 0)])
def test_performance_encode(self): encoder = OneHotEncoder(list(range(1000)), error_if_unknown=False) to_encode = [100, 200, 300, 400, -1] * 100000 time = min( timeit.repeat(lambda: encoder.encode(to_encode), repeat=50, number=1)) #was approximately 0.040 self.assertLess(time, 1)
def assert_simulation_for_data(self, simulation, features, answers) -> None: self.assertEqual(len(simulation.interactions), len(features)) answers = OneHotEncoder(simulation.label_set).encode(answers) #first we make sure that all the labels are included #in the first interactions actions without any concern for order self.assertCountEqual(simulation.interactions[0].actions, set(answers)) #then we set our expected actions to the first interaction #to make sure that every interaction has the exact same actions #with the exact same order expected_actions = simulation.interactions[0].actions for f,l,i in zip(features, answers, simulation.interactions): expected_context = f expected_rewards = [ int(a == l) for a in i.actions] actual_context = i.context actual_actions = i.actions actual_rewards = simulation.reward(_choices(i)) self.assertEqual(actual_context, expected_context) self.assertSequenceEqual(actual_actions, expected_actions) self.assertSequenceEqual(actual_rewards, expected_rewards)
def test_performance_fit_values(self): fit_values = list(range(1000)) time = min( timeit.repeat(lambda: OneHotEncoder(fit_values), repeat=100, number=1)) #was approximately 0.017 self.assertLess(time, .03)
def __init__(self, features: Sequence[_C_out], labels: Sequence[Action]) -> None: """Instantiate a ClassificationSimulation. Args: features: The collection of features used for the original classifier problem. labels: The collection of labels assigned to each observation of features. """ assert len(features) == len( labels), "Mismatched lengths of features and labels" label_set = list(set(labels)) action_set = OneHotEncoder(label_set).encode(label_set) interactions = [ Interaction(context, action_set, i) for i, context in enumerate(features) ] #type: ignore rewards = OneHotEncoder(label_set).encode(labels) self.label_set = label_set super().__init__(interactions, rewards) #type:ignore
def test_error_if_unkonwn_true(self): encoder = OneHotEncoder(error_if_unknown=True).fit( ["1", "1", "1", "0", "0"]) with self.assertRaises(Exception): self.assertEqual(encoder.encode(["2"]), [(0)])
def test_singular_if_binary(self): encoder = OneHotEncoder(singular_if_binary=True).fit( ["1", "1", "1", "0", "0"]) self.assertEqual(encoder.encode(["0"]), [(1, )]) self.assertEqual(encoder.encode(["1"]), [(0, )])
def _make_unfit_encoder( self ) -> Tuple[Encoder, Sequence[str], Sequence[str], Sequence[Any]]: return OneHotEncoder(), ["d", "a", "b", "b", "b", "d"], ["a"], [(1, 0, 0)]
def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]: #placing some of these at the top would cause circular references from coba.data.pipes import Pipe from coba.data.encoders import Encoder, NumericEncoder, OneHotEncoder, StringEncoder from coba.data.filters import CsvReader, LabeledCsvCleaner data_id = self._data_id md5_checksum = self._md5_checksum openml_api_key = ExecutionContext.Config.openml_api_key data_description_url = f'https://www.openml.org/api/v1/json/data/{data_id}' type_description_url = f'https://www.openml.org/api/v1/json/data/features/{data_id}' if openml_api_key is not None: data_description_url += f'?api_key={openml_api_key}' type_description_url += f'?api_key={openml_api_key}' descr = json.loads(''.join( HttpSource(data_description_url, '.json', None, 'descr').read()))["data_set_description"] if descr['status'] == 'deactivated': raise Exception( f"Openml {data_id} has been deactivated. This is often due to flags on the data." ) types = json.loads(''.join( HttpSource(type_description_url, '.json', None, 'types').read()))["data_features"]["feature"] headers: List[str] = [] encoders: List[Encoder] = [] ignored: List[bool] = [] target: str = "" for tipe in types: headers.append(tipe['name']) ignored.append(tipe['is_ignore'] == 'true' or tipe['is_row_identifier'] == 'true') if tipe['is_target'] == 'true': target = tipe['name'] if tipe['data_type'] == 'numeric': encoders.append(NumericEncoder()) elif tipe['data_type'] == 'nominal' and tipe[ 'is_target'] == 'false': encoders.append(OneHotEncoder(singular_if_binary=True)) elif tipe['data_type'] == 'nominal' and tipe['is_target'] == 'true': encoders.append(OneHotEncoder()) else: encoders.append(StringEncoder()) if isinstance(encoders[headers.index(target)], NumericEncoder): target = self._get_classification_target(data_id, openml_api_key) ignored[headers.index(target)] = False encoders[headers.index(target)] = OneHotEncoder() csv_url = f"http://www.openml.org/data/v1/get_csv/{descr['file_id']}" source = HttpSource(csv_url, ".csv", md5_checksum, f"openml {data_id}") reader = CsvReader() cleaner = LabeledCsvCleaner(target, headers, encoders, ignored, True) feature_rows, label_rows = Pipe.join(source, [reader, cleaner]).read() return list(feature_rows), list(label_rows)