def test_instantiated_fit_values(self): encoder = OneHotEncoder(fit_values=["0", "1", "2"]) expected = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)] actual = encoder.encode(["0", "1", "2", "1"]) self.assertEqual(actual, expected)
def test_fit(self): encoder = OneHotEncoder() fit_encoder = encoder.fit(["0", "1", "2"]) self.assertEqual(False, encoder.is_fit) self.assertEqual(True, fit_encoder.is_fit) self.assertEqual([(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)], fit_encoder.encodes(["0", "1", "2", "1"]))
def test_ignore_missing_value(self): encode = Encode({ 0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder() }, missing_val="?") self.assertEqual([[(1, 0, 0), '?'], [(0, 1, 0), (1, 0)], [(0, 1, 0), (0, 1)]], list(encode.filter([[1, '?'], [2, 5], [2, 6]])))
def test_error_if_unkonwn_false(self): encoder = OneHotEncoder(error_if_unknown=False).fit(["0", "1", "2"]) try: actual = encoder.encode(["5"]) except: self.fail("An exception was raised when it shouldn't have been") self.assertEqual(actual, [(0, 0, 0)])
def test_dense_encode_onehot_with_header_and_extra_encoder(self): encode = Encode({ 0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder(), 2: StringEncoder() }) self.assertEqual([[(1, 0, 0), (1, 0, 0)], [(0, 1, 0), (0, 1, 0)], [(0, 1, 0), (0, 0, 1)]], list(encode.filter([[1, 4], [2, 5], [2, 6]])))
def test_performance_encode(self): encoder = OneHotEncoder(list(range(1000)), error_if_unknown=False) to_encode = [100, 200, 300, 400, -1] * 100000 time = min( timeit.repeat(lambda: encoder.encode(to_encode), repeat=50, number=1)) #was approximately 0.040 self.assertLess(time, 1)
def test_onehot_encode_performance(self): encoder = OneHotEncoder(list(range(1000)), err_if_unknown=False) to_encode = [100, 200, 300, 400, -1] * 100000 time = min( timeit.repeat(lambda: encoder.encodes(to_encode), repeat=25, number=1)) #best observed 0.027 self.assertLess(time, .27)
def test_sparse_encode_onehot(self): encode = Encode({0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder()}) given = [{0: 1}, {0: 2, 1: 5}, {0: 2, 1: 6}] expected = [{ 0: (1, 0, 0) }, { 0: (0, 1, 0), 1: (0, 1, 0) }, { 0: (0, 1, 0), 1: (0, 0, 1) }] self.assertEqual(expected, list(encode.filter(given)))
def actions_gen(): if not n_action_features: return OneHotEncoder().fit_encodes(range(n_actions)) else: return [ tuple(rng.gausses(n_action_features, 0, 1)) for _ in range(n_actions) ]
def test_performance_fit_values(self): fit_values = list(range(1000)) time = min( timeit.repeat(lambda: OneHotEncoder(fit_values), repeat=100, number=1)) #was approximately 0.017 self.assertLess(time, .03)
def _determine_encoder(self, index:int, name: str, tipe: str) -> Encoder: is_numeric = tipe in ['numeric', 'integer', 'real'] is_one_hot = '{' in tipe if index in self._skip_encoding or name in self._skip_encoding: return StringEncoder() if is_numeric: return NumericEncoder() if is_one_hot: return OneHotEncoder(fit_values=[ v.strip() for v in tipe.strip("}{").split(',')], singular_if_binary=True) return StringEncoder()
def read(self) -> Iterable[SimulatedInteraction]: items = list(self._source.read()) if not items: return [] features, labels = zip(*items) if self._label_type == "R": max_n_actions = 10 #Scale the labels so their range is 1. min_l, max_l = min(labels), max(labels) labels = [ float(l) / (max_l - min_l) - (min_l / (max_l - min_l)) for l in labels ] if len(labels) <= max_n_actions: actions = labels else: actions = percentile(labels, [ i / (max_n_actions + 1) for i in range(1, max_n_actions + 1) ]) values = dict(zip(OneHotEncoder().fit_encodes(actions), actions)) actions = list(values.keys()) reward = lambda action, label: 1 - abs(values[action] - float(label )) else: #how can we tell the difference between featurized labels and multilabels???? #for now we will assume multilables will be passed in as arrays not tuples... if not isinstance(labels[0], collections.abc.Hashable): actions = list(chain.from_iterable(labels)) else: actions = list(labels) is_label = lambda action, label: action == label in_multilabel = lambda action, label: isinstance( label, collections.abc.Sequence) and action in label reward = lambda action, label: int( is_label(action, label) or in_multilabel(action, label)) contexts = features actions = CobaRandom(1).shuffle(sorted(set(actions))) rewards = [[reward(action, label) for action in actions] for label in labels] for c, a, r in zip(contexts, repeat(actions), rewards): yield SimulatedInteraction(c, a, rewards=r)
def encoder( x: str, cats=categories, get=OneHotEncoder(categories)._onehots.__getitem__): x = x.strip() if x == "?": return None if x not in cats and x[0] in self._quotes and x[0] == x[ -1] and len(x) > 1: x = x[1:-1] if x not in cats: raise CobaException( "We were unable to find one of the categorical values in the arff data." ) return x if self._cat_as_str else get(x)
def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]: #placing some of these at the top would cause circular references from coba.encodings import Encoder, NumericEncoder, OneHotEncoder, StringEncoder from coba.pipes import ArffReader, CsvReader, Encode, Flatten, Transpose d_key = None t_key = None o_key = None try: data_id = self._data_id md5_checksum = self._md5_checksum d_key = f'https://www.openml.org/api/v1/json/data/{data_id}' t_key = f'https://www.openml.org/api/v1/json/data/features/{data_id}' d_bytes = self._query(d_key, "descr") d_object = json.loads( d_bytes.decode('utf-8'))["data_set_description"] if d_object['status'] == 'deactivated': raise Exception( f"Openml {data_id} has been deactivated. This is often due to flags on the data." ) t_bytes = self._query(t_key, "types") t_object = json.loads( t_bytes.decode('utf-8'))["data_features"]["feature"] headers: List[str] = [] encoders: List[Encoder] = [] ignored: List[bool] = [] target: str = "" for tipe in t_object: headers.append(tipe['name'].lower()) ignored.append(tipe['is_ignore'] == 'true' or tipe['is_row_identifier'] == 'true') if tipe['is_target'] == 'true': target = tipe['name'].lower() if tipe['data_type'] == 'numeric': encoders.append(NumericEncoder()) elif tipe['data_type'] == 'nominal': encoders.append(OneHotEncoder(singular_if_binary=True)) else: encoders.append(StringEncoder()) if target == "" or isinstance(encoders[headers.index(target)], NumericEncoder): target = self._get_classification_target(data_id) ignored[headers.index(target)] = False encoders[headers.index(target)] = StringEncoder() csv_url = f"http://www.openml.org/data/v1/get_csv/{d_object['file_id']}" arff_url = f"http://www.openml.org/data/v1/download/{d_object['file_id']}" try: if csv_url in CobaConfig.Cacher or arff_url not in CobaConfig.Cacher: o_key = csv_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list(CsvReader().filter( o_bytes.decode('utf-8').splitlines())) else: o_key = arff_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list( ArffReader(skip_encoding=[target]).filter( o_bytes.decode('utf-8').splitlines())) except: if o_key == csv_url: o_key = arff_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list( ArffReader(skip_encoding=[target]).filter( o_bytes.decode('utf-8').splitlines())) else: o_key = csv_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list(CsvReader().filter( o_bytes.decode('utf-8').splitlines())) is_sparse_data = isinstance(file_rows[0], tuple) and len( file_rows[0]) == 2 if is_sparse_data: file_headers = [ header.lower() for header in file_rows.pop(0)[1] ] else: file_headers = [header.lower() for header in file_rows.pop(0)] file_cols = list(Transpose().filter(file_rows)) for ignored_header in compress(headers, ignored): if ignored_header in file_headers: file_cols.pop(file_headers.index(ignored_header)) file_headers.remove(ignored_header) file_encoders = [ encoders[headers.index(file_header)] for file_header in file_headers ] file_cols = list(Encode(file_encoders).filter(file_cols)) label_col = file_cols.pop(file_headers.index(target)) feature_rows = list(Transpose().filter( Flatten().filter(file_cols))) #we only cache after all the data has been successfully loaded for key, bytes in [(d_key, d_bytes), (t_key, t_bytes), (o_key, o_bytes)]: if key not in CobaConfig.Cacher: CobaConfig.Cacher.put(key, bytes) if is_sparse_data: dense_label_col = ['0'] * len(feature_rows) for index, value in zip(label_col[0], label_col[1]): dense_label_col[index] = value else: dense_label_col = list(label_col) return feature_rows, dense_label_col except KeyboardInterrupt: #we don't want to clear the cache in the case of a KeyboardInterrupt raise except Exception: #if something went wrong we want to clear the #cache just in case it was corrupted somehow for k in [d_key, t_key, o_key]: if k is not None: CobaConfig.Cacher.rmv(k) raise
def actions(index: int, context: Context) -> Sequence[Action]: if n_action_features: return [(rng.gausses(n_action_features)) for _ in range(n_actions)] else: return OneHotEncoder().fit_encodes(range(n_actions))
def __init__(self, n_interactions: int, n_actions: int = 10, n_context_features: int = 10, n_action_features: int = 10, n_exemplars: int = 10, kernel: Literal['linear', 'polynomial', 'exponential'] = 'exponential', degree: int = 2, gamma: float = 1, seed: int = 1) -> None: """Instantiate a KernelSyntheticSimulation. Args: n_interactions: The number of interactions the simulation should have. n_actions: The number of actions each interaction should have. n_context_features: The number of features each context should have. n_action_features: The number of features each action should have. n_exemplars: The number of exemplar action, context pairs. kernel: The family of the kernel basis functions. degree: This argument is only relevant when using polynomial kernels. gamma: This argument is only relevant when using exponential kernels. seed: The random number seed used to generate all features, weights and noise in the simulation. """ self._args = (n_interactions, n_actions, n_context_features, n_action_features, n_exemplars, kernel, degree, gamma, seed) self._n_actions = n_actions self._n_context_features = n_context_features self._n_action_features = n_action_features self._n_exemplars = n_exemplars self._seed = seed self._kernel = kernel self._degree = degree self._gamma = gamma rng = CobaRandom(seed) #if there are no features then we are unable to define exemplars if n_action_features + n_context_features == 0: n_exemplars = 0 feat_gen = lambda n: tuple(rng.gausses(n, 0, .75)) one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions)) self._exemplars = [[ feat_gen(n_action_features + n_context_features) for _ in range(n_exemplars) ] for _ in range(1 if n_action_features else n_actions)] weight_count = n_actions if n_exemplars == 0 else n_exemplars self._weights = [1 - 2 * w for w in rng.randoms(weight_count)] self._bias = 0 if kernel == 'polynomial': #this ensures the dot-product between F and an exemplar is in [0,upper_bound] #This ensures that higher-order polynomials will remain reasonably well behaved upper_bound = (1.5)**(1 / degree) - 1 self._exemplars = [[[upper_bound * ee / sum(e) for ee in e] for e in E] for E in self._exemplars] def context(index: int) -> Context: return feat_gen(n_context_features) if n_context_features else None def actions(index: int, context: Context) -> Sequence[Action]: return [feat_gen(n_action_features) for _ in range(n_actions) ] if n_action_features else one_hot_acts def reward(index: int, context: Context, action: Action) -> float: if n_exemplars == 0: return self._bias + self._weights[action.index(1)] #handles None context context = context or [] if n_action_features: f = list(context) + list(action) W = self._weights E = self._exemplars[0] else: f = list(context) W = self._weights E = self._exemplars[action.index(1)] if kernel == "linear": K = lambda x1, x2: self._linear_kernel(x1, x2) if kernel == "polynomial": K = lambda x1, x2: self._polynomial_kernel( x1, x2, self._degree) if kernel == "exponential": K = lambda x1, x2: self._exponential_kernel( x1, x2, self._gamma) return self._bias + sum([w * K(e, f) for w, e in zip(W, E)]) rewards = [ reward(i, c, a) for i in range(100) for c in [context(i)] for a in actions(i, c) ] m = mean(rewards) s = (max(rewards) - min(rewards)) or 1 self._bias = 0.5 - m / s self._weights = [w / s for w in self._weights] super().__init__(n_interactions, context, actions, reward)
def test_singular_if_binary(self): encoder = OneHotEncoder(singular_if_binary=True).fit( ["1", "1", "1", "0", "0"]) self.assertEqual(encoder.encode(["0"]), [(0, )]) self.assertEqual(encoder.encode(["1"]), [(1, )])
def __init__(self, n_interactions: int, n_actions: int = 10, n_context_features: int = 10, n_action_features: int = 10, reward_features: Sequence[str] = ["a", "xa"], seed: int = 1) -> None: """Instantiate a LinearSyntheticSimulation. Args: n_interactions: The number of interactions the simulation should have. n_actions: The number of actions each interaction should have. n_context_features: The number of features each context should have. n_action_features: The number of features each action should have. reward_features: The features in the simulation's linear reward function. seed: The random number seed used to generate all features, weights and noise in the simulation. """ self._args = (n_interactions, n_actions, n_context_features, n_action_features, reward_features, seed) self._n_actions = n_actions self._n_context_features = n_context_features self._n_action_features = n_action_features self._reward_features = reward_features self._seed = seed if not self._n_context_features: reward_features = list( set(filter(None, [f.replace('x', '') for f in reward_features]))) if not self._n_action_features: reward_features = list( set(filter(None, [f.replace('a', '') for f in reward_features]))) rng = CobaRandom(seed) feat_encoder = InteractionsEncoder(reward_features) #to try and make sure high-order polynomials are well behaved #we center our context and action features on 1 and give them #a very small amount of variance. Then, in post processing, we #shift and re-scale our reward to center and fill in [0,1]. max_degree = max([len(f) for f in reward_features]) if reward_features else 1 feat_gen = lambda n: tuple([ g * rng.choice([1, -1]) for g in rng.gausses(n, mu=1, sigma=1 / (2 * max_degree)) ]) one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions)) feature_count = len( feat_encoder.encode(x=[1] * n_context_features, a=[1] * n_action_features)) weight_parts = 1 if n_action_features else n_actions weight_count = 1 if feature_count == 0 else feature_count self._weights = [[1 - 2 * w for w in rng.randoms(weight_count)] for _ in range(weight_parts)] self._bias = 0 self._clip = False def context(index: int) -> Context: return feat_gen(n_context_features) if n_context_features else None def actions(index: int, context: Context) -> Sequence[Action]: return [feat_gen(n_action_features) for _ in range(n_actions) ] if n_action_features else one_hot_acts def reward(index: int, context: Context, action: Action) -> float: F = feat_encoder.encode(x=context, a=action) or [1] W = self._weights[0 if n_action_features else action.index(1)] return self._bias + sum([w * f for w, f in zip(W, F)]) rewards = [ reward(i, c, a) for i in range(100) for c in [context(i)] for a in actions(i, c) ] m = mean(rewards) s = (max(rewards) - min(rewards)) or 1 self._bias = 0.5 - m / s self._weights = [[w / s for w in W] for W in self._weights] self._clip = True super().__init__(n_interactions, context, actions, reward)
def test_encode_err_if_unkonwn_true(self): with self.assertRaises(CobaException): OneHotEncoder(err_if_unknown=True).fit(["1", "1", "1", "0", "0"]).encode("2")
def test_dense_encode_mixed(self): encode = Encode({0: NumericEncoder(), 1: OneHotEncoder()}) self.assertEqual([[1, (1, 0)], [2, (0, 1)], [3, (0, 1)]], list(encode.filter([[1, 4], [2, 5], [3, 5]])))
def test_init_values(self): encoder = OneHotEncoder(values=["0", "1", "2"]) self.assertEqual(True, encoder.is_fit) self.assertEqual([(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)], encoder.encodes(["0", "1", "2", "1"]))
def test_encodes_sans_fit_exception(self): with self.assertRaises(CobaException): OneHotEncoder().encodes(["0", "1", "2"])
def test_encode_sans_fit_exception(self): with self.assertRaises(CobaException): OneHotEncoder().encode("0")
def test_error_if_unkonwn_true(self): encoder = OneHotEncoder(error_if_unknown=True).fit( ["1", "1", "1", "0", "0"]) with self.assertRaises(Exception): self.assertEqual(encoder.encode(["2"]), [(0)])
def test_err_if_unkonwn_false(self): self.assertEqual((0, 0, 0), OneHotEncoder().fit(["0", "1", "2"]).encode("5"))
def _make_unfit_encoder( self ) -> Tuple[Encoder, Sequence[str], Sequence[str], Sequence[Any]]: return OneHotEncoder(), ["d", "a", "b", "b", "b", "d"], ["a"], [(0, 1, 0)]
def test_dense_encode_onehot(self): encode = Encode({0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder()}) self.assertEqual([[(1, 0, 0), (1, 0, 0)], [(0, 1, 0), (0, 1, 0)], [(0, 1, 0), (0, 0, 1)]], list(encode.filter([[1, 4], [2, 5], [2, 6]])))
def test_fit_encode(self): self.assertEqual([(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)], OneHotEncoder().fit_encodes(["0", "1", "2", "1"]))
def test_sparse_encode_mixed(self): encode = Encode({0: NumericEncoder(), 1: OneHotEncoder()}) given = [{0: "1", 1: 4}, {0: "2", 1: 5}, {0: "3", 1: 5}] expected = [{0: 1, 1: (1, 0)}, {0: 2, 1: (0, 1)}, {0: 3, 1: (0, 1)}] self.assertEqual(expected, list(encode.filter(given)))