Ejemplo n.º 1
0
    def test_instantiated_fit_values(self):
        encoder = OneHotEncoder(fit_values=["0", "1", "2"])

        expected = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)]

        actual = encoder.encode(["0", "1", "2", "1"])

        self.assertEqual(actual, expected)
Ejemplo n.º 2
0
    def test_fit(self):
        encoder = OneHotEncoder()
        fit_encoder = encoder.fit(["0", "1", "2"])

        self.assertEqual(False, encoder.is_fit)
        self.assertEqual(True, fit_encoder.is_fit)
        self.assertEqual([(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)],
                         fit_encoder.encodes(["0", "1", "2", "1"]))
Ejemplo n.º 3
0
 def test_ignore_missing_value(self):
     encode = Encode({
         0: OneHotEncoder([1, 2, 3]),
         1: OneHotEncoder()
     },
                     missing_val="?")
     self.assertEqual([[(1, 0, 0), '?'], [(0, 1, 0),
                                          (1, 0)], [(0, 1, 0), (0, 1)]],
                      list(encode.filter([[1, '?'], [2, 5], [2, 6]])))
Ejemplo n.º 4
0
    def test_error_if_unkonwn_false(self):
        encoder = OneHotEncoder(error_if_unknown=False).fit(["0", "1", "2"])

        try:
            actual = encoder.encode(["5"])
        except:
            self.fail("An exception was raised when it shouldn't have been")

        self.assertEqual(actual, [(0, 0, 0)])
Ejemplo n.º 5
0
 def test_dense_encode_onehot_with_header_and_extra_encoder(self):
     encode = Encode({
         0: OneHotEncoder([1, 2, 3]),
         1: OneHotEncoder(),
         2: StringEncoder()
     })
     self.assertEqual([[(1, 0, 0),
                        (1, 0, 0)], [(0, 1, 0),
                                     (0, 1, 0)], [(0, 1, 0), (0, 0, 1)]],
                      list(encode.filter([[1, 4], [2, 5], [2, 6]])))
Ejemplo n.º 6
0
    def test_performance_encode(self):

        encoder = OneHotEncoder(list(range(1000)), error_if_unknown=False)
        to_encode = [100, 200, 300, 400, -1] * 100000

        time = min(
            timeit.repeat(lambda: encoder.encode(to_encode),
                          repeat=50,
                          number=1))

        #was approximately 0.040
        self.assertLess(time, 1)
Ejemplo n.º 7
0
    def test_onehot_encode_performance(self):

        encoder = OneHotEncoder(list(range(1000)), err_if_unknown=False)
        to_encode = [100, 200, 300, 400, -1] * 100000

        time = min(
            timeit.repeat(lambda: encoder.encodes(to_encode),
                          repeat=25,
                          number=1))

        #best observed 0.027
        self.assertLess(time, .27)
Ejemplo n.º 8
0
    def test_sparse_encode_onehot(self):
        encode = Encode({0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder()})
        given = [{0: 1}, {0: 2, 1: 5}, {0: 2, 1: 6}]
        expected = [{
            0: (1, 0, 0)
        }, {
            0: (0, 1, 0),
            1: (0, 1, 0)
        }, {
            0: (0, 1, 0),
            1: (0, 0, 1)
        }]

        self.assertEqual(expected, list(encode.filter(given)))
Ejemplo n.º 9
0
 def actions_gen():
     if not n_action_features:
         return OneHotEncoder().fit_encodes(range(n_actions))
     else:
         return [
             tuple(rng.gausses(n_action_features, 0, 1))
             for _ in range(n_actions)
         ]
Ejemplo n.º 10
0
    def test_performance_fit_values(self):

        fit_values = list(range(1000))

        time = min(
            timeit.repeat(lambda: OneHotEncoder(fit_values),
                          repeat=100,
                          number=1))

        #was approximately 0.017
        self.assertLess(time, .03)
Ejemplo n.º 11
0
    def _determine_encoder(self, index:int, name: str, tipe: str) -> Encoder:

        is_numeric = tipe in ['numeric', 'integer', 'real']
        is_one_hot = '{' in tipe

        if index in self._skip_encoding or name in self._skip_encoding:
            return StringEncoder()

        if is_numeric: return NumericEncoder()
        if is_one_hot: return OneHotEncoder(fit_values=[ v.strip() for v in tipe.strip("}{").split(',')], singular_if_binary=True)

        return StringEncoder()
Ejemplo n.º 12
0
    def read(self) -> Iterable[SimulatedInteraction]:

        items = list(self._source.read())

        if not items: return []

        features, labels = zip(*items)

        if self._label_type == "R":
            max_n_actions = 10

            #Scale the labels so their range is 1.
            min_l, max_l = min(labels), max(labels)
            labels = [
                float(l) / (max_l - min_l) - (min_l / (max_l - min_l))
                for l in labels
            ]

            if len(labels) <= max_n_actions:
                actions = labels
            else:
                actions = percentile(labels, [
                    i / (max_n_actions + 1)
                    for i in range(1, max_n_actions + 1)
                ])

            values = dict(zip(OneHotEncoder().fit_encodes(actions), actions))
            actions = list(values.keys())

            reward = lambda action, label: 1 - abs(values[action] - float(label
                                                                          ))
        else:
            #how can we tell the difference between featurized labels and multilabels????
            #for now we will assume multilables will be passed in as arrays not tuples...
            if not isinstance(labels[0], collections.abc.Hashable):
                actions = list(chain.from_iterable(labels))
            else:
                actions = list(labels)

            is_label = lambda action, label: action == label
            in_multilabel = lambda action, label: isinstance(
                label, collections.abc.Sequence) and action in label
            reward = lambda action, label: int(
                is_label(action, label) or in_multilabel(action, label))

        contexts = features
        actions = CobaRandom(1).shuffle(sorted(set(actions)))
        rewards = [[reward(action, label) for action in actions]
                   for label in labels]

        for c, a, r in zip(contexts, repeat(actions), rewards):
            yield SimulatedInteraction(c, a, rewards=r)
Ejemplo n.º 13
0
                def encoder(
                        x: str,
                        cats=categories,
                        get=OneHotEncoder(categories)._onehots.__getitem__):

                    x = x.strip()

                    if x == "?":
                        return None

                    if x not in cats and x[0] in self._quotes and x[0] == x[
                            -1] and len(x) > 1:
                        x = x[1:-1]

                    if x not in cats:
                        raise CobaException(
                            "We were unable to find one of the categorical values in the arff data."
                        )

                    return x if self._cat_as_str else get(x)
Ejemplo n.º 14
0
    def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]:

        #placing some of these at the top would cause circular references
        from coba.encodings import Encoder, NumericEncoder, OneHotEncoder, StringEncoder
        from coba.pipes import ArffReader, CsvReader, Encode, Flatten, Transpose

        d_key = None
        t_key = None
        o_key = None

        try:
            data_id = self._data_id
            md5_checksum = self._md5_checksum

            d_key = f'https://www.openml.org/api/v1/json/data/{data_id}'
            t_key = f'https://www.openml.org/api/v1/json/data/features/{data_id}'

            d_bytes = self._query(d_key, "descr")
            d_object = json.loads(
                d_bytes.decode('utf-8'))["data_set_description"]

            if d_object['status'] == 'deactivated':
                raise Exception(
                    f"Openml {data_id} has been deactivated. This is often due to flags on the data."
                )

            t_bytes = self._query(t_key, "types")
            t_object = json.loads(
                t_bytes.decode('utf-8'))["data_features"]["feature"]

            headers: List[str] = []
            encoders: List[Encoder] = []
            ignored: List[bool] = []
            target: str = ""

            for tipe in t_object:

                headers.append(tipe['name'].lower())
                ignored.append(tipe['is_ignore'] == 'true'
                               or tipe['is_row_identifier'] == 'true')

                if tipe['is_target'] == 'true':
                    target = tipe['name'].lower()

                if tipe['data_type'] == 'numeric':
                    encoders.append(NumericEncoder())
                elif tipe['data_type'] == 'nominal':
                    encoders.append(OneHotEncoder(singular_if_binary=True))
                else:
                    encoders.append(StringEncoder())

            if target == "" or isinstance(encoders[headers.index(target)],
                                          NumericEncoder):
                target = self._get_classification_target(data_id)

            ignored[headers.index(target)] = False
            encoders[headers.index(target)] = StringEncoder()

            csv_url = f"http://www.openml.org/data/v1/get_csv/{d_object['file_id']}"
            arff_url = f"http://www.openml.org/data/v1/download/{d_object['file_id']}"

            try:
                if csv_url in CobaConfig.Cacher or arff_url not in CobaConfig.Cacher:
                    o_key = csv_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(CsvReader().filter(
                        o_bytes.decode('utf-8').splitlines()))
                else:
                    o_key = arff_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(
                        ArffReader(skip_encoding=[target]).filter(
                            o_bytes.decode('utf-8').splitlines()))
            except:
                if o_key == csv_url:
                    o_key = arff_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(
                        ArffReader(skip_encoding=[target]).filter(
                            o_bytes.decode('utf-8').splitlines()))
                else:
                    o_key = csv_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(CsvReader().filter(
                        o_bytes.decode('utf-8').splitlines()))

            is_sparse_data = isinstance(file_rows[0], tuple) and len(
                file_rows[0]) == 2

            if is_sparse_data:
                file_headers = [
                    header.lower() for header in file_rows.pop(0)[1]
                ]
            else:
                file_headers = [header.lower() for header in file_rows.pop(0)]

            file_cols = list(Transpose().filter(file_rows))

            for ignored_header in compress(headers, ignored):
                if ignored_header in file_headers:
                    file_cols.pop(file_headers.index(ignored_header))
                    file_headers.remove(ignored_header)

            file_encoders = [
                encoders[headers.index(file_header)]
                for file_header in file_headers
            ]

            file_cols = list(Encode(file_encoders).filter(file_cols))
            label_col = file_cols.pop(file_headers.index(target))
            feature_rows = list(Transpose().filter(
                Flatten().filter(file_cols)))

            #we only cache after all the data has been successfully loaded
            for key, bytes in [(d_key, d_bytes), (t_key, t_bytes),
                               (o_key, o_bytes)]:
                if key not in CobaConfig.Cacher:
                    CobaConfig.Cacher.put(key, bytes)

            if is_sparse_data:
                dense_label_col = ['0'] * len(feature_rows)

                for index, value in zip(label_col[0], label_col[1]):
                    dense_label_col[index] = value
            else:
                dense_label_col = list(label_col)

            return feature_rows, dense_label_col

        except KeyboardInterrupt:
            #we don't want to clear the cache in the case of a KeyboardInterrupt
            raise

        except Exception:
            #if something went wrong we want to clear the
            #cache just in case it was corrupted somehow
            for k in [d_key, t_key, o_key]:
                if k is not None: CobaConfig.Cacher.rmv(k)

            raise
Ejemplo n.º 15
0
 def actions(index: int, context: Context) -> Sequence[Action]:
     if n_action_features:
         return [(rng.gausses(n_action_features))
                 for _ in range(n_actions)]
     else:
         return OneHotEncoder().fit_encodes(range(n_actions))
Ejemplo n.º 16
0
    def __init__(self,
                 n_interactions: int,
                 n_actions: int = 10,
                 n_context_features: int = 10,
                 n_action_features: int = 10,
                 n_exemplars: int = 10,
                 kernel: Literal['linear', 'polynomial',
                                 'exponential'] = 'exponential',
                 degree: int = 2,
                 gamma: float = 1,
                 seed: int = 1) -> None:
        """Instantiate a KernelSyntheticSimulation.

        Args:
            n_interactions: The number of interactions the simulation should have.
            n_actions: The number of actions each interaction should have.
            n_context_features: The number of features each context should have.
            n_action_features: The number of features each action should have.
            n_exemplars: The number of exemplar action, context pairs.
            kernel: The family of the kernel basis functions.
            degree: This argument is only relevant when using polynomial kernels.
            gamma: This argument is only relevant when using exponential kernels. 
            seed: The random number seed used to generate all features, weights and noise in the simulation.
        """

        self._args = (n_interactions, n_actions, n_context_features,
                      n_action_features, n_exemplars, kernel, degree, gamma,
                      seed)

        self._n_actions = n_actions
        self._n_context_features = n_context_features
        self._n_action_features = n_action_features
        self._n_exemplars = n_exemplars
        self._seed = seed
        self._kernel = kernel
        self._degree = degree
        self._gamma = gamma

        rng = CobaRandom(seed)

        #if there are no features then we are unable to define exemplars
        if n_action_features + n_context_features == 0: n_exemplars = 0

        feat_gen = lambda n: tuple(rng.gausses(n, 0, .75))
        one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions))

        self._exemplars = [[
            feat_gen(n_action_features + n_context_features)
            for _ in range(n_exemplars)
        ] for _ in range(1 if n_action_features else n_actions)]
        weight_count = n_actions if n_exemplars == 0 else n_exemplars
        self._weights = [1 - 2 * w for w in rng.randoms(weight_count)]

        self._bias = 0

        if kernel == 'polynomial':
            #this ensures the dot-product between F and an exemplar is in [0,upper_bound]
            #This ensures that higher-order polynomials will remain reasonably well behaved
            upper_bound = (1.5)**(1 / degree) - 1
            self._exemplars = [[[upper_bound * ee / sum(e) for ee in e]
                                for e in E] for E in self._exemplars]

        def context(index: int) -> Context:
            return feat_gen(n_context_features) if n_context_features else None

        def actions(index: int, context: Context) -> Sequence[Action]:
            return [feat_gen(n_action_features) for _ in range(n_actions)
                    ] if n_action_features else one_hot_acts

        def reward(index: int, context: Context, action: Action) -> float:

            if n_exemplars == 0:
                return self._bias + self._weights[action.index(1)]

            #handles None context
            context = context or []

            if n_action_features:
                f = list(context) + list(action)
                W = self._weights
                E = self._exemplars[0]
            else:
                f = list(context)
                W = self._weights
                E = self._exemplars[action.index(1)]

            if kernel == "linear":
                K = lambda x1, x2: self._linear_kernel(x1, x2)
            if kernel == "polynomial":
                K = lambda x1, x2: self._polynomial_kernel(
                    x1, x2, self._degree)
            if kernel == "exponential":
                K = lambda x1, x2: self._exponential_kernel(
                    x1, x2, self._gamma)

            return self._bias + sum([w * K(e, f) for w, e in zip(W, E)])

        rewards = [
            reward(i, c, a) for i in range(100) for c in [context(i)]
            for a in actions(i, c)
        ]

        m = mean(rewards)
        s = (max(rewards) - min(rewards)) or 1

        self._bias = 0.5 - m / s
        self._weights = [w / s for w in self._weights]

        super().__init__(n_interactions, context, actions, reward)
Ejemplo n.º 17
0
    def test_singular_if_binary(self):
        encoder = OneHotEncoder(singular_if_binary=True).fit(
            ["1", "1", "1", "0", "0"])

        self.assertEqual(encoder.encode(["0"]), [(0, )])
        self.assertEqual(encoder.encode(["1"]), [(1, )])
Ejemplo n.º 18
0
    def __init__(self,
                 n_interactions: int,
                 n_actions: int = 10,
                 n_context_features: int = 10,
                 n_action_features: int = 10,
                 reward_features: Sequence[str] = ["a", "xa"],
                 seed: int = 1) -> None:
        """Instantiate a LinearSyntheticSimulation.

        Args:
            n_interactions: The number of interactions the simulation should have.
            n_actions: The number of actions each interaction should have.
            n_context_features: The number of features each context should have.
            n_action_features: The number of features each action should have.
            reward_features: The features in the simulation's linear reward function.
            seed: The random number seed used to generate all features, weights and noise in the simulation.
        """

        self._args = (n_interactions, n_actions, n_context_features,
                      n_action_features, reward_features, seed)

        self._n_actions = n_actions
        self._n_context_features = n_context_features
        self._n_action_features = n_action_features
        self._reward_features = reward_features
        self._seed = seed

        if not self._n_context_features:
            reward_features = list(
                set(filter(None,
                           [f.replace('x', '') for f in reward_features])))

        if not self._n_action_features:
            reward_features = list(
                set(filter(None,
                           [f.replace('a', '') for f in reward_features])))

        rng = CobaRandom(seed)
        feat_encoder = InteractionsEncoder(reward_features)

        #to try and make sure high-order polynomials are well behaved
        #we center our context and action features on 1 and give them
        #a very small amount of variance. Then, in post processing, we
        #shift and re-scale our reward to center and fill in [0,1].
        max_degree = max([len(f)
                          for f in reward_features]) if reward_features else 1
        feat_gen = lambda n: tuple([
            g * rng.choice([1, -1])
            for g in rng.gausses(n, mu=1, sigma=1 / (2 * max_degree))
        ])
        one_hot_acts = OneHotEncoder().fit_encodes(range(n_actions))

        feature_count = len(
            feat_encoder.encode(x=[1] * n_context_features,
                                a=[1] * n_action_features))
        weight_parts = 1 if n_action_features else n_actions
        weight_count = 1 if feature_count == 0 else feature_count

        self._weights = [[1 - 2 * w for w in rng.randoms(weight_count)]
                         for _ in range(weight_parts)]

        self._bias = 0
        self._clip = False

        def context(index: int) -> Context:
            return feat_gen(n_context_features) if n_context_features else None

        def actions(index: int, context: Context) -> Sequence[Action]:
            return [feat_gen(n_action_features) for _ in range(n_actions)
                    ] if n_action_features else one_hot_acts

        def reward(index: int, context: Context, action: Action) -> float:

            F = feat_encoder.encode(x=context, a=action) or [1]
            W = self._weights[0 if n_action_features else action.index(1)]

            return self._bias + sum([w * f for w, f in zip(W, F)])

        rewards = [
            reward(i, c, a) for i in range(100) for c in [context(i)]
            for a in actions(i, c)
        ]

        m = mean(rewards)
        s = (max(rewards) - min(rewards)) or 1

        self._bias = 0.5 - m / s
        self._weights = [[w / s for w in W] for W in self._weights]
        self._clip = True

        super().__init__(n_interactions, context, actions, reward)
Ejemplo n.º 19
0
 def test_encode_err_if_unkonwn_true(self):
     with self.assertRaises(CobaException):
         OneHotEncoder(err_if_unknown=True).fit(["1", "1", "1", "0",
                                                 "0"]).encode("2")
Ejemplo n.º 20
0
 def test_dense_encode_mixed(self):
     encode = Encode({0: NumericEncoder(), 1: OneHotEncoder()})
     self.assertEqual([[1, (1, 0)], [2, (0, 1)], [3, (0, 1)]],
                      list(encode.filter([[1, 4], [2, 5], [3, 5]])))
Ejemplo n.º 21
0
 def test_init_values(self):
     encoder = OneHotEncoder(values=["0", "1", "2"])
     self.assertEqual(True, encoder.is_fit)
     self.assertEqual([(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)],
                      encoder.encodes(["0", "1", "2", "1"]))
Ejemplo n.º 22
0
 def test_encodes_sans_fit_exception(self):
     with self.assertRaises(CobaException):
         OneHotEncoder().encodes(["0", "1", "2"])
Ejemplo n.º 23
0
 def test_encode_sans_fit_exception(self):
     with self.assertRaises(CobaException):
         OneHotEncoder().encode("0")
Ejemplo n.º 24
0
    def test_error_if_unkonwn_true(self):
        encoder = OneHotEncoder(error_if_unknown=True).fit(
            ["1", "1", "1", "0", "0"])

        with self.assertRaises(Exception):
            self.assertEqual(encoder.encode(["2"]), [(0)])
Ejemplo n.º 25
0
 def test_err_if_unkonwn_false(self):
     self.assertEqual((0, 0, 0),
                      OneHotEncoder().fit(["0", "1", "2"]).encode("5"))
Ejemplo n.º 26
0
 def _make_unfit_encoder(
         self
 ) -> Tuple[Encoder, Sequence[str], Sequence[str], Sequence[Any]]:
     return OneHotEncoder(), ["d", "a", "b", "b", "b",
                              "d"], ["a"], [(0, 1, 0)]
Ejemplo n.º 27
0
 def test_dense_encode_onehot(self):
     encode = Encode({0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder()})
     self.assertEqual([[(1, 0, 0),
                        (1, 0, 0)], [(0, 1, 0),
                                     (0, 1, 0)], [(0, 1, 0), (0, 0, 1)]],
                      list(encode.filter([[1, 4], [2, 5], [2, 6]])))
Ejemplo n.º 28
0
 def test_fit_encode(self):
     self.assertEqual([(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)],
                      OneHotEncoder().fit_encodes(["0", "1", "2", "1"]))
Ejemplo n.º 29
0
    def test_sparse_encode_mixed(self):
        encode = Encode({0: NumericEncoder(), 1: OneHotEncoder()})
        given = [{0: "1", 1: 4}, {0: "2", 1: 5}, {0: "3", 1: 5}]
        expected = [{0: 1, 1: (1, 0)}, {0: 2, 1: (0, 1)}, {0: 3, 1: (0, 1)}]

        self.assertEqual(expected, list(encode.filter(given)))