Esempio n. 1
0
    def test_instantiated_fit_values(self):
        encoder = OneHotEncoder(fit_values=["0", "1", "2"])

        expected = [(1, 0, 0), (0, 1, 0), (0, 0, 1), (0, 1, 0)]

        actual = encoder.encode(["0", "1", "2", "1"])

        self.assertEqual(actual, expected)
Esempio n. 2
0
    def test_error_if_unkonwn_false(self):
        encoder = OneHotEncoder(error_if_unknown=False).fit(["0", "1", "2"])

        try:
            actual = encoder.encode(["5"])
        except:
            self.fail("An exception was raised when it shouldn't have been")

        self.assertEqual(actual, [(0, 0, 0)])
Esempio n. 3
0
    def test_performance_encode(self):

        encoder = OneHotEncoder(list(range(1000)), error_if_unknown=False)
        to_encode = [100, 200, 300, 400, -1] * 100000

        time = min(
            timeit.repeat(lambda: encoder.encode(to_encode),
                          repeat=50,
                          number=1))

        #was approximately 0.040
        self.assertLess(time, 1)
Esempio n. 4
0
    def assert_simulation_for_data(self, simulation, features, answers) -> None:

        self.assertEqual(len(simulation.interactions), len(features))

        answers = OneHotEncoder(simulation.label_set).encode(answers)

        #first we make sure that all the labels are included 
        #in the first interactions actions without any concern for order
        self.assertCountEqual(simulation.interactions[0].actions, set(answers))

        #then we set our expected actions to the first interaction
        #to make sure that every interaction has the exact same actions
        #with the exact same order
        expected_actions = simulation.interactions[0].actions

        for f,l,i in zip(features, answers, simulation.interactions):

            expected_context = f
            expected_rewards = [ int(a == l) for a in i.actions]

            actual_context = i.context
            actual_actions = i.actions
            
            actual_rewards  = simulation.reward(_choices(i))

            self.assertEqual(actual_context, expected_context)            
            self.assertSequenceEqual(actual_actions, expected_actions)
            self.assertSequenceEqual(actual_rewards, expected_rewards)
Esempio n. 5
0
    def test_performance_fit_values(self):

        fit_values = list(range(1000))

        time = min(
            timeit.repeat(lambda: OneHotEncoder(fit_values),
                          repeat=100,
                          number=1))

        #was approximately 0.017
        self.assertLess(time, .03)
Esempio n. 6
0
    def __init__(self, features: Sequence[_C_out],
                 labels: Sequence[Action]) -> None:
        """Instantiate a ClassificationSimulation.

        Args:
            features: The collection of features used for the original classifier problem.
            labels: The collection of labels assigned to each observation of features.
        """

        assert len(features) == len(
            labels), "Mismatched lengths of features and labels"

        label_set = list(set(labels))
        action_set = OneHotEncoder(label_set).encode(label_set)

        interactions = [
            Interaction(context, action_set, i)
            for i, context in enumerate(features)
        ]  #type: ignore
        rewards = OneHotEncoder(label_set).encode(labels)

        self.label_set = label_set
        super().__init__(interactions, rewards)  #type:ignore
Esempio n. 7
0
    def test_error_if_unkonwn_true(self):
        encoder = OneHotEncoder(error_if_unknown=True).fit(
            ["1", "1", "1", "0", "0"])

        with self.assertRaises(Exception):
            self.assertEqual(encoder.encode(["2"]), [(0)])
Esempio n. 8
0
    def test_singular_if_binary(self):
        encoder = OneHotEncoder(singular_if_binary=True).fit(
            ["1", "1", "1", "0", "0"])

        self.assertEqual(encoder.encode(["0"]), [(1, )])
        self.assertEqual(encoder.encode(["1"]), [(0, )])
Esempio n. 9
0
 def _make_unfit_encoder(
         self
 ) -> Tuple[Encoder, Sequence[str], Sequence[str], Sequence[Any]]:
     return OneHotEncoder(), ["d", "a", "b", "b", "b",
                              "d"], ["a"], [(1, 0, 0)]
Esempio n. 10
0
    def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]:

        #placing some of these at the top would cause circular references
        from coba.data.pipes import Pipe
        from coba.data.encoders import Encoder, NumericEncoder, OneHotEncoder, StringEncoder
        from coba.data.filters import CsvReader, LabeledCsvCleaner

        data_id = self._data_id
        md5_checksum = self._md5_checksum
        openml_api_key = ExecutionContext.Config.openml_api_key

        data_description_url = f'https://www.openml.org/api/v1/json/data/{data_id}'

        type_description_url = f'https://www.openml.org/api/v1/json/data/features/{data_id}'

        if openml_api_key is not None:
            data_description_url += f'?api_key={openml_api_key}'
            type_description_url += f'?api_key={openml_api_key}'

        descr = json.loads(''.join(
            HttpSource(data_description_url, '.json', None,
                       'descr').read()))["data_set_description"]

        if descr['status'] == 'deactivated':
            raise Exception(
                f"Openml {data_id} has been deactivated. This is often due to flags on the data."
            )

        types = json.loads(''.join(
            HttpSource(type_description_url, '.json', None,
                       'types').read()))["data_features"]["feature"]

        headers: List[str] = []
        encoders: List[Encoder] = []
        ignored: List[bool] = []
        target: str = ""

        for tipe in types:

            headers.append(tipe['name'])
            ignored.append(tipe['is_ignore'] == 'true'
                           or tipe['is_row_identifier'] == 'true')

            if tipe['is_target'] == 'true':
                target = tipe['name']

            if tipe['data_type'] == 'numeric':
                encoders.append(NumericEncoder())
            elif tipe['data_type'] == 'nominal' and tipe[
                    'is_target'] == 'false':
                encoders.append(OneHotEncoder(singular_if_binary=True))
            elif tipe['data_type'] == 'nominal' and tipe['is_target'] == 'true':
                encoders.append(OneHotEncoder())
            else:
                encoders.append(StringEncoder())

        if isinstance(encoders[headers.index(target)], NumericEncoder):
            target = self._get_classification_target(data_id, openml_api_key)
            ignored[headers.index(target)] = False
            encoders[headers.index(target)] = OneHotEncoder()

        csv_url = f"http://www.openml.org/data/v1/get_csv/{descr['file_id']}"

        source = HttpSource(csv_url, ".csv", md5_checksum, f"openml {data_id}")
        reader = CsvReader()
        cleaner = LabeledCsvCleaner(target, headers, encoders, ignored, True)

        feature_rows, label_rows = Pipe.join(source, [reader, cleaner]).read()

        return list(feature_rows), list(label_rows)