예제 #1
0
 def test_ignore_missing_value(self):
     encode = Encode({
         0: OneHotEncoder([1, 2, 3]),
         1: OneHotEncoder()
     },
                     missing_val="?")
     self.assertEqual([[(1, 0, 0), '?'], [(0, 1, 0),
                                          (1, 0)], [(0, 1, 0), (0, 1)]],
                      list(encode.filter([[1, '?'], [2, 5], [2, 6]])))
예제 #2
0
    def test_encode_performance(self):

        encoder = Encode(dict(zip(range(50), [NumericEncoder()] * 50)))
        to_encode = [['1.23'] * 50] * 6000

        time = min(
            timeit.repeat(lambda: list(encoder.filter(to_encode)), number=1))

        #best observed 0.06
        self.assertLess(time, .6)
예제 #3
0
 def test_dense_encode_onehot_with_header_and_extra_encoder(self):
     encode = Encode({
         0: OneHotEncoder([1, 2, 3]),
         1: OneHotEncoder(),
         2: StringEncoder()
     })
     self.assertEqual([[(1, 0, 0),
                        (1, 0, 0)], [(0, 1, 0),
                                     (0, 1, 0)], [(0, 1, 0), (0, 0, 1)]],
                      list(encode.filter([[1, 4], [2, 5], [2, 6]])))
예제 #4
0
    def test_sparse_encode_onehot(self):
        encode = Encode({0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder()})
        given = [{0: 1}, {0: 2, 1: 5}, {0: 2, 1: 6}]
        expected = [{
            0: (1, 0, 0)
        }, {
            0: (0, 1, 0),
            1: (0, 1, 0)
        }, {
            0: (0, 1, 0),
            1: (0, 0, 1)
        }]

        self.assertEqual(expected, list(encode.filter(given)))
예제 #5
0
    def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]:

        #placing some of these at the top would cause circular references
        from coba.encodings import Encoder, NumericEncoder, OneHotEncoder, StringEncoder
        from coba.pipes import ArffReader, CsvReader, Encode, Flatten, Transpose

        d_key = None
        t_key = None
        o_key = None

        try:
            data_id = self._data_id
            md5_checksum = self._md5_checksum

            d_key = f'https://www.openml.org/api/v1/json/data/{data_id}'
            t_key = f'https://www.openml.org/api/v1/json/data/features/{data_id}'

            d_bytes = self._query(d_key, "descr")
            d_object = json.loads(
                d_bytes.decode('utf-8'))["data_set_description"]

            if d_object['status'] == 'deactivated':
                raise Exception(
                    f"Openml {data_id} has been deactivated. This is often due to flags on the data."
                )

            t_bytes = self._query(t_key, "types")
            t_object = json.loads(
                t_bytes.decode('utf-8'))["data_features"]["feature"]

            headers: List[str] = []
            encoders: List[Encoder] = []
            ignored: List[bool] = []
            target: str = ""

            for tipe in t_object:

                headers.append(tipe['name'].lower())
                ignored.append(tipe['is_ignore'] == 'true'
                               or tipe['is_row_identifier'] == 'true')

                if tipe['is_target'] == 'true':
                    target = tipe['name'].lower()

                if tipe['data_type'] == 'numeric':
                    encoders.append(NumericEncoder())
                elif tipe['data_type'] == 'nominal':
                    encoders.append(OneHotEncoder(singular_if_binary=True))
                else:
                    encoders.append(StringEncoder())

            if target == "" or isinstance(encoders[headers.index(target)],
                                          NumericEncoder):
                target = self._get_classification_target(data_id)

            ignored[headers.index(target)] = False
            encoders[headers.index(target)] = StringEncoder()

            csv_url = f"http://www.openml.org/data/v1/get_csv/{d_object['file_id']}"
            arff_url = f"http://www.openml.org/data/v1/download/{d_object['file_id']}"

            try:
                if csv_url in CobaConfig.Cacher or arff_url not in CobaConfig.Cacher:
                    o_key = csv_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(CsvReader().filter(
                        o_bytes.decode('utf-8').splitlines()))
                else:
                    o_key = arff_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(
                        ArffReader(skip_encoding=[target]).filter(
                            o_bytes.decode('utf-8').splitlines()))
            except:
                if o_key == csv_url:
                    o_key = arff_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(
                        ArffReader(skip_encoding=[target]).filter(
                            o_bytes.decode('utf-8').splitlines()))
                else:
                    o_key = csv_url
                    o_bytes = self._query(o_key, "obser", md5_checksum)
                    file_rows = list(CsvReader().filter(
                        o_bytes.decode('utf-8').splitlines()))

            is_sparse_data = isinstance(file_rows[0], tuple) and len(
                file_rows[0]) == 2

            if is_sparse_data:
                file_headers = [
                    header.lower() for header in file_rows.pop(0)[1]
                ]
            else:
                file_headers = [header.lower() for header in file_rows.pop(0)]

            file_cols = list(Transpose().filter(file_rows))

            for ignored_header in compress(headers, ignored):
                if ignored_header in file_headers:
                    file_cols.pop(file_headers.index(ignored_header))
                    file_headers.remove(ignored_header)

            file_encoders = [
                encoders[headers.index(file_header)]
                for file_header in file_headers
            ]

            file_cols = list(Encode(file_encoders).filter(file_cols))
            label_col = file_cols.pop(file_headers.index(target))
            feature_rows = list(Transpose().filter(
                Flatten().filter(file_cols)))

            #we only cache after all the data has been successfully loaded
            for key, bytes in [(d_key, d_bytes), (t_key, t_bytes),
                               (o_key, o_bytes)]:
                if key not in CobaConfig.Cacher:
                    CobaConfig.Cacher.put(key, bytes)

            if is_sparse_data:
                dense_label_col = ['0'] * len(feature_rows)

                for index, value in zip(label_col[0], label_col[1]):
                    dense_label_col[index] = value
            else:
                dense_label_col = list(label_col)

            return feature_rows, dense_label_col

        except KeyboardInterrupt:
            #we don't want to clear the cache in the case of a KeyboardInterrupt
            raise

        except Exception:
            #if something went wrong we want to clear the
            #cache just in case it was corrupted somehow
            for k in [d_key, t_key, o_key]:
                if k is not None: CobaConfig.Cacher.rmv(k)

            raise
예제 #6
0
    def test_sparse_encode_mixed(self):
        encode = Encode({0: NumericEncoder(), 1: OneHotEncoder()})
        given = [{0: "1", 1: 4}, {0: "2", 1: 5}, {0: "3", 1: 5}]
        expected = [{0: 1, 1: (1, 0)}, {0: 2, 1: (0, 1)}, {0: 3, 1: (0, 1)}]

        self.assertEqual(expected, list(encode.filter(given)))
예제 #7
0
    def test_sparse_encode_numeric(self):
        encode = Encode({0: NumericEncoder(), 1: NumericEncoder()})
        given = [{0: "1", 1: "4"}, {0: "2", 1: "5"}, {0: "3", 1: "6"}]
        expected = [{0: 1, 1: 4}, {0: 2, 1: 5}, {0: 3, 1: 6}]

        self.assertEqual(expected, list(encode.filter(given)))
예제 #8
0
 def test_dense_encode_mixed(self):
     encode = Encode({0: NumericEncoder(), 1: OneHotEncoder()})
     self.assertEqual([[1, (1, 0)], [2, (0, 1)], [3, (0, 1)]],
                      list(encode.filter([[1, 4], [2, 5], [3, 5]])))
예제 #9
0
 def test_dense_encode_onehot(self):
     encode = Encode({0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder()})
     self.assertEqual([[(1, 0, 0),
                        (1, 0, 0)], [(0, 1, 0),
                                     (0, 1, 0)], [(0, 1, 0), (0, 0, 1)]],
                      list(encode.filter([[1, 4], [2, 5], [2, 6]])))
예제 #10
0
 def test_dense_encode_numeric(self):
     encode = Encode({0: NumericEncoder(), 1: NumericEncoder()})
     self.assertEqual([[1, 2], [4, 5]],
                      list(encode.filter([["1", "2"], ["4", "5"]])))
예제 #11
0
 def test_encode_empty(self):
     encode = Encode({0: NumericEncoder()})
     self.assertEqual([], list(encode.filter([])))