def test_ignore_missing_value(self): encode = Encode({ 0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder() }, missing_val="?") self.assertEqual([[(1, 0, 0), '?'], [(0, 1, 0), (1, 0)], [(0, 1, 0), (0, 1)]], list(encode.filter([[1, '?'], [2, 5], [2, 6]])))
def test_encode_performance(self): encoder = Encode(dict(zip(range(50), [NumericEncoder()] * 50))) to_encode = [['1.23'] * 50] * 6000 time = min( timeit.repeat(lambda: list(encoder.filter(to_encode)), number=1)) #best observed 0.06 self.assertLess(time, .6)
def test_dense_encode_onehot_with_header_and_extra_encoder(self): encode = Encode({ 0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder(), 2: StringEncoder() }) self.assertEqual([[(1, 0, 0), (1, 0, 0)], [(0, 1, 0), (0, 1, 0)], [(0, 1, 0), (0, 0, 1)]], list(encode.filter([[1, 4], [2, 5], [2, 6]])))
def test_sparse_encode_onehot(self): encode = Encode({0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder()}) given = [{0: 1}, {0: 2, 1: 5}, {0: 2, 1: 6}] expected = [{ 0: (1, 0, 0) }, { 0: (0, 1, 0), 1: (0, 1, 0) }, { 0: (0, 1, 0), 1: (0, 0, 1) }] self.assertEqual(expected, list(encode.filter(given)))
def read(self) -> Tuple[Sequence[Sequence[Any]], Sequence[Any]]: #placing some of these at the top would cause circular references from coba.encodings import Encoder, NumericEncoder, OneHotEncoder, StringEncoder from coba.pipes import ArffReader, CsvReader, Encode, Flatten, Transpose d_key = None t_key = None o_key = None try: data_id = self._data_id md5_checksum = self._md5_checksum d_key = f'https://www.openml.org/api/v1/json/data/{data_id}' t_key = f'https://www.openml.org/api/v1/json/data/features/{data_id}' d_bytes = self._query(d_key, "descr") d_object = json.loads( d_bytes.decode('utf-8'))["data_set_description"] if d_object['status'] == 'deactivated': raise Exception( f"Openml {data_id} has been deactivated. This is often due to flags on the data." ) t_bytes = self._query(t_key, "types") t_object = json.loads( t_bytes.decode('utf-8'))["data_features"]["feature"] headers: List[str] = [] encoders: List[Encoder] = [] ignored: List[bool] = [] target: str = "" for tipe in t_object: headers.append(tipe['name'].lower()) ignored.append(tipe['is_ignore'] == 'true' or tipe['is_row_identifier'] == 'true') if tipe['is_target'] == 'true': target = tipe['name'].lower() if tipe['data_type'] == 'numeric': encoders.append(NumericEncoder()) elif tipe['data_type'] == 'nominal': encoders.append(OneHotEncoder(singular_if_binary=True)) else: encoders.append(StringEncoder()) if target == "" or isinstance(encoders[headers.index(target)], NumericEncoder): target = self._get_classification_target(data_id) ignored[headers.index(target)] = False encoders[headers.index(target)] = StringEncoder() csv_url = f"http://www.openml.org/data/v1/get_csv/{d_object['file_id']}" arff_url = f"http://www.openml.org/data/v1/download/{d_object['file_id']}" try: if csv_url in CobaConfig.Cacher or arff_url not in CobaConfig.Cacher: o_key = csv_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list(CsvReader().filter( o_bytes.decode('utf-8').splitlines())) else: o_key = arff_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list( ArffReader(skip_encoding=[target]).filter( o_bytes.decode('utf-8').splitlines())) except: if o_key == csv_url: o_key = arff_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list( ArffReader(skip_encoding=[target]).filter( o_bytes.decode('utf-8').splitlines())) else: o_key = csv_url o_bytes = self._query(o_key, "obser", md5_checksum) file_rows = list(CsvReader().filter( o_bytes.decode('utf-8').splitlines())) is_sparse_data = isinstance(file_rows[0], tuple) and len( file_rows[0]) == 2 if is_sparse_data: file_headers = [ header.lower() for header in file_rows.pop(0)[1] ] else: file_headers = [header.lower() for header in file_rows.pop(0)] file_cols = list(Transpose().filter(file_rows)) for ignored_header in compress(headers, ignored): if ignored_header in file_headers: file_cols.pop(file_headers.index(ignored_header)) file_headers.remove(ignored_header) file_encoders = [ encoders[headers.index(file_header)] for file_header in file_headers ] file_cols = list(Encode(file_encoders).filter(file_cols)) label_col = file_cols.pop(file_headers.index(target)) feature_rows = list(Transpose().filter( Flatten().filter(file_cols))) #we only cache after all the data has been successfully loaded for key, bytes in [(d_key, d_bytes), (t_key, t_bytes), (o_key, o_bytes)]: if key not in CobaConfig.Cacher: CobaConfig.Cacher.put(key, bytes) if is_sparse_data: dense_label_col = ['0'] * len(feature_rows) for index, value in zip(label_col[0], label_col[1]): dense_label_col[index] = value else: dense_label_col = list(label_col) return feature_rows, dense_label_col except KeyboardInterrupt: #we don't want to clear the cache in the case of a KeyboardInterrupt raise except Exception: #if something went wrong we want to clear the #cache just in case it was corrupted somehow for k in [d_key, t_key, o_key]: if k is not None: CobaConfig.Cacher.rmv(k) raise
def test_sparse_encode_mixed(self): encode = Encode({0: NumericEncoder(), 1: OneHotEncoder()}) given = [{0: "1", 1: 4}, {0: "2", 1: 5}, {0: "3", 1: 5}] expected = [{0: 1, 1: (1, 0)}, {0: 2, 1: (0, 1)}, {0: 3, 1: (0, 1)}] self.assertEqual(expected, list(encode.filter(given)))
def test_sparse_encode_numeric(self): encode = Encode({0: NumericEncoder(), 1: NumericEncoder()}) given = [{0: "1", 1: "4"}, {0: "2", 1: "5"}, {0: "3", 1: "6"}] expected = [{0: 1, 1: 4}, {0: 2, 1: 5}, {0: 3, 1: 6}] self.assertEqual(expected, list(encode.filter(given)))
def test_dense_encode_mixed(self): encode = Encode({0: NumericEncoder(), 1: OneHotEncoder()}) self.assertEqual([[1, (1, 0)], [2, (0, 1)], [3, (0, 1)]], list(encode.filter([[1, 4], [2, 5], [3, 5]])))
def test_dense_encode_onehot(self): encode = Encode({0: OneHotEncoder([1, 2, 3]), 1: OneHotEncoder()}) self.assertEqual([[(1, 0, 0), (1, 0, 0)], [(0, 1, 0), (0, 1, 0)], [(0, 1, 0), (0, 0, 1)]], list(encode.filter([[1, 4], [2, 5], [2, 6]])))
def test_dense_encode_numeric(self): encode = Encode({0: NumericEncoder(), 1: NumericEncoder()}) self.assertEqual([[1, 2], [4, 5]], list(encode.filter([["1", "2"], ["4", "5"]])))
def test_encode_empty(self): encode = Encode({0: NumericEncoder()}) self.assertEqual([], list(encode.filter([])))