def _read_url_files(url, data=None, file_dictionary=None, file_elements=None): """do a post request to url with data, file content of file_dictionary and sending file_elements as files""" data = {} if data is None else data data['api_key'] = config.apikey if file_elements is None: file_elements = {} if file_dictionary is not None: for key, path in file_dictionary.items(): path = os.path.abspath(path) if os.path.exists(path): try: if key is 'dataset': # check if arff is valid? decoder = arff.ArffDecoder() with io.open(path, encoding='utf8') as fh: decoder.decode(fh, encode_nominal=True) except: raise ValueError("The file you have provided is not a valid arff file") file_elements[key] = open(path, 'rb') else: raise ValueError("File doesn't exist") # Using requests.post sets header 'Accept-encoding' automatically to # 'gzip,deflate' response = requests.post(url, data=data, files=file_elements) if response.status_code != 200: raise _parse_server_exception(response, url=url) if 'Content-Encoding' not in response.headers or \ response.headers['Content-Encoding'] != 'gzip': warnings.warn('Received uncompressed content from OpenML for %s.' % url) return response.text
def read_data(self, filename): file_ = codecs.open(filename, 'rb', 'utf-8') decoder = arff.ArffDecoder() dataset = decoder.decode(file_.readlines(), encode_nominal=True) file_.close() data = dataset['data'] return self.normlize_data(np.mat(data))
def load_arff_data(filepath, is_regression=True): from sklearn.model_selection import train_test_split with open(filepath, 'r') as f: decoder = arff.ArffDecoder() d = decoder.decode(f, encode_nominal=True) data = np.array(d['data']) X = data[:, :-1] y = data[:, -1] rng = np.random.RandomState(0) X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=rng) # TODO: Test size should be an arg n_dim = X_train.shape[1] n_train = X_train.shape[0] n_test = X_test.shape[0] n_class = 1 if is_regression else np.unique(y_train) data = { 'x_train': X_train, 'y_train': y_train, 'n_class': n_class, 'n_dim': n_dim, 'n_train': n_train, 'x_test': X_test, 'y_test': y_test, 'n_test': n_test, 'is_sparse': False } return data
def scrape_data(): # decode the .arff data and change text labels into numerical decoder = arff.ArffDecoder() data = decoder.decode(file, encode_nominal=True) # split the raw data into data and labels vals = [val[0:-1] for val in data['data']] labels = [label[-1] for label in data['data']] for val in labels: if labels[val] != 0: labels[val] = 1 # split the labels and data into traning and validation sets training_data = vals[0:int(.9 * len(vals))] training_labels = labels[0:int(.9 * len(vals))] validation_data = vals[int(.9 * len(vals)):] validation_labels = labels[int(.9 * len(vals)):] print(training_labels) # flatten labels with one hot encoding training_labels = to_categorical(training_labels, 5) validation_labels = to_categorical(validation_labels, 5) # save all arrays with numpy np.save('saved-files/vals', np.asarray(vals)) np.save('saved-files/labels', np.asarray(labels)) np.save('saved-files/training_data', np.asarray(training_data)) np.save('saved-files/validation_data', np.asarray(validation_data)) np.save('saved-files/training_labels', np.asarray(training_labels)) np.save('saved-files/validation_labels', np.asarray(validation_labels))
def test_encode_adding_quotes_with_spaces(self): # regression tests for https://github.com/renatopp/liac-arff/issues/87 encoder = self.get_encoder() # \u3000 corresponds to an ideographic space. It should be treated as # a space. fixture = { 'relation': 'name', 'attributes': [('A', 'STRING'), ('B', 'STRING')], 'data': [['a', 'b'], ['b\u3000e', 'a']], } expected_data = """@RELATION name @ATTRIBUTE A STRING @ATTRIBUTE B STRING @DATA a,b 'b\u3000e',a """ arff_data = encoder.encode(fixture) self.assertEqual(arff_data, expected_data) decoder = arff.ArffDecoder() arff_object = decoder.decode(arff_data) self.assertEqual(arff_object['data'], fixture['data'])
def meta_train_data_transformed(request): tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] categorical = {i: True if attribute == 'nominal' else False for i, attribute in enumerate(attribute_types)} data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1,)) logger = logging.getLogger('Meta') meta_features.helper_functions.set_value( "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical), ) meta_features.helper_functions.set_value( "NumSymbols", meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), ) meta_features.helper_functions.set_value( "ClassOccurences", meta_features.helper_functions["ClassOccurences"](X, y, logger), ) DPP = FeatTypeSplit(feat_type={ col: 'categorical' if category else 'numerical' for col, category in categorical.items() }) X_transformed = DPP.fit_transform(X) number_numerical = np.sum(~np.array(list(categorical.values()))) categorical_transformed = {i: True if i < (X_transformed.shape[1] - number_numerical) else False for i in range(X_transformed.shape[1])} # pre-compute values for transformed inputs meta_features.helper_functions.set_value( "PCA", meta_features.helper_functions["PCA"](X_transformed, y, logger), ) meta_features.helper_functions.set_value( "Skewnesses", meta_features.helper_functions["Skewnesses"]( X_transformed, y, logger, categorical_transformed), ) meta_features.helper_functions.set_value( "Kurtosisses", meta_features.helper_functions["Kurtosisses"]( X_transformed, y, logger, categorical_transformed) ) if request.param == 'numpy': return X_transformed, y, categorical_transformed elif request.param == 'pandas': return pd.DataFrame(X_transformed), y, categorical_transformed else: raise ValueError(request.param)
def return_arff(self): filename = self.directory with io.open(filename) as fh: decoder = arff.ArffDecoder() return decoder.decode(fh, encode_nominal=True, return_type=arff.DENSE)
def read_data(self, filename): file_ = codecs.open(filename, 'rb', 'utf-8') decoder = arff.ArffDecoder() dataset = decoder.decode(file_.readlines(), encode_nominal=True) file_.close() self.__data = dataset['data'] if (self.__data is not None and self.__data[0] is not None): self.__dim_size = len(self.__data[0])
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1]] self.categorical = [True if attribute == 'nominal' else False for attribute in self.attribute_types] data = np.array(dataset['data'], dtype=np.float64) X = data[:,:-1] y = data[:,-1].reshape((-1,)) ohe = OneHotEncoder(self.categorical) X_transformed = ohe.fit_transform(X) imp = Imputer(copy=False) X_transformed = imp.fit_transform(X_transformed) center = not scipy.sparse.isspmatrix((X_transformed)) standard_scaler = StandardScaler(with_mean=center) X_transformed = standard_scaler.fit_transform(X_transformed) X_transformed = X_transformed.todense() # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"] (self.X_transformed, self.y)) self.helpers.set_value("MissingValues", self.helpers[ "MissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value("NumSymbols", self.helpers["NumSymbols"]( self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value("Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value("Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))
def load_arff_data(filename): with open(filename) as f: decoder = arff.ArffDecoder() arff_obj = decoder.decode(f, encode_nominal=True) # feat_num = len([v for v in arff_obj['attributes'] if v[0] != 'class']) data = np.array(arff_obj['data']) X = data[:, :-1] y = data[:, -1] return X, y
def arff2df(filepath): decoder = arff.ArffDecoder() arff_file = open(filepath) decoded_arff = decoder.decode(arff_file, return_type=arff.LOD) data = decoded_arff['data'] column_names = list(map(lambda x: x[0], decoded_arff['attributes'])) df = pd.DataFrame.from_records(data, columns=list(range(len(column_names)))) df = df.fillna(0) df.columns = column_names return df
def load_arff_data(filepath): with open(filepath, 'r') as f: decoder = arff.ArffDecoder() d = decoder.decode(f, encode_nominal=True) # tvas: We are assuming the target/dependent is the last column data = np.array(d['data']) X = data[:, :-1] y = data[:, -1] return X, y
def read_data(self, filename): """ Read data from file. :param filename: filename :return: normalized data """ file_ = codecs.open(filename, 'rb', 'utf-8') decoder = arff.ArffDecoder() dataset = decoder.decode(file_.readlines(), encode_nominal=True) file_.close() data = dataset['data'] return self.normalize_data(np.mat(data))
def sparse_data(): tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1] ] categorical = { i: True if attribute == 'nominal' else False for i, attribute in enumerate(attribute_types) } data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1, )) # First, swap NaNs and zeros, because when converting an encoded # dense matrix to sparse, the values which are encoded to zero are lost X_sparse = X.copy() NaNs = ~np.isfinite(X_sparse) X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) X = X_sparse y = y mf = meta_features.metafeatures helpers = meta_features.helper_functions logger = logging.getLogger() # Precompute some helper functions helpers.set_value( "MissingValues", helpers["MissingValues"](X, y, logger, categorical), ) mf.set_value( "NumberOfMissingValues", mf["NumberOfMissingValues"](X, y, logger, categorical), ) helpers.set_value( "NumSymbols", helpers["NumSymbols"](X, y, logger, categorical), ) helpers.set_value( "ClassOccurences", helpers["ClassOccurences"](X, y, logger), ) return X, y, categorical
def test_date(self): file_ = '''@RELATION employee @ATTRIBUTE Name STRING @ATTRIBUTE start_date DATE @ATTRIBUTE end_date DATE '%Y/%m/%dT%H:%M:%S' @ATTRIBUTE simple_date DATE '%Y/%m/%d' @DATA Lulu,'2011-05-20T12:34:56','2014/06/21T12:34:56','2018/03/04' Daisy,'2012-09-30T12:34:56','2015/11/21T12:34:56','2018/03/04' Brie,'2013-05-01T12:34:56','2016/12/21T12:34:56','2018/03/04' ''' decoder = arff.ArffDecoder() d = decoder.decode(file_, encode_nominal=True) reconstituted = arff.dumps(d) decoder2 = arff.ArffDecoder() d2 = decoder2.decode(reconstituted, encode_nominal=True) self.assertEqual(d['data'][1][1], d2['data'][1][1]) self.assertEqual(d['data'][1][2], d2['data'][1][2]) self.assertEqual(d['data'][1][3], d2['data'][1][3]) self.assertEqual(d['data'][2][1], d2['data'][2][1]) self.assertEqual(d['data'][2][2], d2['data'][2][2]) self.assertEqual(d['data'][2][3], d2['data'][2][3])
def retrieve_class_labels_for_dataset(self, dataset): """Reads the datasets arff to determine the class-labels, and returns those. If the task has no class labels (for example a regression problem) it returns None.""" # TODO improve performance, currently reads the whole file # Should make a method that only reads the attributes arffFileName = dataset.data_file with open(arffFileName) as fh: arffData = arff.ArffDecoder().decode(fh) dataAttributes = dict(arffData['attributes']) if('class' in dataAttributes): return dataAttributes['class'] else: return None
def read_data(self, filename): """ Read data from file. :param filename: Name of the file to read :return: no return """ file_ = codecs.open(filename, 'rb', 'utf-8') decoder = arff.ArffDecoder() dataset = decoder.decode(file_.readlines(), encode_nominal=True) file_.close() self.__data = dataset['data'] if self.__data is not None and self.__data[0] is not None: self.__dim_size = len(self.__data[0])
def test_get_online_dataset_arff(self): dataset_id = 100 # Australian # lazy loading not used as arff file is checked. dataset = openml.datasets.get_dataset(dataset_id) decoder = arff.ArffDecoder() # check if the arff from the dataset is # the same as the arff from _get_arff function d_format = (dataset.format).lower() self.assertEqual( dataset._get_arff(d_format), decoder.decode( _get_online_dataset_arff(dataset_id), encode_nominal=True, return_type=arff.DENSE if d_format == 'arff' else arff.COO), "ARFF files are not equal")
def scrape_data(): # 解码arff数据,文本标签转变为二进制数据 decoder = arff.ArffDecoder() data = decoder.decode(file, encode_nominal=True) # 将原始数据分解为数据和标签 vals = [val[0: -1] for val in data['data']] labels = [label[-1] for label in data['data']] for val in labels: if labels[val] != 0: labels[val] = 1 #将标签和数据分成训练和验证集 training_data = vals[0: int(.9 * len(vals))] training_labels = labels[0: int(.9 * len(vals))] validation_data = vals[int(.9 * len(vals)):] validation_labels = labels[int(.9 * len(vals)):] a = np.asarray(training_data, dtype=float) scaler = preprocessing.StandardScaler().fit(a) scaler.mean_ scaler.var_ scaler.scale_ training_data = scaler.transform(a) b = np.asarray(validation_data, dtype=float) scaler = preprocessing.StandardScaler().fit(b) scaler.mean_ scaler.var_ scaler.scale_ validation_data = scaler.transform(b) #print(training_labels) # 将原有的类别向量转换为独热编码的形式 training_labels = to_categorical(training_labels, 5) validation_labels = to_categorical(validation_labels, 5) # 用numpy保存所有数组 np.save('saved-files/vals', np.asarray(vals)) np.save('saved-files/labels', np.asarray(labels)) np.save('saved-files/training_data', np.asarray(training_data)) np.save('saved-files/validation_data', np.asarray(validation_data)) np.save('saved-files/training_labels', np.asarray(training_labels)) np.save('saved-files/validation_labels', np.asarray(validation_labels))
def meta_train_data(request): tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1] ] categorical = { i: True if attribute == 'nominal' else False for i, attribute in enumerate(attribute_types) } data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1, )) logger = logging.getLogger('Meta') meta_features.helper_functions.set_value( "MissingValues", meta_features.helper_functions["MissingValues"](X, y, logger, categorical), ) meta_features.helper_functions.set_value( "NumSymbols", meta_features.helper_functions["NumSymbols"](X, y, logger, categorical), ) meta_features.helper_functions.set_value( "ClassOccurences", meta_features.helper_functions["ClassOccurences"](X, y, logger), ) if request.param == 'numpy': return X, y, categorical elif request.param == 'pandas': return pd.DataFrame(X), y, categorical else: raise ValueError(request.param)
def publish(self): """Publish the dataset on the OpenML server. Upload the dataset description and dataset content to openml. Returns ------- dataset_id: int Id of the dataset uploaded to the server. """ file_elements = {'description': self._to_xml()} # the arff dataset string is available if self._dataset is not None: file_elements['dataset'] = self._dataset else: # the path to the arff dataset is given if self.data_file is not None: path = os.path.abspath(self.data_file) if os.path.exists(path): try: with io.open(path, encoding='utf8') as fh: # check if arff is valid decoder = arff.ArffDecoder() decoder.decode(fh, encode_nominal=True) except arff.ArffException: raise ValueError("The file you have provided is not " "a valid arff file.") with open(path, 'rb') as fp: file_elements['dataset'] = fp.read() else: if self.url is None: raise ValueError("No url/path to the data file was given") return_value = openml._api_calls._perform_api_call( "data/", 'post', file_elements=file_elements, ) response = xmltodict.parse(return_value) self.dataset_id = int(response['oml:upload_data_set']['oml:id']) return self.dataset_id
def detect_dialect(self): # The arff package loads an arff file into a dict with the the keys: # description (description of dataset) # relation (name of dataset) # attributes (list of tuples with name and type of attribute) # data (list with the data rows) decoder = arff.ArffDecoder() if self.contents is None: file = open(self.path, 'r') weka = decoder.decode(file) file.close() else: weka = decoder.decode(self.decoded_contents) # The decoded contents are no longer needed and should not waste memory self.decoded_contents = None self.name = weka['relation'] self.description = weka['description'] # Attribute types are either 'REAL', 'INTEGER', 'NUMERIC' or a list of values (NOMINAL???) self.attributes = weka['attributes'] self.data = weka['data']
def _get_file_elements(self) -> Dict: """ Adds the 'dataset' to file elements. """ file_elements = {} path = None if self.data_file is None else os.path.abspath( self.data_file) if self._dataset is not None: file_elements['dataset'] = self._dataset elif path is not None and os.path.exists(path): with open(path, 'rb') as fp: file_elements['dataset'] = fp.read() try: dataset_utf8 = str(file_elements['dataset'], 'utf8') arff.ArffDecoder().decode(dataset_utf8, encode_nominal=True) except arff.ArffException: raise ValueError( "The file you have provided is not a valid arff file.") elif self.url is None: raise ValueError("No valid url/path to the data file was given.") return file_elements
def _load_arff(filename, target): with open(filename) as fh: decoder = arff.ArffDecoder() arff_object = decoder.decode(fh, encode_nominal=True) dataset_name = arff_object['relation'] attributes = arff_object['attributes'] data = arff_object['data'] if isinstance(data, list): data = np.array(data) elif isinstance(data, tuple): data = sparse.coo_matrix(data) else: raise ValueError('arff returned unknown data format of type %s' % str(type(data))) target_attribute = -1 for i, attribute in enumerate(attributes): if attribute[0] == target: target_attribute = i break if target_attribute < 0: raise ValueError( 'Target feature %s not found. Available features ' 'are: %s' % (target, str([attribute[0] for attribute in attributes]))) y = data[:, target_attribute] X = data[:, np.arange(data.shape[1]) != target_attribute] # Do not add the target to the feat_type list feat_type = [ 'Categorical' if type(attribute[1]) in (list, tuple) else 'Numerical' for attribute in attributes[:-1] ] return X, y, dataset_name, feat_type
def retrieve_class_labels(self, target_name='class'): """Reads the datasets arff to determine the class-labels. If the task has no class labels (for example a regression problem) it returns None. Necessary because the data returned by get_data only contains the indices of the classes, while OpenML needs the real classname when uploading the results of a run. Parameters ---------- target_name : str Name of the target attribute Returns ------- list """ # TODO improve performance, currently reads the whole file # Should make a method that only reads the attributes arffFileName = self.data_file if self.format.lower() == 'arff': return_type = arff.DENSE elif self.format.lower() == 'sparse_arff': return_type = arff.COO else: raise ValueError('Unknown data format %s' % self.format) with io.open(arffFileName, encoding='utf8') as fh: arffData = arff.ArffDecoder().decode(fh, return_type=return_type) dataAttributes = dict(arffData['attributes']) if target_name in dataAttributes: return dataAttributes[target_name] else: return None
def sparse_data_transformed(): tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1] ] categorical = { i: True if attribute == 'nominal' else False for i, attribute in enumerate(attribute_types) } data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1, )) # First, swap NaNs and zeros, because when converting an encoded # dense matrix to sparse, the values which are encoded to zero are lost X_sparse = X.copy() NaNs = ~np.isfinite(X_sparse) X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) ohe = FeatTypeSplit( feat_type={ col: 'categorical' if category else 'numerical' for col, category in categorical.items() }) X_transformed = X_sparse.copy() X_transformed = ohe.fit_transform(X_transformed) imp = SimpleImputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler(with_mean=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(list(categorical.values()))) categorical_transformed = { i: True if i < (X_transformed.shape[1] - number_numerical) else False for i in range(X_transformed.shape[1]) } X = X_sparse X_transformed = X_transformed y = y mf = meta_features.metafeatures helpers = meta_features.helper_functions logger = logging.getLogger() # Precompute some helper functions helpers.set_value( "PCA", helpers["PCA"](X_transformed, y, logger), ) helpers.set_value( "MissingValues", helpers["MissingValues"](X, y, logger, categorical), ) mf.set_value( "NumberOfMissingValues", mf["NumberOfMissingValues"](X, y, logger, categorical), ) helpers.set_value( "NumSymbols", helpers["NumSymbols"](X, y, logger, categorical), ) helpers.set_value( "ClassOccurences", helpers["ClassOccurences"](X, y, logger), ) helpers.set_value( "Skewnesses", helpers["Skewnesses"](X_transformed, y, logger, categorical_transformed), ) helpers.set_value( "Kurtosisses", helpers["Kurtosisses"](X_transformed, y, logger, categorical_transformed), ) return X_transformed, y, categorical_transformed
def get_decoder(self): decoder = arff.ArffDecoder() return decoder
def get_decoder(self, conversors): decoder = arff.ArffDecoder() decoder._conversors = conversors return decoder
model.fit(x_train,y_train) predictionsLR=model.predict(x_test) a1=accuracy_score(y_test,predictionsLR) return(a1) # In[155]: df_all=[] df_all=pd.DataFrame(df_all) files_to_read = [("Combined1.arff","FileName"),("Combined2.arff","FileName"),("Combined3.arff","FileName"),("Combined5a.arff","FileName"),("Combined5b.arff","FileName")] for (file, file_data) in files_to_read: df1=[] with open (file) as f: decoder=arff.ArffDecoder() datadictionary=decoder.decode(f,encode_nominal=True,return_type=arff.LOD) data=datadictionary['data'] arff.ArffDecoder df1=pd.DataFrame(data) #df[3094] = np.where(df[3094]==1.0, 'human', 'worm') df1=df1.replace(-np.inf,np.nan) df1.fillna(df.mean(),inplace=True) b=extract_metafeature(df1) b=pd.DataFrame(b) df_all = df_all.append(b) print(df_all) # In[156]:
def setUp(self): self.cwd = os.getcwd() tests_dir = __file__ os.chdir(os.path.dirname(tests_dir)) decoder = arff.ArffDecoder() with open(os.path.join("datasets", "dataset.arff")) as fh: dataset = decoder.decode(fh, encode_nominal=True) # -1 because the last attribute is the class self.attribute_types = [ 'numeric' if type(type_) != list else 'nominal' for name, type_ in dataset['attributes'][:-1] ] self.categorical = [ True if attribute == 'nominal' else False for attribute in self.attribute_types ] data = np.array(dataset['data'], dtype=np.float64) X = data[:, :-1] y = data[:, -1].reshape((-1, )) # First, swap NaNs and zeros, because when converting an encoded # dense matrix to sparse, the values which are encoded to zero are lost X_sparse = X.copy() NaNs = ~np.isfinite(X_sparse) X_sparse[NaNs] = 0 X_sparse = sparse.csr_matrix(X_sparse) ohe = OneHotEncoder(self.categorical) X_transformed = X_sparse.copy() X_transformed = ohe.fit_transform(X_transformed) imp = SimpleImputer(copy=False) X_transformed = imp.fit_transform(X_transformed) standard_scaler = StandardScaler(with_mean=False) X_transformed = standard_scaler.fit_transform(X_transformed) # Transform the array which indicates the categorical metafeatures number_numerical = np.sum(~np.array(self.categorical)) categorical_transformed = [True] * (X_transformed.shape[1] - number_numerical) + \ [False] * number_numerical self.categorical_transformed = categorical_transformed self.X = X_sparse self.X_transformed = X_transformed self.y = y self.mf = meta_features.metafeatures self.helpers = meta_features.helper_functions # Precompute some helper functions self.helpers.set_value("PCA", self.helpers["PCA"](self.X_transformed, self.y)) self.helpers.set_value( "MissingValues", self.helpers["MissingValues"](self.X, self.y, self.categorical)) self.mf.set_value( "NumberOfMissingValues", self.mf["NumberOfMissingValues"](self.X, self.y, self.categorical)) self.helpers.set_value( "NumSymbols", self.helpers["NumSymbols"](self.X, self.y, self.categorical)) self.helpers.set_value("ClassOccurences", self.helpers["ClassOccurences"](self.X, self.y)) self.helpers.set_value( "Skewnesses", self.helpers["Skewnesses"](self.X_transformed, self.y, self.categorical_transformed)) self.helpers.set_value( "Kurtosisses", self.helpers["Kurtosisses"](self.X_transformed, self.y, self.categorical_transformed))