def get_dataset(id): ds = openml.datasets.get_dataset(int(eval(id))) X, y, categorical_indicator, attribute_names = ds.get_data( dataset_format='dataframe', target=ds.default_target_attribute ) if ds.qualities['NumberOfMissingValues'] > 100000 or X.shape[1] > 500000: return None, None, None dataset_conf = DatasetConfig({'openml': id, 'train_path': None}) dataset_conf.format = ds.format dataset_conf.class_column = ds.default_target_attribute dataset_conf.name = '{}_{}_{}'.format(ds.name, int(eval(id)), time.time()) return X, y, dataset_conf
def load_openml(dataset_conf: DatasetConfig) -> pd.DataFrame: LOGGER.info("Loading openml dataset {}".format(dataset_conf.openml)) ds = openml.datasets.get_dataset(dataset_conf.openml) X, y, categorical_indicator, attribute_names = ds.get_data( dataset_format='dataframe', target=ds.default_target_attribute) df = pd.concat([X, y], axis=1) # Fix configuration dataset_conf.format = ds.format dataset_conf.class_column = ds.default_target_attribute dataset_conf.name = '{}_{}_{}'.format(ds.name, dataset_conf.openml, time.time()) return df