Ejemplo n.º 1
0
def get_dataset(id):
    ds = openml.datasets.get_dataset(int(eval(id)))
    X, y, categorical_indicator, attribute_names = ds.get_data(
        dataset_format='dataframe',
        target=ds.default_target_attribute
    )
    if ds.qualities['NumberOfMissingValues'] > 100000 or X.shape[1] > 500000:
        return None, None, None
    dataset_conf = DatasetConfig({'openml': id, 'train_path': None})
    dataset_conf.format = ds.format
    dataset_conf.class_column = ds.default_target_attribute
    dataset_conf.name = '{}_{}_{}'.format(ds.name, int(eval(id)), time.time())
    return X, y, dataset_conf
Ejemplo n.º 2
0
def load_openml(dataset_conf: DatasetConfig) -> pd.DataFrame:
    LOGGER.info("Loading openml dataset {}".format(dataset_conf.openml))
    ds = openml.datasets.get_dataset(dataset_conf.openml)
    X, y, categorical_indicator, attribute_names = ds.get_data(
        dataset_format='dataframe', target=ds.default_target_attribute)
    df = pd.concat([X, y], axis=1)

    # Fix configuration
    dataset_conf.format = ds.format
    dataset_conf.class_column = ds.default_target_attribute
    dataset_conf.name = '{}_{}_{}'.format(ds.name, dataset_conf.openml,
                                          time.time())

    return df