Example #1
0
def get_task(task_id):
    """Download the OpenML task for a given task ID.

    Parameters
    ----------
    task_id : int
        The OpenML task id.
    """
    try:
        task_id = int(task_id)
    except:
        raise ValueError("Task ID is neither an Integer nor can be "
                         "cast to an Integer.")

    tid_cache_dir = _create_task_cache_dir(task_id)

    with lockutils.external_lock(
            name='datasets.functions.get_dataset:%d' % task_id,
            lock_path=os.path.join(config.get_cache_directory(), 'locks'),
    ):
        try:
            task = _get_task_description(task_id)
            dataset = get_dataset(task.dataset_id)
            class_labels = dataset.retrieve_class_labels(task.target_name)
            task.class_labels = class_labels
            task.download_split()

        except Exception as e:
            _remove_task_cache_dir(tid_cache_dir)
            raise e

    return task
Example #2
0
def generate_or_read_from_file(key_file='.secret_key', key_length=64):
    """Multiprocess-safe secret key file generator.

    Useful to replace the default (and thus unsafe) SECRET_KEY in settings.py
    upon first start. Save to use, i.e. when multiple Python interpreters
    serve the dashboard Django application (e.g. in a mod_wsgi + daemonized
    environment).  Also checks if file permissions are set correctly and
    throws an exception if not.
    """
    abspath = os.path.abspath(key_file)
    lock = lockutils.external_lock(key_file + ".lock",
                                   lock_path=os.path.dirname(abspath))
    with lock:
        if not os.path.exists(key_file):
            key = generate_key(key_length)
            old_umask = os.umask(0o177)  # Use '0600' file permissions
            with open(key_file, 'w') as f:
                f.write(key)
            os.umask(old_umask)
        else:
            if (os.stat(key_file).st_mode & 0o777) != 0o600:
                raise FilePermissionError("Insecure key file permissions!")
            with open(key_file, 'r') as f:
                key = f.readline()
        return key
Example #3
0
def get_task(task_id):
    """Download the OpenML task for a given task ID.
    Parameters
    ----------
    task_id : int
        The OpenML task id.
    """
    task_id = int(task_id)

    with lockutils.external_lock(
            name='task.functions.get_task:%d' % task_id,
            lock_path=openml.utils._create_lockfiles_dir(),
    ):
        tid_cache_dir = openml.utils._create_cache_directory_for_id(
            TASKS_CACHE_DIR_NAME, task_id,
        )

        try:
            task = _get_task_description(task_id)
            dataset = get_dataset(task.dataset_id)
            # Clustering tasks do not have class labels
            # and do not offer download_split
            if isinstance(task, OpenMLSupervisedTask):
                task.download_split()
                if isinstance(task, OpenMLClassificationTask):
                    task.class_labels = \
                        dataset.retrieve_class_labels(task.target_name)
        except Exception as e:
            openml.utils._remove_cache_dir_for_id(
                TASKS_CACHE_DIR_NAME,
                tid_cache_dir,
            )
            raise e

    return task
Example #4
0
def generate_or_read_from_file(key_file='.secret_key', key_length=64):
    """Multiprocess-safe secret key file generator.

    Useful to replace the default (and thus unsafe) SECRET_KEY in settings.py
    upon first start. Save to use, i.e. when multiple Python interpreters
    serve the dashboard Django application (e.g. in a mod_wsgi + daemonized
    environment).  Also checks if file permissions are set correctly and
    throws an exception if not.
    """
    abspath = os.path.abspath(key_file)
    # check, if key_file already exists
    # if yes, then just read and return key
    if os.path.exists(key_file):
        key = read_from_file(key_file)
        return key

    # otherwise, first lock to make sure only one process
    lock = lockutils.external_lock(key_file + ".lock",
                                   lock_path=os.path.dirname(abspath))
    with lock:
        if not os.path.exists(key_file):
            key = generate_key(key_length)
            old_umask = os.umask(0o177)  # Use '0600' file permissions
            with open(key_file, 'w') as f:
                f.write(key)
            os.umask(old_umask)
        else:
            key = read_from_file(key_file)
        return key
Example #5
0
def get_task(task_id):
    """Download the OpenML task for a given task ID.

    Parameters
    ----------
    task_id : int
        The OpenML task id.
    """
    try:
        task_id = int(task_id)
    except:
        raise ValueError("Task ID is neither an Integer nor can be "
                         "cast to an Integer.")

    tid_cache_dir = openml.utils._create_cache_directory_for_id(
        TASKS_CACHE_DIR_NAME,
        task_id,
    )

    with lockutils.external_lock(
            name='task.functions.get_task:%d' % task_id,
            lock_path=openml.utils._create_lockfiles_dir(),
    ):
        try:
            task = _get_task_description(task_id)
            dataset = get_dataset(task.dataset_id)
            class_labels = dataset.retrieve_class_labels(task.target_name)
            task.class_labels = class_labels
            task.download_split()
        except Exception as e:
            openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME,
                                                  tid_cache_dir)
            raise e

    return task
Example #6
0
def generate_or_read_from_file(key_file='.secret_key', key_length=64):
    """Multiprocess-safe secret key file generator.

    Useful to replace the default (and thus unsafe) SECRET_KEY in settings.py
    upon first start. Save to use, i.e. when multiple Python interpreters
    serve the dashboard Django application (e.g. in a mod_wsgi + daemonized
    environment).  Also checks if file permissions are set correctly and
    throws an exception if not.
    """
    abspath = os.path.abspath(key_file)
    # check, if key_file already exists
    # if yes, then just read and return key
    if os.path.exists(key_file):
        key = read_from_file(key_file)
        return key

    # otherwise, first lock to make sure only one process
    lock = lockutils.external_lock(key_file + ".lock",
                                   lock_path=os.path.dirname(abspath))
    with lock:
        if not os.path.exists(key_file):
            key = generate_key(key_length)
            old_umask = os.umask(0o177)  # Use '0600' file permissions
            with open(key_file, 'w') as f:
                f.write(key)
            os.umask(old_umask)
        else:
            key = read_from_file(key_file)
        return key
Example #7
0
def generate_or_read_from_file(key_file='.secret_key', key_length=64):
    """Multiprocess-safe secret key file generator.

    Useful to replace the default (and thus unsafe) SECRET_KEY in settings.py
    upon first start. Save to use, i.e. when multiple Python interpreters
    serve the dashboard Django application (e.g. in a mod_wsgi + daemonized
    environment).  Also checks if file permissions are set correctly and
    throws an exception if not.
    """
    abspath = os.path.abspath(key_file)
    lock = lockutils.external_lock(key_file + ".lock",
                                   lock_path=os.path.dirname(abspath))
    with lock:
        if not os.path.exists(key_file):
            key = generate_key(key_length)
            old_umask = os.umask(0o177)  # Use '0600' file permissions
            with open(key_file, 'w') as f:
                f.write(key)
            os.umask(old_umask)
        else:
            if (os.stat(key_file).st_mode & 0o777) != 0o600:
                raise FilePermissionError("Insecure key file permissions!")
            with open(key_file, 'r') as f:
                key = f.readline()
        return key
Example #8
0
def get_dataset(dataset_id):
    """Download a dataset.

    TODO: explain caching!

    This function is thread/multiprocessing safe.

    Parameters
    ----------
    dataset_id : int
        Dataset ID of the dataset to download

    Returns
    -------
    dataset : :class:`openml.OpenMLDataset`
        The downloaded dataset."""
    try:
        dataset_id = int(dataset_id)
    except:
        raise ValueError("Dataset ID is neither an Integer nor can be "
                         "cast to an Integer.")

    with lockutils.external_lock(
            name='datasets.functions.get_dataset:%d' % dataset_id,
            lock_path=_create_lockfiles_dir(),
    ):
        did_cache_dir = _create_cache_directory_for_id(
            DATASETS_CACHE_DIR_NAME,
            dataset_id,
        )

        try:
            remove_dataset_cache = True
            description = _get_dataset_description(did_cache_dir, dataset_id)
            arff_file = _get_dataset_arff(did_cache_dir, description)
            features = _get_dataset_features(did_cache_dir, dataset_id)
            qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
            remove_dataset_cache = False
        except OpenMLServerException as e:
            # if there was an exception, check if the user had access to the dataset
            if e.code == 112:
                six.raise_from(PrivateDatasetError(e.message), None)
            else:
                raise e
        finally:
            if remove_dataset_cache:
                _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME,
                                         did_cache_dir)

        dataset = _create_dataset_from_description(description, features,
                                                   qualities, arff_file)
    return dataset
 def _remove_pickle_files(self):
     cache_dir = self.static_cache_dir
     for did in ['-1', '2']:
         with lockutils.external_lock(
                 name='datasets.functions.get_dataset:%s' % did,
                 lock_path=os.path.join(openml.config.get_cache_directory(), 'locks'),
         ):
             pickle_path = os.path.join(cache_dir, 'datasets', did,
                                        'dataset.pkl')
             try:
                 os.remove(pickle_path)
             except:
                 pass
 def _remove_pickle_files(self):
     cache_dir = self.static_cache_dir
     for did in ['-1', '2']:
         with lockutils.external_lock(
                 name='datasets.functions.get_dataset:%s' % did,
                 lock_path=os.path.join(openml.config.get_cache_directory(), 'locks'),
         ):
             pickle_path = os.path.join(cache_dir, 'datasets', did,
                                        'dataset.pkl')
             try:
                 os.remove(pickle_path)
             except:
                 pass
Example #11
0
def get_dataset(dataset_id):
    """Download a dataset.

    TODO: explain caching!

    This function is thread/multiprocessing safe.

    Parameters
    ----------
    dataset_id : int
        Dataset ID of the dataset to download

    Returns
    -------
    dataset : :class:`openml.OpenMLDataset`
        The downloaded dataset."""
    try:
        dataset_id = int(dataset_id)
    except:
        raise ValueError("Dataset ID is neither an Integer nor can be "
                         "cast to an Integer.")

    with lockutils.external_lock(
        name='datasets.functions.get_dataset:%d' % dataset_id,
        lock_path=_create_lockfiles_dir(),
    ):
        did_cache_dir = _create_cache_directory_for_id(
            DATASETS_CACHE_DIR_NAME, dataset_id,
        )

        try:
            remove_dataset_cache = True
            description = _get_dataset_description(did_cache_dir, dataset_id)
            arff_file = _get_dataset_arff(did_cache_dir, description)
            features = _get_dataset_features(did_cache_dir, dataset_id)
            qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
            remove_dataset_cache = False
        except OpenMLServerException as e:
            # if there was an exception, check if the user had access to the dataset
            if e.code == 112:
                six.raise_from(PrivateDatasetError(e.message), None)
            else:
                raise e
        finally:
            if remove_dataset_cache:
                _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir)

        dataset = _create_dataset_from_description(
            description, features, qualities, arff_file
        )
    return dataset
Example #12
0
    def setUp(self):
        # This cache directory is checked in to git to simulate a populated
        # cache
        self.maxDiff = None
        self.static_cache_dir = None
        static_cache_dir = os.path.dirname(
            os.path.abspath(inspect.getfile(self.__class__)))
        static_cache_dir = os.path.abspath(os.path.join(
            static_cache_dir, '..'))
        content = os.listdir(static_cache_dir)
        if 'files' in content:
            self.static_cache_dir = os.path.join(static_cache_dir, 'files')

        if self.static_cache_dir is None:
            raise ValueError('Cannot find test cache dir!')

        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        tmp_dir_name = self.id()
        self.workdir = os.path.join(workdir, tmp_dir_name)
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        # amueller's read/write key that he will throw away later
        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
        self.production_server = "https://openml.org/api/v1/xml"
        self.test_server = "https://test.openml.org/api/v1/xml"
        openml.config.cache_directory = None

        openml.config.server = self.test_server
        openml.config.avoid_duplicate_runs = False

        openml.config.cache_directory = self.workdir

        # If we're on travis, we save the api key in the config file to allow
        # the notebook tests to read them.
        if os.environ.get('TRAVIS') or os.environ.get('APPVEYOR'):
            with lockutils.external_lock('config', lock_path=self.workdir):
                with open(openml.config.config_file, 'w') as fh:
                    fh.write('apikey = %s' % openml.config.apikey)

        # Increase the number of retries to avoid spurios server failures
        self.connection_n_retries = openml.config.connection_n_retries
        openml.config.connection_n_retries = 10
 def _remove_pickle_files(self):
     cache_dir = self.static_cache_dir
     for did in ['-1', '2']:
         with lockutils.external_lock(
                 name='datasets.functions.get_dataset:%s' % did,
                 lock_path=os.path.join(openml.config.get_cache_directory(),
                                        'locks'),
         ):
             pickle_path = os.path.join(cache_dir, 'datasets', did,
                                        'dataset.pkl')
             try:
                 os.remove(pickle_path)
             except (OSError, FileNotFoundError):
                 #  Replaced a bare except. Not sure why either of these would be acceptable.
                 pass
Example #14
0
    def setUp(self):
        # This cache directory is checked in to git to simulate a populated
        # cache
        self.maxDiff = None
        self.static_cache_dir = None
        static_cache_dir = os.path.dirname(os.path.abspath(inspect.getfile(self.__class__)))
        static_cache_dir = os.path.abspath(os.path.join(static_cache_dir, '..'))
        content = os.listdir(static_cache_dir)
        if 'files' in content:
            self.static_cache_dir = os.path.join(static_cache_dir, 'files')

        if self.static_cache_dir is None:
            raise ValueError('Cannot find test cache dir!')

        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        tmp_dir_name = self.id()
        self.workdir = os.path.join(workdir, tmp_dir_name)
        try:
            shutil.rmtree(self.workdir)
        except:
            pass

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        # amueller's read/write key that he will throw away later
        openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de"
        self.production_server = "https://openml.org/api/v1/xml"
        self.test_server = "https://test.openml.org/api/v1/xml"
        openml.config.cache_directory = None

        openml.config.server = self.test_server
        openml.config.avoid_duplicate_runs = False

        openml.config.cache_directory = self.workdir

        # If we're on travis, we save the api key in the config file to allow
        # the notebook tests to read them.
        if os.environ.get('TRAVIS') or os.environ.get('APPVEYOR'):
            with lockutils.external_lock('config', lock_path=self.workdir):
                with open(openml.config.config_file, 'w') as fh:
                    fh.write('apikey = %s' % openml.config.apikey)

        # Increase the number of retries to avoid spurios server failures
        self.connection_n_retries = openml.config.connection_n_retries
        openml.config.connection_n_retries = 10
Example #15
0
 def safe_func(*args, **kwargs):
     # Lock directories use the id that is passed as either positional or keyword argument.
     id_parameters = [
         parameter_name for parameter_name in kwargs
         if "_id" in parameter_name
     ]
     if len(id_parameters) == 1:
         id_ = kwargs[id_parameters[0]]
     elif len(args) > 0:
         id_ = args[0]
     else:
         raise RuntimeError(
             "An id must be specified for {}, was passed: ({}, {}).".
             format(func.__name__, args, kwargs))
     # The [7:] gets rid of the 'openml.' prefix
     lock_name = "{}.{}:{}".format(func.__module__[7:], func.__name__,
                                   id_)
     with lockutils.external_lock(name=lock_name,
                                  lock_path=_create_lockfiles_dir()):
         return func(*args, **kwargs)
Example #16
0
def get_dataset(dataset_id):
    """Download a dataset.

    TODO: explain caching!

    This function is thread/multiprocessing safe.

    Parameters
    ----------
    ddataset_id : int
        Dataset ID of the dataset to download

    Returns
    -------
    dataset : :class:`openml.OpenMLDataset`
        The downloaded dataset."""
    try:
        dataset_id = int(dataset_id)
    except:
        raise ValueError("Dataset ID is neither an Integer nor can be "
                         "cast to an Integer.")

    with lockutils.external_lock(
            name='datasets.functions.get_dataset:%d' % dataset_id,
            lock_path=os.path.join(config.get_cache_directory(), 'locks'),
    ):
        did_cache_dir = _create_dataset_cache_directory(dataset_id)

        try:
            description = _get_dataset_description(did_cache_dir, dataset_id)
            arff_file = _get_dataset_arff(did_cache_dir, description)
            features = _get_dataset_features(did_cache_dir, dataset_id)
            # TODO not used yet, figure out what to do with this...
            qualities = _get_dataset_qualities(did_cache_dir, dataset_id)
        except Exception as e:
            _remove_dataset_cache_dir(did_cache_dir)
            raise e

        dataset = _create_dataset_from_description(description, features,
                                                   qualities, arff_file)
    return dataset
Example #17
0
    def setUp(self, n_levels: int = 1):
        """Setup variables and temporary directories.

        In particular, this methods:

        * creates a temporary working directory
        * figures out a path to a few static test files
        * set the default server to be the test server
        * set a static API key for the test server
        * increases the maximal number of retries

        Parameters
        ----------
        n_levels : int
            Number of nested directories the test is in. Necessary to resolve the path to the
            ``files`` directory, which is located directly under the ``tests`` directory.
        """

        # This cache directory is checked in to git to simulate a populated
        # cache
        self.maxDiff = None
        self.static_cache_dir = None
        abspath_this_file = os.path.abspath(inspect.getfile(self.__class__))
        static_cache_dir = os.path.dirname(abspath_this_file)
        for _ in range(n_levels):
            static_cache_dir = os.path.abspath(
                os.path.join(static_cache_dir, '..'))
        content = os.listdir(static_cache_dir)
        if 'files' in content:
            self.static_cache_dir = os.path.join(static_cache_dir, 'files')

        if self.static_cache_dir is None:
            raise ValueError(
                'Cannot find test cache dir, expected it to be {}!'.format(
                    static_cache_dir))

        self.cwd = os.getcwd()
        workdir = os.path.dirname(os.path.abspath(__file__))
        tmp_dir_name = self.id()
        self.workdir = os.path.join(workdir, tmp_dir_name)
        shutil.rmtree(self.workdir, ignore_errors=True)

        os.mkdir(self.workdir)
        os.chdir(self.workdir)

        self.cached = True
        openml.config.apikey = TestBase.apikey
        self.production_server = "https://openml.org/api/v1/xml"
        openml.config.server = TestBase.test_server
        openml.config.avoid_duplicate_runs = False
        openml.config.cache_directory = self.workdir

        # If we're on travis, we save the api key in the config file to allow
        # the notebook tests to read them.
        if os.environ.get('TRAVIS') or os.environ.get('APPVEYOR'):
            with lockutils.external_lock('config', lock_path=self.workdir):
                with open(openml.config.config_file, 'w') as fh:
                    fh.write('apikey = %s' % openml.config.apikey)

        # Increase the number of retries to avoid spurious server failures
        self.connection_n_retries = openml.config.connection_n_retries
        openml.config.connection_n_retries = 10