def get_task(task_id): """Download the OpenML task for a given task ID. Parameters ---------- task_id : int The OpenML task id. """ try: task_id = int(task_id) except: raise ValueError("Task ID is neither an Integer nor can be " "cast to an Integer.") tid_cache_dir = _create_task_cache_dir(task_id) with lockutils.external_lock( name='datasets.functions.get_dataset:%d' % task_id, lock_path=os.path.join(config.get_cache_directory(), 'locks'), ): try: task = _get_task_description(task_id) dataset = get_dataset(task.dataset_id) class_labels = dataset.retrieve_class_labels(task.target_name) task.class_labels = class_labels task.download_split() except Exception as e: _remove_task_cache_dir(tid_cache_dir) raise e return task
def generate_or_read_from_file(key_file='.secret_key', key_length=64): """Multiprocess-safe secret key file generator. Useful to replace the default (and thus unsafe) SECRET_KEY in settings.py upon first start. Save to use, i.e. when multiple Python interpreters serve the dashboard Django application (e.g. in a mod_wsgi + daemonized environment). Also checks if file permissions are set correctly and throws an exception if not. """ abspath = os.path.abspath(key_file) lock = lockutils.external_lock(key_file + ".lock", lock_path=os.path.dirname(abspath)) with lock: if not os.path.exists(key_file): key = generate_key(key_length) old_umask = os.umask(0o177) # Use '0600' file permissions with open(key_file, 'w') as f: f.write(key) os.umask(old_umask) else: if (os.stat(key_file).st_mode & 0o777) != 0o600: raise FilePermissionError("Insecure key file permissions!") with open(key_file, 'r') as f: key = f.readline() return key
def get_task(task_id): """Download the OpenML task for a given task ID. Parameters ---------- task_id : int The OpenML task id. """ task_id = int(task_id) with lockutils.external_lock( name='task.functions.get_task:%d' % task_id, lock_path=openml.utils._create_lockfiles_dir(), ): tid_cache_dir = openml.utils._create_cache_directory_for_id( TASKS_CACHE_DIR_NAME, task_id, ) try: task = _get_task_description(task_id) dataset = get_dataset(task.dataset_id) # Clustering tasks do not have class labels # and do not offer download_split if isinstance(task, OpenMLSupervisedTask): task.download_split() if isinstance(task, OpenMLClassificationTask): task.class_labels = \ dataset.retrieve_class_labels(task.target_name) except Exception as e: openml.utils._remove_cache_dir_for_id( TASKS_CACHE_DIR_NAME, tid_cache_dir, ) raise e return task
def generate_or_read_from_file(key_file='.secret_key', key_length=64): """Multiprocess-safe secret key file generator. Useful to replace the default (and thus unsafe) SECRET_KEY in settings.py upon first start. Save to use, i.e. when multiple Python interpreters serve the dashboard Django application (e.g. in a mod_wsgi + daemonized environment). Also checks if file permissions are set correctly and throws an exception if not. """ abspath = os.path.abspath(key_file) # check, if key_file already exists # if yes, then just read and return key if os.path.exists(key_file): key = read_from_file(key_file) return key # otherwise, first lock to make sure only one process lock = lockutils.external_lock(key_file + ".lock", lock_path=os.path.dirname(abspath)) with lock: if not os.path.exists(key_file): key = generate_key(key_length) old_umask = os.umask(0o177) # Use '0600' file permissions with open(key_file, 'w') as f: f.write(key) os.umask(old_umask) else: key = read_from_file(key_file) return key
def get_task(task_id): """Download the OpenML task for a given task ID. Parameters ---------- task_id : int The OpenML task id. """ try: task_id = int(task_id) except: raise ValueError("Task ID is neither an Integer nor can be " "cast to an Integer.") tid_cache_dir = openml.utils._create_cache_directory_for_id( TASKS_CACHE_DIR_NAME, task_id, ) with lockutils.external_lock( name='task.functions.get_task:%d' % task_id, lock_path=openml.utils._create_lockfiles_dir(), ): try: task = _get_task_description(task_id) dataset = get_dataset(task.dataset_id) class_labels = dataset.retrieve_class_labels(task.target_name) task.class_labels = class_labels task.download_split() except Exception as e: openml.utils._remove_cache_dir_for_id(TASKS_CACHE_DIR_NAME, tid_cache_dir) raise e return task
def get_dataset(dataset_id): """Download a dataset. TODO: explain caching! This function is thread/multiprocessing safe. Parameters ---------- dataset_id : int Dataset ID of the dataset to download Returns ------- dataset : :class:`openml.OpenMLDataset` The downloaded dataset.""" try: dataset_id = int(dataset_id) except: raise ValueError("Dataset ID is neither an Integer nor can be " "cast to an Integer.") with lockutils.external_lock( name='datasets.functions.get_dataset:%d' % dataset_id, lock_path=_create_lockfiles_dir(), ): did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, dataset_id, ) try: remove_dataset_cache = True description = _get_dataset_description(did_cache_dir, dataset_id) arff_file = _get_dataset_arff(did_cache_dir, description) features = _get_dataset_features(did_cache_dir, dataset_id) qualities = _get_dataset_qualities(did_cache_dir, dataset_id) remove_dataset_cache = False except OpenMLServerException as e: # if there was an exception, check if the user had access to the dataset if e.code == 112: six.raise_from(PrivateDatasetError(e.message), None) else: raise e finally: if remove_dataset_cache: _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) dataset = _create_dataset_from_description(description, features, qualities, arff_file) return dataset
def _remove_pickle_files(self): cache_dir = self.static_cache_dir for did in ['-1', '2']: with lockutils.external_lock( name='datasets.functions.get_dataset:%s' % did, lock_path=os.path.join(openml.config.get_cache_directory(), 'locks'), ): pickle_path = os.path.join(cache_dir, 'datasets', did, 'dataset.pkl') try: os.remove(pickle_path) except: pass
def get_dataset(dataset_id): """Download a dataset. TODO: explain caching! This function is thread/multiprocessing safe. Parameters ---------- dataset_id : int Dataset ID of the dataset to download Returns ------- dataset : :class:`openml.OpenMLDataset` The downloaded dataset.""" try: dataset_id = int(dataset_id) except: raise ValueError("Dataset ID is neither an Integer nor can be " "cast to an Integer.") with lockutils.external_lock( name='datasets.functions.get_dataset:%d' % dataset_id, lock_path=_create_lockfiles_dir(), ): did_cache_dir = _create_cache_directory_for_id( DATASETS_CACHE_DIR_NAME, dataset_id, ) try: remove_dataset_cache = True description = _get_dataset_description(did_cache_dir, dataset_id) arff_file = _get_dataset_arff(did_cache_dir, description) features = _get_dataset_features(did_cache_dir, dataset_id) qualities = _get_dataset_qualities(did_cache_dir, dataset_id) remove_dataset_cache = False except OpenMLServerException as e: # if there was an exception, check if the user had access to the dataset if e.code == 112: six.raise_from(PrivateDatasetError(e.message), None) else: raise e finally: if remove_dataset_cache: _remove_cache_dir_for_id(DATASETS_CACHE_DIR_NAME, did_cache_dir) dataset = _create_dataset_from_description( description, features, qualities, arff_file ) return dataset
def setUp(self): # This cache directory is checked in to git to simulate a populated # cache self.maxDiff = None self.static_cache_dir = None static_cache_dir = os.path.dirname( os.path.abspath(inspect.getfile(self.__class__))) static_cache_dir = os.path.abspath(os.path.join( static_cache_dir, '..')) content = os.listdir(static_cache_dir) if 'files' in content: self.static_cache_dir = os.path.join(static_cache_dir, 'files') if self.static_cache_dir is None: raise ValueError('Cannot find test cache dir!') self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) tmp_dir_name = self.id() self.workdir = os.path.join(workdir, tmp_dir_name) try: shutil.rmtree(self.workdir) except: pass os.mkdir(self.workdir) os.chdir(self.workdir) self.cached = True # amueller's read/write key that he will throw away later openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de" self.production_server = "https://openml.org/api/v1/xml" self.test_server = "https://test.openml.org/api/v1/xml" openml.config.cache_directory = None openml.config.server = self.test_server openml.config.avoid_duplicate_runs = False openml.config.cache_directory = self.workdir # If we're on travis, we save the api key in the config file to allow # the notebook tests to read them. if os.environ.get('TRAVIS') or os.environ.get('APPVEYOR'): with lockutils.external_lock('config', lock_path=self.workdir): with open(openml.config.config_file, 'w') as fh: fh.write('apikey = %s' % openml.config.apikey) # Increase the number of retries to avoid spurios server failures self.connection_n_retries = openml.config.connection_n_retries openml.config.connection_n_retries = 10
def _remove_pickle_files(self): cache_dir = self.static_cache_dir for did in ['-1', '2']: with lockutils.external_lock( name='datasets.functions.get_dataset:%s' % did, lock_path=os.path.join(openml.config.get_cache_directory(), 'locks'), ): pickle_path = os.path.join(cache_dir, 'datasets', did, 'dataset.pkl') try: os.remove(pickle_path) except (OSError, FileNotFoundError): # Replaced a bare except. Not sure why either of these would be acceptable. pass
def setUp(self): # This cache directory is checked in to git to simulate a populated # cache self.maxDiff = None self.static_cache_dir = None static_cache_dir = os.path.dirname(os.path.abspath(inspect.getfile(self.__class__))) static_cache_dir = os.path.abspath(os.path.join(static_cache_dir, '..')) content = os.listdir(static_cache_dir) if 'files' in content: self.static_cache_dir = os.path.join(static_cache_dir, 'files') if self.static_cache_dir is None: raise ValueError('Cannot find test cache dir!') self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) tmp_dir_name = self.id() self.workdir = os.path.join(workdir, tmp_dir_name) try: shutil.rmtree(self.workdir) except: pass os.mkdir(self.workdir) os.chdir(self.workdir) self.cached = True # amueller's read/write key that he will throw away later openml.config.apikey = "610344db6388d9ba34f6db45a3cf71de" self.production_server = "https://openml.org/api/v1/xml" self.test_server = "https://test.openml.org/api/v1/xml" openml.config.cache_directory = None openml.config.server = self.test_server openml.config.avoid_duplicate_runs = False openml.config.cache_directory = self.workdir # If we're on travis, we save the api key in the config file to allow # the notebook tests to read them. if os.environ.get('TRAVIS') or os.environ.get('APPVEYOR'): with lockutils.external_lock('config', lock_path=self.workdir): with open(openml.config.config_file, 'w') as fh: fh.write('apikey = %s' % openml.config.apikey) # Increase the number of retries to avoid spurios server failures self.connection_n_retries = openml.config.connection_n_retries openml.config.connection_n_retries = 10
def safe_func(*args, **kwargs): # Lock directories use the id that is passed as either positional or keyword argument. id_parameters = [ parameter_name for parameter_name in kwargs if "_id" in parameter_name ] if len(id_parameters) == 1: id_ = kwargs[id_parameters[0]] elif len(args) > 0: id_ = args[0] else: raise RuntimeError( "An id must be specified for {}, was passed: ({}, {}).". format(func.__name__, args, kwargs)) # The [7:] gets rid of the 'openml.' prefix lock_name = "{}.{}:{}".format(func.__module__[7:], func.__name__, id_) with lockutils.external_lock(name=lock_name, lock_path=_create_lockfiles_dir()): return func(*args, **kwargs)
def get_dataset(dataset_id): """Download a dataset. TODO: explain caching! This function is thread/multiprocessing safe. Parameters ---------- ddataset_id : int Dataset ID of the dataset to download Returns ------- dataset : :class:`openml.OpenMLDataset` The downloaded dataset.""" try: dataset_id = int(dataset_id) except: raise ValueError("Dataset ID is neither an Integer nor can be " "cast to an Integer.") with lockutils.external_lock( name='datasets.functions.get_dataset:%d' % dataset_id, lock_path=os.path.join(config.get_cache_directory(), 'locks'), ): did_cache_dir = _create_dataset_cache_directory(dataset_id) try: description = _get_dataset_description(did_cache_dir, dataset_id) arff_file = _get_dataset_arff(did_cache_dir, description) features = _get_dataset_features(did_cache_dir, dataset_id) # TODO not used yet, figure out what to do with this... qualities = _get_dataset_qualities(did_cache_dir, dataset_id) except Exception as e: _remove_dataset_cache_dir(did_cache_dir) raise e dataset = _create_dataset_from_description(description, features, qualities, arff_file) return dataset
def setUp(self, n_levels: int = 1): """Setup variables and temporary directories. In particular, this methods: * creates a temporary working directory * figures out a path to a few static test files * set the default server to be the test server * set a static API key for the test server * increases the maximal number of retries Parameters ---------- n_levels : int Number of nested directories the test is in. Necessary to resolve the path to the ``files`` directory, which is located directly under the ``tests`` directory. """ # This cache directory is checked in to git to simulate a populated # cache self.maxDiff = None self.static_cache_dir = None abspath_this_file = os.path.abspath(inspect.getfile(self.__class__)) static_cache_dir = os.path.dirname(abspath_this_file) for _ in range(n_levels): static_cache_dir = os.path.abspath( os.path.join(static_cache_dir, '..')) content = os.listdir(static_cache_dir) if 'files' in content: self.static_cache_dir = os.path.join(static_cache_dir, 'files') if self.static_cache_dir is None: raise ValueError( 'Cannot find test cache dir, expected it to be {}!'.format( static_cache_dir)) self.cwd = os.getcwd() workdir = os.path.dirname(os.path.abspath(__file__)) tmp_dir_name = self.id() self.workdir = os.path.join(workdir, tmp_dir_name) shutil.rmtree(self.workdir, ignore_errors=True) os.mkdir(self.workdir) os.chdir(self.workdir) self.cached = True openml.config.apikey = TestBase.apikey self.production_server = "https://openml.org/api/v1/xml" openml.config.server = TestBase.test_server openml.config.avoid_duplicate_runs = False openml.config.cache_directory = self.workdir # If we're on travis, we save the api key in the config file to allow # the notebook tests to read them. if os.environ.get('TRAVIS') or os.environ.get('APPVEYOR'): with lockutils.external_lock('config', lock_path=self.workdir): with open(openml.config.config_file, 'w') as fh: fh.write('apikey = %s' % openml.config.apikey) # Increase the number of retries to avoid spurious server failures self.connection_n_retries = openml.config.connection_n_retries openml.config.connection_n_retries = 10