def load_dataset(self, dataset_key, force_update=False): """ Load a dataset from the local filesystem, downloading it from data.world first, if necessary. This function returns an object of type `LocalDataset`. The object allows access to metedata via it's `describe()` method and to all the data via three properties `raw_data`, `tables` and `dataframes`, all of which are mappings (dict-like structures). Parameters ---------- dataset_key : str Dataset identifier, in the form of owner/id or of a url force_update : bool Flag, indicating if a new copy of the dataset should be downloaded replacing any previously downloaded copy Returns ------- LocalDataset The object representing the dataset Raises ------ RestApiError If a server error occurs """ owner_id, dataset_id = parse_dataset_key(dataset_key) cache_dir = path.join(self._config.cache_dir, owner_id, dataset_id, 'latest') backup_dir = None if path.isdir(cache_dir) and force_update: backup_dir = path.join(self._config.cache_dir, owner_id, dataset_id, 'backup') if path.isdir(backup_dir): shutil.rmtree(backup_dir) shutil.move(cache_dir, backup_dir) descriptor_file = path.join(cache_dir, 'datapackage.json') if not path.isfile(descriptor_file): try: descriptor_file = self.api_client.download_datapackage( dataset_key, cache_dir) except RestApiError as e: if backup_dir is not None: shutil.move(backup_dir, cache_dir) warn('Unable to download datapackage ({}). ' 'Loading previously saved version.'.format(e.reason)) else: raise if backup_dir is not None: shutil.rmtree(backup_dir, ignore_errors=True) return LocalDataset(descriptor_file)
def simpsons_broken_dataset(self, simpsons_broken_descriptor_path): return LocalDataset(simpsons_broken_descriptor_path)
def load_dataset(self, dataset_key, force_update=False, auto_update=False): """Load a dataset from the local filesystem, downloading it from data.world first, if necessary. This function returns an object of type `LocalDataset`. The object allows access to metedata via it's `describe()` method and to all the data via three properties `raw_data`, `tables` and `dataframes`, all of which are mappings (dict-like structures). :param dataset_key: Dataset identifier, in the form of owner/id or of a url :type dataset_key: str :param force_update: Flag, indicating if a new copy of the dataset should be downloaded replacing any previously downloaded copy (Default value = False) :type force_update: bool :param auto_update: Flag, indicating that dataset be updated to the latest version :type auto_update: bool :returns: The object representing the dataset :rtype: LocalDataset :raises RestApiError: If a server error occurs """ owner_id, dataset_id = parse_dataset_key(dataset_key) cache_dir = path.join(self._config.cache_dir, owner_id, dataset_id, 'latest') backup_dir = None if path.isdir(cache_dir) and force_update: backup_dir = path.join(self._config.cache_dir, owner_id, dataset_id, 'backup') move_cache_dir_to_backup_dir(backup_dir, cache_dir) descriptor_file = path.join(cache_dir, 'datapackage.json') if not path.isfile(descriptor_file): try: descriptor_file = self.api_client.download_datapackage( dataset_key, cache_dir) except RestApiError as e: if backup_dir is not None: shutil.move(backup_dir, cache_dir) warn('Unable to download datapackage ({}). ' 'Loading previously saved version.'.format(e.reason)) else: raise else: try: dataset_info = self.api_client.get_dataset(dataset_key) except RestApiError: return LocalDataset(descriptor_file) last_modified = datetime.strptime(dataset_info['updated'], '%Y-%m-%dT%H:%M:%S.%fZ') if (last_modified > datetime.utcfromtimestamp( path.getmtime(str(descriptor_file)))): if auto_update: try: backup_dir = path.join(self._config.cache_dir, owner_id, dataset_id, 'backup') move_cache_dir_to_backup_dir(backup_dir, cache_dir) descriptor_file = self.api_client. \ download_datapackage(dataset_key, cache_dir) except RestApiError as e: if backup_dir is not None: shutil.move(backup_dir, cache_dir) warn('Unable to auto update datapackage ({}). ' 'Loading previously saved version.' .format(e.reason)) else: raise else: filterwarnings('always', message='You are using an outdated copy') warn('You are using an outdated copy of {}. ' 'If you wish to use the latest version, call this ' 'function with the argument ' 'auto_update=True or ' 'force_update=True'.format(dataset_key)) if backup_dir is not None: shutil.rmtree(backup_dir, ignore_errors=True) return LocalDataset(descriptor_file)