Beispiel #1
0
    def load_dataset(self, dataset_key, force_update=False):
        """
        Load a dataset from the local filesystem, downloading it from
        data.world first, if necessary.

        This function returns an object of type `LocalDataset`. The object
        allows access to metedata via it's `describe()` method and to all the
        data via three properties `raw_data`, `tables` and `dataframes`, all
        of which are mappings (dict-like structures).

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id or of a url
        force_update : bool
            Flag, indicating if a new copy of the dataset should be downloaded
            replacing any previously downloaded copy

        Returns
        -------
        LocalDataset
            The object representing the dataset

        Raises
        ------
        RestApiError
            If a server error occurs
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        cache_dir = path.join(self._config.cache_dir, owner_id, dataset_id,
                              'latest')

        backup_dir = None
        if path.isdir(cache_dir) and force_update:
            backup_dir = path.join(self._config.cache_dir, owner_id,
                                   dataset_id, 'backup')
            if path.isdir(backup_dir):
                shutil.rmtree(backup_dir)
            shutil.move(cache_dir, backup_dir)

        descriptor_file = path.join(cache_dir, 'datapackage.json')
        if not path.isfile(descriptor_file):
            try:
                descriptor_file = self.api_client.download_datapackage(
                    dataset_key, cache_dir)
            except RestApiError as e:
                if backup_dir is not None:
                    shutil.move(backup_dir, cache_dir)
                    warn('Unable to download datapackage ({}). '
                         'Loading previously saved version.'.format(e.reason))
                else:
                    raise

        if backup_dir is not None:
            shutil.rmtree(backup_dir, ignore_errors=True)

        return LocalDataset(descriptor_file)
Beispiel #2
0
 def simpsons_broken_dataset(self, simpsons_broken_descriptor_path):
     return LocalDataset(simpsons_broken_descriptor_path)
Beispiel #3
0
 def simpsons_datapackage(self, simpsons_descriptor_path):
     datapackage = Package(descriptor=simpsons_descriptor_path)
     for r in datapackage.resources:
         if 'schema' in r.descriptor:
             LocalDataset._sanitize_resource(r)
     return datapackage
    def load_dataset(self, dataset_key, force_update=False, auto_update=False):
        """Load a dataset from the local filesystem, downloading it from
        data.world first, if necessary.

        This function returns an object of type `LocalDataset`. The object
        allows access to metedata via it's `describe()` method and to all the
        data via three properties `raw_data`, `tables` and `dataframes`, all
        of which are mappings (dict-like structures).

        :param dataset_key: Dataset identifier, in the form of owner/id or of
            a url
        :type dataset_key: str
        :param force_update: Flag, indicating if a new copy of the dataset
            should be downloaded replacing any previously downloaded copy
            (Default value = False)
        :type force_update: bool
        :param auto_update: Flag, indicating that dataset be updated to the
            latest version
        :type auto_update: bool
        :returns: The object representing the dataset
        :rtype: LocalDataset
        :raises RestApiError: If a server error occurs
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        cache_dir = path.join(self._config.cache_dir, owner_id, dataset_id,
                              'latest')
        backup_dir = None
        if path.isdir(cache_dir) and force_update:
            backup_dir = path.join(self._config.cache_dir, owner_id,
                                   dataset_id, 'backup')
            move_cache_dir_to_backup_dir(backup_dir, cache_dir)

        descriptor_file = path.join(cache_dir, 'datapackage.json')
        if not path.isfile(descriptor_file):
            try:
                descriptor_file = self.api_client.download_datapackage(
                    dataset_key, cache_dir)
            except RestApiError as e:
                if backup_dir is not None:
                    shutil.move(backup_dir, cache_dir)
                    warn('Unable to download datapackage ({}). '
                         'Loading previously saved version.'.format(e.reason))
                else:
                    raise
        else:
            try:
                dataset_info = self.api_client.get_dataset(dataset_key)
            except RestApiError:
                return LocalDataset(descriptor_file)

            last_modified = datetime.strptime(dataset_info['updated'],
                                              '%Y-%m-%dT%H:%M:%S.%fZ')
            if (last_modified > datetime.utcfromtimestamp(
                    path.getmtime(str(descriptor_file)))):
                if auto_update:
                    try:
                        backup_dir = path.join(self._config.cache_dir,
                                               owner_id, dataset_id,
                                               'backup')
                        move_cache_dir_to_backup_dir(backup_dir,
                                                     cache_dir)
                        descriptor_file = self.api_client. \
                            download_datapackage(dataset_key, cache_dir)
                    except RestApiError as e:
                        if backup_dir is not None:
                            shutil.move(backup_dir, cache_dir)
                            warn('Unable to auto update datapackage ({}). '
                                 'Loading previously saved version.'
                                 .format(e.reason))
                        else:
                            raise
                else:
                    filterwarnings('always',
                                   message='You are using an outdated copy')
                    warn('You are using an outdated copy of {}. '
                         'If you wish to use the latest version, call this '
                         'function with the argument '
                         'auto_update=True or '
                         'force_update=True'.format(dataset_key))

        if backup_dir is not None:
            shutil.rmtree(backup_dir, ignore_errors=True)

        return LocalDataset(descriptor_file)