Example #1
0
    def sync_files(self, dataset_key):
        """
        Trigger synchronization process to update all dataset files linked to
        source URLs.

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id

        Raises
        ------
        RestApiException
            If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.sync_files('username/test-dataset')  # doctest: +SKIP
        """
        try:
            self._datasets_api.sync(*(parse_dataset_key(dataset_key)))
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #2
0
    def upload_file(self, dataset_key, name, file_metadata={}, **kwargs):
        """Upload one file to a dataset

        :param dataset_key: Dataset identifier, in the form of owner/id
        :type dataset_key: str
        :param name: Name/path for files stored in the local filesystem
        :type name: str
        :param expand_archives: Boolean value to indicate files should be
        expanded upon upload
        :type expand_archive: bool optional
        :param files_metadata: Dict containing the name of files and metadata
        Uses file name as a dict containing File description, labels and
        source URLs to add or update
        :type files_metadata: dict optional
        :raises RestApiException: If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.upload_file(
        ...     'username/test-dataset',
        ...     'example.csv')  # doctest: +SKIP
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            self._uploads_api.upload_file(owner_id, dataset_id, name, **kwargs)
            if file_metadata:
                self.update_dataset(dataset_key, files=file_metadata)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #3
0
    def sparql(self,
               dataset_key,
               query,
               desired_mimetype='application/sparql-results+json',
               **kwargs):
        """Executes SPARQL queries against a dataset via POST

        :param dataset_key: Dataset identifier, in the form of owner/id
        :type dataset_key: str
        :param query: SPARQL query
        :type query: str
        :returns: file object that can be used in file parsers and
            data handling modules.
        :rtype: file object
        :raises RestApiException: If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.sparql_post('username/test-dataset',\
        >>> query) # doctest: +SKIP
        """
        api_client = self._build_api_client(
            default_mimetype_header_accept=desired_mimetype)
        sparql_api = kwargs.get('sparql_api_mock',
                                _swagger.SparqlApi(api_client))
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            return sparql_api.sparql_post(owner_id, dataset_id, query,
                                          **kwargs)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #4
0
    def upload_files(self, dataset_key, files):
        """Upload dataset files

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id
        files : list of str
            The list of names/paths for files stored in the local filesystem

        Raises
        ------
        RestApiException
            If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.upload_files(
        ...     'username/test-dataset',
        ...     ['/my/local/example.csv'])  # doctest: +SKIP
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            self._uploads_api.upload_files(owner_id, dataset_id, files)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #5
0
    def delete_files(self, dataset_key, names):
        """Delete dataset file(s)

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id
        names : list of str
            The list of names for files to be deleted

        Raises
        ------
        RestApiException
            If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.delete_files(
        ...     'username/test-dataset', ['example.csv'])  # doctest: +SKIP
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            self._datasets_api.delete_files_and_sync_sources(
                owner_id, dataset_id, names)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #6
0
    def get_dataset(self, dataset_key):
        """Retrieve an existing dataset definition

        This method retrieves metadata about an existing

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id

        Returns
        -------
        dict
            Dataset definition, with all attributes

        Raises
        ------
        RestApiException
            If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> intro_dataset = api_client.get_dataset(
        ...     'jonloyens/an-intro-to-dataworld-dataset')
        >>> intro_dataset['title']
        'An Intro to data.world Dataset'
        """
        try:
            return self._datasets_api.get_dataset(
                *(parse_dataset_key(dataset_key))).to_dict()
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #7
0
    def append_records(self,
                       dataset_key,
                       stream_id,
                       body,
                       provided_mimetype='application/json',
                       **kwargs):
        """Append records to a stream.

        :param dataset_key: Dataset identifier, in the form of owner/id
        :type dataset_key: str
        :param stream_id: Stream unique identifier.
        :type stream_id: str
        :param body: Object body
        :type body: obj
        :raises RestApiException: If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.append_records('username/test-dataset','streamId', \
        >>> {'content':'content'})
        """
        api_client = self._build_api_client(
            default_mimetype_header_content_type=provided_mimetype)
        streams_api = kwargs.get('streams_api_mock',
                                 _swagger.StreamsApi(api_client))
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            return streams_api.append_records(owner_id, dataset_id, stream_id,
                                              body, **kwargs)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #8
0
    def load_dataset(self, dataset_key, force_update=False):
        """
        Load a dataset from the local filesystem, downloading it from
        data.world first, if necessary.

        This function returns an object of type `LocalDataset`. The object
        allows access to metedata via it's `describe()` method and to all the
        data via three properties `raw_data`, `tables` and `dataframes`, all
        of which are mappings (dict-like structures).

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id or of a url
        force_update : bool
            Flag, indicating if a new copy of the dataset should be downloaded
            replacing any previously downloaded copy

        Returns
        -------
        LocalDataset
            The object representing the dataset

        Raises
        ------
        RestApiError
            If a server error occurs
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        cache_dir = path.join(self._config.cache_dir, owner_id, dataset_id,
                              'latest')

        backup_dir = None
        if path.isdir(cache_dir) and force_update:
            backup_dir = path.join(self._config.cache_dir, owner_id,
                                   dataset_id, 'backup')
            if path.isdir(backup_dir):
                shutil.rmtree(backup_dir)
            shutil.move(cache_dir, backup_dir)

        descriptor_file = path.join(cache_dir, 'datapackage.json')
        if not path.isfile(descriptor_file):
            try:
                descriptor_file = self.api_client.download_datapackage(
                    dataset_key, cache_dir)
            except RestApiError as e:
                if backup_dir is not None:
                    shutil.move(backup_dir, cache_dir)
                    warn('Unable to download datapackage ({}). '
                         'Loading previously saved version.'.format(e.reason))
                else:
                    raise

        if backup_dir is not None:
            shutil.rmtree(backup_dir, ignore_errors=True)

        return LocalDataset(descriptor_file)
Example #9
0
    def query(self, dataset_key, query, query_type="sql", parameters=None):
        """Query an existing dataset

        :param dataset_key: Dataset identifier, in the form of owner/id or of
            a url
        :type dataset_key: str
        :param query: SQL or SPARQL query
        :type query: str
        :param query_type: The type of the query. Must be either 'sql' or
            'sparql'. (Default value = "sql")
        :type query_type: {'sql', 'sparql'}, optional
        :param parameters: parameters to the query - if SPARQL query, this
            should be a dict containing named parameters, if SQL query,then
            this should be a list containing positional parameters.
            Boolean values will be converted to xsd:boolean, Integer values to
            xsd:integer, and other Numeric values to xsd:decimal. Anything
            else is treated as a String literal (Default value = None)
        :type parameters: query parameters, optional
        :returns: Object containing the results of the query
        :rtype: Results
        :raises RuntimeError: If a server error occurs
        """
        # TODO Move network request to RestApiClient
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        params = {"query": query}
        if parameters and query_type == "sparql":
            # if SPARQL, then the parameters should be a Mapping containing
            # named parameters
            params["parameters"] = ",".join([
                "{}={}".format(k, convert_to_sparql_literal(parameters[k]))
                for k in parameters.keys()
            ])
        elif parameters and query_type == "sql":
            # if SQL, then the parameters should be an array with positional
            # parameters, need to unwind them to $data_world_paramN for each
            # 0-indexed position N
            parameters = {
                "$data_world_param{}".format(i): x
                for i, x in enumerate(parameters)
            }
            params["parameters"] = ",".join([
                "{}={}".format(k, convert_to_sparql_literal(parameters[k]))
                for k in parameters.keys()
            ])
        url = "{0}://{1}/{2}/{3}/{4}".format(self._protocol, self._query_host,
                                             query_type, owner_id, dataset_id)
        headers = {
            'User-Agent': _user_agent(),
            'Accept': 'application/sparql-results+json',
            'Authorization': 'Bearer {0}'.format(self._config.auth_token)
        }
        response = requests.get(url, params=params, headers=headers)
        if response.status_code == 200:
            return QueryResults(response.json())
        raise RuntimeError('Error executing query: {}'.format(
            response.content))
Example #10
0
 def put_request(body):
     ownerid, datasetid = parse_dataset_key(self._dataset_key)
     response = requests.put(
         "{}/uploads/{}/{}/files/{}".format(self._api_host, ownerid,
                                            datasetid, self._file_name),
         data=body,
         headers={
             'User-Agent': self._user_agent,
             'Authorization':
             'Bearer {}'.format(self._config.auth_token)
         })
     self._response_queue.put(response)
Example #11
0
 def _open_for_read(self):
     """open the file in read mode"""
     ownerid, datasetid = parse_dataset_key(self._dataset_key)
     response = requests.get(
         '{}/file_download/{}/{}/{}'.format(
             self._query_host, ownerid, datasetid, self._file_name),
         headers={
             'User-Agent': self._user_agent,
             'Authorization': 'Bearer {}'.format(
                 self._config.auth_token)
         }, stream=True)
     try:
         response.raise_for_status()
     except Exception as e:
         raise RestApiError(cause=e)
     self._read_response = response
Example #12
0
    def replace_dataset(self, dataset_key, **kwargs):
        """Replace an existing dataset

        *This method will completely overwrite an existing dataset.*

        :param description: Dataset description
        :type description: str, optional
        :param summary: Dataset summary markdown
        :type summary: str, optional
        :param tags: Dataset tags
        :type tags: list, optional
        :param license: {'CC-BY-SA', 'ODC-ODbL', 'CC BY-NC', 'CC BY-NC-SA',
                            'Other'}
            Dataset license
        :type license: {'Public Domain', 'PDDL', 'CC-0', 'CC-BY', 'ODC-BY'}
        :param visibility: Dataset visibility
        :type visibility: {'OPEN', 'PRIVATE'}
        :param files: File names and source URLs to add or update
        :type files: dict, optional
        :param dataset_key: Dataset identifier, in the form of owner/id
        :type dataset_key: str
        :param **kwargs:
        :raises RestApiException: If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.replace_dataset(
        ...    'username/test-dataset',
        ...    visibility='PRIVATE', license='Public Domain',
        ...    description='A better description')  # doctest: +SKIP
        """
        request = self.__build_dataset_obj(
            lambda: _swagger.DatasetPutRequest(),
            lambda name, url, expand_archive, description, labels: _swagger.
            FileCreateRequest(name=name,
                              source=_swagger.FileSourceCreateRequest(
                                  url=url, expand_archive=expand_archive),
                              description=description,
                              labels=labels), kwargs)

        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            self._datasets_api.replace_dataset(owner_id, dataset_id, request)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #13
0
    def replace_dataset(self, dataset_key, **kwargs):
        """Replace an existing dataset

        *This method will completely overwrite an existing dataset.*

        Parameters
        ----------
        description : str, optional
            Dataset description
        summary : str, optional
            Dataset summary markdown
        tags : list, optional
            Dataset tags
        license : {'Public Domain', 'PDDL', 'CC-0', 'CC-BY', 'ODC-BY',
                   'CC-BY-SA', 'ODC-ODbL', 'CC BY-NC', 'CC BY-NC-SA', 'Other'}
            Dataset license
        visibility : {'OPEN', 'PRIVATE'}
            Dataset visibility
        files : dict, optional
            File names and source URLs to add or update

        Raises
        ------
        RestApiException
            If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.replace_dataset(
        ...    'username/test-dataset',
        ...    visibility='PRIVATE', license='Public Domain',
        ...    description='A better description')  # doctest: +SKIP
        """
        request = self.__build_dataset_obj(
            lambda: _swagger.DatasetPutRequest(),
            lambda name, url: _swagger.FileCreateRequest(
                name=name, source=_swagger.FileSourceCreateRequest(url=url)),
            kwargs)

        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            self._datasets_api.replace_dataset(owner_id, dataset_id, request)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #14
0
    def download_dataset(self, dataset_key):
        """Return a .zip containing all files within the dataset as uploaded.

        :param dataset_key : Dataset identifier, in the form of owner/id
        :type dataset_key: str
        :returns: .zip file contain files within dataset
        :rtype: file object
        :raises RestApiException: If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.download_dataset('username/test-dataset')
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            return self._download_api.download_dataset(owner_id, dataset_id)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #15
0
    def add_files_via_url(self, dataset_key, files={}):
        """Add or update dataset files linked to source URLs

        :param dataset_key: Dataset identifier, in the form of owner/id
        :type dataset_key: str
        :param files: Dict containing the name of files and metadata
            Uses file name as a dict containing File description, labels and
            source URLs to add or update (Default value = {})
            *description and labels are optional.*
        :type files: dict
        :raises RestApiException: If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> url = 'http://www.acme.inc/example.csv'
        >>> api_client = dw.api_client()
        >>> api_client.add_files_via_url(
        ...    'username/test-dataset',
        ...    'example.csv': {
        ...         'url': url,
        ...         'labels': ['raw data'],
        ...         'description': 'file description'})  # doctest: +SKIP
        """
        file_requests = [
            _swagger.FileCreateOrUpdateRequest(
                name=file_name,
                source=_swagger.FileSourceCreateOrUpdateRequest(
                    url=file_info['url'],
                    expand_archive=file_info.get('expand_archive', False)),
                description=file_info.get('description'),
                labels=file_info.get('labels'),
            ) for file_name, file_info in files.items()
        ]
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            self._datasets_api.add_files_by_source(
                owner_id, dataset_id,
                _swagger.FileBatchUpdateRequest(files=file_requests))
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #16
0
    def delete_dataset(self, dataset_key):
        """Deletes a dataset and all associated data

        :params dataset_key: Dataset identifier, in the form of owner/id
        :type dataset_key: str
        :raises RestApiException: If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.delete_dataset(
        ...     'jonloyens/an-intro-to-dataworld-dataset')
        >>> del_dataset.message
        'Dataset has been successfully deleted.'
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            return self._datasets_api.delete_dataset(owner_id, dataset_id)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #17
0
    def add_files_via_url(self, dataset_key, files={}):
        """Add or update dataset files linked to source URLs

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id
        files : dict
            File names and source URLs to add or update

        Raises
        ------
        RestApiException
            If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> url = 'http://www.acme.inc/example.csv'
        >>> api_client = dw.api_client()
        >>> api_client.add_files_via_url(
        ...    'username/test-dataset',
        ...    {'example.csv': url})  # doctest: +SKIP
        """
        file_requests = [
            _swagger.FileCreateOrUpdateRequest(
                name=name,
                source=_swagger.FileSourceCreateOrUpdateRequest(url=url))
            for name, url in files.items()
        ]

        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            self._datasets_api.add_files_by_source(
                owner_id, dataset_id,
                _swagger.FileBatchUpdateRequest(files=file_requests))
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #18
0
    def download_file(self, dataset_key, file):
        """Return a file within the dataset as uploaded.

        :param dataset_key: Dataset identifier, in the form of owner/id
        :type dataset_key: str
        :param file: File path to be returned
        :type file: str
        :returns: file in which the data was uploaded
        :rtype: file object
        :raises RestApiException: If a server error occurs

        Examples
        --------
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> api_client.download_file('username/test-dataset',\
        >>> '/my/local/example.csv')
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        try:
            return self._download_api.download_file(owner_id, dataset_id, file)
        except _swagger.rest.ApiException as e:
            raise RestApiError(cause=e)
Example #19
0
    def query(self, dataset_key, query, query_type="sql"):
        """Query an existing dataset

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id or of a url
        query : str
            SQL or SPARQL query
        query_type : {'sql', 'sparql'}, optional
            The type of the query. Must be either 'sql' or 'sparql'.

        Returns
        -------
        Results
            Object containing the results of the query

        Raises
        ------
        RuntimeError
            If a server error occurs
        """
        # TODO Move network request to RestApiClient
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        params = {"query": query}
        url = "{0}://{1}/{2}/{3}/{4}".format(self._protocol, self._query_host,
                                             query_type, owner_id, dataset_id)
        headers = {
            'User-Agent': _user_agent(),
            'Accept': 'application/sparql-results+json',
            'Authorization': 'Bearer {0}'.format(self._config.auth_token)
        }
        response = requests.get(url, params=params, headers=headers)
        if response.status_code == 200:
            return QueryResults(response.json())
        raise RuntimeError('Error executing query: {}'.format(
            response.content))
Example #20
0
    def load_dataset(self, dataset_key, force_update=False, auto_update=False):
        """Load a dataset from the local filesystem, downloading it from
        data.world first, if necessary.

        This function returns an object of type `LocalDataset`. The object
        allows access to metedata via it's `describe()` method and to all the
        data via three properties `raw_data`, `tables` and `dataframes`, all
        of which are mappings (dict-like structures).

        :param dataset_key: Dataset identifier, in the form of owner/id or of
            a url
        :type dataset_key: str
        :param force_update: Flag, indicating if a new copy of the dataset
            should be downloaded replacing any previously downloaded copy
            (Default value = False)
        :type force_update: bool
        :param auto_update: Flag, indicating that dataset be updated to the
            latest version
        :type auto_update: bool
        :returns: The object representing the dataset
        :rtype: LocalDataset
        :raises RestApiError: If a server error occurs
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        cache_dir = path.join(self._config.cache_dir, owner_id, dataset_id,
                              'latest')
        backup_dir = None
        if path.isdir(cache_dir) and force_update:
            backup_dir = path.join(self._config.cache_dir, owner_id,
                                   dataset_id, 'backup')
            move_cache_dir_to_backup_dir(backup_dir, cache_dir)

        descriptor_file = path.join(cache_dir, 'datapackage.json')
        if not path.isfile(descriptor_file):
            try:
                descriptor_file = self.api_client.download_datapackage(
                    dataset_key, cache_dir)
            except RestApiError as e:
                if backup_dir is not None:
                    shutil.move(backup_dir, cache_dir)
                    warn('Unable to download datapackage ({}). '
                         'Loading previously saved version.'.format(e.reason))
                else:
                    raise
        else:
            try:
                dataset_info = self.api_client.get_dataset(dataset_key)
            except RestApiError:
                return LocalDataset(descriptor_file)

            last_modified = datetime.strptime(dataset_info['updated'],
                                              '%Y-%m-%dT%H:%M:%S.%fZ')
            if (last_modified > datetime.utcfromtimestamp(
                    path.getmtime(str(descriptor_file)))):
                if auto_update:
                    try:
                        backup_dir = path.join(self._config.cache_dir,
                                               owner_id, dataset_id,
                                               'backup')
                        move_cache_dir_to_backup_dir(backup_dir,
                                                     cache_dir)
                        descriptor_file = self.api_client. \
                            download_datapackage(dataset_key, cache_dir)
                    except RestApiError as e:
                        if backup_dir is not None:
                            shutil.move(backup_dir, cache_dir)
                            warn('Unable to auto update datapackage ({}). '
                                 'Loading previously saved version.'
                                 .format(e.reason))
                        else:
                            raise
                else:
                    filterwarnings('always',
                                   message='You are using an outdated copy')
                    warn('You are using an outdated copy of {}. '
                         'If you wish to use the latest version, call this '
                         'function with the argument '
                         'auto_update=True or '
                         'force_update=True'.format(dataset_key))

        if backup_dir is not None:
            shutil.rmtree(backup_dir, ignore_errors=True)

        return LocalDataset(descriptor_file)
Example #21
0
    def download_datapackage(self, dataset_key, dest_dir):
        """
        Download and unzip a dataset's datapackage

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id
        dest_dir : str or path
            Directory under which datapackage should be saved

        Returns
        -------
        path
            Location of the datapackage descriptor (datapackage.json) in the
            local filesystem

        Raises
        ------
        RestApiException
            If a server error occurs

        Examples
        >>> import datadotworld as dw
        >>> api_client = dw.api_client()
        >>> datapackage_descriptor = api_client.download_datapackage(
        ...     'jonloyens/an-intro-to-dataworld-dataset', '/tmp/test')
        >>> datapackage_descriptor
        '/tmp/test/datapackage.json'
        """
        if path.isdir(dest_dir):
            raise ValueError('dest_dir must be a new directory, '
                             'but {} already exists'.format(dest_dir))

        owner_id, dataset_id = parse_dataset_key(dataset_key)
        url = "{0}://{1}/datapackage/{2}/{3}".format(self._protocol,
                                                     self._download_host,
                                                     owner_id, dataset_id)
        headers = {
            'User-Agent': _user_agent(),
            'Authorization': 'Bearer {0}'.format(self._config.auth_token)
        }

        try:
            response = requests.get(url, headers=headers, stream=True)
            response.raise_for_status()
        except requests.RequestException as e:
            raise RestApiError(cause=e)

        unzip_dir = path.join(self._config.tmp_dir, str(uuid.uuid4()))
        os.makedirs(unzip_dir)

        zip_file = path.join(unzip_dir, 'dataset.zip')

        with open(zip_file, 'wb') as f:
            for data in response.iter_content(chunk_size=4096):
                f.write(data)

        zip_obj = zipfile.ZipFile(zip_file)
        zip_obj.extractall(path=unzip_dir)

        # Find where datapackage.json is within expanded files
        unzipped_descriptor = glob.glob(
            '{}/**/datapackage.json'.format(unzip_dir))
        if not unzipped_descriptor:
            raise RuntimeError(
                'Zip file did not contain a datapackage manifest.')

        unzipped_dir = path.dirname(unzipped_descriptor[0])

        shutil.move(unzipped_dir, dest_dir)
        shutil.rmtree(unzip_dir, ignore_errors=True)

        return path.join(dest_dir, 'datapackage.json')
    def query(self, dataset_key, query, query_type="sql", parameters=None):
        """Query an existing dataset

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id or of a url
        query : str
            SQL or SPARQL query
        query_type : {'sql', 'sparql'}, optional
            The type of the query. Must be either 'sql' or 'sparql'.
        parameters: query parameters, optional
            parameters to the query - if SPARQL query, this should be a dict
            containing named parameters, if SQL query, then this should be a
            list containing positional parameters.  Boolean values will be
            converted to xsd:boolean, Integer values to xsd:integer, and other
            Numeric values to xsd:decimal. anything else is treated as a String
            literal

        Returns
        -------
        Results
            Object containing the results of the query

        Raises
        ------
        RuntimeError
            If a server error occurs
        """
        # TODO Move network request to RestApiClient
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        params = {
            "query": query
        }
        if parameters and query_type == "sparql":
            # if SPARQL, then the parameters should be a Mapping containing
            # named parameters
            params["parameters"] = ",".join(
                ["{}={}".format(k, convert_to_sparql_literal(parameters[k]))
                 for k in parameters.keys()])
        elif parameters and query_type == "sql":
            # if SQL, then the parameters should be an array with positional
            # parameters, need to unwind them to $data_world_paramN for each
            # 0-indexed position N
            parameters = {"$data_world_param{}".format(i): x
                          for i, x in enumerate(parameters)}
            params["parameters"] = ",".join(["{}={}".format(
                k, convert_to_sparql_literal(parameters[k]))
                                             for k in parameters.keys()])
        url = "{0}://{1}/{2}/{3}/{4}".format(self._protocol, self._query_host,
                                             query_type, owner_id, dataset_id)
        headers = {
            'User-Agent': _user_agent(),
            'Accept': 'application/sparql-results+json',
            'Authorization': 'Bearer {0}'.format(self._config.auth_token)
        }
        response = requests.get(url, params=params, headers=headers)
        if response.status_code == 200:
            return QueryResults(response.json())
        raise RuntimeError(
            'Error executing query: {}'.format(response.content))
Example #23
0
def test_parse_dataset_key():
    path_owner, path_id = util.parse_dataset_key('owner/dataset')
    assert_that(path_owner, equal_to('owner'))
    assert_that(path_id, equal_to('dataset'))
Example #24
0
def test_parse_dataset_key_with_url():
    url_owner, url_id = util.parse_dataset_key(
        'https://data.world/owner/dataset')
    assert_that(url_owner, equal_to('owner'))
    assert_that(url_id, equal_to('dataset'))
    def load_dataset(self, dataset_key, force_update=False):
        """
        Load a dataset from the local filesystem, downloading it from
        data.world first, if necessary.

        This function returns an object of type `LocalDataset`. The object
        allows access to metedata via it's `describe()` method and to all the
        data via three properties `raw_data`, `tables` and `dataframes`, all
        of which are mappings (dict-like structures).

        Parameters
        ----------
        dataset_key : str
            Dataset identifier, in the form of owner/id or of a url
        force_update : bool
            Flag, indicating if a new copy of the dataset should be downloaded
            replacing any previously downloaded copy

        Returns
        -------
        LocalDataset
            The object representing the dataset

        Raises
        ------
        RestApiError
            If a server error occurs
        """
        owner_id, dataset_id = parse_dataset_key(dataset_key)
        cache_dir = path.join(self._config.cache_dir, owner_id, dataset_id,
                              'latest')

        backup_dir = None
        if path.isdir(cache_dir) and force_update:
            backup_dir = path.join(self._config.cache_dir, owner_id,
                                   dataset_id, 'backup')
            if path.isdir(backup_dir):
                shutil.rmtree(backup_dir)
            shutil.move(cache_dir, backup_dir)

        descriptor_file = path.join(cache_dir, 'datapackage.json')
        if not path.isfile(descriptor_file):
            try:
                descriptor_file = self.api_client.download_datapackage(
                    dataset_key, cache_dir)
            except RestApiError as e:
                if backup_dir is not None:
                    shutil.move(backup_dir, cache_dir)
                    warn('Unable to download datapackage ({}). '
                         'Loading previously saved version.'.format(e.reason))
                else:
                    raise
        else:
            try:
                dataset_info = self.api_client.get_dataset(dataset_key)
                last_modified = datetime.strptime(dataset_info['updated'],
                                                  '%Y-%m-%dT%H:%M:%S.%fZ')
                if (last_modified > datetime.utcfromtimestamp(
                        path.getmtime(str(descriptor_file)))):
                    warn('You are using an outdated copy of {}. '
                         'If you wish to use the latest version, call this '
                         'function with the argument '
                         'force_update=True'.format(dataset_key))
            except RestApiError:
                # Not a critical step
                pass

        if backup_dir is not None:
            shutil.rmtree(backup_dir, ignore_errors=True)

        return LocalDataset(descriptor_file)