コード例 #1
0
    def _read_from_hdx(self, object_type, value, fieldname='id',
                       action=None, **kwargs):
        # type: (str, str, str, Optional[str], Any) -> Tuple[bool, Union[Dict, str]]
        """Makes a read call to HDX passing in given parameter.

        Args:
            object_type (str): Description of HDX object type (for messages)
            value (str): Value of HDX field
            fieldname (str): HDX field name. Defaults to id.
            action (Optional[str]): Replacement CKAN action url to use. Defaults to None.
            **kwargs: Other fields to pass to CKAN.

        Returns:
            Tuple[bool, Union[Dict, str]]: (True/False, HDX object metadata/Error)
        """
        if not fieldname:
            raise HDXError('Empty %s field name!' % object_type)
        if action is None:
            action = self.actions()['show']
        data = {fieldname: value}
        data.update(kwargs)
        try:
            result = self.configuration.call_remoteckan(action, data)
            return True, result
        except NotFound:
            return False, '%s=%s: not found!' % (fieldname, value)
        except Exception as e:
            raisefrom(HDXError, 'Failed when trying to read: %s=%s! (POST)' % (fieldname, value), e)
コード例 #2
0
    def stream_file(self, url, folder=None):
        # type: (str, Optional[str]) -> str
        """Stream file from url and store in provided folder or temporary folder if no folder supplied.
        Must call setup_streaming_download method first.

        Args:
            url (str): URL to download
            folder (Optional[str]): Folder to download it to. Defaults to None (temporary folder).

        Returns:
            str: Path of downloaded file

        """
        path = self.get_path_for_url(url, folder)
        f = None
        try:
            f = open(path, 'wb')
            for chunk in self.response.iter_content(chunk_size=10240):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
                    f.flush()
            return f.name
        except Exception as e:
            raisefrom(DownloadError, 'Download of %s failed in retrieval of stream!' % url, e)
        finally:
            if f:
                f.close()
コード例 #3
0
    def _write_to_hdx(self, action, data, id_field_name=None, file_to_upload=None):
        # type: (str, Dict, str, Optional[str]) -> Dict
        """Creates or updates an HDX object in HDX and return HDX object metadata dict

        Args:
            action (str): Action to perform eg. 'create', 'update'
            data (Dict): Data to write to HDX
            id_field_name (Optional[str]): Name of field containing HDX object identifier. Defaults to None.
            file_to_upload (Optional[str]): File to upload to HDX. Defaults to None.

        Returns:
            Dict: HDX object metadata
        """
        file = None
        try:
            if file_to_upload:
                file = open(file_to_upload, 'rb')
                files = [('upload', file)]
            else:
                files = None
            return self.configuration.call_remoteckan(self.actions()[action], data, files=files)
        except Exception as e:
            if id_field_name:
                idstr = ' %s' % data[id_field_name]
            else:
                idstr = ''
            raisefrom(HDXError, 'Failed when trying to %s%s! (POST)' % (action, idstr), e)
        finally:
            if file_to_upload and file:
                file.close()
コード例 #4
0
    def get_tabular_stream(self, url, **kwargs):
        # type: (str, Any) -> tabulator.Stream
        """Get Tabulator stream.

        Args:
            url (str): URL to download
            **kwargs:
            headers (Union[int, List[int], List[str]]): Number of row(s) containing headers or list of headers
            file_type (Optional[str]): Type of file. Defaults to inferring.
            delimiter (Optional[str]): Delimiter used for values in each row. Defaults to inferring.

        Returns:
            tabulator.Stream: Tabulator Stream object

        """
        self.close_response()
        file_type = kwargs.get('file_type')
        if file_type is not None:
            kwargs['format'] = file_type
            del kwargs['file_type']
        try:
            self.response = tabulator.Stream(url, **kwargs)
            self.response.open()
            return self.response
        except TabulatorException as e:
            raisefrom(DownloadError,
                      'Getting tabular stream for %s failed!' % url, e)
コード例 #5
0
    def setup_stream(self, url, timeout=None):
        # type: (str, Optional[float]) -> None
        """Setup streaming download from provided url

        Args:
            url (str): URL to download
            timeout (Optional[float]): Timeout for connecting to URL. Defaults to None (no timeout).


        """
        self.response = None
        try:
            self.response = self.session.get(url, stream=True, timeout=timeout)
            self.response.raise_for_status()
        except Exception as e:
            raisefrom(DownloadError, 'Setup of Streaming Download of %s failed!', e)
コード例 #6
0
    def validlocations(self):
        # type: () -> List[Dict]
        """
        Return valid locations

        Returns:
            List[Dict]: Valid locations

        """
        try:
            return self._validlocationsfn()
        except (AttributeError, TypeError) as e:
            raisefrom(
                ConfigurationError,
                'There is no valid locations function set up! Use Configuration.create(**kwargs)',
                e)
コード例 #7
0
    def download(self, url, timeout=None):
        # type: (str, Optional[float]) -> requests.Response
        """Download url

        Args:
            url (str): URL to download
            timeout (Optional[float]): Timeout for connecting to URL. Defaults to None (no timeout).

        Returns:
            requests.Response: Response

        """
        try:
            self.response = self.session.get(url, timeout=timeout)
            self.response.raise_for_status()
        except Exception as e:
            raisefrom(DownloadError, 'Download of %s failed!' % url, e)
        return self.response
コード例 #8
0
    def hash_stream(self, url):
        # type: (str) -> str
        """Stream file from url and hash it using MD5. Must call setup_streaming_download method first.

        Args:
            url (str): URL to download

        Returns:
            str: MD5 hash of file

        """
        md5hash = hashlib.md5()
        try:
            for chunk in self.response.iter_content(chunk_size=10240):
                if chunk:  # filter out keep-alive new chunks
                    md5hash.update(chunk)
            return md5hash.hexdigest()
        except Exception as e:
            raisefrom(DownloadError, 'Download of %s failed in retrieval of stream!' % url, e)
コード例 #9
0
    def setup(self,
              url,
              stream=True,
              post=False,
              parameters=None,
              timeout=None):
        # type: (str, bool, bool, Optional[Dict], Optional[float]) -> requests.Response
        """Setup download from provided url returning the response

        Args:
            url (str): URL to download
            stream (bool): Whether to stream download. Defaults to True.
            post (bool): Whether to use POST instead of GET. Defaults to False.
            parameters (Optional[Dict]): Parameters to pass. Defaults to None.
            timeout (Optional[float]): Timeout for connecting to URL. Defaults to None (no timeout).

        Returns:
            requests.Response: requests.Response object

        """
        self.close_response()
        self.response = None
        try:
            if post:
                full_url, parameters = self.get_url_params_for_post(
                    url, parameters)
                self.response = self.session.post(full_url,
                                                  data=parameters,
                                                  stream=stream,
                                                  timeout=timeout)
            else:
                self.response = self.session.get(self.get_url_for_get(
                    url, parameters),
                                                 stream=stream,
                                                 timeout=timeout)
            self.response.raise_for_status()
        except Exception as e:
            raisefrom(DownloadError,
                      'Setup of Streaming Download of %s failed!', e)
        return self.response
コード例 #10
0
    def _parse_date(dataset_date, date_format):
        # type: (str, Optional[str]) -> datetime
        """Parse dataset date from string using specified format. If no format is supplied, the function will guess.
        For unambiguous formats, this should be fine.

        Args:
            dataset_date (str): Dataset date string
            date_format (Optional[str]): Date format. If None is given, will attempt to guess. Defaults to None.

        Returns:
            datetime.datetime
        """
        if date_format is None:
            try:
                return parser.parse(dataset_date)
            except (ValueError, OverflowError) as e:
                raisefrom(HDXError, 'Invalid dataset date!', e)
        else:
            try:
                return datetime.strptime(dataset_date, date_format)
            except ValueError as e:
                raisefrom(HDXError, 'Invalid dataset date!', e)
コード例 #11
0
    def create_datastore(self, schema=None, primary_key=None,
                         delete_first=0, path=None):
        # type: (Optional[List[Dict]], Optional[str], int, Optional[str]) -> None
        """For csvs, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided
        all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX.

        Args:
            schema (List[Dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None.
            primary_key (Optional[str]): Primary key of schema. Defaults to None.
            delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0.
            path (Optional[str]): Local path to file that was uploaded. Defaults to None.

        Returns:
            None
        """
        if delete_first == 0:
            pass
        elif delete_first == 1:
            self.delete_datastore()
        elif delete_first == 2:
            if primary_key is None:
                self.delete_datastore()
        else:
            raise HDXError('delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)')
        if path is None:
            # Download the resource
            url, path = self.download()
            delete_after_download = True
        else:
            url = self.data.get('url', None)
            if not url:
                raise HDXError('No URL to download!')
            delete_after_download = False

        zip_path = None
        stream = None
        try:
            extension = splitext(path)[1]
            if extension.lower() == '.zip':
                zip_file = zipfile.ZipFile(path)
                filename = zip_file.namelist()[0]
                tempdir = gettempdir()
                zip_file.extract(filename, tempdir)
                zip_path = path
                path = join(tempdir, filename)

            def convert_to_text(extended_rows):
                for number, headers, row in extended_rows:
                    for i, val in enumerate(row):
                        row[i] = str(val)
                    yield (number, headers, row)

            stream = Stream(path, headers=1, post_parse=[convert_to_text], bytes_sample_size=1000000)
            stream.open()
            nonefieldname = False
            if schema is None:
                schema = list()
                for fieldname in stream.headers:
                    if fieldname is not None:
                        schema.append({'id': fieldname, 'type': 'text'})
                    else:
                        nonefieldname = True
            data = {'resource_id': self.data['id'], 'force': True, 'fields': schema, 'primary_key': primary_key}
            self._write_to_hdx('datastore_create', data, 'resource_id')
            if primary_key is None:
                method = 'insert'
            else:
                method = 'upsert'
            logger.debug('Uploading data from %s to datastore' % url)
            offset = 0
            chunksize = 100
            rowset = stream.read(keyed=True, limit=chunksize)
            while len(rowset) != 0:
                if nonefieldname:
                    for row in rowset:
                        del row[None]
                data = {'resource_id': self.data['id'], 'force': True, 'method': method, 'records': rowset}
                self._write_to_hdx('datastore_upsert', data, 'resource_id')
                rowset = stream.read(keyed=True, limit=chunksize)
                logger.debug('Uploading: %s' % offset)
                offset += chunksize
        except Exception as e:
            raisefrom(HDXError, 'Upload to datastore of %s failed!' % url, e)
        finally:
            if stream:
                stream.close()
            if delete_after_download:
                unlink(path)
                if zip_path:
                    unlink(zip_path)
            else:
                if zip_path:
                    unlink(path)  # ie. we keep the zip but remove the extracted file
コード例 #12
0
ファイル: resource.py プロジェクト: OCHA-DAP/hdx-python-api
    def create_datastore(self, schema=None, primary_key=None,
                         delete_first=0, path=None):
        # type: (Optional[List[Dict]], Optional[str], int, Optional[str]) -> None
        """For tabular data, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided
        all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX.

        Args:
            schema (List[Dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None.
            primary_key (Optional[str]): Primary key of schema. Defaults to None.
            delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0.
            path (Optional[str]): Local path to file that was uploaded. Defaults to None.

        Returns:
            None
        """
        if delete_first == 0:
            pass
        elif delete_first == 1:
            self.delete_datastore()
        elif delete_first == 2:
            if primary_key is None:
                self.delete_datastore()
        else:
            raise HDXError('delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)')
        if path is None:
            # Download the resource
            url, path = self.download()
            delete_after_download = True
        else:
            url = path
            delete_after_download = False

        def convert_to_text(extended_rows):
            for number, headers, row in extended_rows:
                for i, val in enumerate(row):
                    row[i] = str(val)
                yield (number, headers, row)

        with Download(full_agent=self.configuration.get_user_agent()) as downloader:
            try:
                stream = downloader.get_tabular_stream(path, headers=1, post_parse=[convert_to_text],
                                                       bytes_sample_size=1000000)
                nonefieldname = False
                if schema is None:
                    schema = list()
                    for fieldname in stream.headers:
                        if fieldname is not None:
                            schema.append({'id': fieldname, 'type': 'text'})
                        else:
                            nonefieldname = True
                data = {'resource_id': self.data['id'], 'force': True, 'fields': schema, 'primary_key': primary_key}
                self._write_to_hdx('datastore_create', data, 'resource_id')
                if primary_key is None:
                    method = 'insert'
                else:
                    method = 'upsert'
                logger.debug('Uploading data from %s to datastore' % url)
                offset = 0
                chunksize = 100
                rowset = stream.read(keyed=True, limit=chunksize)
                while len(rowset) != 0:
                    if nonefieldname:
                        for row in rowset:
                            del row[None]
                    data = {'resource_id': self.data['id'], 'force': True, 'method': method, 'records': rowset}
                    self._write_to_hdx('datastore_upsert', data, 'resource_id')
                    rowset = stream.read(keyed=True, limit=chunksize)
                    logger.debug('Uploading: %s' % offset)
                    offset += chunksize
            except Exception as e:
                raisefrom(HDXError, 'Upload to datastore of %s failed!' % url, e)
            finally:
                if delete_after_download:
                    remove(path)
コード例 #13
0
ファイル: resource.py プロジェクト: EmmaArnold/hdx-python-api
    def create_datastore(self,
                         schema=None,
                         primary_key=None,
                         delete_first=0,
                         path=None):
        # type: (Optional[List[Dict]], Optional[str], int, Optional[str]) -> None
        """For tabular data, create a resource in the HDX datastore which enables data preview in HDX. If no schema is provided
        all fields are assumed to be text. If path is not supplied, the file is first downloaded from HDX.

        Args:
            schema (List[Dict]): List of fields and types of form {'id': 'FIELD', 'type': 'TYPE'}. Defaults to None.
            primary_key (Optional[str]): Primary key of schema. Defaults to None.
            delete_first (int): Delete datastore before creation. 0 = No, 1 = Yes, 2 = If no primary key. Defaults to 0.
            path (Optional[str]): Local path to file that was uploaded. Defaults to None.

        Returns:
            None
        """
        if delete_first == 0:
            pass
        elif delete_first == 1:
            self.delete_datastore()
        elif delete_first == 2:
            if primary_key is None:
                self.delete_datastore()
        else:
            raise HDXError(
                'delete_first must be 0, 1 or 2! (0 = No, 1 = Yes, 2 = Delete if no primary key)'
            )
        if path is None:
            # Download the resource
            url, path = self.download()
            delete_after_download = True
        else:
            url = path
            delete_after_download = False

        def convert_to_text(extended_rows):
            for number, headers, row in extended_rows:
                for i, val in enumerate(row):
                    row[i] = str(val)
                yield (number, headers, row)

        with Download(
                full_agent=self.configuration.get_user_agent()) as downloader:
            try:
                stream = downloader.get_tabular_stream(
                    path,
                    headers=1,
                    post_parse=[convert_to_text],
                    bytes_sample_size=1000000)
                nonefieldname = False
                if schema is None:
                    schema = list()
                    for fieldname in stream.headers:
                        if fieldname is not None:
                            schema.append({'id': fieldname, 'type': 'text'})
                        else:
                            nonefieldname = True
                data = {
                    'resource_id': self.data['id'],
                    'force': True,
                    'fields': schema,
                    'primary_key': primary_key
                }
                self._write_to_hdx('datastore_create', data, 'resource_id')
                if primary_key is None:
                    method = 'insert'
                else:
                    method = 'upsert'
                logger.debug('Uploading data from %s to datastore' % url)
                offset = 0
                chunksize = 100
                rowset = stream.read(keyed=True, limit=chunksize)
                while len(rowset) != 0:
                    if nonefieldname:
                        for row in rowset:
                            del row[None]
                    data = {
                        'resource_id': self.data['id'],
                        'force': True,
                        'method': method,
                        'records': rowset
                    }
                    self._write_to_hdx('datastore_upsert', data, 'resource_id')
                    rowset = stream.read(keyed=True, limit=chunksize)
                    logger.debug('Uploading: %s' % offset)
                    offset += chunksize
            except Exception as e:
                raisefrom(HDXError, 'Upload to datastore of %s failed!' % url,
                          e)
            finally:
                if delete_after_download:
                    remove(path)