Ejemplo n.º 1
0
def check_response(response,
                   request_url,
                   who,
                   good_status=(201, 200),
                   ignore_no_success=False):
    """
    Checks the response and raises exceptions if something went terribly wrong

    :param who: A short name that indicated where the error occurred
                (for example "CKAN")
    :param good_status: Status codes that should not raise an exception

    """
    if not response.status_code:
        raise HTTPError(
            'Xloader received an HTTP response with no status code',
            status_code=None,
            request_url=request_url,
            response=response.text)

    message = '{who} bad response. Status code: {code} {reason}. At: {url}.'
    try:
        if response.status_code not in good_status:
            json_response = response.json()
            if not ignore_no_success or json_response.get('success'):
                try:
                    message = json_response["error"]["message"]
                except Exception:
                    message = message.format(who=who,
                                             code=response.status_code,
                                             reason=response.reason,
                                             url=request_url)
                raise HTTPError(message,
                                status_code=response.status_code,
                                request_url=request_url,
                                response=response.text)
    except ValueError:
        message = message.format(who=who,
                                 code=response.status_code,
                                 reason=response.reason,
                                 url=request_url,
                                 resp=response.text[:200])
        raise HTTPError(message,
                        status_code=response.status_code,
                        request_url=request_url,
                        response=response.text)
Ejemplo n.º 2
0
        headers = {}
        if resource.get('url_type') == 'upload':
            # If this is an uploaded file to CKAN, authenticate the request,
            # otherwise we won't get file from private resources
            headers['Authorization'] = api_key

        response = requests.get(resource.get('url'),
                                headers=headers,
                                timeout=DOWNLOAD_TIMEOUT)
        response.raise_for_status()
    except requests.exceptions.HTTPError as error:
        # status code error
        logger.error('HTTP error: {}'.format(error))
        raise HTTPError(
            "DataPusher received a bad HTTP response when trying to download "
            "the data file",
            status_code=error.response.status_code,
            request_url=resource.get('url'),
            response=error)
    except requests.exceptions.Timeout:
        logger.error('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
        raise JobError(
            'Connection timed out after {}s'.format(DOWNLOAD_TIMEOUT))
    except requests.exceptions.RequestException as e:
        try:
            err_message = str(e.reason)
        except AttributeError:
            err_message = str(e)
        logger.error('URL error: {}'.format(err_message))
        raise HTTPError(message=err_message,
                        status_code=None,
                        request_url=resource.get('url'),
Ejemplo n.º 3
0
     line_count = 0
     m = hashlib.md5()
     for line in response.iter_lines(CHUNK_SIZE):
         tmp_file.write(line + '\n')
         m.update(line)
         length += len(line)
         line_count += 1
         if length > MAX_CONTENT_LENGTH or line_count >= MAX_EXCERPT_LINES:
             break
     data['datastore_contains_all_records_of_source_file'] = False
 except requests.exceptions.HTTPError as error:
     # status code error
     logger.debug('HTTP error: {}'.format(error))
     raise HTTPError(
         "Xloader received a bad HTTP response when trying to download "
         "the data file",
         status_code=error.response.status_code,
         request_url=url,
         response=error)
 except requests.exceptions.Timeout:
     logger.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
     raise JobError(
         'Connection timed out after {}s'.format(DOWNLOAD_TIMEOUT))
 except requests.exceptions.RequestException as e:
     try:
         err_message = str(e.reason)
     except AttributeError:
         err_message = str(e)
     logger.warning('URL error: {}'.format(err_message))
     raise HTTPError(message=err_message,
                     status_code=None,
                     request_url=url,
Ejemplo n.º 4
0
def _download_resource_data(resource, data, api_key, logger):
    '''Downloads the resource['url'] as a tempfile.

    :param resource: resource (i.e. metadata) dict (from the job dict)
    :param data: job dict - may be written to during this function
    :param api_key: CKAN api key - needed to obtain resources that are private
    :param logger:

    If the download is bigger than MAX_CONTENT_LENGTH then it just downloads a
    excerpt (of MAX_EXCERPT_LINES) for preview, and flags it by setting
    data['datastore_contains_all_records_of_source_file'] = False
    which will be saved to the resource later on.
    '''
    # check scheme
    url = resource.get('url')
    scheme = urlparse.urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise JobError(
            'Only http, https, and ftp resources may be fetched.'
        )

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    tmp_file = get_tmp_file(url)
    length = 0
    m = hashlib.md5()
    cl = None
    try:
        headers = {}
        if resource.get('url_type') == 'upload':
            # If this is an uploaded file to CKAN, authenticate the request,
            # otherwise we won't get file from private resources
            headers['Authorization'] = api_key

        response = get_response(url, headers)

        cl = response.headers.get('content-length')
        if cl and int(cl) > MAX_CONTENT_LENGTH:
            raise DataTooBigError()

        # download the file to a tempfile on disk
        for chunk in response.iter_content(CHUNK_SIZE):
            length += len(chunk)
            if length > MAX_CONTENT_LENGTH:
                raise DataTooBigError
            tmp_file.write(chunk)
            m.update(chunk)
        data['datastore_contains_all_records_of_source_file'] = True

    except DataTooBigError:
        tmp_file.close()
        message = 'Data too large to load into Datastore: ' \
            '{cl} bytes > max {max_cl} bytes.' \
            .format(cl=cl or length, max_cl=MAX_CONTENT_LENGTH)
        logger.warning(message)
        if MAX_EXCERPT_LINES <= 0:
            raise JobError(message)
        logger.info('Loading excerpt of ~{max_lines} lines to '
                    'DataStore.'
                    .format(max_lines=MAX_EXCERPT_LINES))
        tmp_file = get_tmp_file(url)
        response = get_response(url, headers)
        length = 0
        line_count = 0
        m = hashlib.md5()
        for line in response.iter_lines(CHUNK_SIZE):
            tmp_file.write(line + '\n')
            m.update(line)
            length += len(line)
            line_count += 1
            if length > MAX_CONTENT_LENGTH or line_count >= MAX_EXCERPT_LINES:
                break
        data['datastore_contains_all_records_of_source_file'] = False
    except requests.exceptions.HTTPError as error:
        # status code error
        logger.debug('HTTP error: {}'.format(error))
        raise HTTPError(
            "Xloader received a bad HTTP response when trying to download "
            "the data file", status_code=error.response.status_code,
            request_url=url, response=error)
    except requests.exceptions.Timeout:
        logger.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
        raise JobError('Connection timed out after {}s'.format(
                       DOWNLOAD_TIMEOUT))
    except requests.exceptions.RequestException as e:
        try:
            err_message = str(e.reason)
        except AttributeError:
            err_message = str(e)
        logger.warning('URL error: {}'.format(err_message))
        raise HTTPError(
            message=err_message, status_code=None,
            request_url=url, response=None)

    logger.info('Downloaded ok - %s', printable_file_size(length))
    file_hash = m.hexdigest()
    tmp_file.seek(0)
    return tmp_file, file_hash