Beispiel #1
0
def validate_input(input):
    # Especially validate metadata which is provided by the user
    if 'metadata' not in input:
        raise JobError('Metadata missing')

    data = input['metadata']

    if 'resource_id' not in data:
        raise JobError('No id provided.')
    if 'ckan_url' not in data:
        raise JobError('No ckan_url provided.')
    if not input.get('api_key'):
        raise JobError('No CKAN API key provided')
Beispiel #2
0
def xloader_data_into_datastore_(input, job_dict):
    '''This function:
    * downloads the resource (metadata) from CKAN
    * downloads the data
    * calls the loader to load the data into DataStore
    * calls back to CKAN with the new status

    (datapusher called this function 'push_to_datastore')
    '''
    job_id = get_current_job().id
    db.init(config)

    # Store details of the job in the db
    try:
        db.add_pending_job(job_id, **input)
    except sa.exc.IntegrityError:
        raise JobError('job_id {} already exists'.format(job_id))

    # Set-up logging to the db
    handler = StoringHandler(job_id, input)
    level = logging.DEBUG
    handler.setLevel(level)
    logger = logging.getLogger(job_id)
    handler.setFormatter(logging.Formatter('%(message)s'))
    logger.addHandler(handler)
    # also show logs on stderr
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']

    ckan_url = data['ckan_url']
    resource_id = data['resource_id']
    api_key = input.get('api_key')

    try:
        resource, dataset = get_resource_and_dataset(resource_id)
    except JobError, e:
        # try again in 5 seconds just in case CKAN is slow at adding resource
        time.sleep(5)
        resource, dataset = get_resource_and_dataset(resource_id)
Beispiel #3
0
        resource, dataset = get_resource_and_dataset(resource_id)
    resource_ckan_url = '/dataset/{}/resource/{}' \
        .format(dataset['name'], resource['id'])
    logger.info('Express Load starting: {}'.format(resource_ckan_url))

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Ignoring resource - url_type=datastore - dump files are '
                    'managed with the Datastore API')
        return

    # check scheme
    url = resource.get('url')
    scheme = urlparse.urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise JobError('Only http, https, and ftp resources may be fetched.')

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    tmp_file = get_tmp_file(url)
    length = 0
    m = hashlib.md5()
    cl = None
    try:
        headers = {}
        if resource.get('url_type') == 'upload':
            # If this is an uploaded file to CKAN, authenticate the request,
            # otherwise we won't get file from private resources
            headers['Authorization'] = api_key

        response = get_response(url, headers)
    except JobError, e:
        # try again in 5 seconds just in case CKAN is slow at adding resource
        time.sleep(5)
        resource = get_resource(resource_id, ckan_url, api_key)

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Ignoring resource - url_type=datastore - dump files are '
                    'managed with the Datastore API')
        return

    # check scheme
    url = resource.get('url')
    scheme = urlparse.urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise JobError('Only http, https, and ftp resources may be fetched.')

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    try:
        headers = {}
        if resource.get('url_type') == 'upload':
            # If this is an uploaded file to CKAN, authenticate the request,
            # otherwise we won't get file from private resources
            headers['Authorization'] = api_key

        response = requests.get(
            url,
            headers=headers,
            timeout=DOWNLOAD_TIMEOUT,
            verify=SSL_VERIFY,
Beispiel #5
0
        response = requests.get(resource.get('url'),
                                headers=headers,
                                timeout=DOWNLOAD_TIMEOUT)
        response.raise_for_status()
    except requests.exceptions.HTTPError as error:
        # status code error
        logger.error('HTTP error: {}'.format(error))
        raise HTTPError(
            "DataPusher received a bad HTTP response when trying to download "
            "the data file",
            status_code=error.response.status_code,
            request_url=resource.get('url'),
            response=error)
    except requests.exceptions.Timeout:
        logger.error('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
        raise JobError(
            'Connection timed out after {}s'.format(DOWNLOAD_TIMEOUT))
    except requests.exceptions.RequestException as e:
        try:
            err_message = str(e.reason)
        except AttributeError:
            err_message = str(e)
        logger.error('URL error: {}'.format(err_message))
        raise HTTPError(message=err_message,
                        status_code=None,
                        request_url=resource.get('url'),
                        response=None)
    logger.info('Downloaded ok')

    cl = response.headers.get('content-length')
    if cl and int(cl) > MAX_CONTENT_LENGTH:
        error_msg = 'Resource too large to download: {cl} > max ({max_cl}).'\
Beispiel #6
0
def _download_resource_data(resource, data, api_key, logger):
    '''Downloads the resource['url'] as a tempfile.

    :param resource: resource (i.e. metadata) dict (from the job dict)
    :param data: job dict - may be written to during this function
    :param api_key: CKAN api key - needed to obtain resources that are private
    :param logger:

    If the download is bigger than MAX_CONTENT_LENGTH then it just downloads a
    excerpt (of MAX_EXCERPT_LINES) for preview, and flags it by setting
    data['datastore_contains_all_records_of_source_file'] = False
    which will be saved to the resource later on.
    '''
    # check scheme
    url = resource.get('url')
    scheme = urlparse.urlsplit(url).scheme
    if scheme not in ('http', 'https', 'ftp'):
        raise JobError(
            'Only http, https, and ftp resources may be fetched.'
        )

    # fetch the resource data
    logger.info('Fetching from: {0}'.format(url))
    tmp_file = get_tmp_file(url)
    length = 0
    m = hashlib.md5()
    cl = None
    try:
        headers = {}
        if resource.get('url_type') == 'upload':
            # If this is an uploaded file to CKAN, authenticate the request,
            # otherwise we won't get file from private resources
            headers['Authorization'] = api_key

        response = get_response(url, headers)

        cl = response.headers.get('content-length')
        if cl and int(cl) > MAX_CONTENT_LENGTH:
            raise DataTooBigError()

        # download the file to a tempfile on disk
        for chunk in response.iter_content(CHUNK_SIZE):
            length += len(chunk)
            if length > MAX_CONTENT_LENGTH:
                raise DataTooBigError
            tmp_file.write(chunk)
            m.update(chunk)
        data['datastore_contains_all_records_of_source_file'] = True

    except DataTooBigError:
        tmp_file.close()
        message = 'Data too large to load into Datastore: ' \
            '{cl} bytes > max {max_cl} bytes.' \
            .format(cl=cl or length, max_cl=MAX_CONTENT_LENGTH)
        logger.warning(message)
        if MAX_EXCERPT_LINES <= 0:
            raise JobError(message)
        logger.info('Loading excerpt of ~{max_lines} lines to '
                    'DataStore.'
                    .format(max_lines=MAX_EXCERPT_LINES))
        tmp_file = get_tmp_file(url)
        response = get_response(url, headers)
        length = 0
        line_count = 0
        m = hashlib.md5()
        for line in response.iter_lines(CHUNK_SIZE):
            tmp_file.write(line + '\n')
            m.update(line)
            length += len(line)
            line_count += 1
            if length > MAX_CONTENT_LENGTH or line_count >= MAX_EXCERPT_LINES:
                break
        data['datastore_contains_all_records_of_source_file'] = False
    except requests.exceptions.HTTPError as error:
        # status code error
        logger.debug('HTTP error: {}'.format(error))
        raise HTTPError(
            "Xloader received a bad HTTP response when trying to download "
            "the data file", status_code=error.response.status_code,
            request_url=url, response=error)
    except requests.exceptions.Timeout:
        logger.warning('URL time out after {0}s'.format(DOWNLOAD_TIMEOUT))
        raise JobError('Connection timed out after {}s'.format(
                       DOWNLOAD_TIMEOUT))
    except requests.exceptions.RequestException as e:
        try:
            err_message = str(e.reason)
        except AttributeError:
            err_message = str(e)
        logger.warning('URL error: {}'.format(err_message))
        raise HTTPError(
            message=err_message, status_code=None,
            request_url=url, response=None)

    logger.info('Downloaded ok - %s', printable_file_size(length))
    file_hash = m.hexdigest()
    tmp_file.seek(0)
    return tmp_file, file_hash
Beispiel #7
0
def xloader_data_into_datastore_(input, job_dict):
    '''This function:
    * downloads the resource (metadata) from CKAN
    * downloads the data
    * calls the loader to load the data into DataStore
    * calls back to CKAN with the new status

    (datapusher called this function 'push_to_datastore')
    '''
    job_id = get_current_job().id
    db.init(config)

    # Store details of the job in the db
    try:
        db.add_pending_job(job_id, **input)
    except sa.exc.IntegrityError:
        raise JobError('job_id {} already exists'.format(job_id))

    # Set-up logging to the db
    handler = StoringHandler(job_id, input)
    level = logging.DEBUG
    handler.setLevel(level)
    logger = logging.getLogger(job_id)
    handler.setFormatter(logging.Formatter('%(message)s'))
    logger.addHandler(handler)
    # also show logs on stderr
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']

    ckan_url = data['ckan_url']
    resource_id = data['resource_id']
    api_key = input.get('api_key')

    try:
        resource, dataset = get_resource_and_dataset(resource_id)
    except (JobError, ObjectNotFound) as e:
        # try again in 5 seconds just in case CKAN is slow at adding resource
        time.sleep(5)
        resource, dataset = get_resource_and_dataset(resource_id)
    resource_ckan_url = '/dataset/{}/resource/{}' \
        .format(dataset['name'], resource['id'])
    logger.info('Express Load starting: {}'.format(resource_ckan_url))

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Ignoring resource - url_type=datastore - dump files are '
                    'managed with the Datastore API')
        return

    # download resource
    tmp_file, file_hash = _download_resource_data(resource, data, api_key,
                                                  logger)

    if (resource.get('hash') == file_hash
            and not data.get('ignore_hash')):
        logger.info('Ignoring resource - the file hash hasn\'t changed: '
                    '{hash}.'.format(hash=file_hash))
        return
    logger.info('File hash: {}'.format(file_hash))
    resource['hash'] = file_hash

    def direct_load():
        fields = loader.load_csv(
            tmp_file.name,
            resource_id=resource['id'],
            resource_alias=resource['name'],
            mimetype=resource.get('format'),
            logger=logger)
        loader.calculate_record_count(
            resource_id=resource['id'], logger=logger)
        set_datastore_active(data, resource, logger)
        job_dict['status'] = 'running_but_viewable'
        callback_xloader_hook(result_url=input['result_url'],
                              api_key=api_key,
                              job_dict=job_dict)
        logger.info('Data now available to users: {}'.format(resource_ckan_url))
        loader.create_column_indexes(
            fields=fields,
            resource_id=resource['id'],
            logger=logger)
        update_resource(resource={'id': resource['id'], 'hash': resource['hash']},
                        patch_only=True)
        logger.info('File Hash updated for resource: {}'.format(resource['hash']))

    def messytables_load():
        try:
            loader.load_table(tmp_file.name,
                              resource_id=resource['id'],
                              resource_alias=resource['name'],
                              mimetype=resource.get('format'),
                              logger=logger)
        except JobError as e:
            logger.error('Error during messytables load: {}'.format(e))
            raise
        loader.calculate_record_count(
            resource_id=resource['id'], logger=logger)
        set_datastore_active(data, resource, logger)
        logger.info('Finished loading with messytables')
        update_resource(resource={'id': resource['id'], 'hash': resource['hash']},
                        patch_only=True)
        logger.info('File Hash updated for resource: {}'.format(resource['hash']))

    # Load it
    logger.info('Loading CSV')
    just_load_with_messytables = asbool(config.get(
        'ckanext.xloader.just_load_with_messytables', False))
    logger.info("'Just load with messytables' mode is: {}".format(
        just_load_with_messytables))
    try:
        if just_load_with_messytables:
            messytables_load()
        else:
            try:
                direct_load()
            except JobError as e:
                logger.warning('Load using COPY failed: {}'.format(e))
                logger.info('Trying again with messytables')
                messytables_load()
    except FileCouldNotBeLoadedError as e:
        logger.warning('Loading excerpt for this format not supported.')
        logger.error('Loading file raised an error: {}'.format(e))
        raise JobError('Loading file raised an error: {}'.format(e))

    tmp_file.close()

    logger.info('Express Load completed')