Example #1
0
def run_asynchronous_job(job, job_id, job_key, input):
    if not scheduler.running:
        scheduler.start()
    try:
        db.add_pending_job(job_id, job_key, **input)
    except sa.exc.IntegrityError:
        error_string = 'job_id {} already exists'.format(job_id)
        return json.dumps({"error": error_string}), 409, headers

    scheduler.add_job(RunNowTrigger(), job, [job_id, input], None)

    return job_status(job_id=job_id, show_job_key=True, ignore_auth=True)
Example #2
0
def run_asynchronous_job(job, job_id, job_key, input):
    if not scheduler.running:
        scheduler.start()
    try:
        db.add_pending_job(job_id, job_key, **input)
    except sa.exc.IntegrityError:
        error_string = 'job_id {} already exists'.format(job_id)
        return json.dumps({"error": error_string}), 409, headers

    scheduler.add_job(RunNowTrigger(), job, [job_id, input], None)

    return job_status(job_id=job_id, show_job_key=True, ignore_auth=True)
Example #3
0
def xloader_data_into_datastore_(input, job_dict):
    '''This function:
    * downloads the resource (metadata) from CKAN
    * downloads the data
    * calls the loader to load the data into DataStore
    * calls back to CKAN with the new status

    (datapusher called this function 'push_to_datastore')
    '''
    job_id = get_current_job().id
    db.init(config)

    # Store details of the job in the db
    try:
        db.add_pending_job(job_id, **input)
    except sa.exc.IntegrityError:
        raise JobError('job_id {} already exists'.format(job_id))

    # Set-up logging to the db
    handler = StoringHandler(job_id, input)
    level = logging.DEBUG
    handler.setLevel(level)
    logger = logging.getLogger(job_id)
    handler.setFormatter(logging.Formatter('%(message)s'))
    logger.addHandler(handler)
    # also show logs on stderr
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']

    ckan_url = data['ckan_url']
    resource_id = data['resource_id']
    api_key = input.get('api_key')

    try:
        resource, dataset = get_resource_and_dataset(resource_id)
    except JobError, e:
        # try again in 5 seconds just in case CKAN is slow at adding resource
        time.sleep(5)
        resource, dataset = get_resource_and_dataset(resource_id)
Example #4
0
def run_synchronous_job(job, job_id, job_key, input):
    try:
        db.add_pending_job(job_id, job_key, **input)
    except sa.exc.IntegrityError, e:
        error_string = 'job_id {} already exists'.format(job_id)
        return json.dumps({"error": error_string}), 409, headers
Example #5
0
def xloader_data_into_datastore_(input, job_dict):
    '''This function:
    * downloads the resource (metadata) from CKAN
    * downloads the data
    * calls the loader to load the data into DataStore
    * calls back to CKAN with the new status

    (datapusher called this function 'push_to_datastore')
    '''
    job_id = get_current_job().id
    db.init(config)

    # Store details of the job in the db
    try:
        db.add_pending_job(job_id, **input)
    except sa.exc.IntegrityError:
        raise JobError('job_id {} already exists'.format(job_id))

    # Set-up logging to the db
    handler = StoringHandler(job_id, input)
    level = logging.DEBUG
    handler.setLevel(level)
    logger = logging.getLogger(job_id)
    handler.setFormatter(logging.Formatter('%(message)s'))
    logger.addHandler(handler)
    # also show logs on stderr
    logger.addHandler(logging.StreamHandler())
    logger.setLevel(logging.DEBUG)

    validate_input(input)

    data = input['metadata']

    ckan_url = data['ckan_url']
    resource_id = data['resource_id']
    api_key = input.get('api_key')

    try:
        resource, dataset = get_resource_and_dataset(resource_id)
    except (JobError, ObjectNotFound) as e:
        # try again in 5 seconds just in case CKAN is slow at adding resource
        time.sleep(5)
        resource, dataset = get_resource_and_dataset(resource_id)
    resource_ckan_url = '/dataset/{}/resource/{}' \
        .format(dataset['name'], resource['id'])
    logger.info('Express Load starting: {}'.format(resource_ckan_url))

    # check if the resource url_type is a datastore
    if resource.get('url_type') == 'datastore':
        logger.info('Ignoring resource - url_type=datastore - dump files are '
                    'managed with the Datastore API')
        return

    # download resource
    tmp_file, file_hash = _download_resource_data(resource, data, api_key,
                                                  logger)

    if (resource.get('hash') == file_hash
            and not data.get('ignore_hash')):
        logger.info('Ignoring resource - the file hash hasn\'t changed: '
                    '{hash}.'.format(hash=file_hash))
        return
    logger.info('File hash: {}'.format(file_hash))
    resource['hash'] = file_hash

    def direct_load():
        fields = loader.load_csv(
            tmp_file.name,
            resource_id=resource['id'],
            resource_alias=resource['name'],
            mimetype=resource.get('format'),
            logger=logger)
        loader.calculate_record_count(
            resource_id=resource['id'], logger=logger)
        set_datastore_active(data, resource, logger)
        job_dict['status'] = 'running_but_viewable'
        callback_xloader_hook(result_url=input['result_url'],
                              api_key=api_key,
                              job_dict=job_dict)
        logger.info('Data now available to users: {}'.format(resource_ckan_url))
        loader.create_column_indexes(
            fields=fields,
            resource_id=resource['id'],
            logger=logger)
        update_resource(resource={'id': resource['id'], 'hash': resource['hash']},
                        patch_only=True)
        logger.info('File Hash updated for resource: {}'.format(resource['hash']))

    def messytables_load():
        try:
            loader.load_table(tmp_file.name,
                              resource_id=resource['id'],
                              resource_alias=resource['name'],
                              mimetype=resource.get('format'),
                              logger=logger)
        except JobError as e:
            logger.error('Error during messytables load: {}'.format(e))
            raise
        loader.calculate_record_count(
            resource_id=resource['id'], logger=logger)
        set_datastore_active(data, resource, logger)
        logger.info('Finished loading with messytables')
        update_resource(resource={'id': resource['id'], 'hash': resource['hash']},
                        patch_only=True)
        logger.info('File Hash updated for resource: {}'.format(resource['hash']))

    # Load it
    logger.info('Loading CSV')
    just_load_with_messytables = asbool(config.get(
        'ckanext.xloader.just_load_with_messytables', False))
    logger.info("'Just load with messytables' mode is: {}".format(
        just_load_with_messytables))
    try:
        if just_load_with_messytables:
            messytables_load()
        else:
            try:
                direct_load()
            except JobError as e:
                logger.warning('Load using COPY failed: {}'.format(e))
                logger.info('Trying again with messytables')
                messytables_load()
    except FileCouldNotBeLoadedError as e:
        logger.warning('Loading excerpt for this format not supported.')
        logger.error('Loading file raised an error: {}'.format(e))
        raise JobError('Loading file raised an error: {}'.format(e))

    tmp_file.close()

    logger.info('Express Load completed')
Example #6
0
def run_synchronous_job(job, job_id, job_key, input):
    try:
        db.add_pending_job(job_id, job_key, **input)
    except sa.exc.IntegrityError, e:
        error_string = 'job_id {} already exists'.format(job_id)
        return json.dumps({"error": error_string}), 409, headers