Python harvest_job_list Examples, ckanext.harvest.logic.action.get.harvest_job_list Python Examples

Example #1

0

Show file

File: create.py Project: ngds/ckanext-harvest

def _check_for_existing_jobs(context, source_id):
    '''
    Given a source id, checks if there are jobs for this source
    with status 'New' or 'Running'

    rtype: boolean
    '''
    data_dict = {'source_id': source_id, 'status': u'New'}
    exist_new = harvest_job_list(context, data_dict)
    data_dict = {'source_id': source_id, 'status': u'Running'}
    exist_running = harvest_job_list(context, data_dict)
    exist = len(exist_new + exist_running) > 0

    return exist

Example #2

0

Show file

File: create.py Project: tbalaz/test

def harvest_job_create_all(context,data_dict):
    log.info('Harvest job create all: %r', data_dict)
    check_access('harvest_job_create_all',context,data_dict)

    data_dict.update({'only_active':True})

    # Get all active sources
    sources = harvest_source_list(context,data_dict)
    jobs = []
    # Create a new job for each, if there isn't already one
    for source in sources:
        data_dict ={
            'source_id':source['id'],
            'status':u'New'
        }

        exists = harvest_job_list(context,data_dict)
        if len(exists):
            continue

        job = harvest_job_create(context,{'source_id':source['id']})
        jobs.append(job)

    log.info('Created jobs for %i harvest sources', len(jobs))
    return jobs

Example #3

0

Show file

File: create.py Project: tbalaz/test

def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s', source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict ={
        'source_id':source_id,
        'status':u'New'
    }
    exists = harvest_job_list(context,data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job,context)

Example #4

0

Show file

File: update.py Project: tbalaz/test

def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    source_id = data_dict.get('source_id',None)

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    log.info('Number of jobs: %i', len(jobs))
    sent_jobs = []
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        return sent_jobs # i.e. []
        # Do not raise an exception as that will cause cron (which runs
        # this) to produce an error email.

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context,{'id':job['source']})
        if source['active']:
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()

    # Record the running in harvest_status
    log.info('%i jobs sent to the gather queue to be harvested', len(sent_jobs))

    return sent_jobs

Example #5

0

Show file

File: create.py Project: uk-gov-mirror/datagovuk.ckanext-harvest

def harvest_job_create_all(context,data_dict):
    log.info('Harvest job create all: %r', data_dict)
    check_access('harvest_job_create_all',context,data_dict)

    data_dict.update({'only_active':True})

    # Get all active sources
    sources = harvest_source_list(context,data_dict)
    jobs = []
    # Create a new job for each, if there isn't already one
    for source in sources:
        data_dict ={
            'source_id':source['id'],
            'status':u'New'
        }

        exists = harvest_job_list(context,data_dict)
        if len(exists):
            continue

        job = harvest_job_create(context,{'source_id':source['id']})
        jobs.append(job)

    log.info('Created jobs for %i harvest sources', len(jobs))
    return jobs

Example #6

0

Show file

def harvest_job_create(context, data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise HarvestError('Can not create jobs on inactive sources')

    # Check if there already is an unrun job for this source
    data_dict = {'source_id': source_id, 'status': u'New'}
    exists = harvest_job_list(context, data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists,
                 source_id)
        raise HarvestError('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source

    job.save()
    log.info('Harvest job saved %s', job.id)
    return harvest_job_dictize(job, context)

Example #7

0

Show file

File: update.py Project: uk-gov-mirror/datagovuk.ckanext-harvest

def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    source_id = data_dict.get('source_id',None)

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context,{'id':job['source']})
        if source['active']:
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs

Example #8

0

Show file

File: create.py Project: CodeForAfricaLabs/ckanext-harvest

def _check_for_existing_jobs(context, source_id):
    '''
    Given a source id, checks if there are jobs for this source
    with status 'New' or 'Running'

    rtype: boolean
    '''
    data_dict ={
        'source_id':source_id,
        'status':u'New'
    }
    exist_new = harvest_job_list(context,data_dict)
    data_dict ={
        'source_id':source_id,
        'status':u'Running'
    }
    exist_running = harvest_job_list(context,data_dict)
    exist = len(exist_new + exist_running) > 0

    return exist

Example #9

0

Show file

def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    source_id = data_dict.get('source_id', None)

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    log.info('Number of jobs: %i', len(jobs))
    sent_jobs = []
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        return sent_jobs  # i.e. []
        # Do not raise an exception as that will cause cron (which runs
        # this) to produce an error email.

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source']})
        if source['active']:
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()

    # Record the running in harvest_status
    log.info('%i jobs sent to the gather queue to be harvested',
             len(sent_jobs))

    return sent_jobs

Example #10

0

Show file

File: update.py Project: AQUACROSS/ckanext-harvest

def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(
        context, {'source_id': source_id, 'status': u'Running'})
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(HarvestObject.import_finished != None) \
                        .order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()
                    log.info('Marking job as finished: %s', job_obj)
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {'id': job_obj.source.id})
                else:
                    log.debug('Ongoing job:%s source:%s',
                              job['id'], job['source_id'])

    # resubmit old redis tasks
    resubmit_jobs()

    return []  # merely for backwards compatibility

Example #11

0

Show file

def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .count()

                if num_objects_in_progress == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if config.get('ckan.harvest.status_mail') == 'errors' \
                       and status['last_job']['stats']['errored']:
                        subject, body = prepare_error_mail(
                            context, job_obj.source_id, status,
                            'emails/error_email.txt')

                        log.info('Sending error mail')
                        send_mail(context, job_obj.source.id, subject, body)

                    if config.get('ckan.harvest.status_mail') == 'all':
                        subject, body = prepare_summary_mail(
                            context, job_obj.source.id, status,
                            'emails/summary_email.txt')

                        log.info('Sending summary email')
                        send_mail(context, job_obj.source.id, subject, body)
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    return []  # merely for backwards compatibility

Example #12

0

Show file

def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise NoNewHarvestJobError('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs

Example #13

0

Show file

def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    If ckanext.harvest.timeout is set:
    Check if the duration of the job is longer than ckanext.harvest.timeout, 
    then mark that job as finished as there is probably an underlying issue with the harvest process.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)
    timeout = config.get('ckan.harvest.timeout')

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if timeout:
                created = datetime.datetime.strptime(job['created'],
                                                     '%Y-%m-%d %H:%M:%S.%f')
                now = datetime.datetime.now()
                if now - created > datetime.timedelta(minutes=int(timeout)):
                    msg = 'Job timeout: %s is taking longer than %s minutes' % (
                        job['id'], timeout)
                    log.error(msg)

                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    job_obj.finished = now
                    job_obj.save()

                    err = HarvestGatherError(message=msg, job=job_obj)
                    err.save()
                    log.info('Marking job as finished due to error: %s %s',
                             job_obj.source.url, job_obj.id)
                    continue

            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .count()

                if num_objects_in_progress == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if toolkit.asbool(config.get('ckan.harvest.status_mail.errored'))\
                            and (status['last_job']['stats']['errored']):
                        send_error_mail(context, job_obj.source.id, status)
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    return []  # merely for backwards compatibility

Example #14

0

Show file

File: update.py Project: kata-csc/ckanext-harvest

def harvest_jobs_run(context, data_dict):
    log.info("Harvest job run: %r", data_dict)
    check_access("harvest_jobs_run", context, data_dict)

    session = context["session"]

    source_id = data_dict.get("source_id", None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context["return_objects"] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job["gather_finished"]:
                objects = (
                    session.query(HarvestObject.id)
                    .filter(HarvestObject.harvest_job_id == job["id"])
                    .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR")))
                    .order_by(HarvestObject.import_finished.desc())
                )

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job["id"])
                    job_obj.status = u"Finished"

                    last_object = (
                        session.query(HarvestObject)
                        .filter(HarvestObject.harvest_job_id == job["id"])
                        .filter(HarvestObject.import_finished != None)
                        .order_by(HarvestObject.import_finished.desc())
                        .first()
                    )
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if "extras_as_string" in context:
                        del context["extras_as_string"]
                    context.update({"validate": False, "ignore_auth": True})
                    package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"})
    if len(jobs) == 0:
        log.info("No new harvest jobs.")
        raise Exception("There are no new harvesting jobs")

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context["detailed"] = False
        source = harvest_source_show(context, {"id": job["source_id"]})
        if source["active"]:
            job_obj = HarvestJob.get(job["id"])
            job_obj.status = job["status"] = u"Running"
            job_obj.save()
            publisher.send({"harvest_job_id": job["id"]})
            log.info("Sent job %s to the gather queue" % job["id"])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs

Example #15

0

Show file

File: update.py Project: poguez/ckanext-harvest

def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    session = context['session']

    source_id = data_dict.get('source_id',None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string'in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(context,
                            {'id': job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context,{'id':job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs

Example #16

0

Show file

def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    If ckanext.harvest.timeout is set:
    Check if the duration of the job is longer than ckanext.harvest.timeout,
    then mark that job as finished as there is probably an underlying issue with the harvest process.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)
    timeout = config.get('ckan.harvest.timeout')

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            job_obj = HarvestJob.get(job['id'])
            if timeout:
                last_time = job_obj.get_last_action_time()
                now = datetime.datetime.utcnow()
                if now - last_time > datetime.timedelta(minutes=int(timeout)):
                    msg = 'Job {} timeout ({} minutes)\n'.format(
                        job_obj.id, timeout)
                    msg += '\tJob created: {}\n'.format(job_obj.created)
                    msg += '\tJob gather finished: {}\n'.format(
                        job_obj.created)
                    msg += '\tJob last action time: {}\n'.format(last_time)

                    job_obj.status = u'Finished'
                    job_obj.finished = now
                    job_obj.save()

                    err = HarvestGatherError(message=msg, job=job_obj)
                    err.save()
                    log.info('Marking job as finished due to error: %s %s',
                             job_obj.source.url, job_obj.id)
                    continue

            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(and_((HarvestObject.state != u'COMPLETE'),
                                     (HarvestObject.state != u'ERROR'))) \
                        .count()

                if num_objects_in_progress == 0:

                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    notify_all = toolkit.asbool(
                        config.get('ckan.harvest.status_mail.all'))
                    notify_errors = toolkit.asbool(
                        config.get('ckan.harvest.status_mail.errored'))
                    last_job_errors = status['last_job']['stats'].get(
                        'errored', 0)
                    log.debug(
                        'Notifications: All:{} On error:{} Errors:{}'.format(
                            notify_all, notify_errors, last_job_errors))

                    if last_job_errors > 0 and (notify_all or notify_errors):
                        # send_error_mail_ncar(context, job_obj)
                        # get_mail_extra_vars(context, job_obj.source.id, status)
                        send_error_email(context, job_obj.source.id, status)
                    elif notify_all:
                        send_summary_email(context, job_obj.source.id, status)
                else:
                    log.debug('%d Ongoing jobs for %s (source:%s)',
                              num_objects_in_progress, job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    # log.debug('Start of commit and close')
    # session.commit()
    # log.debug('  (Start of close)')
    # session.close()
    # log.debug('End of commit and close')

    return []  # merely for backwards compatibility

Example #17

0

Show file

def distributed_harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)

    check_access('distributed_harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)
    routing_key = data_dict.get('gather_routing_key', None)
    exchange_name = data_dict.get('exchange_name', None)
    fetch_routing_key = data_dict.get('fetch_routing_key', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_distributed_gather_publisher(exchange_name, routing_key)
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({
                'harvest_job_id': job['id'],
                'exchange_name': exchange_name,
                'fetch_routing_key': fetch_routing_key
            })
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs

Example #18

0

Show file

File: update.py Project: govtmirror/ckanext-harvest

def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()

                    # recreate job for datajson collection or the like.
                    source = job_obj.source
                    source_config = json.loads(source.config or '{}')
                    datajson_collection = source_config.get(
                        'datajson_collection')
                    if datajson_collection == 'parents_run':
                        new_job = HarvestJob()
                        new_job.source = source
                        new_job.save()
                        source_config['datajson_collection'] = 'children_run'
                        source.config = json.dumps(source_config)
                        source.save()
                    elif datajson_collection:
                        # reset the key if 'children_run', or anything.
                        source_config.pop("datajson_collection", None)
                        source.config = json.dumps(source_config)
                        source.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs

Example #19

0

Show file

def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(HarvestObject.import_finished != None) \
                        .order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()
                    log.info('Marking job as finished: %s', job_obj)
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])

    # resubmit old redis tasks
    resubmit_jobs()

    return []  # merely for backwards compatibility