Beispiel #1
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise NoNewHarvestJobError('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Beispiel #2
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .count()

                if num_objects_in_progress == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if config.get('ckan.harvest.status_mail') == 'errors' \
                       and status['last_job']['stats']['errored']:
                        subject, body = prepare_error_mail(
                            context, job_obj.source_id, status,
                            'emails/error_email.txt')

                        log.info('Sending error mail')
                        send_mail(context, job_obj.source.id, subject, body)

                    if config.get('ckan.harvest.status_mail') == 'all':
                        subject, body = prepare_summary_mail(
                            context, job_obj.source.id, status,
                            'emails/summary_email.txt')

                        log.info('Sending summary email')
                        send_mail(context, job_obj.source.id, subject, body)
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    return []  # merely for backwards compatibility
Beispiel #3
0
def harvest_jobs_run(context, data_dict):
    log.info("Harvest job run: %r", data_dict)
    check_access("harvest_jobs_run", context, data_dict)

    session = context["session"]

    source_id = data_dict.get("source_id", None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context["return_objects"] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"Running"})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job["gather_finished"]:
                objects = (
                    session.query(HarvestObject.id)
                    .filter(HarvestObject.harvest_job_id == job["id"])
                    .filter(and_((HarvestObject.state != u"COMPLETE"), (HarvestObject.state != u"ERROR")))
                    .order_by(HarvestObject.import_finished.desc())
                )

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job["id"])
                    job_obj.status = u"Finished"

                    last_object = (
                        session.query(HarvestObject)
                        .filter(HarvestObject.harvest_job_id == job["id"])
                        .filter(HarvestObject.import_finished != None)
                        .order_by(HarvestObject.import_finished.desc())
                        .first()
                    )
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if "extras_as_string" in context:
                        del context["extras_as_string"]
                    context.update({"validate": False, "ignore_auth": True})
                    package_dict = logic.get_action("package_show")(context, {"id": job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {"source_id": source_id, "status": u"New"})
    if len(jobs) == 0:
        log.info("No new harvest jobs.")
        raise Exception("There are no new harvesting jobs")

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context["detailed"] = False
        source = harvest_source_show(context, {"id": job["source_id"]})
        if source["active"]:
            job_obj = HarvestJob.get(job["id"])
            job_obj.status = job["status"] = u"Running"
            job_obj.save()
            publisher.send({"harvest_job_id": job["id"]})
            log.info("Sent job %s to the gather queue" % job["id"])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Beispiel #4
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    If ckanext.harvest.timeout is set:
    Check if the duration of the job is longer than ckanext.harvest.timeout, 
    then mark that job as finished as there is probably an underlying issue with the harvest process.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)
    timeout = config.get('ckan.harvest.timeout')

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if timeout:
                created = datetime.datetime.strptime(job['created'],
                                                     '%Y-%m-%d %H:%M:%S.%f')
                now = datetime.datetime.now()
                if now - created > datetime.timedelta(minutes=int(timeout)):
                    msg = 'Job timeout: %s is taking longer than %s minutes' % (
                        job['id'], timeout)
                    log.error(msg)

                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    job_obj.finished = now
                    job_obj.save()

                    err = HarvestGatherError(message=msg, job=job_obj)
                    err.save()
                    log.info('Marking job as finished due to error: %s %s',
                             job_obj.source.url, job_obj.id)
                    continue

            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .count()

                if num_objects_in_progress == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if toolkit.asbool(config.get('ckan.harvest.status_mail.errored'))\
                            and (status['last_job']['stats']['errored']):
                        send_error_mail(context, job_obj.source.id, status)
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    return []  # merely for backwards compatibility
Beispiel #5
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    If ckanext.harvest.timeout is set:
    Check if the duration of the job is longer than ckanext.harvest.timeout,
    then mark that job as finished as there is probably an underlying issue with the harvest process.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)
    timeout = config.get('ckan.harvest.timeout')

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            job_obj = HarvestJob.get(job['id'])
            if timeout:
                last_time = job_obj.get_last_action_time()
                now = datetime.datetime.utcnow()
                if now - last_time > datetime.timedelta(minutes=int(timeout)):
                    msg = 'Job {} timeout ({} minutes)\n'.format(
                        job_obj.id, timeout)
                    msg += '\tJob created: {}\n'.format(job_obj.created)
                    msg += '\tJob gather finished: {}\n'.format(
                        job_obj.created)
                    msg += '\tJob last action time: {}\n'.format(last_time)

                    job_obj.status = u'Finished'
                    job_obj.finished = now
                    job_obj.save()

                    err = HarvestGatherError(message=msg, job=job_obj)
                    err.save()
                    log.info('Marking job as finished due to error: %s %s',
                             job_obj.source.url, job_obj.id)
                    continue

            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(and_((HarvestObject.state != u'COMPLETE'),
                                     (HarvestObject.state != u'ERROR'))) \
                        .count()

                if num_objects_in_progress == 0:

                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    notify_all = toolkit.asbool(
                        config.get('ckan.harvest.status_mail.all'))
                    notify_errors = toolkit.asbool(
                        config.get('ckan.harvest.status_mail.errored'))
                    last_job_errors = status['last_job']['stats'].get(
                        'errored', 0)
                    log.debug(
                        'Notifications: All:{} On error:{} Errors:{}'.format(
                            notify_all, notify_errors, last_job_errors))

                    if last_job_errors > 0 and (notify_all or notify_errors):
                        # send_error_mail_ncar(context, job_obj)
                        # get_mail_extra_vars(context, job_obj.source.id, status)
                        send_error_email(context, job_obj.source.id, status)
                    elif notify_all:
                        send_summary_email(context, job_obj.source.id, status)
                else:
                    log.debug('%d Ongoing jobs for %s (source:%s)',
                              num_objects_in_progress, job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    # log.debug('Start of commit and close')
    # session.commit()
    # log.debug('  (Start of close)')
    # session.close()
    # log.debug('End of commit and close')

    return []  # merely for backwards compatibility
Beispiel #6
0
def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    session = context['session']

    source_id = data_dict.get('source_id',None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'})
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string'in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(context,
                            {'id': job_obj.source.id})

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context,{'id':job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Beispiel #7
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(
        context, {'source_id': source_id, 'status': u'Running'})
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(HarvestObject.import_finished != None) \
                        .order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()
                    log.info('Marking job as finished: %s', job_obj)
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {'id': job_obj.source.id})
                else:
                    log.debug('Ongoing job:%s source:%s',
                              job['id'], job['source_id'])

    # resubmit old redis tasks
    resubmit_jobs()

    return []  # merely for backwards compatibility
Beispiel #8
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
                objects = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(HarvestObject.import_finished != None) \
                        .order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()
                    log.info('Marking job as finished: %s', job_obj)
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])

    # resubmit old redis tasks
    resubmit_jobs()

    return []  # merely for backwards compatibility
Beispiel #9
0
    def test_raise_child_error_and_retry(self):
        """ if a harvest job for a child fails because 
            parent still not exists we need to ensure
            this job will be retried. 
            This test emulate the case we harvest children first
            (e.g. if we have several active queues).
            Just for CKAN 2.8 env"""

        # start harvest process with gather to create harvest objects
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        self.run_gather(url=url)
        assert_equal(len(self.harvest_objects), 3)

        # create a publisher to send this objects to the fetch queue
        publisher = queue.get_fetch_publisher()

        for ho in self.harvest_objects:
            ho = harvest_model.HarvestObject.get(ho.id)  # refresh
            ho_data = json.loads(ho.content)
            assert_equal(ho.state, 'WAITING')
            log.info('HO: {}\n\tCurrent: {}'.format(ho_data['identifier'],
                                                    ho.current))
            assert_equal(ho.retry_times, 0)
            publisher.send({'harvest_object_id': ho.id})
            log.info('Harvest object sent to the fetch queue {} as {}'.format(
                ho_data['identifier'], ho.id))

        publisher.close()

        # run fetch for elements in the wrong order (first a child, the a parent)

        class FakeMethod(object):
            ''' This is to act like the method returned by AMQP'''
            def __init__(self, message):
                self.delivery_tag = message

        # get the fetch
        consumer_fetch = queue.get_fetch_consumer()
        qname = queue.get_fetch_queue_name()

        # first a child and assert to get an error
        r2 = json.dumps({"harvest_object_id": self.harvest_objects[1].id})
        r0 = FakeMethod(r2)
        with assert_raises(ParentNotHarvestedException):
            queue.fetch_callback(consumer_fetch, r0, None, r2)
        assert_equal(self.harvest_objects[1].retry_times, 1)
        assert_equal(self.harvest_objects[1].state, "ERROR")

        # run the parent later, like in a different queue
        r2 = json.dumps({"harvest_object_id": self.harvest_objects[0].id})
        r0 = FakeMethod(r2)
        queue.fetch_callback(consumer_fetch, r0, None, r2)
        assert_equal(self.harvest_objects[0].retry_times, 1)
        assert_equal(self.harvest_objects[0].state, "COMPLETE")

        # Check status on harvest objects
        # We expect one child with error, parent ok and second child still waiting
        for ho in self.harvest_objects:
            ho = harvest_model.HarvestObject.get(ho.id)  # refresh
            ho_data = json.loads(ho.content)
            idf = ho_data['identifier']
            log.info(
                '\nHO2: {}\n\tState: {}\n\tCurrent: {}\n\tGathered {}'.format(
                    idf, ho.state, ho.current, ho.gathered))
            if idf == 'OPM-ERround-0001':
                assert_equal(ho.state, 'COMPLETE')
            elif idf == 'OPM-ERround-0001-AWOL':
                assert_equal(ho.state, 'ERROR')
                ho_awol_id = ho.id
            elif idf == 'OPM-ERround-0001-Retire':
                assert_equal(ho.state, 'WAITING')
                ho_retire_id = ho.id
            else:
                raise Exception('Unexpected identifier: "{}"'.format(idf))

        # resubmit jobs and objects as harvest_jobs_run does
        # we expect the errored harvest object is in this queue
        queue.resubmit_jobs()
        queue.resubmit_objects()

        # iterate over the fetch consumer queue again and check pending harvest objects
        harvest_objects = []
        while True:
            method, header, body = consumer_fetch.basic_get(queue=qname)
            if body is None:
                break

            body_data = json.loads(body)
            ho_id = body_data.get('harvest_object_id', None)
            log.info('Adding ho_id {}'.format(ho_id))
            if ho_id is not None:
                ho = harvest_model.HarvestObject.get(ho_id)
                if ho is not None:
                    harvest_objects.append(ho)
                    content = json.loads(ho.content)
                    log.info('Harvest object found {}: {} '.format(
                        content['identifier'], ho.state))
                else:
                    log.info('Harvest object not found {}'.format(ho_id))

        ho_ids = [ho.id for ho in harvest_objects]

        # Now, we expect the waiting child and the errored one to be in the fetch queue

        log.info('Searching wainting object "Retire ID"')
        assert_in(ho_retire_id, ho_ids)

        log.info('Searching errored object "Awol ID"')
        assert_in(ho_awol_id, ho_ids)
Beispiel #10
0
def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()

                    # recreate job for datajson collection or the like.
                    source = job_obj.source
                    source_config = json.loads(source.config or '{}')
                    datajson_collection = source_config.get(
                        'datajson_collection')
                    if datajson_collection == 'parents_run':
                        new_job = HarvestJob()
                        new_job.source = source
                        new_job.save()
                        source_config['datajson_collection'] = 'children_run'
                        source.config = json.dumps(source_config)
                        source.save()
                    elif datajson_collection:
                        # reset the key if 'children_run', or anything.
                        source_config.pop("datajson_collection", None)
                        source.config = json.dumps(source_config)
                        source.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Beispiel #11
0
def distributed_harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)

    check_access('distributed_harvest_jobs_run', context, data_dict)

    session = context['session']

    source_id = data_dict.get('source_id', None)
    routing_key = data_dict.get('gather_routing_key', None)
    exchange_name = data_dict.get('exchange_name', None)
    fetch_routing_key = data_dict.get('fetch_routing_key', None)

    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        package_index = PackageSearchIndex()
        for job in jobs:
            if job['gather_finished']:
                objects = session.query(HarvestObject.id) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
                                       (HarvestObject.state!=u'ERROR'))) \
                          .order_by(HarvestObject.import_finished.desc())

                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'

                    last_object = session.query(HarvestObject) \
                          .filter(HarvestObject.harvest_job_id==job['id']) \
                          .filter(HarvestObject.import_finished!=None) \
                          .order_by(HarvestObject.import_finished.desc()) \
                          .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
                    if 'extras_as_string' in context:
                        del context['extras_as_string']
                    context.update({'validate': False, 'ignore_auth': True})
                    package_dict = logic.get_action('package_show')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if package_dict:
                        package_index.index_package(package_dict)

    # resubmit old redis tasks
    resubmit_jobs()

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'New'
    })
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_distributed_gather_publisher(exchange_name, routing_key)
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
            job_obj.save()
            publisher.send({
                'harvest_job_id': job['id'],
                'exchange_name': exchange_name,
                'fetch_routing_key': fetch_routing_key
            })
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs