Esempio n. 1
0
 def _save_gather_error(self,message,job):
     '''
     Helper function to create an error during the gather stage.
     '''
     err = HarvestGatherError(message=message,job=job)
     err.save()
     log.error(message)
Esempio n. 2
0
 def _save_gather_error(self, message, job):
     '''
     Helper function to create an error during the gather stage.
     '''
     err = HarvestGatherError(message=message, job=job)
     err.save()
     log.error(message)
Esempio n. 3
0
 def _save_gather_error(self, message, job):
     err = HarvestGatherError(message=message, job=job)
     try:
         err.save()
     except InvalidRequestError:
         Session.rollback()
         err.save()
     finally:
         log.error(message)
Esempio n. 4
0
def gather_callback(message_data, message):
    try:
        id = message_data['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)

        # Get rid of any old session state that may still be around. This is
        # a simple alternative to creating a new session for this callback.
        model.Session.expire_all()

        # Get a publisher for the fetch queue
        publisher = get_fetch_publisher()

        try:
            job = HarvestJob.get(id)
            if not job:
                log.error('Harvest job does not exist: %s' % id)
                return

            # Send the harvest job to the plugins that implement
            # the Harvester interface, only if the source type
            # matches
            harvester_found = False
            for harvester in PluginImplementations(IHarvester):
                if harvester.info()['name'] == job.source.type:
                    harvester_found = True
                    # Get a list of harvest object ids from the plugin
                    job.gather_started = datetime.datetime.now()
                    harvest_object_ids = harvester.gather_stage(job)
                    job.gather_finished = datetime.datetime.now()
                    job.save()
                    log.debug('Received from plugin'
                              's gather_stage: %r' % harvest_object_ids)
                    if harvest_object_ids and len(harvest_object_ids) > 0:
                        for id in harvest_object_ids:
                            # Send the id to the fetch queue
                            publisher.send({'harvest_object_id': id})
                            log.debug('Sent object %s to the fetch queue' % id)

            if not harvester_found:
                msg = 'No harvester could be found for source type %s' % job.source.type
                err = HarvestGatherError(message=msg, job=job)
                err.save()
                log.error(msg)

            job.status = u'Finished'
            job.save()

        finally:
            publisher.close()

    except KeyError:
        log.error('No harvest job id received')
    finally:
        message.ack()
Esempio n. 5
0
File: queue.py Progetto: tbalaz/test
def gather_callback(message_data,message):
    try:
        id = message_data['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)

        # Get rid of any old session state that may still be around. This is
        # a simple alternative to creating a new session for this callback.
        model.Session.expire_all()

        # Get a publisher for the fetch queue
        publisher = get_fetch_publisher()

        try:
            job = HarvestJob.get(id)
            if not job:
                log.error('Harvest job does not exist: %s' % id)
                return

            # Send the harvest job to the plugins that implement
            # the Harvester interface, only if the source type
            # matches
            harvester_found = False
            for harvester in PluginImplementations(IHarvester):
                if harvester.info()['name'] == job.source.type:
                    harvester_found = True
                    # Get a list of harvest object ids from the plugin
                    job.gather_started = datetime.datetime.now()
                    harvest_object_ids = harvester.gather_stage(job)
                    job.gather_finished = datetime.datetime.now()
                    job.save()
                    log.debug('Received from plugin''s gather_stage: %r' % harvest_object_ids)
                    if harvest_object_ids and len(harvest_object_ids) > 0:
                        for id in harvest_object_ids:
                            # Send the id to the fetch queue
                            publisher.send({'harvest_object_id':id})
                            log.debug('Sent object %s to the fetch queue' % id)

            if not harvester_found:
                msg = 'No harvester could be found for source type %s' % job.source.type
                err = HarvestGatherError(message=msg,job=job)
                err.save()
                log.error(msg)

            job.status = u'Finished'
            job.save()

        finally:
            publisher.close()

    except KeyError:
        log.error('No harvest job id received')
    finally:
        message.ack()
Esempio n. 6
0
    def test_error_mail_sent(self, mock_mailer_mail_recipient):
        context, harvest_source, job = self._create_harvest_source_and_job_if_not_existing()

        # create a HarvestGatherError
        job_model = HarvestJob.get(job['id'])
        msg = 'System error - No harvester could be found for source type %s' % job_model.source.type
        err = HarvestGatherError(message=msg, job=job_model)
        err.save()

        status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']})

        send_error_mail(
            context,
            harvest_source['id'],
            status
        )

        assert_equal(1, status['last_job']['stats']['errored'])
        assert mock_mailer_mail_recipient.called
Esempio n. 7
0
    def test_error_mail_sent(self, mock_mailer_mail_recipient):
        context, harvest_source, job = self._create_harvest_source_and_job_if_not_existing()

        # create a HarvestGatherError
        job_model = HarvestJob.get(job['id'])
        msg = 'System error - No harvester could be found for source type %s' % job_model.source.type
        err = HarvestGatherError(message=msg, job=job_model)
        err.save()

        status = toolkit.get_action('harvest_source_show_status')(context, {'id': harvest_source['id']})

        send_error_mail(
            context,
            harvest_source['id'],
            status
        )

        assert_equal(1, status['last_job']['stats']['errored'])
        assert mock_mailer_mail_recipient.called
Esempio n. 8
0
 def _save_gather_error(self, message, job):
     err = HarvestGatherError(message=message, job=job)
     try:
         err.save()
     except InvalidRequestError:
         Session.rollback()
         err.save()
     finally:
         log.error(message)
Esempio n. 9
0
            channel.basic_ack(method.delivery_tag)
            return False

        log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                    len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
        for id in harvest_object_ids:
            # Send the id to the fetch queue
            publisher.send({'harvest_object_id':id})
        log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))

    else:
        # This can occur if you:
        # * remove a harvester and it still has sources that are then refreshed
        # * add a new harvester and restart CKAN but not the gather queue.
        msg = 'System error - No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)


def get_harvester(harvest_source_type):
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == harvest_source_type:
            return harvester


def gather_stage(harvester, job):
Esempio n. 10
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    try:
        job = HarvestJob.get(id)
    except sqlalchemy.exc.DatabaseError:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        # or DatabaseError "connection timed out"
        log.exception('Connection Error during gather of job %s', id)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
    if not job:
        log.error('Harvest job does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester = get_harvester(job.source.type)

    if harvester:
        try:
            harvest_object_ids = gather_stage(harvester, job)
        except (Exception, KeyboardInterrupt):
            channel.basic_ack(method.delivery_tag)
            raise

        if not isinstance(harvest_object_ids, list):
            log.error('Gather stage failed')
            publisher.close()
            channel.basic_ack(method.delivery_tag)
            return False

        if len(harvest_object_ids) == 0:
            log.info('No harvest objects to fetch')
            publisher.close()
            channel.basic_ack(method.delivery_tag)
            return False

        log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                    len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
        for id in harvest_object_ids:
            # Send the id to the fetch queue
            publisher.send({'harvest_object_id':id})
        log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))

    else:
        # This can occur if you:
        # * remove a harvester and it still has sources that are then refreshed
        # * add a new harvester and restart CKAN but not the gather queue.
        msg = 'System error - No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
Esempio n. 11
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    If ckanext.harvest.timeout is set:
    Check if the duration of the job is longer than ckanext.harvest.timeout, 
    then mark that job as finished as there is probably an underlying issue with the harvest process.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)
    timeout = config.get('ckan.harvest.timeout')

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            if timeout:
                created = datetime.datetime.strptime(job['created'],
                                                     '%Y-%m-%d %H:%M:%S.%f')
                now = datetime.datetime.now()
                if now - created > datetime.timedelta(minutes=int(timeout)):
                    msg = 'Job timeout: %s is taking longer than %s minutes' % (
                        job['id'], timeout)
                    log.error(msg)

                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    job_obj.finished = now
                    job_obj.save()

                    err = HarvestGatherError(message=msg, job=job_obj)
                    err.save()
                    log.info('Marking job as finished due to error: %s %s',
                             job_obj.source.url, job_obj.id)
                    continue

            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                           .filter(HarvestObject.harvest_job_id == job['id']) \
                           .filter(and_((HarvestObject.state != u'COMPLETE'),
                                        (HarvestObject.state != u'ERROR'))) \
                           .count()

                if num_objects_in_progress == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    if toolkit.asbool(config.get('ckan.harvest.status_mail.errored'))\
                            and (status['last_job']['stats']['errored']):
                        send_error_mail(context, job_obj.source.id, status)
                else:
                    log.debug('Ongoing job:%s source:%s', job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    return []  # merely for backwards compatibility
Esempio n. 12
0
 def _save_gather_error(self,message,job):
     err = HarvestGatherError(message=message,job=job)
     err.save()
     log.error(message)
Esempio n. 13
0
            .order_by("gather_finished desc").first()
        # We thought about using the document's modified date to see if it is
        # unchanged from the previous harvest, but it's hard to tell if the
        # previous harvest was not successful due to whatever reason, so don't
        # skip the doc because of its modified date.

        # We create a new HarvestObject for each inv:Dataset within the
        # Inventory document
        ids = []
        harvested_identifiers = set()
        for dataset_node in doc.dataset_nodes():
            dataset = doc.dataset_to_dict(dataset_node)

            if dataset['identifier'] in harvested_identifiers:
                HarvestGatherError.create(
                    'Dataset with duplicate identifier "%s" - discarding' %
                    dataset['identifier'], harvest_job)
                continue
            harvested_identifiers.add(dataset['identifier'])

            guid = self.build_guid(doc_metadata['identifier'],
                                   dataset['identifier'])
            # Use the most recent modification date out of the doc and dataset,
            # since they might have forgotten to enter or update the dataset
            # date.
            dataset_last_modified = dataset['modified'] or doc_last_modified
            if dataset_last_modified and doc_last_modified:
                dataset_last_modified = max(dataset_last_modified,
                                            doc_last_modified)
            if previous:
                # object may be in the previous harvest, or an older one
Esempio n. 14
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    try:
        job = HarvestJob.get(id)
    except sqlalchemy.exc.DatabaseError:
        # Occasionally we see: sqlalchemy.exc.OperationalError
        # "SSL connection has been closed unexpectedly"
        # or DatabaseError "connection timed out"
        log.exception('Connection Error during gather of job %s', id)
        # By not sending the ack, it will be retried later.
        # Try to clear the issue with a remove.
        model.Session.remove()
        return
    if not job:
        log.error('Harvest job does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester = get_harvester(job.source.type)

    if harvester:
        try:
            harvest_object_ids = gather_stage(harvester, job)
        except (Exception, KeyboardInterrupt):
            channel.basic_ack(method.delivery_tag)
            raise

        if not isinstance(harvest_object_ids, list):
            log.error('Gather stage failed')
            publisher.close()
            channel.basic_ack(method.delivery_tag)
            return False

        if len(harvest_object_ids) == 0:
            log.info('No harvest objects to fetch')
            publisher.close()
            channel.basic_ack(method.delivery_tag)
            return False

        log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                    len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
        for id in harvest_object_ids:
            # Send the id to the fetch queue
            publisher.send({'harvest_object_id':id})
        log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))

    else:
        # This can occur if you:
        # * remove a harvester and it still has sources that are then refreshed
        # * add a new harvester and restart CKAN but not the gather queue.
        msg = 'System error - No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
Esempio n. 15
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    job = HarvestJob.get(id)

    if not job:
        log.error('Harvest job does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester_found = False
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
            harvester_found = True
            # Get a list of harvest object ids from the plugin
            job.gather_started = datetime.datetime.utcnow()

            try:
                harvest_object_ids = harvester.gather_stage(job)
            except (Exception, KeyboardInterrupt):
                channel.basic_ack(method.delivery_tag)
                harvest_objects = model.Session.query(HarvestObject).filter_by(
                    harvest_job_id=job.id
                )
                for harvest_object in harvest_objects:
                    model.Session.delete(harvest_object)
                model.Session.commit()
                raise
            finally:
                job.gather_finished = datetime.datetime.utcnow()
                job.save()

            if not isinstance(harvest_object_ids, list):
                log.error('Gather stage failed')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            if len(harvest_object_ids) == 0:
                log.info('No harvest objects to fetch')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                        len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
            for id in harvest_object_ids:
                # Send the id to the fetch queue
                publisher.send({'harvest_object_id':id})
            log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))

    if not harvester_found:
        msg = 'No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
Esempio n. 16
0
            .order_by("gather_finished desc").first()
        # We thought about using the document's modified date to see if it is
        # unchanged from the previous harvest, but it's hard to tell if the
        # previous harvest was not successful due to whatever reason, so don't
        # skip the doc because of its modified date.

        # We create a new HarvestObject for each inv:Dataset within the
        # Inventory document
        ids = []
        harvested_identifiers = set()
        for dataset_node in doc.dataset_nodes():
            dataset = doc.dataset_to_dict(dataset_node)

            if dataset['identifier'] in harvested_identifiers:
                HarvestGatherError.create(
                    'Dataset with duplicate identifier "%s" - discarding'
                    % dataset['identifier'], harvest_job)
                continue
            harvested_identifiers.add(dataset['identifier'])

            guid = self.build_guid(doc_metadata['identifier'], dataset['identifier'])
            # Use the most recent modification date out of the doc and dataset,
            # since they might have forgotten to enter or update the dataset
            # date.
            dataset_last_modified = dataset['modified'] or doc_last_modified
            if dataset_last_modified and doc_last_modified:
                dataset_last_modified = max(dataset_last_modified, doc_last_modified)
            if previous:
                # object may be in the previous harvest, or an older one
                existing_object = model.Session.query(HarvestObject)\
                                       .filter_by(guid=guid)\
Esempio n. 17
0
def harvest_jobs_run(context, data_dict):
    '''
    Runs scheduled jobs, checks if any jobs need marking as finished, and
    resubmits queue items if needed.

    If ckanext.harvest.timeout is set:
    Check if the duration of the job is longer than ckanext.harvest.timeout,
    then mark that job as finished as there is probably an underlying issue with the harvest process.

    This should be called every few minutes (e.g. by a cron), or else jobs
    will never show as finished.

    This used to also 'run' new jobs created by the web UI, putting them onto
    the gather queue, but now this is done by default when you create a job. If
    you need to send do this explicitly, then use
    ``harvest_send_job_to_gather_queue``.

    :param source_id: the id of the harvest source, if you just want to check
                      for its finished jobs (optional)
    :type source_id: string
    '''
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run', context, data_dict)
    timeout = config.get('ckan.harvest.timeout')

    session = context['session']

    source_id = data_dict.get('source_id')

    # Scheduled jobs
    if not source_id:
        _make_scheduled_jobs(context, data_dict)

    context['return_objects'] = False

    # Flag finished jobs as such
    jobs = harvest_job_list(context, {
        'source_id': source_id,
        'status': u'Running'
    })
    if len(jobs):
        for job in jobs:
            job_obj = HarvestJob.get(job['id'])
            if timeout:
                last_time = job_obj.get_last_action_time()
                now = datetime.datetime.utcnow()
                if now - last_time > datetime.timedelta(minutes=int(timeout)):
                    msg = 'Job {} timeout ({} minutes)\n'.format(
                        job_obj.id, timeout)
                    msg += '\tJob created: {}\n'.format(job_obj.created)
                    msg += '\tJob gather finished: {}\n'.format(
                        job_obj.created)
                    msg += '\tJob last action time: {}\n'.format(last_time)

                    job_obj.status = u'Finished'
                    job_obj.finished = now
                    job_obj.save()

                    err = HarvestGatherError(message=msg, job=job_obj)
                    err.save()
                    log.info('Marking job as finished due to error: %s %s',
                             job_obj.source.url, job_obj.id)
                    continue

            if job['gather_finished']:
                num_objects_in_progress = \
                    session.query(HarvestObject.id) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(and_((HarvestObject.state != u'COMPLETE'),
                                     (HarvestObject.state != u'ERROR'))) \
                        .count()

                if num_objects_in_progress == 0:

                    job_obj.status = u'Finished'
                    log.info('Marking job as finished %s %s',
                             job_obj.source.url, job_obj.id)

                    # save the time of finish, according to the last running
                    # object
                    last_object = session.query(HarvestObject) \
                        .filter(HarvestObject.harvest_job_id == job['id']) \
                        .filter(
                        HarvestObject.import_finished != None  # noqa: E711
                    ).order_by(HarvestObject.import_finished.desc()) \
                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    else:
                        job_obj.finished = job['gather_finished']
                    job_obj.save()

                    # Reindex the harvest source dataset so it has the latest
                    # status
                    get_action('harvest_source_reindex')(
                        context, {
                            'id': job_obj.source.id
                        })

                    status = get_action('harvest_source_show_status')(
                        context, {
                            'id': job_obj.source.id
                        })

                    notify_all = toolkit.asbool(
                        config.get('ckan.harvest.status_mail.all'))
                    notify_errors = toolkit.asbool(
                        config.get('ckan.harvest.status_mail.errored'))
                    last_job_errors = status['last_job']['stats'].get(
                        'errored', 0)
                    log.debug(
                        'Notifications: All:{} On error:{} Errors:{}'.format(
                            notify_all, notify_errors, last_job_errors))

                    if last_job_errors > 0 and (notify_all or notify_errors):
                        # send_error_mail_ncar(context, job_obj)
                        # get_mail_extra_vars(context, job_obj.source.id, status)
                        send_error_email(context, job_obj.source.id, status)
                    elif notify_all:
                        send_summary_email(context, job_obj.source.id, status)
                else:
                    log.debug('%d Ongoing jobs for %s (source:%s)',
                              num_objects_in_progress, job['id'],
                              job['source_id'])
    log.debug('No jobs to send to the gather queue')

    # Resubmit old redis tasks
    resubmit_jobs()

    # Resubmit pending objects missing from Redis
    resubmit_objects()

    # log.debug('Start of commit and close')
    # session.commit()
    # log.debug('  (Start of close)')
    # session.close()
    # log.debug('End of commit and close')

    return []  # merely for backwards compatibility
Esempio n. 18
0
    def gather_stage(self, harvest_job):
        '''
        analyze the source, return a list of IDs
            and create one HarvestObject per dataset 
        '''
        logger.info('Starts Gather SIU Transp')
        # load paths
        self.set_paths()
        self.siu_data_lib.get_query_files()

        # basic things you'll need
        self.source = harvest_job.source
        self.source_config = json.loads(self.source.config)

        # allow to get config from URL
        # Sample: https://raw.githubusercontent.com/avdata99/ckan-env/develop/docs/full_config.json
        config_from_url = self.source_config.get('from_url', None)
        if config_from_url is not None:
            logger.info('Updating config from URL')
            response = requests.get(config_from_url)
            update_config = response.json()
            self.source_config.update(update_config)

        self.siu_data_lib.base_url = self.source.url
        self.siu_data_lib.username = self.source_config['username']
        self.siu_data_lib.password = self.source_config['password']
        
        # ####################################
        # get previous harvested packages
        pfr = self.get_packages_for_source(harvest_source_id=self.source.id)
        prev_names = [pkg['name'] for pkg in pfr['results']]
        logger.info('Get previous harvested objects {}'.format(prev_names))
        # TODO
        # ####################################
        
        object_ids = []  # lista de IDs a procesar, esto se devuelve en esta funcion
        
        self.source_dataset = get_harvest_source(self.source.id)
        owner_org = self.source_dataset.get('owner_org')
        logger.info('Gather SIU Transp to ORG {}'.format(owner_org))
        
        # Iterar por cada query para obtener diferentes conjuntos de datos
        # Por cada archivo en siu_transp_data/queries se generarán múltiples datasets para publicar
        
        report = []  # resumen de todos los resultados
        logger.info('Iter files')
        
        # ver si la config me pide sobreescribir metadatos en los datasets de cada archivo
        override = self.source_config.get('override', {})
        logger.info("General override {}".format(override))
            
        for qf in self.siu_data_lib.query_files:
            only_files = self.source_config.get('only_files', None)
            query_file_name = qf.split('/')[-1]
            if only_files is not None:
                if query_file_name not in only_files:
                    logger.info('Skipping file by config {}'.format(query_file_name))
                    continue
            
            logger.info('Gather SIU Transp FILE {}'.format(qf))
            stqf = SIUTranspQueryFile(portal=self.siu_data_lib, path=qf)
            # open to read query params
            stqf.open()
            # request all data
            stqf.request_all(results_folder_path=self.results_folder_path)
            for err in stqf.errors:
                hgerr = HarvestGatherError(message=err, job=harvest_job)
                hgerr.save()


            # ====== Prepare dict to override datasets metadata ============
            override_this = override.get(query_file_name, {})
            logger.info("To override {}: {}".format(query_file_name, override_this))
            
            # extras need to be {"key": "extra name", "value": "extra value"}
            extras = override_this.get('extras', {})
            new_extras = []
            for extra_key, extra_value in extras.iteritems():
                logger.info("Override extra found {}: {}".format(extra_key, extra_value))
                if not isinstance(extra_value, str):
                    extra_value = str(extra_value)
                new_extras.append({"key": extra_key, "value": extra_value})
            
            if len(new_extras) > 0:
                override_this['extras'] = new_extras

            # tags need to be {"name": "tag name"}
            tags = override_this.get('tags', [])
            new_tags = []
            for tag in tags:
                logger.info("Override tag found {}".format(unicode(tag).encode("utf-8")))
                new_tags.append({"name": tag})
            
            if len(new_tags) > 0:
                override_this['tags'] = new_tags

            # groups need to be {"name": "tag name"}
            groups = override_this.get('groups', [])
            new_groups = []
            for group in groups:
                logger.info("Override group found {}".format(group))
                # check if groups must be created
                context = {'model': model, 'session': model.Session, 'user': self._get_user_name()}
                try:
                    p.toolkit.get_action('group_create')(context, {"name": group})
                except Exception as e:
                    logger.error('Error creating group (skipped) {}: {}'.format(group, e))
                    
                new_groups.append({"name": group})
            
            if len(new_groups) > 0:
                override_this['groups'] = new_groups

            # ================================
                
            report += stqf.requests
            for dataset in stqf.datasets:
                if dataset['name'] in prev_names:
                    action = 'update'
                    # leave this list just with packages to remove
                    prev_names.remove(dataset['name'])
                else:
                    action = 'create'
                logger.info('Dataset {} to {}'.format(dataset['name'], action))
                ho_dict = {
                    'title': dataset['title'],
                    'name': dataset['name'],
                    'owner_org': owner_org,
                    'notes': dataset['notes'],
                    'tags': dataset['tags'],
                    'resources': dataset['resources'],
                    'action': action
                }

                # fix extras if they exists
                ho_dict.update(override_this)
                logger.info("Overrided ho_dict {}".format(ho_dict))
                    

                # Each harvest object will be passed to other stages in harvest process
                obj = HarvestObject(guid=dataset['name'],
                                    job=harvest_job,
                                    content=json.dumps(ho_dict))
                obj.save()
                logger.info('Objects ID appends {}'.format(obj.id))
                object_ids.append(obj.id)

        # TODO compare with previous harvested data to remove dataset no more at harvest source

        # resumen final
        logger.info('REQUESTS: \n{}'.format('\n\t'.join(report)))
        return object_ids
Esempio n. 19
0
def gather_callback(channel, method, header, body):
    try:
        id = json.loads(body)['harvest_job_id']
        log.debug('Received harvest job id: %s' % id)
    except KeyError:
        log.error('No harvest job id received')
        channel.basic_ack(method.delivery_tag)
        return False

    # Get a publisher for the fetch queue
    publisher = get_fetch_publisher()

    job = HarvestJob.get(id)

    if not job:
        log.error('Harvest job does not exist: %s' % id)
        channel.basic_ack(method.delivery_tag)
        return False

    # Send the harvest job to the plugins that implement
    # the Harvester interface, only if the source type
    # matches
    harvester_found = False
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == job.source.type:
            harvester_found = True
            # Get a list of harvest object ids from the plugin
            job.gather_started = datetime.datetime.utcnow()

            try:
                harvest_object_ids = harvester.gather_stage(job)
            except (Exception, KeyboardInterrupt):
                channel.basic_ack(method.delivery_tag)
                harvest_objects = model.Session.query(HarvestObject).filter_by(
                    harvest_job_id=job.id)
                for harvest_object in harvest_objects:
                    model.Session.delete(harvest_object)
                model.Session.commit()
                raise
            finally:
                job.gather_finished = datetime.datetime.utcnow()
                job.save()

            if not isinstance(harvest_object_ids, list):
                log.error('Gather stage failed')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            if len(harvest_object_ids) == 0:
                log.info('No harvest objects to fetch')
                publisher.close()
                channel.basic_ack(method.delivery_tag)
                return False

            log.debug(
                'Received from plugin gather_stage: {0} objects (first: {1} last: {2})'
                .format(len(harvest_object_ids), harvest_object_ids[:1],
                        harvest_object_ids[-1:]))
            for id in harvest_object_ids:
                # Send the id to the fetch queue
                publisher.send({'harvest_object_id': id})
            log.debug('Sent {0} objects to the fetch queue'.format(
                len(harvest_object_ids)))

    if not harvester_found:
        msg = 'No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg, job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)
Esempio n. 20
0
            channel.basic_ack(method.delivery_tag)
            return False

        log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
                    len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
        for id in harvest_object_ids:
            # Send the id to the fetch queue
            publisher.send({'harvest_object_id':id})
        log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))

    else:
        # This can occur if you:
        # * remove a harvester and it still has sources that are then refreshed
        # * add a new harvester and restart CKAN but not the gather queue.
        msg = 'System error - No harvester could be found for source type %s' % job.source.type
        err = HarvestGatherError(message=msg,job=job)
        err.save()
        log.error(msg)

    model.Session.remove()
    publisher.close()
    channel.basic_ack(method.delivery_tag)


def get_harvester(harvest_source_type):
    for harvester in PluginImplementations(IHarvester):
        if harvester.info()['name'] == harvest_source_type:
            return harvester


def gather_stage(harvester, job):