Esempio n. 1
0
    def test_redis_queue_purging(self):
        '''
        Test that Redis queue purging doesn't purge the wrong keys.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key', 'foobar')

            # Create some fake jobs
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            num_keys = redis.dbsize()

            # Create some fake objects
            gather_consumer = queue.get_gather_consumer()
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            fetch_consumer = queue.get_fetch_consumer()
            next(fetch_consumer.consume(queue.get_fetch_queue_name()))

            assert redis.dbsize() > num_keys

            queue.purge_queues()

            assert redis.get('ckanext-harvest:some-random-key') == 'foobar'
            assert redis.dbsize() == num_keys
            assert redis.llen(queue.get_gather_routing_key()) == 0
            assert redis.llen(queue.get_fetch_routing_key()) == 0
        finally:
            redis.delete('ckanext-harvest:some-random-key')
Esempio n. 2
0
    def test_redis_corrupt(self, mock_log_error):
        '''
        Test that corrupt Redis doesn't stop harvest process and still processes other jobs.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key-2', 'foobar')

            # make sure queues/exchanges are created first and are empty
            gather_consumer = queue.get_gather_consumer()
            fetch_consumer = queue.get_fetch_consumer()
            gather_consumer.queue_purge(queue=queue.get_gather_queue_name())
            fetch_consumer.queue_purge(queue=queue.get_fetch_queue_name())

            # Create some fake jobs and objects with no harvest_job_id
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': None})
            h_obj_id = str(uuid.uuid4())
            fetch_publisher.send({'harvest_object_id': h_obj_id})

            # Create some fake objects
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            _, _, body = next(
                fetch_consumer.consume(queue.get_fetch_queue_name()))

            json_obj = json.loads(body)
            assert json_obj['harvest_object_id'] == h_obj_id

            assert mock_log_error.call_count == 1
            args, _ = mock_log_error.call_args_list[0]
            if six.PY2:
                assert "cannot concatenate 'str' and 'NoneType' objects" in args[
                    1]
            else:
                assert "must be str, not NoneType" in str(args[1])

        finally:
            redis.delete('ckanext-harvest:some-random-key-2')
Esempio n. 3
0
def gather_consumer():
    import logging
    from ckanext.harvest.queue import (
        get_gather_consumer,
        gather_callback,
        get_gather_queue_name,
    )

    logging.getLogger("amqplib").setLevel(logging.INFO)
    consumer = get_gather_consumer()
    for method, header, body in consumer.consume(
            queue=get_gather_queue_name()):
        gather_callback(consumer, method, header, body)
Esempio n. 4
0
    def test_redis_queue_purging(self):
        '''
        Test that Redis queue purging doesn't purge the wrong keys.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            raise SkipTest()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key', 'foobar')

            # Create some fake jobs
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            num_keys = redis.dbsize()

            # Create some fake objects
            gather_consumer = queue.get_gather_consumer()
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            fetch_consumer = queue.get_fetch_consumer()
            next(fetch_consumer.consume(queue.get_fetch_queue_name()))

            ok_(redis.dbsize() > num_keys)

            queue.purge_queues()

            assert_equal(redis.get('ckanext-harvest:some-random-key'),
                         'foobar')
            assert_equal(redis.dbsize(), num_keys)
            assert_equal(redis.llen(queue.get_gather_routing_key()), 0)
            assert_equal(redis.llen(queue.get_fetch_routing_key()), 0)
        finally:
            redis.delete('ckanext-harvest:some-random-key')
Esempio n. 5
0
    def test_01_basic_harvester(self):

        # make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

        user = logic.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})['name']

        context = {
            'model': model,
            'session': model.Session,
            'user': user,
            'api_version': 3,
            'ignore_auth': True
        }

        source_dict = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': 'basic_test',
            'source_type': 'test',
        }

        harvest_source = logic.get_action('harvest_source_create')(context,
                                                                   source_dict)

        assert harvest_source['source_type'] == 'test', harvest_source
        assert harvest_source['url'] == 'basic_test', harvest_source

        harvest_job = logic.get_action('harvest_job_create')(
            context, {
                'source_id': harvest_source['id'],
                'run': True
            })

        job_id = harvest_job['id']

        assert harvest_job['source_id'] == harvest_source['id'], harvest_job

        assert harvest_job['status'] == u'Running'

        assert logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')

        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'WAITING'
        assert all_objects[1].state == 'WAITING'
        assert all_objects[2].state == 'WAITING'

        assert len(model.Session.query(HarvestObject).all()) == 3
        assert len(model.Session.query(HarvestObjectExtra).all()) == 1

        # do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3
        all_objects = model.Session.query(HarvestObject).filter_by(
            current=True).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'COMPLETE'
        assert all_objects[0].report_status == 'added'
        assert all_objects[1].state == 'COMPLETE'
        assert all_objects[1].report_status == 'added'
        assert all_objects[2].state == 'COMPLETE'
        assert all_objects[2].report_status == 'added'

        # fire run again to check if job is set to Finished
        logic.get_action('harvest_jobs_run')(context, {
            'source_id': harvest_source['id']
        })

        harvest_job = logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })

        assert harvest_job['status'] == u'Finished'
        assert harvest_job['stats'] == {
            'added': 3,
            'updated': 0,
            'not modified': 0,
            'errored': 0,
            'deleted': 0
        }

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context, {
                'id': harvest_source['id']
            })

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 3,
            'updated': 0,
            'not modified': 0,
            'errored': 0,
            'deleted': 0
        }
        assert harvest_source_dict['status']['total_datasets'] == 3
        assert harvest_source_dict['status']['job_count'] == 1

        # Second run
        harvest_job = logic.get_action('harvest_job_create')(
            context, {
                'source_id': harvest_source['id'],
                'run': True
            })

        job_id = harvest_job['id']
        assert logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')
        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 6

        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3

        all_objects = model.Session.query(HarvestObject).filter_by(
            report_status='added').all()
        assert len(all_objects) == 3

        all_objects = model.Session.query(HarvestObject).filter_by(
            report_status='updated').all()
        assert len(all_objects) == 2

        all_objects = model.Session.query(HarvestObject).filter_by(
            report_status='deleted').all()
        assert len(all_objects) == 1

        # run to make sure job is marked as finshed
        logic.get_action('harvest_jobs_run')(context, {
            'source_id': harvest_source['id']
        })

        harvest_job = logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })
        assert harvest_job['stats'] == {
            'added': 0,
            'updated': 2,
            'not modified': 0,
            'errored': 0,
            'deleted': 1
        }

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context, {
                'id': harvest_source['id']
            })

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 0,
            'updated': 2,
            'not modified': 0,
            'errored': 0,
            'deleted': 1
        }
        assert harvest_source_dict['status']['total_datasets'] == 2
        assert harvest_source_dict['status']['job_count'] == 2
Esempio n. 6
0
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True
        }
        self.admin_user = get_action('get_site_user')(context, {})

        print ''

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == 'source':
            if len(self.args) > 2:
                self.create_harvest_source()
            else:
                self.show_harvest_source()
        elif cmd == 'rmsource':
            self.remove_harvest_source()
        elif cmd == 'clearsource':
            self.clear_harvest_source()
        elif cmd == 'sources':
            self.list_harvest_sources()
        elif cmd == 'job':
            self.create_harvest_job()
        elif cmd == 'jobs':
            self.list_harvest_jobs()
        elif cmd == 'job_abort':
            self.job_abort()
        elif cmd == 'run':
            self.run_harvester()
        elif cmd == 'run_test':
            self.run_test_harvest()
        elif cmd == 'gather_consumer':
            import logging
            from ckanext.harvest.queue import (get_gather_consumer,
                                               gather_callback,
                                               get_gather_queue_name)
            logging.getLogger('amqplib').setLevel(logging.INFO)
            consumer = get_gather_consumer()
            for method, header, body in consumer.consume(
                    queue=get_gather_queue_name()):
                gather_callback(consumer, method, header, body)
        elif cmd == 'fetch_consumer':
            import logging
            logging.getLogger('amqplib').setLevel(logging.INFO)
            from ckanext.harvest.queue import (get_fetch_consumer,
                                               fetch_callback,
                                               get_fetch_queue_name)
            consumer = get_fetch_consumer()
            for method, header, body in consumer.consume(
                    queue=get_fetch_queue_name()):
                fetch_callback(consumer, method, header, body)
        elif cmd == 'purge_queues':
            from ckanext.harvest.queue import purge_queues
            purge_queues()
        elif cmd == 'initdb':
            self.initdb()
        elif cmd == 'import':
            self.initdb()
            self.import_stage()
        elif cmd == 'job-all':
            self.create_harvest_job_all()
        elif cmd == 'harvesters-info':
            harvesters_info = get_action('harvesters_info_show')()
            pprint(harvesters_info)
        elif cmd == 'reindex':
            self.reindex()
        elif cmd == 'clean_harvest_log':
            self.clean_harvest_log()
        else:
            print 'Command %s not recognized' % cmd
Esempio n. 7
0
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {'model':model,'session':model.Session,'ignore_auth':True}
        self.admin_user = get_action('get_site_user')(context,{})


        print ''

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == 'source':
            if len(self.args) > 2:
                self.create_harvest_source()
            else:
                self.show_harvest_source()
        elif cmd == 'rmsource':
            self.remove_harvest_source()
        elif cmd == 'clearsource':
            self.clear_harvest_source()
        elif cmd == 'sources':
            self.list_harvest_sources()
        elif cmd == 'job':
            self.create_harvest_job()
        elif cmd == 'jobs':
            self.list_harvest_jobs()
        elif cmd == 'job_abort':
            self.job_abort()
        elif cmd == 'run':
            self.run_harvester()
        elif cmd == 'run_test':
            self.run_test_harvest()
        elif cmd == 'gather_consumer':
            import logging
            from ckanext.harvest.queue import (get_gather_consumer,
                gather_callback, get_gather_queue_name)
            logging.getLogger('amqplib').setLevel(logging.INFO)
            consumer = get_gather_consumer()
            for method, header, body in consumer.consume(queue=get_gather_queue_name()):
                gather_callback(consumer, method, header, body)
        elif cmd == 'fetch_consumer':
            import logging
            logging.getLogger('amqplib').setLevel(logging.INFO)
            from ckanext.harvest.queue import (get_fetch_consumer, fetch_callback,
                get_fetch_queue_name)
            consumer = get_fetch_consumer()
            for method, header, body in consumer.consume(queue=get_fetch_queue_name()):
               fetch_callback(consumer, method, header, body)
        elif cmd == 'purge_queues':
            from ckanext.harvest.queue import purge_queues
            purge_queues()
        elif cmd == 'initdb':
            self.initdb()
        elif cmd == 'import':
            self.initdb()
            self.import_stage()
        elif cmd == 'job-all':
            self.create_harvest_job_all()
        elif cmd == 'harvesters-info':
            harvesters_info = get_action('harvesters_info_show')()
            pprint(harvesters_info)
        elif cmd == 'reindex':
            self.reindex()
        else:
            print 'Command %s not recognized' % cmd
Esempio n. 8
0
    def test_01_basic_harvester(self):

        if config.get('ckan.harvest.mq.type') == 'redis':
            # make sure that there are no old elements in the redis db
            redis = queue.get_connection()
            redis.flushdb()

        # make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

        user = toolkit.get_action('get_site_user')(
            {'model': model, 'ignore_auth': True}, {}
        )['name']

        context = {'model': model, 'session': model.Session,
                   'user': user, 'api_version': 3, 'ignore_auth': True}

        harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context)

        # do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3
        all_objects = model.Session.query(HarvestObject).filter_by(current=True).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'COMPLETE'
        assert all_objects[0].report_status == 'added'
        assert all_objects[1].state == 'COMPLETE'
        assert all_objects[1].report_status == 'added'
        assert all_objects[2].state == 'COMPLETE'
        assert all_objects[2].report_status == 'added'

        # fire run again to check if job is set to Finished
        toolkit.get_action('harvest_jobs_run')(
            context,
            {'source_id': harvest_source['id']}
        )

        harvest_job = toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )

        assert harvest_job['status'] == u'Finished'
        assert harvest_job['stats'] == {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}

        harvest_source_dict = toolkit.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}
        assert harvest_source_dict['status']['total_datasets'] == 3
        assert harvest_source_dict['status']['job_count'] == 1

        # Second run
        harvest_job = toolkit.get_action('harvest_job_create')(
            context,
            {'source_id': harvest_source['id'], 'run': True}
        )

        job_id = harvest_job['id']
        assert toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')
        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 6

        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all()
        assert len(all_objects) == 3

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all()
        assert len(all_objects) == 2

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all()
        assert len(all_objects) == 1

        # run to make sure job is marked as finshed
        toolkit.get_action('harvest_jobs_run')(
            context,
            {'source_id': harvest_source['id']}
        )

        harvest_job = toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )
        assert harvest_job['stats'] == {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}

        harvest_source_dict = toolkit.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}
        assert harvest_source_dict['status']['total_datasets'] == 2
        assert harvest_source_dict['status']['job_count'] == 2
Esempio n. 9
0
    def test_resubmit_objects(self):
        '''
        Test that only harvest objects re-submitted which were not be present in the redis fetch queue.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        # make sure that there are no old elements in the redis db
        redis = queue.get_connection()
        fetch_routing_key = queue.get_fetch_routing_key()
        redis.flushdb()
        try:
            # make sure queues/exchanges are created first and are empty
            consumer = queue.get_gather_consumer()
            consumer_fetch = queue.get_fetch_consumer()
            consumer.queue_purge(queue=queue.get_gather_queue_name())
            consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

            user = toolkit.get_action('get_site_user')(
                {'model': model, 'ignore_auth': True}, {}
            )['name']

            context = {'model': model, 'session': model.Session,
                       'user': user, 'api_version': 3, 'ignore_auth': True}

            harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context)

            assert redis.llen(fetch_routing_key) == 3

            # do only one time for the first harvest object
            reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
            queue.fetch_callback(consumer_fetch, *reply)

            count = model.Session.query(model.Package) \
                .filter(model.Package.type == 'dataset') \
                .count()
            assert count == 1

            all_objects = model.Session.query(HarvestObject).order_by(HarvestObject.state.asc()).all()
            assert len(all_objects) == 3
            assert all_objects[0].state == 'COMPLETE'
            assert all_objects[0].report_status == 'added'
            assert all_objects[0].current is True
            assert all_objects[1].state == 'WAITING'
            assert all_objects[1].current is False
            assert all_objects[2].state == 'WAITING'
            assert all_objects[2].current is False

            assert len(redis.keys(fetch_routing_key + ':*')) == 0
            assert redis.llen(fetch_routing_key) == 2

            # Remove one object from redis that should be re-sent to the fetch queue
            reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
            fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10)
            assert len(fetch_queue_items) == 1
            harvest_object_id = reply[2]
            assert fetch_queue_items[0] != harvest_object_id

            queue.resubmit_objects()

            assert redis.llen(fetch_routing_key) == 2
            fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10)
            assert harvest_object_id in fetch_queue_items
            assert redis.dbsize() == 1
        finally:
            redis.flushdb()
Esempio n. 10
0
    def test_01_basic_harvester(self):

        ### make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())


        user = logic.get_action('get_site_user')(
            {'model': model, 'ignore_auth': True}, {}
        )['name']

        context = {'model': model, 'session': model.Session,
                   'user': user, 'api_version': 3, 'ignore_auth': True}

        source_dict = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': 'basic_test',
            'source_type': 'test',
        }

        harvest_source = logic.get_action('harvest_source_create')(
            context,
            source_dict
        )

        assert harvest_source['source_type'] == 'test', harvest_source
        assert harvest_source['url'] == 'basic_test', harvest_source

        harvest_job = logic.get_action('harvest_job_create')(
            context,
            {'source_id': harvest_source['id'], 'run': True}
        )

        job_id = harvest_job['id']

        assert harvest_job['source_id'] == harvest_source['id'], harvest_job

        assert harvest_job['status'] == u'Running'

        assert logic.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )['status'] == u'Running'

        ## pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')

        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'WAITING'
        assert all_objects[1].state == 'WAITING'
        assert all_objects[2].state == 'WAITING'


        assert len(model.Session.query(HarvestObject).all()) == 3
        assert len(model.Session.query(HarvestObjectExtra).all()) == 1

        ## do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
                .filter(model.Package.type=='dataset') \
                .count()
        assert count == 3
        all_objects = model.Session.query(HarvestObject).filter_by(current=True).all()

        assert_equal(len(all_objects), 3)
        assert_equal(all_objects[0].state, 'COMPLETE')
        assert_equal(all_objects[0].report_status, 'added')
        assert_equal(all_objects[1].state, 'COMPLETE')
        assert_equal(all_objects[1].report_status, 'added')
        assert_equal(all_objects[2].state, 'COMPLETE')
        assert_equal(all_objects[2].report_status, 'added')

        ## fire run again to check if job is set to Finished
        logic.get_action('harvest_jobs_run')(
            context,
            {'source_id':harvest_source['id']}
        )

        harvest_job = logic.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )

        assert_equal(harvest_job['status'], u'Finished')
        assert_equal(harvest_job['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0})

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0})
        assert_equal(harvest_source_dict['status']['total_datasets'], 3)
        assert_equal(harvest_source_dict['status']['job_count'], 1)


        ########### Second run ########################
        harvest_job = logic.get_action('harvest_job_create')(
            context,
            {'source_id': harvest_source['id'], 'run': True}
        )

        job_id = harvest_job['id']
        assert logic.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )['status'] == u'Running'

        ## pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')
        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 6

        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
                .filter(model.Package.type=='dataset') \
                .count()
        assert_equal(count, 3)

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all()
        assert_equal(len(all_objects), 3)

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all()
        assert_equal(len(all_objects), 2)

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all()
        assert_equal(len(all_objects), 1)

        # run to make sure job is marked as finshed
        logic.get_action('harvest_jobs_run')(
            context,
            {'source_id':harvest_source['id']}
        )

        harvest_job = logic.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )
        assert_equal(harvest_job['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1})

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1})
        assert_equal(harvest_source_dict['status']['total_datasets'], 2)
        assert_equal(harvest_source_dict['status']['job_count'], 2)
Esempio n. 11
0
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {"model": model, "session": model.Session, "ignore_auth": True}
        self.admin_user = get_action("get_site_user")(context, {})

        print ""

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == "source":
            if len(self.args) > 2:
                self.create_harvest_source()
            else:
                self.show_harvest_source()
        elif cmd == "rmsource":
            self.remove_harvest_source()
        elif cmd == "clearsource":
            self.clear_harvest_source()
        elif cmd == "clearsource_history":
            self.clear_harvest_source_history()
        elif cmd == "sources":
            self.list_harvest_sources()
        elif cmd == "job":
            self.create_harvest_job()
        elif cmd == "jobs":
            self.list_harvest_jobs()
        elif cmd == "job_abort":
            self.job_abort()
        elif cmd == "run":
            self.run_harvester()
        elif cmd == "run_test":
            self.run_test_harvest()
        elif cmd == "gather_consumer":
            import logging
            from ckanext.harvest.queue import get_gather_consumer, gather_callback, get_gather_queue_name

            logging.getLogger("amqplib").setLevel(logging.INFO)
            consumer = get_gather_consumer()
            for method, header, body in consumer.consume(queue=get_gather_queue_name()):
                gather_callback(consumer, method, header, body)
        elif cmd == "fetch_consumer":
            import logging

            logging.getLogger("amqplib").setLevel(logging.INFO)
            from ckanext.harvest.queue import get_fetch_consumer, fetch_callback, get_fetch_queue_name

            consumer = get_fetch_consumer()
            for method, header, body in consumer.consume(queue=get_fetch_queue_name()):
                fetch_callback(consumer, method, header, body)
        elif cmd == "purge_queues":
            self.purge_queues()
        elif cmd == "initdb":
            self.initdb()
        elif cmd == "import":
            self.initdb()
            self.import_stage()
        elif cmd == "job-all":
            self.create_harvest_job_all()
        elif cmd == "harvesters-info":
            harvesters_info = get_action("harvesters_info_show")()
            pprint(harvesters_info)
        elif cmd == "reindex":
            self.reindex()
        elif cmd == "clean_harvest_log":
            self.clean_harvest_log()
        else:
            print "Command %s not recognized" % cmd
Esempio n. 12
0
    def test_fetch_doesnt_process_remaining_objects_if_job_status_finished(
            self):

        # make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

        user = logic.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})['name']

        context = {
            'model': model,
            'session': model.Session,
            'user': user,
            'api_version': 3,
            'ignore_auth': True
        }

        source_dict = {
            'title': 'Test Job Finished',
            'name': 'test-job-finished',
            'url': 'basic_test_1',
            'source_type': 'test-nose',
        }

        harvest_source = logic.get_action('harvest_source_create')(context,
                                                                   source_dict)

        assert harvest_source['source_type'] == 'test-nose', harvest_source
        assert harvest_source['url'] == 'basic_test_1', harvest_source

        harvest_job = logic.get_action('harvest_job_create')(
            context, {
                'source_id': harvest_source['id'],
                'run': True
            })

        job_id = harvest_job['id']

        assert harvest_job['source_id'] == harvest_source['id'], harvest_job

        assert harvest_job['status'] == u'Running'

        assert logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')

        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).filter(
            HarvestObject.harvest_job_id == harvest_job['id']).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'WAITING'
        assert all_objects[1].state == 'WAITING'
        assert all_objects[2].state == 'WAITING'

        # artificially set the job to finished to simulate a job abort or timeout
        job_obj = HarvestJob.get(harvest_job['id'])
        job_obj.status = 'Finished'
        job_obj.save()

        original_dataset_count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()

        # do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        all_objects = model.Session.query(HarvestObject).filter(
            HarvestObject.harvest_job_id == harvest_job['id']).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'ERROR'
        assert all_objects[1].state == 'ERROR'
        assert all_objects[2].state == 'ERROR'

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == original_dataset_count

        # fire run again to check if job is set to Finished
        logic.get_action('harvest_jobs_run')(context, {
            'source_id': harvest_source['id']
        })

        harvest_job = logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })

        assert_equal(harvest_job['status'], u'Finished')
        assert_equal(
            harvest_job['stats'], {
                'added': 0,
                'updated': 0,
                'not modified': 0,
                'errored': 3,
                'deleted': 0
            })

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context, {
                'id': harvest_source['id']
            })

        assert_equal(
            harvest_source_dict['status']['last_job']['stats'], {
                'added': 0,
                'updated': 0,
                'not modified': 0,
                'errored': 3,
                'deleted': 0
            })
        assert_equal(harvest_source_dict['status']['total_datasets'], 0)
        assert_equal(harvest_source_dict['status']['job_count'], 1)