Beispiel #1
0
    def run_gather(self, url, config_str='{}'):

        self.source = HarvestSourceObj(url=url,
                                       owner_org=self.org['id'],
                                       config=config_str)
        self.job = HarvestJobObj(source=self.source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        log.info('obj_ids=%s', obj_ids)
        if len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        return obj_ids
    def run_gather(self, url):
        source = CSWHarvestSourceObj(url=url,
                                     owner_org=self.organization['id'])
        job = HarvestJobObj(source=source)

        self.harvester = GeoDataGovCSWHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(job)
        log.info('job.gather_errors=%s', job.gather_errors)
        if len(job.gather_errors) > 0:
            raise Exception(job.gather_errors[0])

        log.info('obj_ids=%s', obj_ids)
        if obj_ids is None or len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        # this is a list of harvestObjects IDs. One for dataset
        return obj_ids
    def run_gather(self, url, source_config='{}'):

        source = SIUHarvestSourceObj(url=url,
                                     owner_org='test-org',
                                     config=source_config)

        log.info('Created source {}'.format(source))
        self.job = HarvestJobObj(source=source)
        self.harvester = SIUTransparenciaHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)

        log.info('obj_ids=%s', obj_ids)
        if obj_ids is None or len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        # this is a list of harvestObjects IDs. One for dataset
        return obj_ids
    def run_gather(self, url, source_config):

        sc = json.loads(source_config)
        existing_profiles = [v.name for v in all_validators]
        log.info('Existing validator profiles: {}'.format(existing_profiles))
        source = WafCollectionHarvestSourceObj(url=url,
                                               owner_org='test-org',
                                               # config=source_config,
                                               **sc)
        job = HarvestJobObj(source=source)

        self.harvester = WAFCollectionHarvester()
        
        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(job)
        log.info('job.gather_errors=%s', job.gather_errors)
        if len(job.gather_errors) > 0:
            raise Exception(job.gather_errors[0])
        
        log.info('obj_ids=%s', obj_ids)
        if obj_ids is None or len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        # this is a list of harvestObjects IDs. One for dataset
        return obj_ids
Beispiel #5
0
    def run_gather(self, url, source_config):

        sc = json.loads(source_config)

        source = WafHarvestSourceObj(url=url,
                                     owner_org=self.organization['id'],
                                     config=source_config,
                                     **sc)

        log.info('Created source {}'.format(source))
        self.job = HarvestJobObj(source=source)
        self.harvester = GeoDataGovWAFHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        if len(self.job.gather_errors) > 0:
            raise Exception(self.job.gather_errors[0])

        log.info('obj_ids=%s', obj_ids)
        if obj_ids is None or len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        # this is a list of harvestObjects IDs. One for dataset
        return obj_ids
    def run_source(self, url):
        source = HarvestSourceObj(url=url)
        job = HarvestJobObj(source=source)

        harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = harvester.gather_stage(job)
        log.info('job.gather_errors=%s', job.gather_errors)
        log.info('obj_ids=%s', obj_ids)
        if len(obj_ids) == 0:
            # nothing to see
            return

        harvest_object = harvest_model.HarvestObject.get(obj_ids[0])
        log.info('ho guid=%s', harvest_object.guid)
        log.info('ho content=%s', harvest_object.content)

        # fetch stage
        log.info('FETCHING %s', url)
        result = harvester.fetch_stage(harvest_object)

        log.info('ho errors=%s', harvest_object.errors)
        log.info('result 1=%s', result)

        # fetch stage
        log.info('IMPORTING %s', url)
        result = harvester.import_stage(harvest_object)

        log.info('ho errors 2=%s', harvest_object.errors)
        log.info('result 2=%s', result)
        log.info('ho pkg id=%s', harvest_object.package_id)
        dataset = model.Package.get(harvest_object.package_id)
        if dataset:
            log.info('dataset name=%s', dataset.name)
        errors = harvest_object.errors

        return harvest_object, result, dataset, errors
class TestIntegrationDataJSONHarvester23(object):
    """Integration tests using a complete CKAN 2.3 harvest stack. Unlike unit tests,
    these tests are only run on a complete CKAN 2.3 stack."""
    @classmethod
    def setup_class(cls):
        log.info('Starting mock http server')
        cls.mock_port = 8960
        mock_datajson_source.serve(cls.mock_port)

    @classmethod
    def setup(cls):
        # Start data json sources server we can test harvesting against it
        reset_db()
        harvest_model.setup()
        cls.user = Sysadmin()

        if p.toolkit.check_ckan_version(min_version='2.8.0'):
            raise SkipTest('Just for CKAN 2.3')

    def run_gather(self, url):
        self.source = HarvestSourceObj(url=url)
        self.job = HarvestJobObj(source=self.source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        log.info('obj_ids=%s', obj_ids)
        if len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        return obj_ids

    def run_fetch(self):
        # fetch stage

        for harvest_object in self.harvest_objects:
            log.info('FETCHING %s' % harvest_object.id)
            result = self.harvester.fetch_stage(harvest_object)

            log.info('ho errors=%s', harvest_object.errors)
            log.info('result 1=%s', result)
            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors

    def run_import(self, objects=None):
        # import stage
        datasets = []

        # allow run just some objects
        if objects is None:
            # default is all objects in the right order
            objects = self.harvest_objects
        else:
            log.info('Import custom list {}'.format(objects))

        for harvest_object in objects:
            log.info('IMPORTING %s' % harvest_object.id)
            result = self.harvester.import_stage(harvest_object)

            log.info('ho errors 2=%s', harvest_object.errors)
            log.info('result 2=%s', result)

            if not result:
                log.error(
                    'Dataset not imported: {}. Errors: {}. Content: {}'.format(
                        harvest_object.package_id, harvest_object.errors,
                        harvest_object.content))

            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors
                harvest_object.state = "ERROR"

            harvest_object.state = "COMPLETE"
            harvest_object.save()

            log.info('ho pkg id=%s', harvest_object.package_id)
            dataset = model.Package.get(harvest_object.package_id)
            if dataset:
                datasets.append(dataset)
                log.info('dataset name=%s', dataset.name)

        return datasets

    def run_source(self, url):
        self.run_gather(url)
        self.run_fetch()
        datasets = self.run_import()

        return datasets

    def test_datajson_collection(self):
        """ harvest from a source with a parent in the second place
            We expect the gather stage to re-order to the forst place """
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        identifiers = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            content = json.loads(harvest_object.content)
            identifiers.append(content['identifier'])

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        expected_obj_ids = ['OPM-ERround-0001']

        assert_equal(expected_obj_ids, identifiers)

    def test_harvesting_parent_child_collections(self):
        """ Test that parent are beeing harvested first.
            When we harvest a child the parent must exists
            data.json from: https://www.opm.gov/data.json """

        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(obj_ids), 1)

        self.run_fetch()
        datasets = self.run_import()

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(datasets), 1)
        titles = ['Employee Relations Roundtables']

        parent_counter = 0
        child_counter = 0

        for dataset in datasets:
            assert dataset.title in titles
            extras = self.fix_extras(dataset.extras.items())
            is_parent = extras.get('collection_metadata',
                                   'false').lower() == 'true'
            is_child = extras.get('collection_package_id', None) is not None

            log.info('Harvested dataset {} {} {}'.format(
                dataset.title, is_parent, is_child))

            if dataset.title == 'Employee Relations Roundtables':
                assert_equal(is_parent, True)
                assert_equal(is_child, False)
                parent_counter += 1
            else:
                assert_equal(is_child, True)
                assert_equal(is_parent, False)
                child_counter += 1

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(child_counter, 0)

        assert_equal(parent_counter, 1)

    def get_datasets_from_2_collection(self):
        url = 'http://127.0.0.1:%s/collection-2-parent-4-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(obj_ids), 2)

        self.run_fetch()
        datasets = self.run_import()

        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(datasets), 2)

        return datasets

    @patch('ckanext.harvest.logic.action.update.harvest_source_show')
    def test_new_job_created(self, mock_harvest_source_show):
        """ with CKAN 2.3 we divide the harvest job for collection in two steps:
            (one for parents and a second one for children).
            After finish tha parent job a new job is created for children
            """
        def ps(context, data):
            return {
                u'id': self.source.id,
                u'title': self.source.title,
                u'state': u'active',
                u'type': u'harvest',
                u'source_type': self.source.type,
                u'active': False,
                u'name': u'test_source_0',
                u'url': self.source.url,
                u'extras': []
            }

        # just for CKAN 2.3
        mock_harvest_source_show.side_effect = ps

        datasets = self.get_datasets_from_2_collection()

        # in CKAN 2.3 we expect a new job for this source and also a change in the source config

        context = {
            'model': model,
            'user': self.user['name'],
            'session': model.Session
        }

        # fake job status before final RUN command.
        self.job.status = u'Running'
        self.job.gather_finished = datetime.utcnow()
        self.job.save()

        # mark finished and do the after job tasks (in CKAN 2.3 is to create a new job for children)
        p.toolkit.get_action('harvest_jobs_run')(context, {
            'source_id': self.source.id
        })

        jobs = harvest_model.HarvestJob.filter(source=self.source).all()
        source_config = json.loads(self.source.config or '{}')

        assert_equal(len(jobs), 2)
        # Old harvester go from parents_run to children_run (a second job for children)
        assert_equal(source_config.get('datajson_collection'), 'children_run')

        return datasets

    def test_datasets_count(self):
        """ test we harvest the right amount of datasets """

        datasets = self.get_datasets_from_2_collection()
        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(len(datasets), 2)

    def test_parent_child_counts(self):
        """ Test count for parent and children """

        datasets = self.get_datasets_from_2_collection()

        parent_counter = 0
        child_counter = 0

        for dataset in datasets:
            extras = self.fix_extras(dataset.extras.items())
            is_parent = extras.get('collection_metadata',
                                   'false').lower() == 'true'
            parent_package_id = extras.get('collection_package_id', None)
            is_child = parent_package_id is not None

            if is_parent:
                parent_counter += 1
            elif is_child:
                child_counter += 1

        assert_equal(parent_counter, 2)
        # at CKAN 2.3 with GSA ckanext-harvest fork we expect just parents
        # after "parents_run" a new job will be raised for children
        assert_equal(child_counter, 0)

    def fix_extras(self, extras):
        """ fix extras rolled up at geodatagov """
        new_extras = {}
        for e in extras:
            k = e[0]
            v = e[1]
            if k == 'extras_rollup':
                extras_rollup_dict = json.loads(v)
                for rk, rv in extras_rollup_dict.items():
                    new_extras[rk] = rv
            else:
                new_extras[e[0]] = e[1]

        return new_extras
Beispiel #8
0
class TestIntegrationDataJSONHarvester28(object):
    """Integration tests using a complete CKAN 2.8+ harvest stack. Unlike unit tests,
    these tests are only run on a complete CKAN 2.8 stack."""
    @classmethod
    def setup_class(cls):
        log.info('Starting mock http server')
        cls.mock_port = 8959
        mock_datajson_source.serve(cls.mock_port)

    @classmethod
    def setup(cls):
        # Start data json sources server we can test harvesting against it
        reset_db()
        harvest_model.setup()
        cls.user = Sysadmin()
        cls.org = Organization()

        if not p.toolkit.check_ckan_version(min_version='2.8.0'):
            raise SkipTest('Just for CKAN 2.3')

    def run_gather(self, url, config_str='{}'):

        self.source = HarvestSourceObj(url=url,
                                       owner_org=self.org['id'],
                                       config=config_str)
        self.job = HarvestJobObj(source=self.source)

        self.harvester = DataJsonHarvester()

        # gather stage
        log.info('GATHERING %s', url)
        obj_ids = self.harvester.gather_stage(self.job)
        log.info('job.gather_errors=%s', self.job.gather_errors)
        log.info('obj_ids=%s', obj_ids)
        if len(obj_ids) == 0:
            # nothing to see
            return

        self.harvest_objects = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            log.info('ho guid=%s', harvest_object.guid)
            log.info('ho content=%s', harvest_object.content)
            self.harvest_objects.append(harvest_object)

        return obj_ids

    def run_fetch(self):
        # fetch stage

        for harvest_object in self.harvest_objects:
            log.info('FETCHING %s' % harvest_object.id)
            result = self.harvester.fetch_stage(harvest_object)

            log.info('ho errors=%s', harvest_object.errors)
            log.info('result 1=%s', result)
            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors

    def run_import(self, objects=None):
        # import stage
        datasets = []

        # allow run just some objects
        if objects is None:
            # default is all objects in the right order
            objects = self.harvest_objects
        else:
            log.info('Import custom list {}'.format(objects))

        for harvest_object in objects:
            log.info('IMPORTING %s' % harvest_object.id)
            result = self.harvester.import_stage(harvest_object)

            log.info('ho errors 2=%s', harvest_object.errors)
            log.info('result 2=%s', result)

            if not result:
                log.error(
                    'Dataset not imported: {}. Errors: {}. Content: {}'.format(
                        harvest_object.package_id, harvest_object.errors,
                        harvest_object.content))

            if len(harvest_object.errors) > 0:
                self.errors = harvest_object.errors
                harvest_object.state = "ERROR"

            harvest_object.state = "COMPLETE"
            harvest_object.save()

            log.info('ho pkg id=%s', harvest_object.package_id)
            dataset = model.Package.get(harvest_object.package_id)
            if dataset:
                datasets.append(dataset)
                log.info('dataset name=%s', dataset.name)

        return datasets

    def run_source(self, url, config_str='{}'):
        self.run_gather(url, config_str)
        self.run_fetch()
        datasets = self.run_import()

        return datasets

    def test_datajson_collection(self):
        """ harvest from a source with a parent in the second place
            We expect the gather stage to re-order to the forst place """
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        identifiers = []
        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            content = json.loads(harvest_object.content)
            identifiers.append(content['identifier'])

        # We always expect the parent to be the first on the list
        expected_obj_ids = [
            'OPM-ERround-0001', 'OPM-ERround-0001-AWOL',
            'OPM-ERround-0001-Retire'
        ]
        assert_equal(expected_obj_ids, identifiers)

    def test_harvesting_parent_child_collections(self):
        """ Test that parent are beeing harvested first.
            When we harvest a child the parent must exists
            data.json from: https://www.opm.gov/data.json """

        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        assert_equal(len(obj_ids), 3)

        self.run_fetch()
        datasets = self.run_import()

        assert_equal(len(datasets), 3)
        titles = [
            'Linking Employee Relations and Retirement', 'Addressing AWOL',
            'Employee Relations Roundtables'
        ]

        parent_counter = 0
        child_counter = 0

        for dataset in datasets:
            assert dataset.title in titles
            extras = self.fix_extras(dataset.extras.items())

            is_parent = extras.get('collection_metadata',
                                   'false').lower() == 'true'
            is_child = extras.get('collection_package_id', None) is not None

            log.info('Harvested dataset {} {} {}'.format(
                dataset.title, is_parent, is_child))

            if dataset.title == 'Employee Relations Roundtables':
                assert_equal(is_parent, True)
                assert_equal(is_child, False)
                parent_counter += 1
            else:
                assert_equal(is_child, True)
                assert_equal(is_parent, False)
                child_counter += 1

        assert_equal(child_counter, 2)
        assert_equal(parent_counter, 1)

    def get_datasets_from_2_collection(self):
        url = 'http://127.0.0.1:%s/collection-2-parent-4-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)

        assert_equal(len(obj_ids), 6)

        self.run_fetch()
        datasets = self.run_import()
        assert_equal(len(datasets), 6)
        return datasets

    @patch('ckanext.harvest.logic.action.update.harvest_source_show')
    def test_new_job_created(self, mock_harvest_source_show):
        """ with CKAN 2.3 we divide the harvest job for collection in two steps:
            (one for parents and a second one for children).
            After finish tha parent job a new job is created for children
            """
        def ps(context, data):
            return {
                u'id': self.source.id,
                u'title': self.source.title,
                u'state': u'active',
                u'type': u'harvest',
                u'source_type': self.source.type,
                u'active': False,
                u'name': u'test_source_0',
                u'url': self.source.url,
                u'extras': []
            }

        # just for CKAN 2.3
        mock_harvest_source_show.side_effect = ps

        datasets = self.get_datasets_from_2_collection()

        # in CKAN 2.3 we expect a new job for this source and also a change in the source config

        context = {
            'model': model,
            'user': self.user['name'],
            'session': model.Session
        }

        # fake job status before final RUN command.
        self.job.status = u'Running'
        self.job.gather_finished = datetime.utcnow()
        self.job.save()

        # mark finished and do the after job tasks (in CKAN 2.3 is to create a new job for children)
        p.toolkit.get_action('harvest_jobs_run')(context, {
            'source_id': self.source.id
        })

        jobs = harvest_model.HarvestJob.filter(source=self.source).all()
        source_config = json.loads(self.source.config or '{}')

        assert_equal(len(jobs), 1)
        assert_equal(jobs[0].status, 'Finished')

        return datasets

    def test_datasets_count(self):
        """ test we harvest the right amount of datasets """

        datasets = self.get_datasets_from_2_collection()
        assert_equal(len(datasets), 6)

    def fix_extras(self, extras):
        """ fix extras rolled up at geodatagov """
        new_extras = {}
        for e in extras:
            k = e[0]
            v = e[1]
            if k == 'extras_rollup':
                extras_rollup_dict = json.loads(v)
                for rk, rv in extras_rollup_dict.items():
                    new_extras[rk] = rv
            else:
                new_extras[e[0]] = e[1]

        return new_extras

    def test_parent_child_counts(self):
        """ Test count for parent and children """

        datasets = self.get_datasets_from_2_collection()

        parent_counter = 0
        child_counter = 0

        for dataset in datasets:
            extras = self.fix_extras(dataset.extras.items())
            is_parent = extras.get('collection_metadata',
                                   'false').lower() == 'true'
            parent_package_id = extras.get('collection_package_id', None)
            is_child = parent_package_id is not None

            if is_parent:
                parent_counter += 1
            elif is_child:
                child_counter += 1

        assert_equal(parent_counter, 2)
        assert_equal(child_counter, 4)

    def test_raise_child_error_and_retry(self):
        """ if a harvest job for a child fails because 
            parent still not exists we need to ensure
            this job will be retried. 
            This test emulate the case we harvest children first
            (e.g. if we have several active queues).
            Just for CKAN 2.8 env"""

        # start harvest process with gather to create harvest objects
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        self.run_gather(url=url)
        assert_equal(len(self.harvest_objects), 3)

        # create a publisher to send this objects to the fetch queue
        publisher = queue.get_fetch_publisher()

        for ho in self.harvest_objects:
            ho = harvest_model.HarvestObject.get(ho.id)  # refresh
            ho_data = json.loads(ho.content)
            assert_equal(ho.state, 'WAITING')
            log.info('HO: {}\n\tCurrent: {}'.format(ho_data['identifier'],
                                                    ho.current))
            assert_equal(ho.retry_times, 0)
            publisher.send({'harvest_object_id': ho.id})
            log.info('Harvest object sent to the fetch queue {} as {}'.format(
                ho_data['identifier'], ho.id))

        publisher.close()

        # run fetch for elements in the wrong order (first a child, the a parent)

        class FakeMethod(object):
            ''' This is to act like the method returned by AMQP'''
            def __init__(self, message):
                self.delivery_tag = message

        # get the fetch
        consumer_fetch = queue.get_fetch_consumer()
        qname = queue.get_fetch_queue_name()

        # first a child and assert to get an error
        r2 = json.dumps({"harvest_object_id": self.harvest_objects[1].id})
        r0 = FakeMethod(r2)
        with assert_raises(ParentNotHarvestedException):
            queue.fetch_callback(consumer_fetch, r0, None, r2)
        assert_equal(self.harvest_objects[1].retry_times, 1)
        assert_equal(self.harvest_objects[1].state, "ERROR")

        # run the parent later, like in a different queue
        r2 = json.dumps({"harvest_object_id": self.harvest_objects[0].id})
        r0 = FakeMethod(r2)
        queue.fetch_callback(consumer_fetch, r0, None, r2)
        assert_equal(self.harvest_objects[0].retry_times, 1)
        assert_equal(self.harvest_objects[0].state, "COMPLETE")

        # Check status on harvest objects
        # We expect one child with error, parent ok and second child still waiting
        for ho in self.harvest_objects:
            ho = harvest_model.HarvestObject.get(ho.id)  # refresh
            ho_data = json.loads(ho.content)
            idf = ho_data['identifier']
            log.info(
                '\nHO2: {}\n\tState: {}\n\tCurrent: {}\n\tGathered {}'.format(
                    idf, ho.state, ho.current, ho.gathered))
            if idf == 'OPM-ERround-0001':
                assert_equal(ho.state, 'COMPLETE')
            elif idf == 'OPM-ERround-0001-AWOL':
                assert_equal(ho.state, 'ERROR')
                ho_awol_id = ho.id
            elif idf == 'OPM-ERround-0001-Retire':
                assert_equal(ho.state, 'WAITING')
                ho_retire_id = ho.id
            else:
                raise Exception('Unexpected identifier: "{}"'.format(idf))

        # resubmit jobs and objects as harvest_jobs_run does
        # we expect the errored harvest object is in this queue
        queue.resubmit_jobs()
        queue.resubmit_objects()

        # iterate over the fetch consumer queue again and check pending harvest objects
        harvest_objects = []
        while True:
            method, header, body = consumer_fetch.basic_get(queue=qname)
            if body is None:
                break

            body_data = json.loads(body)
            ho_id = body_data.get('harvest_object_id', None)
            log.info('Adding ho_id {}'.format(ho_id))
            if ho_id is not None:
                ho = harvest_model.HarvestObject.get(ho_id)
                if ho is not None:
                    harvest_objects.append(ho)
                    content = json.loads(ho.content)
                    log.info('Harvest object found {}: {} '.format(
                        content['identifier'], ho.state))
                else:
                    log.info('Harvest object not found {}'.format(ho_id))

        ho_ids = [ho.id for ho in harvest_objects]

        # Now, we expect the waiting child and the errored one to be in the fetch queue

        log.info('Searching wainting object "Retire ID"')
        assert_in(ho_retire_id, ho_ids)

        log.info('Searching errored object "Awol ID"')
        assert_in(ho_awol_id, ho_ids)

    @patch(
        'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id'
    )
    @patch('ckan.plugins.toolkit.get_action')
    def test_parent_not_harvested_exception(self, mock_get_action,
                                            mock_get_harvest_source_id):
        """ unit test for is_part_of_to_package_id function 
            Test for 2 parents with the same identifier. 
            Just one belongs to the right harvest source """

        results = {
            'count':
            2,
            'results': [{
                'id':
                'pkg-1',
                'name':
                'dataset-1',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }, {
                'id':
                'pkg-2',
                'name':
                'dataset-2',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }]
        }

        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: results
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action
        mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format(
            package_id)

        harvest_source = Mock()
        harvest_source.id = 'hsi-pkg-99'  # raise error, not found
        harvest_object = Mock()
        harvest_object.source = harvest_source

        harvester = DataJsonHarvester()
        with assert_raises(ParentNotHarvestedException):
            harvester.is_part_of_to_package_id('custom-identifier',
                                               harvest_object)

        assert mock_get_action.called

    @patch(
        'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id'
    )
    @patch('ckan.plugins.toolkit.get_action')
    def test_is_part_of_to_package_id_one_result(self, mock_get_action,
                                                 mock_get_harvest_source_id):
        """ unit test for is_part_of_to_package_id function """

        results = {
            'count':
            1,
            'results': [{
                'id':
                'pkg-1',
                'name':
                'dataset-1',
                'extras': [{
                    'key': 'identifier',
                    'value': 'identifier'
                }]
            }]
        }

        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: results
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action
        mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format(
            package_id)

        harvest_source = Mock()
        harvest_source.id = 'hsi-pkg-1'
        harvest_object = Mock()
        harvest_object.source = harvest_source

        harvester = DataJsonHarvester()
        dataset = harvester.is_part_of_to_package_id('identifier',
                                                     harvest_object)
        assert mock_get_action.called
        assert_equal(dataset['name'], 'dataset-1')

    @patch(
        'ckanext.datajson.harvester_datajson.DataJsonHarvester.get_harvest_source_id'
    )
    @patch('ckan.plugins.toolkit.get_action')
    def test_is_part_of_to_package_id_two_result(self, mock_get_action,
                                                 mock_get_harvest_source_id):
        """ unit test for is_part_of_to_package_id function 
            Test for 2 parents with the same identifier. 
            Just one belongs to the right harvest source """

        results = {
            'count':
            2,
            'results': [{
                'id':
                'pkg-1',
                'name':
                'dataset-1',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }, {
                'id':
                'pkg-2',
                'name':
                'dataset-2',
                'extras': [{
                    'key': 'identifier',
                    'value': 'custom-identifier'
                }]
            }]
        }

        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: results
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action
        mock_get_harvest_source_id.side_effect = lambda package_id: 'hsi-{}'.format(
            package_id)

        harvest_source = Mock()
        harvest_source.id = 'hsi-pkg-2'
        harvest_object = Mock()
        harvest_object.source = harvest_source

        harvester = DataJsonHarvester()
        dataset = harvester.is_part_of_to_package_id('custom-identifier',
                                                     harvest_object)
        assert mock_get_action.called
        assert_equal(dataset['name'], 'dataset-2')

    @patch('ckan.plugins.toolkit.get_action')
    def test_is_part_of_to_package_id_fail_no_results(self, mock_get_action):
        """ unit test for is_part_of_to_package_id function """
        def get_action(action_name):
            # CKAN 2.8 have the "mock_action" decorator but this is not available for CKAN 2.3
            if action_name == 'package_search':
                return lambda ctx, data: {'count': 0}
            elif action_name == 'get_site_user':
                return lambda ctx, data: {'name': 'default'}

        mock_get_action.side_effect = get_action

        harvester = DataJsonHarvester()
        with assert_raises(ParentNotHarvestedException):
            harvester.is_part_of_to_package_id('identifier', None)

    def test_datajson_is_part_of_package_id(self):
        url = 'http://127.0.0.1:%s/collection-1-parent-2-children.data.json' % self.mock_port
        obj_ids = self.run_gather(url=url)
        self.run_fetch()
        self.run_import()

        for obj_id in obj_ids:
            harvest_object = harvest_model.HarvestObject.get(obj_id)
            content = json.loads(harvest_object.content)
            # get the dataset with this identifier only if is a parent in a collection
            if content['identifier'] == 'OPM-ERround-0001':
                dataset = self.harvester.is_part_of_to_package_id(
                    content['identifier'], harvest_object)
                assert_equal(dataset['title'],
                             'Employee Relations Roundtables')

            if content['identifier'] in [
                    'OPM-ERround-0001-AWOL', 'OPM-ERround-0001-Retire'
            ]:
                with assert_raises(ParentNotHarvestedException):
                    self.harvester.is_part_of_to_package_id(
                        content['identifier'], harvest_object)

        with assert_raises(ParentNotHarvestedException):
            self.harvester.is_part_of_to_package_id('bad identifier',
                                                    harvest_object)

    def test_datajson_non_federal(self):
        """ validate we get the coinfig we sent """
        url = 'http://127.0.0.1:%s/ny' % self.mock_port
        config = '{"validator_schema": "non-federal", "private_datasets": "False", "default_groups": "local"}'
        self.run_source(url, config)

        source_config = self.harvester.load_config(self.source)
        # include default values (filers and default)
        expected_config = {
            'defaults': {},
            'filters': {},
            'validator_schema': 'non-federal',
            'default_groups': 'local',
            'private_datasets': 'False'
        }
        assert_equal(source_config, expected_config)