예제 #1
0
    def test_redis_queue_purging(self):
        '''
        Test that Redis queue purging doesn't purge the wrong keys.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key', 'foobar')

            # Create some fake jobs
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            num_keys = redis.dbsize()

            # Create some fake objects
            gather_consumer = queue.get_gather_consumer()
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            fetch_consumer = queue.get_fetch_consumer()
            next(fetch_consumer.consume(queue.get_fetch_queue_name()))

            assert redis.dbsize() > num_keys

            queue.purge_queues()

            assert redis.get('ckanext-harvest:some-random-key') == 'foobar'
            assert redis.dbsize() == num_keys
            assert redis.llen(queue.get_gather_routing_key()) == 0
            assert redis.llen(queue.get_fetch_routing_key()) == 0
        finally:
            redis.delete('ckanext-harvest:some-random-key')
예제 #2
0
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True
        }
        self.admin_user = get_action('get_site_user')(context, {})

        print ''

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == 'source':
            self.create_harvest_source()
        elif cmd == "rmsource":
            self.remove_harvest_source()
        elif cmd == 'sources':
            self.list_harvest_sources()
        elif cmd == 'job':
            self.create_harvest_job()
        elif cmd == 'jobs':
            self.list_harvest_jobs()
        elif cmd == 'run':
            self.run_harvester()
        elif cmd == 'gather_consumer':
            import logging
            from ckanext.harvest.queue import get_gather_consumer
            logging.getLogger('amqplib').setLevel(logging.INFO)
            consumer = get_gather_consumer()
            logging.getLogger('ckan.cli').info(
                'Now going to wait on the gather queue...')
            consumer.wait()
        elif cmd == 'fetch_consumer':
            import logging
            logging.getLogger('amqplib').setLevel(logging.INFO)
            from ckanext.harvest.queue import get_fetch_consumer
            consumer = get_fetch_consumer()
            logging.getLogger('ckan.cli').info(
                'Now going to wait on the fetch queue...')
            consumer.wait()
        elif cmd == 'initdb':
            self.initdb()
        elif cmd == 'import':
            self.initdb()
            self.import_stage()
        elif cmd == 'job-all':
            self.create_harvest_job_all()
        elif cmd == 'harvesters-info':
            harvesters_info = get_action('harvesters_info_show')()
            pprint(harvesters_info)
        elif cmd == 'job-run':
            self.job_run()
        else:
            print 'Command %s not recognized' % cmd
예제 #3
0
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {'model':model,'session':model.Session,'ignore_auth':True}
        self.admin_user = get_action('get_site_user')(context,{})


        print ''

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == 'source':
            self.create_harvest_source()
        elif cmd == "rmsource":
            self.remove_harvest_source()
        elif cmd == 'sources':
            self.list_harvest_sources()
        elif cmd == 'job':
            self.create_harvest_job()
        elif cmd == 'jobs':
            self.list_harvest_jobs()
        elif cmd == 'run':
            self.run_harvester()
        elif cmd == 'gather_consumer':
            import logging
            from ckanext.harvest.queue import get_gather_consumer, gather_callback
            logging.getLogger('amqplib').setLevel(logging.INFO)
            consumer = get_gather_consumer()
            for method, header, body in consumer.consume(queue='ckan.harvest.gather'):
                gather_callback(consumer, method, header, body)
        elif cmd == 'fetch_consumer':
            import logging
            logging.getLogger('amqplib').setLevel(logging.INFO)
            from ckanext.harvest.queue import get_fetch_consumer, fetch_callback
            consumer = get_fetch_consumer()
            for method, header, body in consumer.consume(queue='ckan.harvest.fetch'):
               fetch_callback(consumer, method, header, body)
        elif cmd == 'purge_queues':
            from ckanext.harvest.queue import purge_queues
            purge_queues()
        elif cmd == 'initdb':
            self.initdb()
        elif cmd == 'import':
            self.initdb()
            self.import_stage()
        elif cmd == 'job-all':
            self.create_harvest_job_all()
        elif cmd == 'harvesters-info':
            harvesters_info = get_action('harvesters_info_show')()
            pprint(harvesters_info)
        elif cmd == 'reindex':
            self.reindex()
        else:
            print 'Command %s not recognized' % cmd
예제 #4
0
파일: harvester.py 프로젝트: tbalaz/test
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {'model':model,'session':model.Session,'ignore_auth':True}
        self.admin_user = get_action('get_site_user')(context,{})


        print ''

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == 'source':
            self.create_harvest_source()
        elif cmd == "rmsource":
            self.remove_harvest_source()
        elif cmd == 'sources':
            self.list_harvest_sources()
        elif cmd == 'job':
            self.create_harvest_job()
        elif cmd == 'jobs':
            self.list_harvest_jobs()
        elif cmd == 'run':
            self.run_harvester()
        elif cmd == 'gather_consumer':
            import logging
            from ckanext.harvest.queue import get_gather_consumer
            logging.getLogger('amqplib').setLevel(logging.INFO)
            consumer = get_gather_consumer()
            logging.getLogger('ckan.cli').info('Now going to wait on the gather queue...')
            consumer.wait()
        elif cmd == 'fetch_consumer':
            import logging
            logging.getLogger('amqplib').setLevel(logging.INFO)
            from ckanext.harvest.queue import get_fetch_consumer
            consumer = get_fetch_consumer()
            logging.getLogger('ckan.cli').info('Now going to wait on the fetch queue...')
            consumer.wait()
        elif cmd == 'initdb':
            self.initdb()
        elif cmd == 'import':
            self.initdb()
            self.import_stage()
        elif cmd == 'job-all':
            self.create_harvest_job_all()
        elif cmd == 'harvesters-info':
            harvesters_info = get_action('harvesters_info_show')()
            pprint(harvesters_info)
        elif cmd == 'job-run':
            self.job_run()
        else:
            print 'Command %s not recognized' % cmd
예제 #5
0
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {"model": model, "session": model.Session, "ignore_auth": True}
        self.admin_user = get_action("get_site_user")(context, {})

        print ""

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == "source":
            self.create_harvest_source()
        elif cmd == "rmsource":
            self.remove_harvest_source()
        elif cmd == "sources":
            self.list_harvest_sources()
        elif cmd == "job":
            self.create_harvest_job()
        elif cmd == "jobs":
            self.list_harvest_jobs()
        elif cmd == "run":
            self.run_harvester()
        elif cmd == "gather_consumer":
            import logging
            from ckanext.harvest.queue import get_gather_consumer

            logging.getLogger("amqplib").setLevel(logging.INFO)
            consumer = get_gather_consumer()
            consumer.wait()
        elif cmd == "fetch_consumer":
            import logging

            logging.getLogger("amqplib").setLevel(logging.INFO)
            from ckanext.harvest.queue import get_fetch_consumer

            consumer = get_fetch_consumer()
            consumer.wait()
        elif cmd == "initdb":
            self.initdb()
        elif cmd == "import":
            self.initdb()
            self.import_stage()
        elif cmd == "job-all":
            self.create_harvest_job_all()
        elif cmd == "harvesters-info":
            harvesters_info = get_action("harvesters_info_show")()
            pprint(harvesters_info)
        else:
            print "Command %s not recognized" % cmd
예제 #6
0
def gather_consumer():
    import logging
    from ckanext.harvest.queue import (
        get_gather_consumer,
        gather_callback,
        get_gather_queue_name,
    )

    logging.getLogger("amqplib").setLevel(logging.INFO)
    consumer = get_gather_consumer()
    for method, header, body in consumer.consume(
            queue=get_gather_queue_name()):
        gather_callback(consumer, method, header, body)
예제 #7
0
    def test_redis_corrupt(self, mock_log_error):
        '''
        Test that corrupt Redis doesn't stop harvest process and still processes other jobs.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key-2', 'foobar')

            # make sure queues/exchanges are created first and are empty
            gather_consumer = queue.get_gather_consumer()
            fetch_consumer = queue.get_fetch_consumer()
            gather_consumer.queue_purge(queue=queue.get_gather_queue_name())
            fetch_consumer.queue_purge(queue=queue.get_fetch_queue_name())

            # Create some fake jobs and objects with no harvest_job_id
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': None})
            h_obj_id = str(uuid.uuid4())
            fetch_publisher.send({'harvest_object_id': h_obj_id})

            # Create some fake objects
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            _, _, body = next(
                fetch_consumer.consume(queue.get_fetch_queue_name()))

            json_obj = json.loads(body)
            assert json_obj['harvest_object_id'] == h_obj_id

            assert mock_log_error.call_count == 1
            args, _ = mock_log_error.call_args_list[0]
            if six.PY2:
                assert "cannot concatenate 'str' and 'NoneType' objects" in args[
                    1]
            else:
                assert "must be str, not NoneType" in str(args[1])

        finally:
            redis.delete('ckanext-harvest:some-random-key-2')
예제 #8
0
    def test_redis_queue_purging(self):
        '''
        Test that Redis queue purging doesn't purge the wrong keys.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            raise SkipTest()
        redis = queue.get_connection()
        try:
            redis.set('ckanext-harvest:some-random-key', 'foobar')

            # Create some fake jobs
            gather_publisher = queue.get_gather_publisher()
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            gather_publisher.send({'harvest_job_id': str(uuid.uuid4())})
            fetch_publisher = queue.get_fetch_publisher()
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            fetch_publisher.send({'harvest_object_id': str(uuid.uuid4())})
            num_keys = redis.dbsize()

            # Create some fake objects
            gather_consumer = queue.get_gather_consumer()
            next(gather_consumer.consume(queue.get_gather_queue_name()))
            fetch_consumer = queue.get_fetch_consumer()
            next(fetch_consumer.consume(queue.get_fetch_queue_name()))

            ok_(redis.dbsize() > num_keys)

            queue.purge_queues()

            assert_equal(redis.get('ckanext-harvest:some-random-key'),
                         'foobar')
            assert_equal(redis.dbsize(), num_keys)
            assert_equal(redis.llen(queue.get_gather_routing_key()), 0)
            assert_equal(redis.llen(queue.get_fetch_routing_key()), 0)
        finally:
            redis.delete('ckanext-harvest:some-random-key')
예제 #9
0
    def setup_class(cls):

        h.reset_db()

        cls.gather_consumer = queue.get_gather_consumer()
        cls.fetch_consumer = queue.get_fetch_consumer()

        # Minimal remote RDF file
        cls.rdf_mock_url = 'http://some.dcat.file.rdf'
        cls.rdf_content_type = 'application/rdf+xml'
        cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2">
              <dct:title>Example dataset 2</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''

        # Minimal remote RDF file with pagination (1)
        # Use slashes for paginated URLs because HTTPretty won't distinguish
        # query strings
        cls.rdf_mock_url_pagination_1 = 'http://some.dcat.file.pagination.rdf'
        cls.rdf_content_pagination_1 = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:hydra="http://www.w3.org/ns/hydra/core#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2">
              <dct:title>Example dataset 2</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1">
            <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems>
            <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage>
            <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage>
            <hydra:nextPage>http://some.dcat.file.pagination.rdf/page/2</hydra:nextPage>
            <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage>
        </hydra:PagedCollection>
        </rdf:RDF>
        '''

        # Minimal remote RDF file with pagination (2)
        cls.rdf_mock_url_pagination_2 = 'http://some.dcat.file.pagination.rdf/page/2'
        cls.rdf_content_pagination_2 = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:hydra="http://www.w3.org/ns/hydra/core#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/3">
              <dct:title>Example dataset 3</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/4">
              <dct:title>Example dataset 4</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1">
            <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems>
            <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage>
            <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage>
            <hydra:previousPage>http://some.dcat.file.pagination.rdf/page/1</hydra:previousPage>
            <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage>
        </hydra:PagedCollection>
        </rdf:RDF>
        '''

        # Minimal remote RDF file
        cls.rdf_mock_url = 'http://some.dcat.file.rdf'
        cls.rdf_content_type = 'application/rdf+xml'
        cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2">
              <dct:title>Example dataset 2</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''

        cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''

        # RDF with minimal distribution
        cls.rdf_content_with_distribution_uri = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
              <dcat:distribution>
                <dcat:Distribution rdf:about="https://data.some.org/catalog/datasets/1/resource/1">
                  <dct:title>Example resource 1</dct:title>
                  <dcat:accessURL>http://data.some.org/download.zip</dcat:accessURL>
                </dcat:Distribution>
              </dcat:distribution>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''
        cls.rdf_content_with_distribution = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
              <dcat:distribution>
                <dcat:Distribution>
                  <dct:title>Example resource 1</dct:title>
                  <dcat:accessURL>http://data.some.org/download.zip</dcat:accessURL>
                </dcat:Distribution>
              </dcat:distribution>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''

        cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog
        </rdf:RDF>
        '''

        #Minimal remote turtle file
        cls.ttl_mock_url = 'http://some.dcat.file.ttl'
        cls.ttl_content_type = 'text/turtle'
        cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" .
          '''
        cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" .
          '''
        cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" ;
          dcat:keyword "förskola", "Garduña" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" ;
          dcat:keyword "San Sebastián", "Ελλάδα" .
          '''
        cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" ;
          dcat:keyword "Utbildning, kontaktuppgifter" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" ;
          dcat:keyword "Trees, forest, shrub" .
          '''
        cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
예제 #10
0
    def setup_class(cls):

        cls.gather_consumer = queue.get_gather_consumer()
        cls.fetch_consumer = queue.get_fetch_consumer()

        # Minimal remote RDF file
        cls.rdf_mock_url = 'http://some.dcat.file.rdf'
        cls.rdf_content_type = 'application/rdf+xml'
        cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2">
              <dct:title>Example dataset 2</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''
        cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''
        cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog
        </rdf:RDF>
        '''

        #Minimal remote turtle file
        cls.ttl_mock_url = 'http://some.dcat.file.ttl'
        cls.ttl_content_type = 'text/turtle'
        cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" .
          '''
        cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" .
          '''
        cls.ttl_remote_file_invalid =  '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
    def setup_class(cls):

        h.reset_db()

        cls.gather_consumer = queue.get_gather_consumer()
        cls.fetch_consumer = queue.get_fetch_consumer()

        # Minimal remote RDF file
        cls.rdf_mock_url = 'http://some.dcat.file.rdf'
        cls.rdf_content_type = 'application/rdf+xml'
        cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2">
              <dct:title>Example dataset 2</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''

        # Minimal remote RDF file with pagination (1)
        # Use slashes for paginated URLs because HTTPretty won't distinguish
        # query strings
        cls.rdf_mock_url_pagination_1 = 'http://some.dcat.file.pagination.rdf'
        cls.rdf_content_pagination_1 = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:hydra="http://www.w3.org/ns/hydra/core#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2">
              <dct:title>Example dataset 2</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1">
            <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems>
            <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage>
            <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage>
            <hydra:nextPage>http://some.dcat.file.pagination.rdf/page/2</hydra:nextPage>
            <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage>
        </hydra:PagedCollection>
        </rdf:RDF>
        '''

        # Minimal remote RDF file with pagination (2)
        cls.rdf_mock_url_pagination_2 = 'http://some.dcat.file.pagination.rdf/page/2'
        cls.rdf_content_pagination_2 = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
         xmlns:hydra="http://www.w3.org/ns/hydra/core#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/3">
              <dct:title>Example dataset 3</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/4">
              <dct:title>Example dataset 4</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        <hydra:PagedCollection rdf:about="http://some.dcat.file.pagination.rdf/page/1">
            <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">4</hydra:totalItems>
            <hydra:lastPage>http://some.dcat.file.pagination.rdf/page/2</hydra:lastPage>
            <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">2</hydra:itemsPerPage>
            <hydra:previousPage>http://some.dcat.file.pagination.rdf/page/1</hydra:previousPage>
            <hydra:firstPage>http://some.dcat.file.pagination.rdf/page/1</hydra:firstPage>
        </hydra:PagedCollection>
        </rdf:RDF>
        '''

        # Minimal remote RDF file
        cls.rdf_mock_url = 'http://some.dcat.file.rdf'
        cls.rdf_content_type = 'application/rdf+xml'
        cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2">
              <dct:title>Example dataset 2</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''

        cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''
        cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog
        </rdf:RDF>
        '''

        #Minimal remote turtle file
        cls.ttl_mock_url = 'http://some.dcat.file.ttl'
        cls.ttl_content_type = 'text/turtle'
        cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" .
          '''
        cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" .
          '''
        cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" ;
          dcat:keyword "förskola", "Garduña" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" ;
          dcat:keyword "San Sebastián", "Ελλάδα" .
          '''
        cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" ;
          dcat:keyword "Utbildning, kontaktuppgifter" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" ;
          dcat:keyword "Trees, forest, shrub" .
          '''
        cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
예제 #12
0
    def test_01_basic_harvester(self):

        ### make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())


        user = logic.get_action('get_site_user')(
            {'model': model, 'ignore_auth': True}, {}
        )['name']

        context = {'model': model, 'session': model.Session,
                   'user': user, 'api_version': 3, 'ignore_auth': True}

        source_dict = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': 'basic_test',
            'source_type': 'test',
        }

        harvest_source = logic.get_action('harvest_source_create')(
            context,
            source_dict
        )

        assert harvest_source['source_type'] == 'test', harvest_source
        assert harvest_source['url'] == 'basic_test', harvest_source

        harvest_job = logic.get_action('harvest_job_create')(
            context,
            {'source_id': harvest_source['id'], 'run': True}
        )

        job_id = harvest_job['id']

        assert harvest_job['source_id'] == harvest_source['id'], harvest_job

        assert harvest_job['status'] == u'Running'

        assert logic.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )['status'] == u'Running'

        ## pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')

        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'WAITING'
        assert all_objects[1].state == 'WAITING'
        assert all_objects[2].state == 'WAITING'


        assert len(model.Session.query(HarvestObject).all()) == 3
        assert len(model.Session.query(HarvestObjectExtra).all()) == 1

        ## do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
                .filter(model.Package.type=='dataset') \
                .count()
        assert count == 3
        all_objects = model.Session.query(HarvestObject).filter_by(current=True).all()

        assert_equal(len(all_objects), 3)
        assert_equal(all_objects[0].state, 'COMPLETE')
        assert_equal(all_objects[0].report_status, 'added')
        assert_equal(all_objects[1].state, 'COMPLETE')
        assert_equal(all_objects[1].report_status, 'added')
        assert_equal(all_objects[2].state, 'COMPLETE')
        assert_equal(all_objects[2].report_status, 'added')

        ## fire run again to check if job is set to Finished
        logic.get_action('harvest_jobs_run')(
            context,
            {'source_id':harvest_source['id']}
        )

        harvest_job = logic.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )

        assert_equal(harvest_job['status'], u'Finished')
        assert_equal(harvest_job['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0})

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0})
        assert_equal(harvest_source_dict['status']['total_datasets'], 3)
        assert_equal(harvest_source_dict['status']['job_count'], 1)


        ########### Second run ########################
        harvest_job = logic.get_action('harvest_job_create')(
            context,
            {'source_id': harvest_source['id'], 'run': True}
        )

        job_id = harvest_job['id']
        assert logic.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )['status'] == u'Running'

        ## pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')
        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 6

        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
                .filter(model.Package.type=='dataset') \
                .count()
        assert_equal(count, 3)

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all()
        assert_equal(len(all_objects), 3)

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all()
        assert_equal(len(all_objects), 2)

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all()
        assert_equal(len(all_objects), 1)

        # run to make sure job is marked as finshed
        logic.get_action('harvest_jobs_run')(
            context,
            {'source_id':harvest_source['id']}
        )

        harvest_job = logic.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )
        assert_equal(harvest_job['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1})

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1})
        assert_equal(harvest_source_dict['status']['total_datasets'], 2)
        assert_equal(harvest_source_dict['status']['job_count'], 2)
    def setup_class(cls):

        cls.gather_consumer = queue.get_gather_consumer()
        cls.fetch_consumer = queue.get_fetch_consumer()

        # Minimal remote RDF file
        cls.rdf_mock_url = 'http://some.dcat.file.rdf'
        cls.rdf_content_type = 'application/rdf+xml'
        cls.rdf_content = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/2">
              <dct:title>Example dataset 2</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''
        cls.rdf_remote_file_small = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dct="http://purl.org/dc/terms/"
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:xsd="http://www.w3.org/2001/XMLSchema#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog rdf:about="https://data.some.org/catalog">
          <dcat:dataset>
            <dcat:Dataset rdf:about="https://data.some.org/catalog/datasets/1">
              <dct:title>Example dataset 1</dct:title>
            </dcat:Dataset>
          </dcat:dataset>
        </dcat:Catalog>
        </rdf:RDF>
        '''
        cls.rdf_remote_file_invalid = '''<?xml version="1.0" encoding="utf-8" ?>
        <rdf:RDF
         xmlns:dcat="http://www.w3.org/ns/dcat#"
         xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
        <dcat:Catalog
        </rdf:RDF>
        '''

        #Minimal remote turtle file
        cls.ttl_mock_url = 'http://some.dcat.file.ttl'
        cls.ttl_content_type = 'text/turtle'
        cls.ttl_content = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" .
          '''
        cls.ttl_remote_file_small = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1>, <https://data.some.org/catalog/datasets/2> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" .
          '''
        cls.ttl_unicode_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" ;
          dcat:keyword "förskola", "Garduña" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" ;
          dcat:keyword "San Sebastián", "Ελλάδα" .
          '''
        cls.ttl_commas_in_keywords = u'''@prefix dcat: <http://www.w3.org/ns/dcat#> .
        @prefix dc: <http://purl.org/dc/terms/> .
        <https://data.some.org/catalog>
          a dcat:Catalog ;
          dcat:dataset <https://data.some.org/catalog/datasets/1> .
        <https://data.some.org/catalog/datasets/1>
          a dcat:Dataset ;
          dc:title "Example dataset 1" ;
          dcat:keyword "Utbildning, kontaktuppgifter" .
        <https://data.some.org/catalog/datasets/2>
          a dcat:Dataset ;
          dc:title "Example dataset 2" ;
          dcat:keyword "Trees, forest, shrub" .
          '''
        cls.ttl_remote_file_invalid = '''@prefix dcat: <http://www.w3.org/ns/dcat#> .
    def setup_class(cls):
        h.reset_db()

        cls.gather_consumer = queue.get_gather_consumer()
        cls.fetch_consumer = queue.get_fetch_consumer()
예제 #15
0
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {"model": model, "session": model.Session, "ignore_auth": True}
        self.admin_user = get_action("get_site_user")(context, {})

        print ""

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == "source":
            if len(self.args) > 2:
                self.create_harvest_source()
            else:
                self.show_harvest_source()
        elif cmd == "rmsource":
            self.remove_harvest_source()
        elif cmd == "clearsource":
            self.clear_harvest_source()
        elif cmd == "clearsource_history":
            self.clear_harvest_source_history()
        elif cmd == "sources":
            self.list_harvest_sources()
        elif cmd == "job":
            self.create_harvest_job()
        elif cmd == "jobs":
            self.list_harvest_jobs()
        elif cmd == "job_abort":
            self.job_abort()
        elif cmd == "run":
            self.run_harvester()
        elif cmd == "run_test":
            self.run_test_harvest()
        elif cmd == "gather_consumer":
            import logging
            from ckanext.harvest.queue import get_gather_consumer, gather_callback, get_gather_queue_name

            logging.getLogger("amqplib").setLevel(logging.INFO)
            consumer = get_gather_consumer()
            for method, header, body in consumer.consume(queue=get_gather_queue_name()):
                gather_callback(consumer, method, header, body)
        elif cmd == "fetch_consumer":
            import logging

            logging.getLogger("amqplib").setLevel(logging.INFO)
            from ckanext.harvest.queue import get_fetch_consumer, fetch_callback, get_fetch_queue_name

            consumer = get_fetch_consumer()
            for method, header, body in consumer.consume(queue=get_fetch_queue_name()):
                fetch_callback(consumer, method, header, body)
        elif cmd == "purge_queues":
            self.purge_queues()
        elif cmd == "initdb":
            self.initdb()
        elif cmd == "import":
            self.initdb()
            self.import_stage()
        elif cmd == "job-all":
            self.create_harvest_job_all()
        elif cmd == "harvesters-info":
            harvesters_info = get_action("harvesters_info_show")()
            pprint(harvesters_info)
        elif cmd == "reindex":
            self.reindex()
        elif cmd == "clean_harvest_log":
            self.clean_harvest_log()
        else:
            print "Command %s not recognized" % cmd
예제 #16
0
    def command(self):
        self._load_config()

        # We'll need a sysadmin user to perform most of the actions
        # We will use the sysadmin site user (named as the site_id)
        context = {
            'model': model,
            'session': model.Session,
            'ignore_auth': True
        }
        self.admin_user = get_action('get_site_user')(context, {})

        print ''

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == 'source':
            if len(self.args) > 2:
                self.create_harvest_source()
            else:
                self.show_harvest_source()
        elif cmd == 'rmsource':
            self.remove_harvest_source()
        elif cmd == 'clearsource':
            self.clear_harvest_source()
        elif cmd == 'sources':
            self.list_harvest_sources()
        elif cmd == 'job':
            self.create_harvest_job()
        elif cmd == 'jobs':
            self.list_harvest_jobs()
        elif cmd == 'job_abort':
            self.job_abort()
        elif cmd == 'run':
            self.run_harvester()
        elif cmd == 'run_test':
            self.run_test_harvest()
        elif cmd == 'gather_consumer':
            import logging
            from ckanext.harvest.queue import (get_gather_consumer,
                                               gather_callback,
                                               get_gather_queue_name)
            logging.getLogger('amqplib').setLevel(logging.INFO)
            consumer = get_gather_consumer()
            for method, header, body in consumer.consume(
                    queue=get_gather_queue_name()):
                gather_callback(consumer, method, header, body)
        elif cmd == 'fetch_consumer':
            import logging
            logging.getLogger('amqplib').setLevel(logging.INFO)
            from ckanext.harvest.queue import (get_fetch_consumer,
                                               fetch_callback,
                                               get_fetch_queue_name)
            consumer = get_fetch_consumer()
            for method, header, body in consumer.consume(
                    queue=get_fetch_queue_name()):
                fetch_callback(consumer, method, header, body)
        elif cmd == 'purge_queues':
            from ckanext.harvest.queue import purge_queues
            purge_queues()
        elif cmd == 'initdb':
            self.initdb()
        elif cmd == 'import':
            self.initdb()
            self.import_stage()
        elif cmd == 'job-all':
            self.create_harvest_job_all()
        elif cmd == 'harvesters-info':
            harvesters_info = get_action('harvesters_info_show')()
            pprint(harvesters_info)
        elif cmd == 'reindex':
            self.reindex()
        elif cmd == 'clean_harvest_log':
            self.clean_harvest_log()
        else:
            print 'Command %s not recognized' % cmd
예제 #17
0
    def test_resubmit_objects(self):
        '''
        Test that only harvest objects re-submitted which were not be present in the redis fetch queue.
        '''
        if config.get('ckan.harvest.mq.type') != 'redis':
            pytest.skip()
        # make sure that there are no old elements in the redis db
        redis = queue.get_connection()
        fetch_routing_key = queue.get_fetch_routing_key()
        redis.flushdb()
        try:
            # make sure queues/exchanges are created first and are empty
            consumer = queue.get_gather_consumer()
            consumer_fetch = queue.get_fetch_consumer()
            consumer.queue_purge(queue=queue.get_gather_queue_name())
            consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

            user = toolkit.get_action('get_site_user')(
                {'model': model, 'ignore_auth': True}, {}
            )['name']

            context = {'model': model, 'session': model.Session,
                       'user': user, 'api_version': 3, 'ignore_auth': True}

            harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context)

            assert redis.llen(fetch_routing_key) == 3

            # do only one time for the first harvest object
            reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
            queue.fetch_callback(consumer_fetch, *reply)

            count = model.Session.query(model.Package) \
                .filter(model.Package.type == 'dataset') \
                .count()
            assert count == 1

            all_objects = model.Session.query(HarvestObject).order_by(HarvestObject.state.asc()).all()
            assert len(all_objects) == 3
            assert all_objects[0].state == 'COMPLETE'
            assert all_objects[0].report_status == 'added'
            assert all_objects[0].current is True
            assert all_objects[1].state == 'WAITING'
            assert all_objects[1].current is False
            assert all_objects[2].state == 'WAITING'
            assert all_objects[2].current is False

            assert len(redis.keys(fetch_routing_key + ':*')) == 0
            assert redis.llen(fetch_routing_key) == 2

            # Remove one object from redis that should be re-sent to the fetch queue
            reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
            fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10)
            assert len(fetch_queue_items) == 1
            harvest_object_id = reply[2]
            assert fetch_queue_items[0] != harvest_object_id

            queue.resubmit_objects()

            assert redis.llen(fetch_routing_key) == 2
            fetch_queue_items = redis.lrange(fetch_routing_key, 0, 10)
            assert harvest_object_id in fetch_queue_items
            assert redis.dbsize() == 1
        finally:
            redis.flushdb()
예제 #18
0
    def test_01_basic_harvester(self):

        if config.get('ckan.harvest.mq.type') == 'redis':
            # make sure that there are no old elements in the redis db
            redis = queue.get_connection()
            redis.flushdb()

        # make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

        user = toolkit.get_action('get_site_user')(
            {'model': model, 'ignore_auth': True}, {}
        )['name']

        context = {'model': model, 'session': model.Session,
                   'user': user, 'api_version': 3, 'ignore_auth': True}

        harvest_source, job_id = self._create_harvest_job_and_finish_gather_stage(consumer, context)

        # do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3
        all_objects = model.Session.query(HarvestObject).filter_by(current=True).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'COMPLETE'
        assert all_objects[0].report_status == 'added'
        assert all_objects[1].state == 'COMPLETE'
        assert all_objects[1].report_status == 'added'
        assert all_objects[2].state == 'COMPLETE'
        assert all_objects[2].report_status == 'added'

        # fire run again to check if job is set to Finished
        toolkit.get_action('harvest_jobs_run')(
            context,
            {'source_id': harvest_source['id']}
        )

        harvest_job = toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )

        assert harvest_job['status'] == u'Finished'
        assert harvest_job['stats'] == {'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}

        harvest_source_dict = toolkit.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 3, 'updated': 0, 'not modified': 0, 'errored': 0, 'deleted': 0}
        assert harvest_source_dict['status']['total_datasets'] == 3
        assert harvest_source_dict['status']['job_count'] == 1

        # Second run
        harvest_job = toolkit.get_action('harvest_job_create')(
            context,
            {'source_id': harvest_source['id'], 'run': True}
        )

        job_id = harvest_job['id']
        assert toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')
        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 6

        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all()
        assert len(all_objects) == 3

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all()
        assert len(all_objects) == 2

        all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all()
        assert len(all_objects) == 1

        # run to make sure job is marked as finshed
        toolkit.get_action('harvest_jobs_run')(
            context,
            {'source_id': harvest_source['id']}
        )

        harvest_job = toolkit.get_action('harvest_job_show')(
            context,
            {'id': job_id}
        )
        assert harvest_job['stats'] == {'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}

        harvest_source_dict = toolkit.get_action('harvest_source_show')(
            context,
            {'id': harvest_source['id']}
        )

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 0, 'updated': 2, 'not modified': 0, 'errored': 0, 'deleted': 1}
        assert harvest_source_dict['status']['total_datasets'] == 2
        assert harvest_source_dict['status']['job_count'] == 2
예제 #19
0
    def test_01_basic_harvester(self):

        # make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

        user = logic.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})['name']

        context = {
            'model': model,
            'session': model.Session,
            'user': user,
            'api_version': 3,
            'ignore_auth': True
        }

        source_dict = {
            'title': 'Test Source',
            'name': 'test-source',
            'url': 'basic_test',
            'source_type': 'test',
        }

        harvest_source = logic.get_action('harvest_source_create')(context,
                                                                   source_dict)

        assert harvest_source['source_type'] == 'test', harvest_source
        assert harvest_source['url'] == 'basic_test', harvest_source

        harvest_job = logic.get_action('harvest_job_create')(
            context, {
                'source_id': harvest_source['id'],
                'run': True
            })

        job_id = harvest_job['id']

        assert harvest_job['source_id'] == harvest_source['id'], harvest_job

        assert harvest_job['status'] == u'Running'

        assert logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')

        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'WAITING'
        assert all_objects[1].state == 'WAITING'
        assert all_objects[2].state == 'WAITING'

        assert len(model.Session.query(HarvestObject).all()) == 3
        assert len(model.Session.query(HarvestObjectExtra).all()) == 1

        # do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3
        all_objects = model.Session.query(HarvestObject).filter_by(
            current=True).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'COMPLETE'
        assert all_objects[0].report_status == 'added'
        assert all_objects[1].state == 'COMPLETE'
        assert all_objects[1].report_status == 'added'
        assert all_objects[2].state == 'COMPLETE'
        assert all_objects[2].report_status == 'added'

        # fire run again to check if job is set to Finished
        logic.get_action('harvest_jobs_run')(context, {
            'source_id': harvest_source['id']
        })

        harvest_job = logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })

        assert harvest_job['status'] == u'Finished'
        assert harvest_job['stats'] == {
            'added': 3,
            'updated': 0,
            'not modified': 0,
            'errored': 0,
            'deleted': 0
        }

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context, {
                'id': harvest_source['id']
            })

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 3,
            'updated': 0,
            'not modified': 0,
            'errored': 0,
            'deleted': 0
        }
        assert harvest_source_dict['status']['total_datasets'] == 3
        assert harvest_source_dict['status']['job_count'] == 1

        # Second run
        harvest_job = logic.get_action('harvest_job_create')(
            context, {
                'source_id': harvest_source['id'],
                'run': True
            })

        job_id = harvest_job['id']
        assert logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')
        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).all()

        assert len(all_objects) == 6

        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == 3

        all_objects = model.Session.query(HarvestObject).filter_by(
            report_status='added').all()
        assert len(all_objects) == 3

        all_objects = model.Session.query(HarvestObject).filter_by(
            report_status='updated').all()
        assert len(all_objects) == 2

        all_objects = model.Session.query(HarvestObject).filter_by(
            report_status='deleted').all()
        assert len(all_objects) == 1

        # run to make sure job is marked as finshed
        logic.get_action('harvest_jobs_run')(context, {
            'source_id': harvest_source['id']
        })

        harvest_job = logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })
        assert harvest_job['stats'] == {
            'added': 0,
            'updated': 2,
            'not modified': 0,
            'errored': 0,
            'deleted': 1
        }

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context, {
                'id': harvest_source['id']
            })

        assert harvest_source_dict['status']['last_job']['stats'] == {
            'added': 0,
            'updated': 2,
            'not modified': 0,
            'errored': 0,
            'deleted': 1
        }
        assert harvest_source_dict['status']['total_datasets'] == 2
        assert harvest_source_dict['status']['job_count'] == 2
예제 #20
0
    def test_fetch_doesnt_process_remaining_objects_if_job_status_finished(
            self):

        # make sure queues/exchanges are created first and are empty
        consumer = queue.get_gather_consumer()
        consumer_fetch = queue.get_fetch_consumer()
        consumer.queue_purge(queue=queue.get_gather_queue_name())
        consumer_fetch.queue_purge(queue=queue.get_fetch_queue_name())

        user = logic.get_action('get_site_user')({
            'model': model,
            'ignore_auth': True
        }, {})['name']

        context = {
            'model': model,
            'session': model.Session,
            'user': user,
            'api_version': 3,
            'ignore_auth': True
        }

        source_dict = {
            'title': 'Test Job Finished',
            'name': 'test-job-finished',
            'url': 'basic_test_1',
            'source_type': 'test-nose',
        }

        harvest_source = logic.get_action('harvest_source_create')(context,
                                                                   source_dict)

        assert harvest_source['source_type'] == 'test-nose', harvest_source
        assert harvest_source['url'] == 'basic_test_1', harvest_source

        harvest_job = logic.get_action('harvest_job_create')(
            context, {
                'source_id': harvest_source['id'],
                'run': True
            })

        job_id = harvest_job['id']

        assert harvest_job['source_id'] == harvest_source['id'], harvest_job

        assert harvest_job['status'] == u'Running'

        assert logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })['status'] == u'Running'

        # pop on item off the queue and run the callback
        reply = consumer.basic_get(queue='ckan.harvest.gather')

        queue.gather_callback(consumer, *reply)

        all_objects = model.Session.query(HarvestObject).filter(
            HarvestObject.harvest_job_id == harvest_job['id']).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'WAITING'
        assert all_objects[1].state == 'WAITING'
        assert all_objects[2].state == 'WAITING'

        # artificially set the job to finished to simulate a job abort or timeout
        job_obj = HarvestJob.get(harvest_job['id'])
        job_obj.status = 'Finished'
        job_obj.save()

        original_dataset_count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()

        # do three times as three harvest objects
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)
        reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
        queue.fetch_callback(consumer_fetch, *reply)

        all_objects = model.Session.query(HarvestObject).filter(
            HarvestObject.harvest_job_id == harvest_job['id']).all()

        assert len(all_objects) == 3
        assert all_objects[0].state == 'ERROR'
        assert all_objects[1].state == 'ERROR'
        assert all_objects[2].state == 'ERROR'

        count = model.Session.query(model.Package) \
            .filter(model.Package.type == 'dataset') \
            .count()
        assert count == original_dataset_count

        # fire run again to check if job is set to Finished
        logic.get_action('harvest_jobs_run')(context, {
            'source_id': harvest_source['id']
        })

        harvest_job = logic.get_action('harvest_job_show')(context, {
            'id': job_id
        })

        assert_equal(harvest_job['status'], u'Finished')
        assert_equal(
            harvest_job['stats'], {
                'added': 0,
                'updated': 0,
                'not modified': 0,
                'errored': 3,
                'deleted': 0
            })

        harvest_source_dict = logic.get_action('harvest_source_show')(
            context, {
                'id': harvest_source['id']
            })

        assert_equal(
            harvest_source_dict['status']['last_job']['stats'], {
                'added': 0,
                'updated': 0,
                'not modified': 0,
                'errored': 3,
                'deleted': 0
            })
        assert_equal(harvest_source_dict['status']['total_datasets'], 0)
        assert_equal(harvest_source_dict['status']['job_count'], 1)