def initialize(self):
     '''Generate a list of fake identifiers to harvest'''
     # Here you comes your implementation.
     # You should iter over a remote endpoint to list identifiers
     # to harvest and optionnaly store extra data
     for _ in range(faker.pyint()):
         self.add_item(faker.uuid4())  # Accept kwargs to store data
Exemple #2
0
    def test_html_description(self):
        node = BNode()
        g = Graph()

        g.add((node, RDF.type, DCAT.Dataset))
        g.add((node, DCT.identifier, Literal(faker.uuid4())))
        g.add((node, DCT.title, Literal(faker.sentence())))
        g.add((node, DCT.description, Literal('<div>a description</div>')))

        dataset = dataset_from_rdf(g)
        dataset.validate()

        assert isinstance(dataset, Dataset)
        assert dataset.description == 'a description'
def test_minimal_ckan_response(rmock):
    '''CKAN Harvester should accept the minimum dataset payload'''
    CKAN_URL = 'https://harvest.me/'
    API_URL = '{}api/3/action/'.format(CKAN_URL)
    PACKAGE_LIST_URL = '{}package_list'.format(API_URL)
    PACKAGE_SHOW_URL = '{}package_show'.format(API_URL)

    name = faker.unique_string()
    json = {
        'success': True,
        'result': {
            'id': faker.uuid4(),
            'name': name,
            'title': faker.sentence(),
            'maintainer': faker.name(),
            'tags': [],
            'private': False,
            'maintainer_email': faker.email(),
            'license_id': None,
            'metadata_created': faker.iso8601(),
            'organization': None,
            'metadata_modified': faker.iso8601(),
            'author': None,
            'author_email': None,
            'notes': faker.paragraph(),
            'license_title': None,
            'state': None,
            'revision_id': faker.unique_string(),
            'type': 'dataset',
            'resources': [],
            # extras is not always present so we exclude it from the minimal payload
        }
    }
    source = HarvestSourceFactory(backend='ckan', url=CKAN_URL)
    rmock.get(PACKAGE_LIST_URL,
              json={
                  'success': True,
                  'result': [name]
              },
              status_code=200,
              headers={'Content-Type': 'application/json'})
    rmock.get(PACKAGE_SHOW_URL,
              json=json,
              status_code=200,
              headers={'Content-Type': 'application/json'})
    actions.run(source.slug)
    source.reload()
    assert source.get_last_job().status == 'done'
Exemple #4
0
    def test_all_fields(self):
        uri = 'https://test.org/dataset'
        node = URIRef(uri)
        g = Graph()

        id = faker.uuid4()
        title = faker.sentence()
        acronym = faker.word()
        description = faker.paragraph()
        tags = faker.words(nb=3)
        start = faker.past_date(start_date='-30d')
        end = faker.future_date(end_date='+30d')
        g.set((node, RDF.type, DCAT.Dataset))
        g.set((node, DCT.identifier, Literal(id)))
        g.set((node, DCT.title, Literal(title)))
        g.set((node, SKOS.altLabel, Literal(acronym)))
        g.set((node, DCT.description, Literal(description)))
        g.set((node, DCT.accrualPeriodicity, FREQ.daily))
        pot = BNode()
        g.add((node, DCT.temporal, pot))
        g.set((pot, RDF.type, DCT.PeriodOfTime))
        g.set((pot, SCHEMA.startDate, Literal(start)))
        g.set((pot, SCHEMA.endDate, Literal(end)))
        for tag in tags:
            g.add((node, DCAT.keyword, Literal(tag)))

        dataset = dataset_from_rdf(g)
        dataset.validate()

        assert isinstance(dataset, Dataset)
        assert dataset.title == title
        assert dataset.acronym == acronym
        assert dataset.description == description
        assert dataset.frequency == 'daily'
        assert set(dataset.tags) == set(tags)
        assert isinstance(dataset.temporal_coverage, db.DateRange)
        assert dataset.temporal_coverage.start == start
        assert dataset.temporal_coverage.end == end

        extras = dataset.extras
        assert 'dct:identifier' in extras
        assert extras['dct:identifier'] == id
        assert 'uri' in extras
        assert extras['uri'] == uri
    def test_all_fields(self):
        uri = 'https://test.org/dataset'
        node = URIRef(uri)
        g = Graph()

        id = faker.uuid4()
        title = faker.sentence()
        acronym = faker.word()
        description = faker.paragraph()
        tags = faker.words(nb=3)
        start = faker.past_date(start_date='-30d')
        end = faker.future_date(end_date='+30d')
        g.set((node, RDF.type, DCAT.Dataset))
        g.set((node, DCT.identifier, Literal(id)))
        g.set((node, DCT.title, Literal(title)))
        g.set((node, SKOS.altLabel, Literal(acronym)))
        g.set((node, DCT.description, Literal(description)))
        g.set((node, DCT.accrualPeriodicity, FREQ.daily))
        pot = BNode()
        g.add((node, DCT.temporal, pot))
        g.set((pot, RDF.type, DCT.PeriodOfTime))
        g.set((pot, SCHEMA.startDate, Literal(start)))
        g.set((pot, SCHEMA.endDate, Literal(end)))
        for tag in tags:
            g.add((node, DCAT.keyword, Literal(tag)))

        dataset = dataset_from_rdf(g)
        dataset.validate()

        self.assertIsInstance(dataset, Dataset)
        self.assertEqual(dataset.title, title)
        self.assertEqual(dataset.acronym, acronym)
        self.assertEqual(dataset.description, description)
        self.assertEqual(dataset.frequency, 'daily')
        self.assertEqual(set(dataset.tags), set(tags))
        self.assertIsInstance(dataset.temporal_coverage, db.DateRange)
        self.assertEqual(dataset.temporal_coverage.start, start)
        self.assertEqual(dataset.temporal_coverage.end, end)

        extras = dataset.extras
        self.assertIn('dct:identifier', extras)
        self.assertEqual(extras['dct:identifier'], id)
        self.assertIn('uri', extras)
        self.assertEqual(extras['uri'], uri)