Exemple #1
0
    def test_store_data_dedups_simple(self, source_config):
        rd1 = RawDatum.objects.store_data(source_config,
                                          FetchResult('unique', 'mydatums'))
        rd2 = RawDatum.objects.store_data(source_config,
                                          FetchResult('unique', 'mydatums'))

        assert rd1.pk == rd2.pk
        assert rd1.created is True
        assert rd2.created is False
        assert rd1.date_created == rd2.date_created
        assert rd1.date_modified < rd2.date_modified
Exemple #2
0
    def test_store_data_dedups_complex(self, source_config):
        data = '{"providerUpdatedDateTime":"2016-08-25T11:37:40Z","uris":{"canonicalUri":"https://provider.domain/files/7d2792031","providerUris":["https://provider.domain/files/7d2792031"]},"contributors":[{"name":"Person1","email":"*****@*****.**"},{"name":"Person2","email":"*****@*****.**"},{"name":"Person3","email":"*****@*****.**"},{"name":"Person4","email":"*****@*****.**"}],"title":"ReducingMorbiditiesinNeonatesUndergoingMRIScannig"}'
        rd1 = RawDatum.objects.store_data(source_config,
                                          FetchResult('unique', data))
        rd2 = RawDatum.objects.store_data(source_config,
                                          FetchResult('unique', data))

        assert rd1.pk == rd2.pk
        assert rd1.created is True
        assert rd2.created is False
        assert rd1.date_modified < rd2.date_modified
        assert rd1.date_created == rd2.date_created
Exemple #3
0
    def post(self, request, *args, **kwargs):

        try:
            jsonschema.validate(request.data, schemas.v1_push_schema)
        except (jsonschema.exceptions.ValidationError) as error:
            raise ParseError(detail=error.message)

        try:
            prelim_data = request.data['jsonData']
        except ParseError as error:
            return Response(
                'Invalid JSON - {0}'.format(error.message),
                status=status.HTTP_400_BAD_REQUEST
            )

        # store raw data, assuming you can only submit one at a time
        with transaction.atomic():
            try:
                doc_id = prelim_data['uris']['canonicalUri']
            except KeyError:
                return Response({'errors': 'Canonical URI not found in uris.', 'data': prelim_data}, status=status.HTTP_400_BAD_REQUEST)

            config = self._get_source_config(request.user)
            raw = RawDatum.objects.store_data(config, FetchResult(doc_id, DictSerializer(pretty=False).serialize(prelim_data), timezone.now()))

        transformed_data = config.get_transformer().transform(raw.datum)
        data = {}
        data['data'] = transformed_data
        serializer = BasicNormalizedDataSerializer(data=data, context={'request': request})

        if serializer.is_valid():
            nm_instance = serializer.save()
            async_result = disambiguate.delay(nm_instance.id)
            return Response({'task_id': async_result.id}, status=status.HTTP_202_ACCEPTED)
        return Response({'errors': serializer.errors, 'data': prelim_data}, status=status.HTTP_400_BAD_REQUEST)
def test_swbiodiversity_transformer():
    config = SourceConfig.objects.get(label=('org.swbiodiversity'))
    transformer = config.get_transformer()
    fetch_result = FetchResult(
        'http://swbiodiversity.org/seinet/collections/misc/collprofiles.php?collid=187',
        data)
    raw_datum = RawDatum.objects.store_data(config, fetch_result)
    result = transformer.transform(raw_datum)
    assert result['@graph'][3]['@type'] == 'dataset'
    assert result['@graph'][3]['description'] == 'Sample description'
    assert result['@graph'][3]['title'] == 'A. Michael Powell Herbarium (SRSC)'
    assert result['@graph'][3]['extra'][
        'usage_rights'] == 'CC BY-NC (Attribution-Non-Commercial)'
    assert result['@graph'][3]['extra'][
        'access_rights'] == 'Sul Ross University'
    assert result['@graph'][3]['extra']['collection_statistics'] == {
        "(25%) georeferenced": "1,195",
        "(59%) identified to species": "2,849",
        "(61%) with images": "2,954",
        "families": "104",
        "genera": "361",
        "species": "661",
        "specimen records": "4,868",
        "total taxa (including subsp. and var.)": "762"
    }
    assert result['@graph'][4][
        'uri'] == 'http://swbiodiversity.org/seinet/collections/misc/collprofiles.php?collid=187'
Exemple #5
0
    def setup_ingest(self, claim_job):
        assert self.datum and self._config and not (self.raw or self.job
                                                    or self.async_task)

        # TODO get rid of FetchResult, or make it more sensical
        from share.harvest.base import FetchResult
        fetch_result = FetchResult(self.datum_id, self.datum, self.datestamp)
        self.raw = RawDatum.objects.store_data(self._config, fetch_result)
        self.job = IngestJob.schedule(self.raw, claim=claim_job)
        return self
Exemple #6
0
    def test_rawdata(self, source_config):
        work = factories.AbstractCreativeWorkFactory(
            change__change_set__normalized_data__raw=models.RawDatum.objects.
            store_data(source_config, FetchResult('unique', 'data')))
        work.change.change_set.normalized_data.delete()

        assert models.Change.objects.count() == 0
        assert models.ChangeSet.objects.count() == 0
        assert models.NormalizedData.objects.count() == 0
        assert models.AbstractCreativeWork.objects.count() == 0
Exemple #7
0
    def test_store_data(self, source_config):
        rd = RawDatum.objects.store_data(source_config,
                                         FetchResult('unique', 'mydatums'))

        assert rd.date_modified is not None
        assert rd.date_created is not None

        assert rd.datum == 'mydatums'
        assert rd.suid.identifier == 'unique'
        assert rd.suid.source_config == source_config
        assert rd.sha256 == hashlib.sha256(b'mydatums').hexdigest()
Exemple #8
0
    def test_data_flow(self, source_config, monkeypatch, count, rediscovered,
                       superfluous, limit, ingest, django_assert_num_queries):
        assert rediscovered <= count, 'Y tho'

        fake = Factory.create()
        mock_ingest_task = mock.Mock()

        monkeypatch.setattr('share.tasks.transform', mock_ingest_task)
        source_config.harvester.get_class()._do_fetch.extend([
            (fake.sentence(), str(i * 50)) for i in range(count)
        ])
        list(
            RawDatum.objects.store_chunk(
                source_config, (FetchResult(*tup) for tup in random.sample(
                    source_config.harvester.get_class()._do_fetch,
                    rediscovered))))

        log = factories.HarvestLogFactory(source_config=source_config)

        tasks.harvest(log_id=log.id,
                      superfluous=superfluous,
                      limit=limit,
                      ingest=ingest)

        log.refresh_from_db()

        assert log.completions == 1
        assert log.status == HarvestLog.STATUS.succeeded
        assert log.raw_data.count() == (count if limit is None or count < limit
                                        else limit)

        if limit is not None and rediscovered:
            assert RawDatum.objects.filter().count() >= rediscovered
            assert RawDatum.objects.filter().count() <= rediscovered + max(
                0, min(limit, count - rediscovered))
        else:
            assert RawDatum.objects.filter().count() == (
                count if limit is None or count < limit else limit)

        if ingest:
            if superfluous:
                assert mock_ingest_task.apply_async.call_count == min(
                    count, limit or 99999)
            elif limit is not None:
                assert mock_ingest_task.apply_async.call_count <= min(
                    limit, count)
                assert mock_ingest_task.apply_async.call_count >= min(
                    limit, count) - rediscovered
            else:
                assert mock_ingest_task.apply_async.call_count == count - rediscovered
        else:
            assert mock_ingest_task.apply_async.call_count == 0
def test_swbiodiversity_transformer():
    config = SourceConfig.objects.get(label=('org.swbiodiversity'))
    transformer = config.get_transformer()
    fetch_result = FetchResult(
        'http://swbiodiversity.org/seinet/collections/misc/collprofiles.php?collid=187',
        data)
    raw_datum = RawDatum.objects.store_data(config, fetch_result)

    graph = transformer.transform(raw_datum)

    dataset = graph.filter_nodes(lambda n: n.type == 'dataset')[0]

    assert dataset.type == 'dataset'
    assert dataset['description'] == 'Sample description'
    assert dataset['title'] == 'A. Michael Powell Herbarium (SRSC)'
    assert dataset['extra'][
        'usage_rights'] == 'CC BY-NC (Attribution-Non-Commercial)'
    assert dataset['extra']['access_rights'] == 'Sul Ross University'
    assert dataset['extra']['collection_statistics'] == {
        "(25%) georeferenced": "1,195",
        "(59%) identified to species": "2,849",
        "(61%) with images": "2,954",
        "families": "104",
        "genera": "361",
        "species": "661",
        "specimen records": "4,868",
        "total taxa (including subsp. and var.)": "762"
    }

    agent_relations = dataset['agent_relations']
    assert len(agent_relations) == 1
    agent = agent_relations[0]['agent']
    assert agent['given_name'] == 'Test'
    assert agent['identifiers'][0]['uri'] == 'mailto:[email protected]'

    identifiers = dataset['identifiers']
    assert len(identifiers) == 1
    assert identifiers[0][
        'uri'] == 'http://swbiodiversity.org/seinet/collections/misc/collprofiles.php?collid=187'
Exemple #10
0
    def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries):
        assert rediscovered <= count, 'Y tho'

        fake = Factory.create()

        source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(count)])
        list(RawDatum.objects.store_chunk(source_config, (
            FetchResult(*tup) for tup in
            random.sample(source_config.harvester.get_class()._do_fetch, rediscovered))
        ))

        job = factories.HarvestJobFactory(source_config=source_config)

        tasks.harvest(job_id=job.id, superfluous=superfluous, limit=limit, ingest=ingest)

        job.refresh_from_db()

        assert job.completions == 1
        assert job.status == HarvestJob.STATUS.succeeded
        assert job.raw_data.count() == (count if limit is None or count < limit else limit)

        if limit is not None and rediscovered:
            assert RawDatum.objects.filter().count() >= rediscovered
            assert RawDatum.objects.filter().count() <= rediscovered + max(0, min(limit, count - rediscovered))
        else:
            assert RawDatum.objects.filter().count() == (count if limit is None or count < limit else limit)

        ingest_count = IngestJob.objects.filter(status=IngestJob.STATUS.created).count()
        if ingest:
            if superfluous:
                assert ingest_count == min(count, limit or 99999)
            elif limit is not None:
                assert ingest_count <= min(limit, count)
                assert ingest_count >= min(limit, count) - rediscovered
            else:
                assert ingest_count == count - rediscovered
        else:
            assert ingest_count == 0