Example #1
0
    def test_obsolete(self):
        source_config = factories.SourceConfigFactory()

        hlv1 = factories.HarvestJobFactory(
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        old_version = source_config.harvester.get_class().VERSION
        source_config.harvester.get_class().VERSION += 1
        new_version = source_config.harvester.get_class().VERSION

        hlv2 = factories.HarvestJobFactory(
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        tasks.harvest(job_id=hlv2.id)
        tasks.harvest(job_id=hlv1.id)

        hlv1.refresh_from_db()
        hlv2.refresh_from_db()

        assert hlv2.status == HarvestJob.STATUS.succeeded
        assert hlv2.harvester_version == new_version

        assert hlv1.status == HarvestJob.STATUS.skipped
        assert hlv1.harvester_version == old_version
        assert hlv1.error_context == HarvestJob.SkipReasons.obsolete.value
Example #2
0
    def test_autoupdate(self, completions, status, new_version, updated):
        source_config = factories.SourceConfigFactory()

        source_config.harvester.get_class().VERSION = 1

        hl = factories.HarvestJobFactory(
            status=status,
            completions=completions,
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        source_config.harvester.get_class().VERSION = new_version

        tasks.harvest(job_id=hl.id)

        hl.refresh_from_db()

        if updated:
            assert hl.status == HarvestJob.STATUS.succeeded
        elif new_version > 1:
            assert hl.status == HarvestJob.STATUS.skipped
            assert hl.error_context == HarvestJob.SkipReasons.obsolete.value

        assert (hl.harvester_version == new_version) == updated
Example #3
0
    def test_caught_up(self):
        source_config = factories.SourceConfigFactory(
            full_harvest=True,
            earliest_date=pendulum.parse('2017-01-01').date()
        )

        factories.HarvestJobFactory(
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
            end_date=pendulum.parse('2017-01-02').date(),
        )

        factories.HarvestJobFactory(
            source_config=source_config,
            start_date=pendulum.parse('2018-01-01').date(),
            end_date=pendulum.parse('2018-01-02').date(),
        )

        assert len(HarvestScheduler(source_config).all(cutoff=pendulum.parse('2018-01-01').date())) == 0
Example #4
0
    def test_harvest_fails(self, source_config):
        source_config.harvester.get_class()._do_fetch.side_effect = ValueError('In a test')
        job = factories.HarvestJobFactory(source_config=source_config)

        with pytest.raises(ValueError) as e:
            tasks.harvest(job_id=job.id)

        job.refresh_from_db()

        assert e.value.args == ('In a test', )
        assert job.status == HarvestJob.STATUS.failed
        assert job.completions == 0
        assert 'ValueError: In a test' in job.error_context
Example #5
0
    def test_overrides(self, source_config_kwargs, task_kwargs, lock_config):
        source_config = factories.SourceConfigFactory(**source_config_kwargs)
        job = factories.HarvestJobFactory(source_config=source_config)

        if lock_config:
            t = SyncedThread(source_config.acquire_lock)
            t.start()

        try:
            tasks.harvest(job_id=job.id, **task_kwargs)
        finally:
            if lock_config:
                t.join()
Example #6
0
    def test_handles_duplicate_values(self, monkeypatch, source_config):
        fake = Factory.create()
        job = factories.HarvestJobFactory(source_config=source_config)

        source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(100)] * 3)

        tasks.harvest(job_id=job.id, ingest=False)

        job.refresh_from_db()

        assert job.completions == 1
        assert job.status == HarvestJob.STATUS.succeeded
        assert job.raw_data.count() == 100
Example #7
0
    def test_failure_cases(self, source_config_kwargs, task_kwargs, lock_config, exception):
        source_config = factories.SourceConfigFactory(**source_config_kwargs)
        job = factories.HarvestJobFactory(source_config=source_config)

        if lock_config:
            t = SyncedThread(source_config.acquire_lock)
            t.start()

        try:
            with pytest.raises(exception):
                tasks.harvest(job_id=job.id, **task_kwargs)
        finally:
            if lock_config:
                t.join()
Example #8
0
    def test_latest_date(self):
        source_config = factories.SourceConfigFactory(
            full_harvest=True,
            earliest_date=pendulum.parse('2017-01-01').date()
        )

        # We have a harvest job with start_date equal to earliest_date
        # but a different source_config
        factories.HarvestJobFactory(
            start_date=pendulum.parse('2017-01-01').date(),
            end_date=pendulum.parse('2017-01-02').date(),
        )

        assert len(HarvestScheduler(source_config).all(cutoff=pendulum.parse('2018-01-01').date())) == 365
Example #9
0
    def test_job_values(self, source_config):
        task_id = uuid.uuid4()
        job = factories.HarvestJobFactory(source_config=source_config)

        tasks.harvest.apply((), {'job_id': job.id}, task_id=str(task_id), throw=True)

        job.refresh_from_db()

        assert job.task_id == task_id
        assert job.status == HarvestJob.STATUS.succeeded
        assert job.error_context == ''
        assert job.completions == 1
        assert job.source_config == source_config
        assert job.share_version == settings.VERSION
        assert job.harvester_version == source_config.get_harvester().VERSION
        assert job.source_config_version == source_config.version
Example #10
0
    def test_partial_harvest_fails(self, source_config):
        job = factories.HarvestJobFactory(source_config=source_config)

        def _do_fetch(*args, **kwargs):
            yield ('doc1', b'doc1data')
            yield ('doc2', b'doc2data')
            yield ('doc3', b'doc3data')
            raise ValueError('In a test')
        source_config.harvester.get_class()._do_fetch = _do_fetch

        with pytest.raises(ValueError) as e:
            tasks.harvest(job_id=job.id)

        job.refresh_from_db()

        assert job.raw_data.count() == 3
        assert e.value.args == ('In a test', )
        assert job.status == HarvestJob.STATUS.failed
        assert job.completions == 0
        assert 'ValueError: In a test' in job.error_context
        assert IngestJob.objects.filter(status=IngestJob.STATUS.created).count() == 3
Example #11
0
    def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries):
        assert rediscovered <= count, 'Y tho'

        fake = Factory.create()

        source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(count)])
        list(RawDatum.objects.store_chunk(source_config, (
            FetchResult(*tup) for tup in
            random.sample(source_config.harvester.get_class()._do_fetch, rediscovered))
        ))

        job = factories.HarvestJobFactory(source_config=source_config)

        tasks.harvest(job_id=job.id, superfluous=superfluous, limit=limit, ingest=ingest)

        job.refresh_from_db()

        assert job.completions == 1
        assert job.status == HarvestJob.STATUS.succeeded
        assert job.raw_data.count() == (count if limit is None or count < limit else limit)

        if limit is not None and rediscovered:
            assert RawDatum.objects.filter().count() >= rediscovered
            assert RawDatum.objects.filter().count() <= rediscovered + max(0, min(limit, count - rediscovered))
        else:
            assert RawDatum.objects.filter().count() == (count if limit is None or count < limit else limit)

        ingest_count = IngestJob.objects.filter(status=IngestJob.STATUS.created).count()
        if ingest:
            if superfluous:
                assert ingest_count == min(count, limit or 99999)
            elif limit is not None:
                assert ingest_count <= min(limit, count)
                assert ingest_count >= min(limit, count) - rediscovered
            else:
                assert ingest_count == count - rediscovered
        else:
            assert ingest_count == 0
Example #12
0
    def test_handles_duplicate_values_limit(self, monkeypatch, source_config):
        fake = Factory.create()
        job = factories.HarvestJobFactory(source_config=source_config)

        source_config.harvester.get_class()._do_fetch.clear()

        padding = []
        for _ in range(20):
            s = fake.sentence()
            padding.append((s, s * 5))

        for i in range(10):
            s = fake.sentence()
            source_config.harvester.get_class()._do_fetch.extend([(s, s * 5)] * 5)
            source_config.harvester.get_class()._do_fetch.extend(padding)

        tasks.harvest(job_id=job.id, limit=60, ingest=False)

        job.refresh_from_db()

        assert job.completions == 1
        assert job.status == HarvestJob.STATUS.succeeded
        assert job.raw_data.count() == 30