def test_obsolete(self): source_config = factories.SourceConfigFactory() hlv1 = factories.HarvestJobFactory( harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) old_version = source_config.harvester.get_class().VERSION source_config.harvester.get_class().VERSION += 1 new_version = source_config.harvester.get_class().VERSION hlv2 = factories.HarvestJobFactory( harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) tasks.harvest(job_id=hlv2.id) tasks.harvest(job_id=hlv1.id) hlv1.refresh_from_db() hlv2.refresh_from_db() assert hlv2.status == HarvestJob.STATUS.succeeded assert hlv2.harvester_version == new_version assert hlv1.status == HarvestJob.STATUS.skipped assert hlv1.harvester_version == old_version assert hlv1.error_context == HarvestJob.SkipReasons.obsolete.value
def test_autoupdate(self, completions, status, new_version, updated): source_config = factories.SourceConfigFactory() source_config.harvester.get_class().VERSION = 1 hl = factories.HarvestJobFactory( status=status, completions=completions, harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) source_config.harvester.get_class().VERSION = new_version tasks.harvest(job_id=hl.id) hl.refresh_from_db() if updated: assert hl.status == HarvestJob.STATUS.succeeded elif new_version > 1: assert hl.status == HarvestJob.STATUS.skipped assert hl.error_context == HarvestJob.SkipReasons.obsolete.value assert (hl.harvester_version == new_version) == updated
def test_caught_up(self): source_config = factories.SourceConfigFactory( full_harvest=True, earliest_date=pendulum.parse('2017-01-01').date() ) factories.HarvestJobFactory( source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), end_date=pendulum.parse('2017-01-02').date(), ) factories.HarvestJobFactory( source_config=source_config, start_date=pendulum.parse('2018-01-01').date(), end_date=pendulum.parse('2018-01-02').date(), ) assert len(HarvestScheduler(source_config).all(cutoff=pendulum.parse('2018-01-01').date())) == 0
def test_harvest_fails(self, source_config): source_config.harvester.get_class()._do_fetch.side_effect = ValueError('In a test') job = factories.HarvestJobFactory(source_config=source_config) with pytest.raises(ValueError) as e: tasks.harvest(job_id=job.id) job.refresh_from_db() assert e.value.args == ('In a test', ) assert job.status == HarvestJob.STATUS.failed assert job.completions == 0 assert 'ValueError: In a test' in job.error_context
def test_overrides(self, source_config_kwargs, task_kwargs, lock_config): source_config = factories.SourceConfigFactory(**source_config_kwargs) job = factories.HarvestJobFactory(source_config=source_config) if lock_config: t = SyncedThread(source_config.acquire_lock) t.start() try: tasks.harvest(job_id=job.id, **task_kwargs) finally: if lock_config: t.join()
def test_handles_duplicate_values(self, monkeypatch, source_config): fake = Factory.create() job = factories.HarvestJobFactory(source_config=source_config) source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(100)] * 3) tasks.harvest(job_id=job.id, ingest=False) job.refresh_from_db() assert job.completions == 1 assert job.status == HarvestJob.STATUS.succeeded assert job.raw_data.count() == 100
def test_failure_cases(self, source_config_kwargs, task_kwargs, lock_config, exception): source_config = factories.SourceConfigFactory(**source_config_kwargs) job = factories.HarvestJobFactory(source_config=source_config) if lock_config: t = SyncedThread(source_config.acquire_lock) t.start() try: with pytest.raises(exception): tasks.harvest(job_id=job.id, **task_kwargs) finally: if lock_config: t.join()
def test_latest_date(self): source_config = factories.SourceConfigFactory( full_harvest=True, earliest_date=pendulum.parse('2017-01-01').date() ) # We have a harvest job with start_date equal to earliest_date # but a different source_config factories.HarvestJobFactory( start_date=pendulum.parse('2017-01-01').date(), end_date=pendulum.parse('2017-01-02').date(), ) assert len(HarvestScheduler(source_config).all(cutoff=pendulum.parse('2018-01-01').date())) == 365
def test_job_values(self, source_config): task_id = uuid.uuid4() job = factories.HarvestJobFactory(source_config=source_config) tasks.harvest.apply((), {'job_id': job.id}, task_id=str(task_id), throw=True) job.refresh_from_db() assert job.task_id == task_id assert job.status == HarvestJob.STATUS.succeeded assert job.error_context == '' assert job.completions == 1 assert job.source_config == source_config assert job.share_version == settings.VERSION assert job.harvester_version == source_config.get_harvester().VERSION assert job.source_config_version == source_config.version
def test_partial_harvest_fails(self, source_config): job = factories.HarvestJobFactory(source_config=source_config) def _do_fetch(*args, **kwargs): yield ('doc1', b'doc1data') yield ('doc2', b'doc2data') yield ('doc3', b'doc3data') raise ValueError('In a test') source_config.harvester.get_class()._do_fetch = _do_fetch with pytest.raises(ValueError) as e: tasks.harvest(job_id=job.id) job.refresh_from_db() assert job.raw_data.count() == 3 assert e.value.args == ('In a test', ) assert job.status == HarvestJob.STATUS.failed assert job.completions == 0 assert 'ValueError: In a test' in job.error_context assert IngestJob.objects.filter(status=IngestJob.STATUS.created).count() == 3
def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries): assert rediscovered <= count, 'Y tho' fake = Factory.create() source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(count)]) list(RawDatum.objects.store_chunk(source_config, ( FetchResult(*tup) for tup in random.sample(source_config.harvester.get_class()._do_fetch, rediscovered)) )) job = factories.HarvestJobFactory(source_config=source_config) tasks.harvest(job_id=job.id, superfluous=superfluous, limit=limit, ingest=ingest) job.refresh_from_db() assert job.completions == 1 assert job.status == HarvestJob.STATUS.succeeded assert job.raw_data.count() == (count if limit is None or count < limit else limit) if limit is not None and rediscovered: assert RawDatum.objects.filter().count() >= rediscovered assert RawDatum.objects.filter().count() <= rediscovered + max(0, min(limit, count - rediscovered)) else: assert RawDatum.objects.filter().count() == (count if limit is None or count < limit else limit) ingest_count = IngestJob.objects.filter(status=IngestJob.STATUS.created).count() if ingest: if superfluous: assert ingest_count == min(count, limit or 99999) elif limit is not None: assert ingest_count <= min(limit, count) assert ingest_count >= min(limit, count) - rediscovered else: assert ingest_count == count - rediscovered else: assert ingest_count == 0
def test_handles_duplicate_values_limit(self, monkeypatch, source_config): fake = Factory.create() job = factories.HarvestJobFactory(source_config=source_config) source_config.harvester.get_class()._do_fetch.clear() padding = [] for _ in range(20): s = fake.sentence() padding.append((s, s * 5)) for i in range(10): s = fake.sentence() source_config.harvester.get_class()._do_fetch.extend([(s, s * 5)] * 5) source_config.harvester.get_class()._do_fetch.extend(padding) tasks.harvest(job_id=job.id, limit=60, ingest=False) job.refresh_from_db() assert job.completions == 1 assert job.status == HarvestJob.STATUS.succeeded assert job.raw_data.count() == 30