def test_obsolete(self): source_config = factories.SourceConfigFactory() hlv1 = factories.HarvestJobFactory( harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) old_version = source_config.harvester.get_class().VERSION source_config.harvester.get_class().VERSION += 1 new_version = source_config.harvester.get_class().VERSION hlv2 = factories.HarvestJobFactory( harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) tasks.harvest(job_id=hlv2.id) tasks.harvest(job_id=hlv1.id) hlv1.refresh_from_db() hlv2.refresh_from_db() assert hlv2.status == HarvestJob.STATUS.succeeded assert hlv2.harvester_version == new_version assert hlv1.status == HarvestJob.STATUS.skipped assert hlv1.harvester_version == old_version assert hlv1.error_context == HarvestJob.SkipReasons.obsolete.value
def start_harvest(context, label, start=None, end=None): job = HarvestScheduler(models.SourceConfig.objects.get(label=label), claim_jobs=True).range( pendulum.parse(start), pendulum.parse(end), )[0] tasks.harvest(job_id=job.id)
def test_autoupdate(self, completions, status, new_version, updated): source_config = factories.SourceConfigFactory() source_config.harvester.get_class().VERSION = 1 hl = factories.HarvestJobFactory( status=status, completions=completions, harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) source_config.harvester.get_class().VERSION = new_version tasks.harvest(job_id=hl.id) hl.refresh_from_db() if updated: assert hl.status == HarvestJob.STATUS.succeeded elif new_version > 1: assert hl.status == HarvestJob.STATUS.skipped assert hl.error_context == HarvestJob.SkipReasons.obsolete.value assert (hl.harvester_version == new_version) == updated
def start_harvest(context, label, start=None, end=None): log = HarvestScheduler(models.SourceConfig.objects.get(label=label)).range( pendulum.parse(start), pendulum.parse(end), )[0] tasks.harvest(log_id=log.id)
def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries): assert rediscovered <= count, 'Y tho' fake = Factory.create() mock_ingest_task = mock.Mock() monkeypatch.setattr('share.tasks.transform', mock_ingest_task) source_config.harvester.get_class()._do_fetch.extend([ (fake.sentence(), str(i * 50)) for i in range(count) ]) list( RawDatum.objects.store_chunk( source_config, (FetchResult(*tup) for tup in random.sample( source_config.harvester.get_class()._do_fetch, rediscovered)))) log = factories.HarvestLogFactory(source_config=source_config) tasks.harvest(log_id=log.id, superfluous=superfluous, limit=limit, ingest=ingest) log.refresh_from_db() assert log.completions == 1 assert log.status == HarvestLog.STATUS.succeeded assert log.raw_data.count() == (count if limit is None or count < limit else limit) if limit is not None and rediscovered: assert RawDatum.objects.filter().count() >= rediscovered assert RawDatum.objects.filter().count() <= rediscovered + max( 0, min(limit, count - rediscovered)) else: assert RawDatum.objects.filter().count() == ( count if limit is None or count < limit else limit) if ingest: if superfluous: assert mock_ingest_task.apply_async.call_count == min( count, limit or 99999) elif limit is not None: assert mock_ingest_task.apply_async.call_count <= min( limit, count) assert mock_ingest_task.apply_async.call_count >= min( limit, count) - rediscovered else: assert mock_ingest_task.apply_async.call_count == count - rediscovered else: assert mock_ingest_task.apply_async.call_count == 0
def test_harvest_after(self, monkeypatch, now, end_date, harvest_after, should_run, source_config): monkeypatch.setattr('share.tasks.harvest.apply_async', mock.Mock()) source_config.harvest_after = harvest_after source_config.save() monkeypatch.setattr('django.utils.timezone.now', lambda: now) source_config.harvester.get_class()._do_fetch = mock.Mock(return_value=[]) HarvestScheduler(source_config).date(end_date.add(days=-1)) tasks.harvest() assert source_config.harvester.get_class()._do_fetch.called == should_run
def test_handles_duplicate_values(self, monkeypatch, source_config): fake = Factory.create() job = factories.HarvestJobFactory(source_config=source_config) source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(100)] * 3) tasks.harvest(job_id=job.id, ingest=False) job.refresh_from_db() assert job.completions == 1 assert job.status == HarvestJob.STATUS.succeeded assert job.raw_data.count() == 100
def test_overrides(self, source_config_kwargs, task_kwargs, lock_config): source_config = factories.SourceConfigFactory(**source_config_kwargs) job = factories.HarvestJobFactory(source_config=source_config) if lock_config: t = SyncedThread(source_config.acquire_lock) t.start() try: tasks.harvest(job_id=job.id, **task_kwargs) finally: if lock_config: t.join()
def test_harvest_fails(self, source_config): source_config.harvester.get_class()._do_fetch.side_effect = ValueError('In a test') job = factories.HarvestJobFactory(source_config=source_config) with pytest.raises(ValueError) as e: tasks.harvest(job_id=job.id) job.refresh_from_db() assert e.value.args == ('In a test', ) assert job.status == HarvestJob.STATUS.failed assert job.completions == 0 assert 'ValueError: In a test' in job.error_context
def test_failure_cases(self, source_config_kwargs, task_kwargs, lock_config, exception): source_config = factories.SourceConfigFactory(**source_config_kwargs) job = factories.HarvestJobFactory(source_config=source_config) if lock_config: t = SyncedThread(source_config.acquire_lock) t.start() try: with pytest.raises(exception): tasks.harvest(job_id=job.id, **task_kwargs) finally: if lock_config: t.join()
def test_partial_harvest_fails(self, source_config): job = factories.HarvestJobFactory(source_config=source_config) def _do_fetch(*args, **kwargs): yield ('doc1', b'doc1data') yield ('doc2', b'doc2data') yield ('doc3', b'doc3data') raise ValueError('In a test') source_config.harvester.get_class()._do_fetch = _do_fetch with pytest.raises(ValueError) as e: tasks.harvest(job_id=job.id) job.refresh_from_db() assert job.raw_data.count() == 3 assert e.value.args == ('In a test', ) assert job.status == HarvestJob.STATUS.failed assert job.completions == 0 assert 'ValueError: In a test' in job.error_context assert IngestJob.objects.filter(status=IngestJob.STATUS.created).count() == 3
def test_partial_harvest_fails(self, source_config, mock_transform): log = factories.HarvestLogFactory(source_config=source_config) def _do_fetch(*args, **kwargs): yield ('doc1', b'doc1data') yield ('doc2', b'doc2data') yield ('doc3', b'doc3data') raise ValueError('In a test') source_config.harvester.get_class()._do_fetch = _do_fetch with pytest.raises(ValueError) as e: tasks.harvest(log_id=log.id) log.refresh_from_db() assert log.raw_data.count() == 3 assert e.value.args == ('In a test', ) assert log.status == HarvestLog.STATUS.failed assert log.completions == 0 assert 'ValueError: In a test' in log.context assert mock_transform.apply_async.call_count == 3
def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries): assert rediscovered <= count, 'Y tho' fake = Factory.create() source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(count)]) list(RawDatum.objects.store_chunk(source_config, ( FetchResult(*tup) for tup in random.sample(source_config.harvester.get_class()._do_fetch, rediscovered)) )) job = factories.HarvestJobFactory(source_config=source_config) tasks.harvest(job_id=job.id, superfluous=superfluous, limit=limit, ingest=ingest) job.refresh_from_db() assert job.completions == 1 assert job.status == HarvestJob.STATUS.succeeded assert job.raw_data.count() == (count if limit is None or count < limit else limit) if limit is not None and rediscovered: assert RawDatum.objects.filter().count() >= rediscovered assert RawDatum.objects.filter().count() <= rediscovered + max(0, min(limit, count - rediscovered)) else: assert RawDatum.objects.filter().count() == (count if limit is None or count < limit else limit) ingest_count = IngestJob.objects.filter(status=IngestJob.STATUS.created).count() if ingest: if superfluous: assert ingest_count == min(count, limit or 99999) elif limit is not None: assert ingest_count <= min(limit, count) assert ingest_count >= min(limit, count) - rediscovered else: assert ingest_count == count - rediscovered else: assert ingest_count == 0
def test_handles_duplicate_values_limit(self, monkeypatch, source_config): fake = Factory.create() job = factories.HarvestJobFactory(source_config=source_config) source_config.harvester.get_class()._do_fetch.clear() padding = [] for _ in range(20): s = fake.sentence() padding.append((s, s * 5)) for i in range(10): s = fake.sentence() source_config.harvester.get_class()._do_fetch.extend([(s, s * 5)] * 5) source_config.harvester.get_class()._do_fetch.extend(padding) tasks.harvest(job_id=job.id, limit=60, ingest=False) job.refresh_from_db() assert job.completions == 1 assert job.status == HarvestJob.STATUS.succeeded assert job.raw_data.count() == 30
def test_not_found(self): with pytest.raises(HarvestLog.DoesNotExist): tasks.harvest(log_id=12)
def test_not_found(self): with pytest.raises(HarvestJob.DoesNotExist): tasks.harvest(job_id=12)