def test_obsolete(self): source_config = factories.SourceConfigFactory() hlv1 = factories.HarvestLogFactory( harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) old_version = source_config.harvester.get_class().VERSION source_config.harvester.get_class().VERSION += 1 new_version = source_config.harvester.get_class().VERSION hlv2 = factories.HarvestLogFactory( harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) tasks.harvest(log_id=hlv2.id) tasks.harvest(log_id=hlv1.id) hlv1.refresh_from_db() hlv2.refresh_from_db() assert hlv2.status == HarvestLog.STATUS.succeeded assert hlv2.harvester_version == new_version assert hlv1.status == HarvestLog.STATUS.skipped assert hlv1.harvester_version == old_version assert hlv1.context == HarvestLog.SkipReasons.obsolete.value
def test_autoupdate(self, completions, status, new_version, updated): source_config = factories.SourceConfigFactory() source_config.harvester.get_class().VERSION = 1 hl = factories.HarvestLogFactory( status=status, completions=completions, harvester_version=source_config.harvester.version, source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), ) source_config.harvester.get_class().VERSION = new_version tasks.harvest(log_id=hl.id) hl.refresh_from_db() if updated: assert hl.status == HarvestLog.STATUS.succeeded elif new_version > 1: assert hl.status == HarvestLog.STATUS.skipped assert hl.context == HarvestLog.SkipReasons.obsolete.value assert (hl.harvester_version == new_version) == updated
def test_caught_up(self): source_config = factories.SourceConfigFactory( full_harvest=True, earliest_date=pendulum.parse('2017-01-01').date()) factories.HarvestLogFactory( source_config=source_config, start_date=pendulum.parse('2017-01-01').date(), end_date=pendulum.parse('2017-01-02').date(), ) factories.HarvestLogFactory( source_config=source_config, start_date=pendulum.parse('2018-01-01').date(), end_date=pendulum.parse('2018-01-02').date(), ) assert len( HarvestScheduler(source_config).all( cutoff=pendulum.parse('2018-01-01').date())) == 0
def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries): assert rediscovered <= count, 'Y tho' fake = Factory.create() mock_ingest_task = mock.Mock() monkeypatch.setattr('share.tasks.transform', mock_ingest_task) source_config.harvester.get_class()._do_fetch.extend([ (fake.sentence(), str(i * 50)) for i in range(count) ]) list( RawDatum.objects.store_chunk( source_config, (FetchResult(*tup) for tup in random.sample( source_config.harvester.get_class()._do_fetch, rediscovered)))) log = factories.HarvestLogFactory(source_config=source_config) tasks.harvest(log_id=log.id, superfluous=superfluous, limit=limit, ingest=ingest) log.refresh_from_db() assert log.completions == 1 assert log.status == HarvestLog.STATUS.succeeded assert log.raw_data.count() == (count if limit is None or count < limit else limit) if limit is not None and rediscovered: assert RawDatum.objects.filter().count() >= rediscovered assert RawDatum.objects.filter().count() <= rediscovered + max( 0, min(limit, count - rediscovered)) else: assert RawDatum.objects.filter().count() == ( count if limit is None or count < limit else limit) if ingest: if superfluous: assert mock_ingest_task.apply_async.call_count == min( count, limit or 99999) elif limit is not None: assert mock_ingest_task.apply_async.call_count <= min( limit, count) assert mock_ingest_task.apply_async.call_count >= min( limit, count) - rediscovered else: assert mock_ingest_task.apply_async.call_count == count - rediscovered else: assert mock_ingest_task.apply_async.call_count == 0
def test_overrides(self, source_config_kwargs, task_kwargs, lock_config): source_config = factories.SourceConfigFactory(**source_config_kwargs) log = factories.HarvestLogFactory(source_config=source_config) if lock_config: t = SyncedThread(source_config.acquire_lock) t.start() try: tasks.harvest(log_id=log.id, **task_kwargs) finally: if lock_config: t.join()
def test_handles_duplicate_values(self, monkeypatch, source_config): fake = Factory.create() log = factories.HarvestLogFactory(source_config=source_config) source_config.harvester.get_class()._do_fetch.extend( [(fake.sentence(), str(i * 50)) for i in range(100)] * 3) tasks.harvest(log_id=log.id, ingest=False) log.refresh_from_db() assert log.completions == 1 assert log.status == HarvestLog.STATUS.succeeded assert log.raw_data.count() == 100
def test_harvest_fails(self, source_config): source_config.harvester.get_class()._do_fetch.side_effect = ValueError( 'In a test') log = factories.HarvestLogFactory(source_config=source_config) with pytest.raises(ValueError) as e: tasks.harvest(log_id=log.id) log.refresh_from_db() assert e.value.args == ('In a test', ) assert log.status == HarvestLog.STATUS.failed assert log.completions == 0 assert 'ValueError: In a test' in log.context
def test_latest_date(self): source_config = factories.SourceConfigFactory( full_harvest=True, earliest_date=pendulum.parse('2017-01-01').date()) # We have a harvest log with start_date equal to earliest_date # but a different source_config factories.HarvestLogFactory( start_date=pendulum.parse('2017-01-01').date(), end_date=pendulum.parse('2017-01-02').date(), ) assert len( HarvestScheduler(source_config).all( cutoff=pendulum.parse('2018-01-01').date())) == 365
def test_failure_cases(self, source_config_kwargs, task_kwargs, lock_config, exception): source_config = factories.SourceConfigFactory(**source_config_kwargs) log = factories.HarvestLogFactory(source_config=source_config) if lock_config: t = SyncedThread(source_config.acquire_lock) t.start() try: with pytest.raises(exception): tasks.harvest(log_id=log.id, **task_kwargs) finally: if lock_config: t.join()
def test_log_values(self, source_config): task_id = uuid.uuid4() log = factories.HarvestLogFactory(source_config=source_config) tasks.harvest.apply((), {'log_id': log.id}, task_id=str(task_id), throw=True) log.refresh_from_db() assert log.task_id == task_id assert log.status == HarvestLog.STATUS.succeeded assert log.context == '' assert log.completions == 1 assert log.source_config == source_config assert log.share_version == settings.VERSION assert log.harvester_version == source_config.get_harvester().VERSION assert log.source_config_version == source_config.version
def test_partial_harvest_fails(self, source_config, mock_transform): log = factories.HarvestLogFactory(source_config=source_config) def _do_fetch(*args, **kwargs): yield ('doc1', b'doc1data') yield ('doc2', b'doc2data') yield ('doc3', b'doc3data') raise ValueError('In a test') source_config.harvester.get_class()._do_fetch = _do_fetch with pytest.raises(ValueError) as e: tasks.harvest(log_id=log.id) log.refresh_from_db() assert log.raw_data.count() == 3 assert e.value.args == ('In a test', ) assert log.status == HarvestLog.STATUS.failed assert log.completions == 0 assert 'ValueError: In a test' in log.context assert mock_transform.apply_async.call_count == 3
def test_handles_duplicate_values_limit(self, monkeypatch, source_config): fake = Factory.create() log = factories.HarvestLogFactory(source_config=source_config) source_config.harvester.get_class()._do_fetch.clear() padding = [] for _ in range(20): s = fake.sentence() padding.append((s, s * 5)) for i in range(10): s = fake.sentence() source_config.harvester.get_class()._do_fetch.extend([(s, s * 5)] * 5) source_config.harvester.get_class()._do_fetch.extend(padding) tasks.harvest(log_id=log.id, limit=60, ingest=False) log.refresh_from_db() assert log.completions == 1 assert log.status == HarvestLog.STATUS.succeeded assert log.raw_data.count() == 30