Beispiel #1
0
    def test_obsolete(self):
        source_config = factories.SourceConfigFactory()

        hlv1 = factories.HarvestJobFactory(
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        old_version = source_config.harvester.get_class().VERSION
        source_config.harvester.get_class().VERSION += 1
        new_version = source_config.harvester.get_class().VERSION

        hlv2 = factories.HarvestJobFactory(
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        tasks.harvest(job_id=hlv2.id)
        tasks.harvest(job_id=hlv1.id)

        hlv1.refresh_from_db()
        hlv2.refresh_from_db()

        assert hlv2.status == HarvestJob.STATUS.succeeded
        assert hlv2.harvester_version == new_version

        assert hlv1.status == HarvestJob.STATUS.skipped
        assert hlv1.harvester_version == old_version
        assert hlv1.error_context == HarvestJob.SkipReasons.obsolete.value
Beispiel #2
0
def start_harvest(context, label, start=None, end=None):
    job = HarvestScheduler(models.SourceConfig.objects.get(label=label), claim_jobs=True).range(
        pendulum.parse(start),
        pendulum.parse(end),
    )[0]

    tasks.harvest(job_id=job.id)
Beispiel #3
0
    def test_autoupdate(self, completions, status, new_version, updated):
        source_config = factories.SourceConfigFactory()

        source_config.harvester.get_class().VERSION = 1

        hl = factories.HarvestJobFactory(
            status=status,
            completions=completions,
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        source_config.harvester.get_class().VERSION = new_version

        tasks.harvest(job_id=hl.id)

        hl.refresh_from_db()

        if updated:
            assert hl.status == HarvestJob.STATUS.succeeded
        elif new_version > 1:
            assert hl.status == HarvestJob.STATUS.skipped
            assert hl.error_context == HarvestJob.SkipReasons.obsolete.value

        assert (hl.harvester_version == new_version) == updated
Beispiel #4
0
def start_harvest(context, label, start=None, end=None):
    log = HarvestScheduler(models.SourceConfig.objects.get(label=label)).range(
        pendulum.parse(start),
        pendulum.parse(end),
    )[0]

    tasks.harvest(log_id=log.id)
Beispiel #5
0
def start_harvest(context, label, start=None, end=None):
    job = HarvestScheduler(models.SourceConfig.objects.get(label=label),
                           claim_jobs=True).range(
                               pendulum.parse(start),
                               pendulum.parse(end),
                           )[0]

    tasks.harvest(job_id=job.id)
Beispiel #6
0
    def test_data_flow(self, source_config, monkeypatch, count, rediscovered,
                       superfluous, limit, ingest, django_assert_num_queries):
        assert rediscovered <= count, 'Y tho'

        fake = Factory.create()
        mock_ingest_task = mock.Mock()

        monkeypatch.setattr('share.tasks.transform', mock_ingest_task)
        source_config.harvester.get_class()._do_fetch.extend([
            (fake.sentence(), str(i * 50)) for i in range(count)
        ])
        list(
            RawDatum.objects.store_chunk(
                source_config, (FetchResult(*tup) for tup in random.sample(
                    source_config.harvester.get_class()._do_fetch,
                    rediscovered))))

        log = factories.HarvestLogFactory(source_config=source_config)

        tasks.harvest(log_id=log.id,
                      superfluous=superfluous,
                      limit=limit,
                      ingest=ingest)

        log.refresh_from_db()

        assert log.completions == 1
        assert log.status == HarvestLog.STATUS.succeeded
        assert log.raw_data.count() == (count if limit is None or count < limit
                                        else limit)

        if limit is not None and rediscovered:
            assert RawDatum.objects.filter().count() >= rediscovered
            assert RawDatum.objects.filter().count() <= rediscovered + max(
                0, min(limit, count - rediscovered))
        else:
            assert RawDatum.objects.filter().count() == (
                count if limit is None or count < limit else limit)

        if ingest:
            if superfluous:
                assert mock_ingest_task.apply_async.call_count == min(
                    count, limit or 99999)
            elif limit is not None:
                assert mock_ingest_task.apply_async.call_count <= min(
                    limit, count)
                assert mock_ingest_task.apply_async.call_count >= min(
                    limit, count) - rediscovered
            else:
                assert mock_ingest_task.apply_async.call_count == count - rediscovered
        else:
            assert mock_ingest_task.apply_async.call_count == 0
Beispiel #7
0
    def test_harvest_after(self, monkeypatch, now, end_date, harvest_after, should_run, source_config):
        monkeypatch.setattr('share.tasks.harvest.apply_async', mock.Mock())

        source_config.harvest_after = harvest_after
        source_config.save()
        monkeypatch.setattr('django.utils.timezone.now', lambda: now)
        source_config.harvester.get_class()._do_fetch = mock.Mock(return_value=[])

        HarvestScheduler(source_config).date(end_date.add(days=-1))

        tasks.harvest()

        assert source_config.harvester.get_class()._do_fetch.called == should_run
Beispiel #8
0
    def test_handles_duplicate_values(self, monkeypatch, source_config):
        fake = Factory.create()
        job = factories.HarvestJobFactory(source_config=source_config)

        source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(100)] * 3)

        tasks.harvest(job_id=job.id, ingest=False)

        job.refresh_from_db()

        assert job.completions == 1
        assert job.status == HarvestJob.STATUS.succeeded
        assert job.raw_data.count() == 100
Beispiel #9
0
    def test_overrides(self, source_config_kwargs, task_kwargs, lock_config):
        source_config = factories.SourceConfigFactory(**source_config_kwargs)
        job = factories.HarvestJobFactory(source_config=source_config)

        if lock_config:
            t = SyncedThread(source_config.acquire_lock)
            t.start()

        try:
            tasks.harvest(job_id=job.id, **task_kwargs)
        finally:
            if lock_config:
                t.join()
Beispiel #10
0
    def test_harvest_fails(self, source_config):
        source_config.harvester.get_class()._do_fetch.side_effect = ValueError('In a test')
        job = factories.HarvestJobFactory(source_config=source_config)

        with pytest.raises(ValueError) as e:
            tasks.harvest(job_id=job.id)

        job.refresh_from_db()

        assert e.value.args == ('In a test', )
        assert job.status == HarvestJob.STATUS.failed
        assert job.completions == 0
        assert 'ValueError: In a test' in job.error_context
Beispiel #11
0
    def test_failure_cases(self, source_config_kwargs, task_kwargs, lock_config, exception):
        source_config = factories.SourceConfigFactory(**source_config_kwargs)
        job = factories.HarvestJobFactory(source_config=source_config)

        if lock_config:
            t = SyncedThread(source_config.acquire_lock)
            t.start()

        try:
            with pytest.raises(exception):
                tasks.harvest(job_id=job.id, **task_kwargs)
        finally:
            if lock_config:
                t.join()
Beispiel #12
0
    def test_partial_harvest_fails(self, source_config):
        job = factories.HarvestJobFactory(source_config=source_config)

        def _do_fetch(*args, **kwargs):
            yield ('doc1', b'doc1data')
            yield ('doc2', b'doc2data')
            yield ('doc3', b'doc3data')
            raise ValueError('In a test')
        source_config.harvester.get_class()._do_fetch = _do_fetch

        with pytest.raises(ValueError) as e:
            tasks.harvest(job_id=job.id)

        job.refresh_from_db()

        assert job.raw_data.count() == 3
        assert e.value.args == ('In a test', )
        assert job.status == HarvestJob.STATUS.failed
        assert job.completions == 0
        assert 'ValueError: In a test' in job.error_context
        assert IngestJob.objects.filter(status=IngestJob.STATUS.created).count() == 3
Beispiel #13
0
    def test_partial_harvest_fails(self, source_config, mock_transform):
        log = factories.HarvestLogFactory(source_config=source_config)

        def _do_fetch(*args, **kwargs):
            yield ('doc1', b'doc1data')
            yield ('doc2', b'doc2data')
            yield ('doc3', b'doc3data')
            raise ValueError('In a test')

        source_config.harvester.get_class()._do_fetch = _do_fetch

        with pytest.raises(ValueError) as e:
            tasks.harvest(log_id=log.id)

        log.refresh_from_db()

        assert log.raw_data.count() == 3
        assert e.value.args == ('In a test', )
        assert log.status == HarvestLog.STATUS.failed
        assert log.completions == 0
        assert 'ValueError: In a test' in log.context
        assert mock_transform.apply_async.call_count == 3
Beispiel #14
0
    def test_data_flow(self, source_config, monkeypatch, count, rediscovered, superfluous, limit, ingest, django_assert_num_queries):
        assert rediscovered <= count, 'Y tho'

        fake = Factory.create()

        source_config.harvester.get_class()._do_fetch.extend([(fake.sentence(), str(i * 50)) for i in range(count)])
        list(RawDatum.objects.store_chunk(source_config, (
            FetchResult(*tup) for tup in
            random.sample(source_config.harvester.get_class()._do_fetch, rediscovered))
        ))

        job = factories.HarvestJobFactory(source_config=source_config)

        tasks.harvest(job_id=job.id, superfluous=superfluous, limit=limit, ingest=ingest)

        job.refresh_from_db()

        assert job.completions == 1
        assert job.status == HarvestJob.STATUS.succeeded
        assert job.raw_data.count() == (count if limit is None or count < limit else limit)

        if limit is not None and rediscovered:
            assert RawDatum.objects.filter().count() >= rediscovered
            assert RawDatum.objects.filter().count() <= rediscovered + max(0, min(limit, count - rediscovered))
        else:
            assert RawDatum.objects.filter().count() == (count if limit is None or count < limit else limit)

        ingest_count = IngestJob.objects.filter(status=IngestJob.STATUS.created).count()
        if ingest:
            if superfluous:
                assert ingest_count == min(count, limit or 99999)
            elif limit is not None:
                assert ingest_count <= min(limit, count)
                assert ingest_count >= min(limit, count) - rediscovered
            else:
                assert ingest_count == count - rediscovered
        else:
            assert ingest_count == 0
Beispiel #15
0
    def test_handles_duplicate_values_limit(self, monkeypatch, source_config):
        fake = Factory.create()
        job = factories.HarvestJobFactory(source_config=source_config)

        source_config.harvester.get_class()._do_fetch.clear()

        padding = []
        for _ in range(20):
            s = fake.sentence()
            padding.append((s, s * 5))

        for i in range(10):
            s = fake.sentence()
            source_config.harvester.get_class()._do_fetch.extend([(s, s * 5)] * 5)
            source_config.harvester.get_class()._do_fetch.extend(padding)

        tasks.harvest(job_id=job.id, limit=60, ingest=False)

        job.refresh_from_db()

        assert job.completions == 1
        assert job.status == HarvestJob.STATUS.succeeded
        assert job.raw_data.count() == 30
Beispiel #16
0
 def test_not_found(self):
     with pytest.raises(HarvestLog.DoesNotExist):
         tasks.harvest(log_id=12)
Beispiel #17
0
 def test_not_found(self):
     with pytest.raises(HarvestJob.DoesNotExist):
         tasks.harvest(job_id=12)