Example #1
0
    def test_obsolete(self):
        source_config = factories.SourceConfigFactory()

        hlv1 = factories.HarvestLogFactory(
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        old_version = source_config.harvester.get_class().VERSION
        source_config.harvester.get_class().VERSION += 1
        new_version = source_config.harvester.get_class().VERSION

        hlv2 = factories.HarvestLogFactory(
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        tasks.harvest(log_id=hlv2.id)
        tasks.harvest(log_id=hlv1.id)

        hlv1.refresh_from_db()
        hlv2.refresh_from_db()

        assert hlv2.status == HarvestLog.STATUS.succeeded
        assert hlv2.harvester_version == new_version

        assert hlv1.status == HarvestLog.STATUS.skipped
        assert hlv1.harvester_version == old_version
        assert hlv1.context == HarvestLog.SkipReasons.obsolete.value
Example #2
0
    def test_autoupdate(self, completions, status, new_version, updated):
        source_config = factories.SourceConfigFactory()

        source_config.harvester.get_class().VERSION = 1

        hl = factories.HarvestLogFactory(
            status=status,
            completions=completions,
            harvester_version=source_config.harvester.version,
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
        )

        source_config.harvester.get_class().VERSION = new_version

        tasks.harvest(log_id=hl.id)

        hl.refresh_from_db()

        if updated:
            assert hl.status == HarvestLog.STATUS.succeeded
        elif new_version > 1:
            assert hl.status == HarvestLog.STATUS.skipped
            assert hl.context == HarvestLog.SkipReasons.obsolete.value

        assert (hl.harvester_version == new_version) == updated
Example #3
0
    def test_caught_up(self):
        source_config = factories.SourceConfigFactory(
            full_harvest=True,
            earliest_date=pendulum.parse('2017-01-01').date())

        factories.HarvestLogFactory(
            source_config=source_config,
            start_date=pendulum.parse('2017-01-01').date(),
            end_date=pendulum.parse('2017-01-02').date(),
        )

        factories.HarvestLogFactory(
            source_config=source_config,
            start_date=pendulum.parse('2018-01-01').date(),
            end_date=pendulum.parse('2018-01-02').date(),
        )

        assert len(
            HarvestScheduler(source_config).all(
                cutoff=pendulum.parse('2018-01-01').date())) == 0
Example #4
0
    def test_data_flow(self, source_config, monkeypatch, count, rediscovered,
                       superfluous, limit, ingest, django_assert_num_queries):
        assert rediscovered <= count, 'Y tho'

        fake = Factory.create()
        mock_ingest_task = mock.Mock()

        monkeypatch.setattr('share.tasks.transform', mock_ingest_task)
        source_config.harvester.get_class()._do_fetch.extend([
            (fake.sentence(), str(i * 50)) for i in range(count)
        ])
        list(
            RawDatum.objects.store_chunk(
                source_config, (FetchResult(*tup) for tup in random.sample(
                    source_config.harvester.get_class()._do_fetch,
                    rediscovered))))

        log = factories.HarvestLogFactory(source_config=source_config)

        tasks.harvest(log_id=log.id,
                      superfluous=superfluous,
                      limit=limit,
                      ingest=ingest)

        log.refresh_from_db()

        assert log.completions == 1
        assert log.status == HarvestLog.STATUS.succeeded
        assert log.raw_data.count() == (count if limit is None or count < limit
                                        else limit)

        if limit is not None and rediscovered:
            assert RawDatum.objects.filter().count() >= rediscovered
            assert RawDatum.objects.filter().count() <= rediscovered + max(
                0, min(limit, count - rediscovered))
        else:
            assert RawDatum.objects.filter().count() == (
                count if limit is None or count < limit else limit)

        if ingest:
            if superfluous:
                assert mock_ingest_task.apply_async.call_count == min(
                    count, limit or 99999)
            elif limit is not None:
                assert mock_ingest_task.apply_async.call_count <= min(
                    limit, count)
                assert mock_ingest_task.apply_async.call_count >= min(
                    limit, count) - rediscovered
            else:
                assert mock_ingest_task.apply_async.call_count == count - rediscovered
        else:
            assert mock_ingest_task.apply_async.call_count == 0
Example #5
0
    def test_overrides(self, source_config_kwargs, task_kwargs, lock_config):
        source_config = factories.SourceConfigFactory(**source_config_kwargs)
        log = factories.HarvestLogFactory(source_config=source_config)

        if lock_config:
            t = SyncedThread(source_config.acquire_lock)
            t.start()

        try:
            tasks.harvest(log_id=log.id, **task_kwargs)
        finally:
            if lock_config:
                t.join()
Example #6
0
    def test_handles_duplicate_values(self, monkeypatch, source_config):
        fake = Factory.create()
        log = factories.HarvestLogFactory(source_config=source_config)

        source_config.harvester.get_class()._do_fetch.extend(
            [(fake.sentence(), str(i * 50)) for i in range(100)] * 3)

        tasks.harvest(log_id=log.id, ingest=False)

        log.refresh_from_db()

        assert log.completions == 1
        assert log.status == HarvestLog.STATUS.succeeded
        assert log.raw_data.count() == 100
Example #7
0
    def test_harvest_fails(self, source_config):
        source_config.harvester.get_class()._do_fetch.side_effect = ValueError(
            'In a test')
        log = factories.HarvestLogFactory(source_config=source_config)

        with pytest.raises(ValueError) as e:
            tasks.harvest(log_id=log.id)

        log.refresh_from_db()

        assert e.value.args == ('In a test', )
        assert log.status == HarvestLog.STATUS.failed
        assert log.completions == 0
        assert 'ValueError: In a test' in log.context
Example #8
0
    def test_latest_date(self):
        source_config = factories.SourceConfigFactory(
            full_harvest=True,
            earliest_date=pendulum.parse('2017-01-01').date())

        # We have a harvest log with start_date equal to earliest_date
        # but a different source_config
        factories.HarvestLogFactory(
            start_date=pendulum.parse('2017-01-01').date(),
            end_date=pendulum.parse('2017-01-02').date(),
        )

        assert len(
            HarvestScheduler(source_config).all(
                cutoff=pendulum.parse('2018-01-01').date())) == 365
Example #9
0
    def test_failure_cases(self, source_config_kwargs, task_kwargs,
                           lock_config, exception):
        source_config = factories.SourceConfigFactory(**source_config_kwargs)
        log = factories.HarvestLogFactory(source_config=source_config)

        if lock_config:
            t = SyncedThread(source_config.acquire_lock)
            t.start()

        try:
            with pytest.raises(exception):
                tasks.harvest(log_id=log.id, **task_kwargs)
        finally:
            if lock_config:
                t.join()
Example #10
0
    def test_log_values(self, source_config):
        task_id = uuid.uuid4()
        log = factories.HarvestLogFactory(source_config=source_config)

        tasks.harvest.apply((), {'log_id': log.id},
                            task_id=str(task_id),
                            throw=True)

        log.refresh_from_db()

        assert log.task_id == task_id
        assert log.status == HarvestLog.STATUS.succeeded
        assert log.context == ''
        assert log.completions == 1
        assert log.source_config == source_config
        assert log.share_version == settings.VERSION
        assert log.harvester_version == source_config.get_harvester().VERSION
        assert log.source_config_version == source_config.version
Example #11
0
    def test_partial_harvest_fails(self, source_config, mock_transform):
        log = factories.HarvestLogFactory(source_config=source_config)

        def _do_fetch(*args, **kwargs):
            yield ('doc1', b'doc1data')
            yield ('doc2', b'doc2data')
            yield ('doc3', b'doc3data')
            raise ValueError('In a test')

        source_config.harvester.get_class()._do_fetch = _do_fetch

        with pytest.raises(ValueError) as e:
            tasks.harvest(log_id=log.id)

        log.refresh_from_db()

        assert log.raw_data.count() == 3
        assert e.value.args == ('In a test', )
        assert log.status == HarvestLog.STATUS.failed
        assert log.completions == 0
        assert 'ValueError: In a test' in log.context
        assert mock_transform.apply_async.call_count == 3
Example #12
0
    def test_handles_duplicate_values_limit(self, monkeypatch, source_config):
        fake = Factory.create()
        log = factories.HarvestLogFactory(source_config=source_config)

        source_config.harvester.get_class()._do_fetch.clear()

        padding = []
        for _ in range(20):
            s = fake.sentence()
            padding.append((s, s * 5))

        for i in range(10):
            s = fake.sentence()
            source_config.harvester.get_class()._do_fetch.extend([(s, s * 5)] *
                                                                 5)
            source_config.harvester.get_class()._do_fetch.extend(padding)

        tasks.harvest(log_id=log.id, limit=60, ingest=False)

        log.refresh_from_db()

        assert log.completions == 1
        assert log.status == HarvestLog.STATUS.succeeded
        assert log.raw_data.count() == 30