Exemple #1
0
def ingest(args, argv):
    """
    Usage: {0} ingest <source_configs>... [--superfluous] [--now]
           {0} ingest --suids <suid_ids>... [--now]

    Options:
        -i, --suids         Provide Suid IDs to ingest specifically
        -s, --superfluous   Don't skip RawDatums that already have an IngestJob
        -n, --now           Run ingest tasks synchronously for each IngestJob
    """
    suid_ids = args['<suid_ids>']
    source_configs = args['<source_configs>']
    superfluous = args.get('<superfluous>')
    run_now = args['--now']

    qs = SourceUniqueIdentifier.objects.all()
    if suid_ids:
        qs = qs.filter(id__in=suid_ids)
    elif source_configs:
        qs = qs.filter(source_config__label__in=source_configs)
    else:
        raise ValueError('Need raw ids, suid ids, or source configs')

    if not superfluous:
        qs = qs.filter(ingest_jobs=None)

    scheduler = IngestScheduler()
    if run_now:
        for suid in qs:
            print('Ingesting {!r}...'.format(suid))
            scheduler.reingest(suid)
    else:
        jobs = scheduler.bulk_reingest(qs)
        print('Scheduled {} IngestJobs'.format(len(jobs)))
Exemple #2
0
def ingest(args, argv):
    """
    Usage: {0} ingest <source_configs>... [--superfluous] [--now]
           {0} ingest --suids <suid_ids>... [--superfluous] [--now]

    Options:
        -i, --suids         Provide Suid IDs to ingest specifically
        -s, --superfluous   Don't skip RawDatums that already have an IngestJob
        -n, --now           Run ingest tasks synchronously for each IngestJob
    """
    suid_ids = args['<suid_ids>']
    source_configs = args['<source_configs>']
    superfluous = args.get('--superfluous')
    run_now = args['--now']

    qs = SourceUniqueIdentifier.objects.all()
    if suid_ids:
        qs = qs.filter(id__in=suid_ids)
    elif source_configs:
        qs = qs.filter(source_config__label__in=source_configs)
    else:
        raise ValueError('Need raw ids, suid ids, or source configs')

    if not superfluous:
        qs = qs.filter(ingest_jobs=None)

    scheduler = IngestScheduler()
    if run_now:
        for suid in qs:
            print('Ingesting {!r}...'.format(suid))
            scheduler.reingest(suid)
    else:
        jobs = scheduler.bulk_reingest(qs)
        print('Scheduled {} IngestJobs'.format(len(jobs)))
Exemple #3
0
 def test_reingest_async(self, mock_ingest):
     raw = factories.RawDatumFactory()
     job = IngestScheduler().reingest_async(raw.suid)
     assert job.claimed
     mock_ingest.delay.assert_called_once_with(job_id=job.id,
                                               exhaust=False,
                                               superfluous=True)
Exemple #4
0
    def test_schedule(self, raw_ages, selected_raw, claim, prior_status,
                      superfluous, expected_status):
        suid = factories.SourceUniqueIdentifierFactory()
        raws = [
            factories.RawDatumFactory(
                suid=suid, datestamp=pendulum.now().subtract(days=days_ago))
            for days_ago in raw_ages
        ]
        expected_raw = raws[selected_raw]

        expected_job = None
        if prior_status:
            expected_job = factories.IngestJobFactory(raw=expected_raw,
                                                      status=getattr(
                                                          IngestJob.STATUS,
                                                          prior_status))

        job = IngestScheduler().schedule(suid,
                                         claim=claim,
                                         superfluous=superfluous)

        if expected_job:
            assert job.id == expected_job.id
        assert job.suid_id == suid.id
        assert job.raw_id == expected_raw.id
        assert job.status == getattr(IngestJob.STATUS, expected_status)
        assert job.claimed == claim
Exemple #5
0
    def _setup_ingest(self, claim_job):
        assert self.datum and self._config and not (self.raw or self.job
                                                    or self.async_task)

        # TODO get rid of FetchResult, or make it more sensical
        from share.harvest.base import FetchResult
        fetch_result = FetchResult(self.datum_id, self.datum, self.datestamp)
        self.raw = RawDatum.objects.store_data(self._config, fetch_result)
        self.job = IngestScheduler().schedule(self.raw.suid,
                                              self.raw.id,
                                              claim=claim_job)
Exemple #6
0
    def test_bulk_reingest(self, mock_ingest):
        with mock.patch('share.ingest.scheduler.IngestScheduler.bulk_schedule'
                        ) as mock_bulk_schedule:
            jobs = [factories.IngestJobFactory() for _ in range(10)]
            mock_bulk_schedule.return_value = jobs
            actual_jobs = IngestScheduler().bulk_reingest(
                mock.sentinel.suid_qs)

            mock_bulk_schedule.assert_called_once_with(mock.sentinel.suid_qs,
                                                       superfluous=True,
                                                       claim=True)

            assert actual_jobs is jobs
            assert mock_ingest.delay.call_args_list == [({
                'job_id': j.id,
                'exhaust': False,
                'superfluous': True,
            }, ) for j in actual_jobs]
Exemple #7
0
    def test_bulk_schedule(self, claim, superfluous):
        suid_specs = [
            # raw_ages, expected_raw, job_status
            ([0, 1, 2], 0, 'created'),
            ([5, 4, 2, 3], 2, 'failed'),
            ([2, 1], 1, 'succeeded'),
            ([4, 2], 1, None),
        ]
        suids = set()
        expected_jobs = set()
        for raw_ages, selected_raw, job_status in suid_specs:
            suid = factories.SourceUniqueIdentifierFactory()
            raws = [
                factories.RawDatumFactory(
                    suid=suid,
                    datestamp=pendulum.now().subtract(days=days_ago))
                for days_ago in raw_ages
            ]
            if job_status:
                job = factories.IngestJobFactory(raw=raws[selected_raw],
                                                 status=getattr(
                                                     IngestJob.STATUS,
                                                     job_status))
                expected_jobs.add(job)
            suids.add(suid)

        actual_jobs = IngestScheduler().bulk_schedule(
            SourceUniqueIdentifier.objects.all(),
            claim=claim,
            superfluous=superfluous,
        )

        assert len(actual_jobs) == len(suids)
        assert expected_jobs.issubset(actual_jobs)
        for job in actual_jobs:
            assert bool(job.claimed) == claim
            if superfluous:
                assert job.status == IngestJob.STATUS.created
Exemple #8
0
def rawdata_janitor(self, limit=500):
    """Find RawDatum that have neither NormalizedData nor IngestJob and schedule them for ingestion
    """
    count = 0

    # NOTE: Do NOT use .iterator here. It will create a temporary table and eat disk space like no other
    # the limit lets this query fit nicely in memory and actually finish executing
    # Be very careful about changing this query. If you do change it, make sure the EXPLAIN looks something like this:
    # Limit  (cost=1.13..Much Smaller Numbers)
    #   ->  Nested Loop  (cost=1.13..Big Numbers)
    #         Join Filter: (share_sourceuniqueidentifier.source_config_id = share_sourceconfig.id)
    #         ->  Nested Loop  (cost=1.13..Big Numbers)
    #               ->  Nested Loop Anti Join  (cost=0.56..Big Numbers)
    #                     ->  Seq Scan on share_rawdatum  (cost=0.00..Big Numbers)
    #                     ->  Index Only Scan using share_normalizeddata_c0e72696 on share_normalizeddata  (cost=0.56..Small Numbers)
    #                           Index Cond: (raw_id = share_rawdatum.id)

    qs = RawDatum.objects.select_related('suid__source_config').annotate(
        has_normalizedata=Exists(
            NormalizedData.objects.values('id').filter(raw=OuterRef('id'))),
        has_ingestjob=Exists(
            IngestJob.objects.values('id').filter(raw=OuterRef('id'))),
    ).exclude(no_output=True).filter(
        has_normalizedata=False,
        has_ingestjob=False,
        suid__source_config__disabled=False,
        suid__source_config__source__is_deleted=False,
    )

    for rd in qs[:limit]:
        count += 1
        logger.debug('Found unprocessed %r from %r', rd, rd.suid.source_config)
        job = IngestScheduler().schedule(rd.suid, rd.id)
        logger.info('Created job %s for %s', job, rd)
    if count:
        logger.warning('Found %d total unprocessed RawData', count)
    return count
Exemple #9
0
 def reingest(self, request, queryset):
     IngestScheduler().bulk_reingest(queryset)