def ingest(args, argv): """ Usage: {0} ingest <source_configs>... [--superfluous] [--now] {0} ingest --suids <suid_ids>... [--now] Options: -i, --suids Provide Suid IDs to ingest specifically -s, --superfluous Don't skip RawDatums that already have an IngestJob -n, --now Run ingest tasks synchronously for each IngestJob """ suid_ids = args['<suid_ids>'] source_configs = args['<source_configs>'] superfluous = args.get('<superfluous>') run_now = args['--now'] qs = SourceUniqueIdentifier.objects.all() if suid_ids: qs = qs.filter(id__in=suid_ids) elif source_configs: qs = qs.filter(source_config__label__in=source_configs) else: raise ValueError('Need raw ids, suid ids, or source configs') if not superfluous: qs = qs.filter(ingest_jobs=None) scheduler = IngestScheduler() if run_now: for suid in qs: print('Ingesting {!r}...'.format(suid)) scheduler.reingest(suid) else: jobs = scheduler.bulk_reingest(qs) print('Scheduled {} IngestJobs'.format(len(jobs)))
def ingest(args, argv): """ Usage: {0} ingest <source_configs>... [--superfluous] [--now] {0} ingest --suids <suid_ids>... [--superfluous] [--now] Options: -i, --suids Provide Suid IDs to ingest specifically -s, --superfluous Don't skip RawDatums that already have an IngestJob -n, --now Run ingest tasks synchronously for each IngestJob """ suid_ids = args['<suid_ids>'] source_configs = args['<source_configs>'] superfluous = args.get('--superfluous') run_now = args['--now'] qs = SourceUniqueIdentifier.objects.all() if suid_ids: qs = qs.filter(id__in=suid_ids) elif source_configs: qs = qs.filter(source_config__label__in=source_configs) else: raise ValueError('Need raw ids, suid ids, or source configs') if not superfluous: qs = qs.filter(ingest_jobs=None) scheduler = IngestScheduler() if run_now: for suid in qs: print('Ingesting {!r}...'.format(suid)) scheduler.reingest(suid) else: jobs = scheduler.bulk_reingest(qs) print('Scheduled {} IngestJobs'.format(len(jobs)))
def test_reingest_async(self, mock_ingest): raw = factories.RawDatumFactory() job = IngestScheduler().reingest_async(raw.suid) assert job.claimed mock_ingest.delay.assert_called_once_with(job_id=job.id, exhaust=False, superfluous=True)
def test_schedule(self, raw_ages, selected_raw, claim, prior_status, superfluous, expected_status): suid = factories.SourceUniqueIdentifierFactory() raws = [ factories.RawDatumFactory( suid=suid, datestamp=pendulum.now().subtract(days=days_ago)) for days_ago in raw_ages ] expected_raw = raws[selected_raw] expected_job = None if prior_status: expected_job = factories.IngestJobFactory(raw=expected_raw, status=getattr( IngestJob.STATUS, prior_status)) job = IngestScheduler().schedule(suid, claim=claim, superfluous=superfluous) if expected_job: assert job.id == expected_job.id assert job.suid_id == suid.id assert job.raw_id == expected_raw.id assert job.status == getattr(IngestJob.STATUS, expected_status) assert job.claimed == claim
def _setup_ingest(self, claim_job): assert self.datum and self._config and not (self.raw or self.job or self.async_task) # TODO get rid of FetchResult, or make it more sensical from share.harvest.base import FetchResult fetch_result = FetchResult(self.datum_id, self.datum, self.datestamp) self.raw = RawDatum.objects.store_data(self._config, fetch_result) self.job = IngestScheduler().schedule(self.raw.suid, self.raw.id, claim=claim_job)
def test_bulk_reingest(self, mock_ingest): with mock.patch('share.ingest.scheduler.IngestScheduler.bulk_schedule' ) as mock_bulk_schedule: jobs = [factories.IngestJobFactory() for _ in range(10)] mock_bulk_schedule.return_value = jobs actual_jobs = IngestScheduler().bulk_reingest( mock.sentinel.suid_qs) mock_bulk_schedule.assert_called_once_with(mock.sentinel.suid_qs, superfluous=True, claim=True) assert actual_jobs is jobs assert mock_ingest.delay.call_args_list == [({ 'job_id': j.id, 'exhaust': False, 'superfluous': True, }, ) for j in actual_jobs]
def test_bulk_schedule(self, claim, superfluous): suid_specs = [ # raw_ages, expected_raw, job_status ([0, 1, 2], 0, 'created'), ([5, 4, 2, 3], 2, 'failed'), ([2, 1], 1, 'succeeded'), ([4, 2], 1, None), ] suids = set() expected_jobs = set() for raw_ages, selected_raw, job_status in suid_specs: suid = factories.SourceUniqueIdentifierFactory() raws = [ factories.RawDatumFactory( suid=suid, datestamp=pendulum.now().subtract(days=days_ago)) for days_ago in raw_ages ] if job_status: job = factories.IngestJobFactory(raw=raws[selected_raw], status=getattr( IngestJob.STATUS, job_status)) expected_jobs.add(job) suids.add(suid) actual_jobs = IngestScheduler().bulk_schedule( SourceUniqueIdentifier.objects.all(), claim=claim, superfluous=superfluous, ) assert len(actual_jobs) == len(suids) assert expected_jobs.issubset(actual_jobs) for job in actual_jobs: assert bool(job.claimed) == claim if superfluous: assert job.status == IngestJob.STATUS.created
def rawdata_janitor(self, limit=500): """Find RawDatum that have neither NormalizedData nor IngestJob and schedule them for ingestion """ count = 0 # NOTE: Do NOT use .iterator here. It will create a temporary table and eat disk space like no other # the limit lets this query fit nicely in memory and actually finish executing # Be very careful about changing this query. If you do change it, make sure the EXPLAIN looks something like this: # Limit (cost=1.13..Much Smaller Numbers) # -> Nested Loop (cost=1.13..Big Numbers) # Join Filter: (share_sourceuniqueidentifier.source_config_id = share_sourceconfig.id) # -> Nested Loop (cost=1.13..Big Numbers) # -> Nested Loop Anti Join (cost=0.56..Big Numbers) # -> Seq Scan on share_rawdatum (cost=0.00..Big Numbers) # -> Index Only Scan using share_normalizeddata_c0e72696 on share_normalizeddata (cost=0.56..Small Numbers) # Index Cond: (raw_id = share_rawdatum.id) qs = RawDatum.objects.select_related('suid__source_config').annotate( has_normalizedata=Exists( NormalizedData.objects.values('id').filter(raw=OuterRef('id'))), has_ingestjob=Exists( IngestJob.objects.values('id').filter(raw=OuterRef('id'))), ).exclude(no_output=True).filter( has_normalizedata=False, has_ingestjob=False, suid__source_config__disabled=False, suid__source_config__source__is_deleted=False, ) for rd in qs[:limit]: count += 1 logger.debug('Found unprocessed %r from %r', rd, rd.suid.source_config) job = IngestScheduler().schedule(rd.suid, rd.id) logger.info('Created job %s for %s', job, rd) if count: logger.warning('Found %d total unprocessed RawData', count) return count
def reingest(self, request, queryset): IngestScheduler().bulk_reingest(queryset)