def import_new_sushi_attempts_task(): """ Go over new sushi attempts that contain data and import them """ with cache_based_lock('import_new_sushi_attempts_task', blocking_timeout=10): import_new_sushi_attempts()
def fetch_report(self, counter_report: CounterReportType, start_date, end_date, fetch_attempt: 'SushiFetchAttempt' = None, use_url_lock=True) -> \ 'SushiFetchAttempt': """ :param counter_report: :param start_date: :param end_date: :param fetch_attempt: when provides, new SushiFetchAttempt will not be created but rather the given one updated :param use_url_lock: if True, a cache based lock will be used to ensure exclusive access to one URL :return: """ client = self.create_sushi_client() fetch_m = self._fetch_report_v4 if self.counter_version == 4 else self._fetch_report_v5 if use_url_lock: with cache_based_lock(self.url_lock_name): attempt_params = fetch_m(client, counter_report, start_date, end_date) else: attempt_params = fetch_m(client, counter_report, start_date, end_date) attempt_params['in_progress'] = False if fetch_attempt: for key, value in attempt_params.items(): setattr(fetch_attempt, key, value) fetch_attempt.save() return fetch_attempt else: attempt = SushiFetchAttempt.objects.create(**attempt_params) return attempt
def fetch_new_sushi_data_for_credentials_task(credentials_id: int): """ Fetch sushi data for dates and platforms where they are not available - only for specific credentials identified by database pk """ credentials = SushiCredentials.objects.get(pk=credentials_id) with cache_based_lock(f'fetch_new_sushi_data_task_{credentials_id}', blocking_timeout=10): fetch_new_sushi_data(credentials=credentials)
def process_fetch_units_wrapper(args, **kwargs): lock_name, fetch_units, start_date, end_date = args logger.debug('Going to lock fetching of %d fetch units with lock %s', len(fetch_units), lock_name) with cache_based_lock(lock_name): logger.debug('Locked %s', lock_name) try: process_fetch_units(fetch_units, start_date, end_date, **kwargs) except Exception as exc: logger.error('Exception: %s', exc) logger.error('Traceback: %s', traceback.format_exc()) raise exc logger.debug('Unlocked %s', lock_name)
def recompute_interest_by_batch(queryset=None, verbose=False): """ Using `verbose` reports potential discrepancies between old and recomputed interest values. It requires 2 extra queries for each import batch, so it should be used with caution. """ with cache_based_lock('sync_interest_task', blocking_timeout=10): # we share the lock with sync_interest_task because the two could compete for the # same data if queryset is None: queryset = ImportBatch.objects.filter(interest_timestamp__isnull=False) # report_type.superseeded_by is needed later on, so we select it here to reduce the # query count # WARNING: the following messes up the queries when they are more complex and can # lead to memory exhaustion - I leave it here as a memento against future attempts # queryset = queryset.select_related('report_type__superseeded_by', 'platform').\ # annotate(min_date=Min('accesslog__date'), max_date=Max('accesslog__date')) interest_rt = interest_report_type() total_count = queryset.count() logger.info('Going to recompute interest for %d batches', total_count) stats = Counter() for i, import_batch in enumerate(queryset.iterator()): old_sum = ( import_batch.accesslog_set.filter(report_type=interest_rt).aggregate( sum=Sum('value') )['sum'] if verbose else 0 ) stats += sync_interest_for_import_batch(import_batch, interest_rt) if i % 100 == 0: logger.info( 'Recomputed interest for %d out of %d batches, stats: %s', i, total_count, stats ) if verbose: new_sum = import_batch.accesslog_set.filter(report_type=interest_rt).aggregate( sum=Sum('value') )['sum'] if new_sum != old_sum: logger.warning( 'Mismatched interest sum: %d vs %d (%.1f) [%s]', old_sum, new_sum, old_sum / new_sum, import_batch, ) stats['mismatch'] += 1 else: stats['match'] += 1 return stats
def erms_sync_organizations() -> dict: with cache_based_lock('sync_organizations_with_erms'): erms = ERMS(base_url=settings.ERMS_API_URL) data_source, _created = DataSource.objects.get_or_create( short_name='ERMS', type=DataSource.TYPE_API) erms_orgs = erms.fetch_objects(ERMS.CLS_ORGANIZATION) parent_ids = set() internal_ids = set() clean_records = [] # we first go through the records, generate list of parents and also remove # records with duplicated internal_id for record in erms_orgs: internal_id = record['vals'].get('czechelib id') if internal_id: internal_id = internal_id[0] if internal_id and internal_id in internal_ids: logger.warning( f'Duplicate internal ID "{internal_id}" for {record}') else: clean_records.append(record) internal_ids.add(internal_id) if record.get('refs'): cb = record['refs'].get('controlled by') if cb and record['vals'].get('czechelib id'): parent_ids.add(cb[0]) # then we do another batch of cleaning where we # filter out organizations without ICO or czechelib id, but keep parents # of those with czechelib id # we also keep organizations which are registred in settings as master organizations clean_records = [ org for org in clean_records if (org['vals'].get('ico') and org['vals'].get('czechelib id')) or org['id'] in parent_ids or ( org['vals'].get('czechelib id') and org['vals'].get( 'czechelib id')[0] in settings.MASTER_ORGANIZATIONS) ] syncer = OrganizationSyncer(data_source) stats = syncer.sync_data(clean_records) return stats
def recompute_interest_by_batch(queryset=None): with cache_based_lock('sync_interest_task', blocking_timeout=10): # we share the lock with sync_interest_task because the two could compete for the # same data if queryset is None: queryset = ImportBatch.objects.filter( interest_timestamp__isnull=False) # report_type.superseeded_by is needed later on, so we select it here to reduce the # query count # WARNING: the following messes up the queries when they are more complex and can # lead to memory exhaustion - I leave it here as a memento against future attempts # queryset = queryset.select_related('report_type__superseeded_by', 'platform').\ # annotate(min_date=Min('accesslog__date'), max_date=Max('accesslog__date')) interest_rt = interest_report_type() total_count = queryset.count() logger.info('Going to recompute interest for %d batches', total_count) stats = Counter() for i, import_batch in enumerate(queryset.iterator()): stats += sync_interest_for_import_batch(import_batch, interest_rt) if i % 20 == 0: logger.debug( 'Recomputed interest for %d out of %d batches, stats: %s', i, total_count, stats) return stats
def sync_interest_task(): """ Synchronizes computed interest for import batches that were not processed yet """ with cache_based_lock('sync_interest_task', blocking_timeout=10): sync_interest_by_import_batches()
def fetch_new_sushi_data_task(): """ Fetch sushi data for dates and platforms where they are not available """ with cache_based_lock('fetch_new_sushi_data_task', blocking_timeout=10): fetch_new_sushi_data()
def retry_queued_attempts_task(): """ Retry downloading data for attempts that were queued """ with cache_based_lock('retry_queued_attempts_task', blocking_timeout=10): retry_queued(sleep_interval=5)