Ejemplo n.º 1
0
def import_new_sushi_attempts_task():
    """
    Go over new sushi attempts that contain data and import them
    """
    with cache_based_lock('import_new_sushi_attempts_task',
                          blocking_timeout=10):
        import_new_sushi_attempts()
Ejemplo n.º 2
0
 def fetch_report(self, counter_report: CounterReportType, start_date, end_date,
                  fetch_attempt: 'SushiFetchAttempt' = None, use_url_lock=True) -> \
         'SushiFetchAttempt':
     """
     :param counter_report:
     :param start_date:
     :param end_date:
     :param fetch_attempt: when provides, new SushiFetchAttempt will not be created but rather
                           the given one updated
     :param use_url_lock: if True, a cache based lock will be used to ensure exclusive access
                          to one URL
     :return:
     """
     client = self.create_sushi_client()
     fetch_m = self._fetch_report_v4 if self.counter_version == 4 else self._fetch_report_v5
     if use_url_lock:
         with cache_based_lock(self.url_lock_name):
             attempt_params = fetch_m(client, counter_report, start_date, end_date)
     else:
         attempt_params = fetch_m(client, counter_report, start_date, end_date)
     attempt_params['in_progress'] = False
     if fetch_attempt:
         for key, value in attempt_params.items():
             setattr(fetch_attempt, key, value)
         fetch_attempt.save()
         return fetch_attempt
     else:
         attempt = SushiFetchAttempt.objects.create(**attempt_params)
         return attempt
Ejemplo n.º 3
0
def fetch_new_sushi_data_for_credentials_task(credentials_id: int):
    """
    Fetch sushi data for dates and platforms where they are not available - only for specific
    credentials identified by database pk
    """
    credentials = SushiCredentials.objects.get(pk=credentials_id)
    with cache_based_lock(f'fetch_new_sushi_data_task_{credentials_id}',
                          blocking_timeout=10):
        fetch_new_sushi_data(credentials=credentials)
Ejemplo n.º 4
0
def process_fetch_units_wrapper(args, **kwargs):
    lock_name, fetch_units, start_date, end_date = args
    logger.debug('Going to lock fetching of %d fetch units with lock %s',
                 len(fetch_units), lock_name)
    with cache_based_lock(lock_name):
        logger.debug('Locked %s', lock_name)
        try:
            process_fetch_units(fetch_units, start_date, end_date, **kwargs)
        except Exception as exc:
            logger.error('Exception: %s', exc)
            logger.error('Traceback: %s', traceback.format_exc())
            raise exc
        logger.debug('Unlocked %s', lock_name)
Ejemplo n.º 5
0
def recompute_interest_by_batch(queryset=None, verbose=False):
    """
    Using `verbose` reports potential discrepancies between old and recomputed interest values.
    It requires 2 extra queries for each import batch, so it should be used with caution.
    """
    with cache_based_lock('sync_interest_task', blocking_timeout=10):
        # we share the lock with sync_interest_task because the two could compete for the
        # same data
        if queryset is None:
            queryset = ImportBatch.objects.filter(interest_timestamp__isnull=False)
        # report_type.superseeded_by is needed later on, so we select it here to reduce the
        # query count
        # WARNING: the following messes up the queries when they are more complex and can
        #          lead to memory exhaustion - I leave it here as a memento against future attempts
        # queryset = queryset.select_related('report_type__superseeded_by', 'platform').\
        #     annotate(min_date=Min('accesslog__date'), max_date=Max('accesslog__date'))
        interest_rt = interest_report_type()
        total_count = queryset.count()
        logger.info('Going to recompute interest for %d batches', total_count)
        stats = Counter()
        for i, import_batch in enumerate(queryset.iterator()):
            old_sum = (
                import_batch.accesslog_set.filter(report_type=interest_rt).aggregate(
                    sum=Sum('value')
                )['sum']
                if verbose
                else 0
            )
            stats += sync_interest_for_import_batch(import_batch, interest_rt)
            if i % 100 == 0:
                logger.info(
                    'Recomputed interest for %d out of %d batches, stats: %s', i, total_count, stats
                )
            if verbose:
                new_sum = import_batch.accesslog_set.filter(report_type=interest_rt).aggregate(
                    sum=Sum('value')
                )['sum']
                if new_sum != old_sum:
                    logger.warning(
                        'Mismatched interest sum: %d vs %d (%.1f) [%s]',
                        old_sum,
                        new_sum,
                        old_sum / new_sum,
                        import_batch,
                    )
                    stats['mismatch'] += 1
                else:
                    stats['match'] += 1
        return stats
Ejemplo n.º 6
0
def erms_sync_organizations() -> dict:
    with cache_based_lock('sync_organizations_with_erms'):
        erms = ERMS(base_url=settings.ERMS_API_URL)
        data_source, _created = DataSource.objects.get_or_create(
            short_name='ERMS', type=DataSource.TYPE_API)
        erms_orgs = erms.fetch_objects(ERMS.CLS_ORGANIZATION)
        parent_ids = set()
        internal_ids = set()
        clean_records = []
        # we first go through the records, generate list of parents and also remove
        # records with duplicated internal_id
        for record in erms_orgs:
            internal_id = record['vals'].get('czechelib id')
            if internal_id:
                internal_id = internal_id[0]
            if internal_id and internal_id in internal_ids:
                logger.warning(
                    f'Duplicate internal ID "{internal_id}" for {record}')
            else:
                clean_records.append(record)
            internal_ids.add(internal_id)
            if record.get('refs'):
                cb = record['refs'].get('controlled by')
                if cb and record['vals'].get('czechelib id'):
                    parent_ids.add(cb[0])
        # then we do another batch of cleaning where we
        # filter out organizations without ICO or czechelib id, but keep parents
        # of those with czechelib id
        # we also keep organizations which are registred in settings as master organizations
        clean_records = [
            org for org in clean_records
            if (org['vals'].get('ico') and org['vals'].get('czechelib id'))
            or org['id'] in parent_ids or (
                org['vals'].get('czechelib id') and org['vals'].get(
                    'czechelib id')[0] in settings.MASTER_ORGANIZATIONS)
        ]
        syncer = OrganizationSyncer(data_source)
        stats = syncer.sync_data(clean_records)
        return stats
Ejemplo n.º 7
0
def recompute_interest_by_batch(queryset=None):
    with cache_based_lock('sync_interest_task', blocking_timeout=10):
        # we share the lock with sync_interest_task because the two could compete for the
        # same data
        if queryset is None:
            queryset = ImportBatch.objects.filter(
                interest_timestamp__isnull=False)
        # report_type.superseeded_by is needed later on, so we select it here to reduce the
        # query count
        # WARNING: the following messes up the queries when they are more complex and can
        #          lead to memory exhaustion - I leave it here as a memento against future attempts
        # queryset = queryset.select_related('report_type__superseeded_by', 'platform').\
        #     annotate(min_date=Min('accesslog__date'), max_date=Max('accesslog__date'))
        interest_rt = interest_report_type()
        total_count = queryset.count()
        logger.info('Going to recompute interest for %d batches', total_count)
        stats = Counter()
        for i, import_batch in enumerate(queryset.iterator()):
            stats += sync_interest_for_import_batch(import_batch, interest_rt)
            if i % 20 == 0:
                logger.debug(
                    'Recomputed interest for %d out of %d batches, stats: %s',
                    i, total_count, stats)
        return stats
Ejemplo n.º 8
0
def sync_interest_task():
    """
    Synchronizes computed interest for import batches that were not processed yet
    """
    with cache_based_lock('sync_interest_task', blocking_timeout=10):
        sync_interest_by_import_batches()
Ejemplo n.º 9
0
def fetch_new_sushi_data_task():
    """
    Fetch sushi data for dates and platforms where they are not available
    """
    with cache_based_lock('fetch_new_sushi_data_task', blocking_timeout=10):
        fetch_new_sushi_data()
Ejemplo n.º 10
0
def retry_queued_attempts_task():
    """
    Retry downloading data for attempts that were queued
    """
    with cache_based_lock('retry_queued_attempts_task', blocking_timeout=10):
        retry_queued(sleep_interval=5)