Example #1
0
    def _add_consent_response(self, rows):
        """
        Transforms iterable to add user consent from the consent service.

        The consent lookup makes an external API call to return consent.
        For perfromance reasons the consent amount is limited by consent_page_size.
        Due to this limitaion the iterable are sliced into chunks requesting consent for 100 rows
        at a time.
        """
        # Slice iterable into chunks
        row_chunks = slice_iterable_into_chunks(rows, self.consent_page_size)
        for chunk in row_chunks:
            """
            Loop over the chunks and extract the email and item.
            Save the item because the iterator cannot be used twice.
            """
            rows = list(chunk)
            # Peform constent lookup on emails POST request
            consent_lookups = consent.get_many([
                row['email']
                for row in rows if self._is_valid_email(row['email'])
            ], )
            for row in rows:
                # Assign contact consent boolean to accepts_dit_email_marketing
                # and yield modified result.
                row['accepts_dit_email_marketing'] = consent_lookups.get(
                    row['email'], False)
                yield row
Example #2
0
def sync_app(search_app, batch_size=None, post_batch_callback=None):
    """Syncs objects for an app to ElasticSearch in batches of batch_size."""
    model_name = search_app.es_model.__name__
    batch_size = batch_size or search_app.bulk_batch_size
    logger.info(
        f'Processing {model_name} records, using batch size {batch_size}')

    read_indices, write_index = search_app.es_model.get_read_and_write_indices(
    )

    num_source_rows_processed = 0
    num_objects_synced = 0
    total_rows = search_app.queryset.count()
    it = search_app.queryset.values_list(
        'pk', flat=True).iterator(chunk_size=batch_size)
    batches = slice_iterable_into_chunks(it, batch_size)
    for batch in batches:
        objs = search_app.queryset.filter(pk__in=batch)

        num_actions = sync_objects(
            search_app.es_model,
            objs,
            read_indices,
            write_index,
            post_batch_callback=post_batch_callback,
        )

        emit_progress = (
            (num_source_rows_processed + num_actions) // PROGRESS_INTERVAL -
            num_source_rows_processed // PROGRESS_INTERVAL > 0)

        num_source_rows_processed += len(batch)
        num_objects_synced += num_actions

        if emit_progress:
            logger.info(
                f'{model_name} rows processed: {num_source_rows_processed}/{total_rows} '
                f'{num_source_rows_processed*100//total_rows}%', )

    logger.info(
        f'{model_name} rows processed: {num_source_rows_processed}/{total_rows} 100%.'
    )
    if num_source_rows_processed != num_objects_synced:
        logger.warning(
            f'{num_source_rows_processed - num_objects_synced} deleted objects detected while '
            f'syncing model {model_name}', )
Example #3
0
    def run(self, tmp_file_creator, endpoint=None):
        """Runs the synchronisation operation."""
        logger.info('Starting CH load...')
        endpoint = endpoint or settings.COMPANIESHOUSE_DOWNLOAD_URL
        ch_csv_urls = get_ch_latest_dump_file_list(endpoint)
        logger.info('Found the following Companies House CSV URLs: %s',
                    ch_csv_urls)

        for csv_url in ch_csv_urls:
            ch_company_rows = iter_ch_csv_from_url(csv_url, tmp_file_creator)

            batch_iter = slice_iterable_into_chunks(
                ch_company_rows,
                settings.BULK_INSERT_BATCH_SIZE,
            )
            with connection.cursor() as cursor:
                for batch in batch_iter:
                    self._process_batch(cursor, batch)

        logger.info('Companies House load complete, %s records loaded',
                    self.count)
Example #4
0
def sync_app(search_app, batch_size=None, post_batch_callback=None):
    """Syncs objects for an app to ElasticSearch in batches of batch_size."""
    model_name = search_app.es_model.__name__
    batch_size = batch_size or search_app.bulk_batch_size
    logger.info(f'Processing {model_name} records, using batch size {batch_size}')

    read_indices, write_index = search_app.es_model.get_read_and_write_indices()

    rows_processed = 0
    total_rows = search_app.queryset.count()
    it = search_app.queryset.iterator(chunk_size=batch_size)
    batches = slice_iterable_into_chunks(it, batch_size)
    for batch in batches:
        num_actions = sync_objects(
            search_app.es_model,
            batch,
            read_indices,
            write_index,
            post_batch_callback=post_batch_callback,
        )

        emit_progress = (
            (rows_processed + num_actions) // PROGRESS_INTERVAL
            - rows_processed // PROGRESS_INTERVAL
            > 0
        )

        rows_processed += num_actions

        if emit_progress:
            logger.info(
                f'{model_name} rows processed: {rows_processed}/{total_rows} '
                f'{rows_processed*100//total_rows}%',
            )

    logger.info(f'{model_name} rows processed: {rows_processed}/{total_rows} 100%.')
Example #5
0
def test_slice_iterable_into_chunks():
    """Test slice iterable into chunks."""
    size = 2
    iterable = range(5)
    chunks = list(slice_iterable_into_chunks(iterable, size))
    assert chunks == [[0, 1], [2, 3], [4]]