Exemple #1
0
    async def _run(self):
        connection = await aio_pika.connect_robust(
            host=os.environ['AMQP_HOST'],
            port=int(os.environ['AMQP_PORT']),
            login=os.environ['AMQP_USER'],
            password=os.environ['AMQP_PASSWORD'],
        )
        self.channel = await connection.channel()
        await self.channel.set_qos(prefetch_count=MAX_CONCURRENT_DOWNLOAD)

        await self._amqp_setup()

        # Consume profiling queue
        async for message in self.profile_queue:
            obj = msg2json(message)
            dataset_id = obj['id']
            metadata = obj['metadata']
            materialize = metadata.get('materialize', {})

            logger.info("Processing dataset %r from %r",
                        dataset_id, materialize.get('identifier'))

            # Compare materialization info with stored to know whether cache
            # should be ignored
            try:
                hit = self.es.get('datamart', dataset_id)
            except elasticsearch.NotFoundError:
                cache_invalid = True
            else:
                cache_invalid = materialize != hit['_source']['materialize']

            future = self.loop.run_in_executor(
                None,
                materialize_and_process_dataset,
                dataset_id,
                metadata,
                self.lazo_client,
                self.nominatim,
                self.profile_semaphore,
                cache_invalid,
            )

            future.add_done_callback(
                self.process_dataset_callback(
                    message, dataset_id,
                )
            )
Exemple #2
0
    async def _run(self):
        connection = await aio_pika.connect_robust(
            host=os.environ['AMQP_HOST'],
            port=int(os.environ['AMQP_PORT']),
            login=os.environ['AMQP_USER'],
            password=os.environ['AMQP_PASSWORD'],
        )
        self.channel = await connection.channel()
        await self.channel.set_qos(prefetch_count=MAX_CONCURRENT_DOWNLOAD)

        await self._amqp_setup()

        # Consume profiling queue
        async for message in self.profile_queue:
            obj = msg2json(message)
            dataset_id = obj['id']
            metadata = obj['metadata']
            materialize = metadata.get('materialize', {})

            logger.info("Processing dataset %r from %r",
                        dataset_id, materialize.get('identifier'))

            future = self.loop.run_in_executor(
                None,
                materialize_and_process_dataset,
                dataset_id,
                metadata,
                LazoDeleteFirst(self.lazo_client, self.es, dataset_id),
                self.nominatim,
                self.geo_data,
                self.profile_semaphore,
            )

            future.add_done_callback(
                self.process_dataset_callback(
                    message, dataset_id,
                )
            )
Exemple #3
0
        async def coro(future):
            metadata = msg2json(message)['metadata']
            _rie = asyncio.get_event_loop().run_in_executor
            in_thread = lambda func: _rie(None, func)
            try:
                try:
                    metadata = future.result()
                    if metadata['nb_rows'] == 0:
                        logger.info(
                            "Dataset has no rows, not inserting into index: " +
                            "%r",
                            dataset_id,
                        )
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                # DO delete from Lazo
                                self.lazo_client,
                            ),
                        )
                        self.es.index(
                            'pending',
                            dict(
                                status='error',
                                error="Dataset has no rows",
                                metadata=metadata,
                                date=datetime.utcnow().isoformat(),
                                source=metadata['source'],
                                materialize=metadata['materialize'],
                            ),
                            id=dataset_id,
                        )
                    else:
                        # Delete dataset if already exists in index
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                # Don't delete from Lazo, we inserted during profile
                                None,
                            ),
                        )
                        # Insert results in Elasticsearch
                        body = dict(metadata,
                                    date=datetime.utcnow().isoformat() + 'Z',
                                    version=os.environ['DATAMART_VERSION'])
                        await in_thread(
                            lambda: add_dataset_to_index(self.es, dataset_id, body),
                        )

                        # Publish to RabbitMQ
                        msg = dict(
                            id=dataset_id,
                        )
                        for key in (
                            'name', 'description', 'source', 'date', 'version',
                            'types', 'nb_rows', 'nb_columns', 'materialize',
                        ):
                            if key in body:
                                msg[key] = body[key]
                        await self.datasets_exchange.publish(
                            json2msg(msg),
                            dataset_id,
                        )

                        # Remove from alternate index
                        try:
                            self.es.delete('pending', dataset_id)
                        except elasticsearch.NotFoundError:
                            pass
                except DatasetTooBig as e:
                    # Materializer reached size limit
                    if not e.limit:
                        logger.info("Dataset over size limit: %r", dataset_id)
                    elif e.actual:
                        logger.info(
                            "Dataset over size limit (%d > %d bytes): %r",
                            e.actual, e.limit,
                            dataset_id,
                        )
                    else:
                        logger.info(
                            "Dataset over size limit (%d bytes): %r",
                            e.limit, dataset_id,
                        )
                    await message.ack()
                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Dataset is too big",
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                    try:
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                self.lazo_client,
                            ),
                        )
                    except elasticsearch.NotFoundError:
                        pass
                except Exception as e:
                    if isinstance(e, elasticsearch.RequestError):
                        # This is a problem with our computed metadata
                        sentry_sdk.capture_exception(e)
                        logger.exception(
                            "Error inserting dataset %r in Elasticsearch",
                            dataset_id,
                        )
                    elif isinstance(e, elasticsearch.TransportError):
                        # This is probably an issue with Elasticsearch
                        # We'll log, nack and retry
                        raise
                    else:
                        logger.warning("Error processing dataset %r",
                                       dataset_id, exc_info=True)
                    # Move message to failed queue
                    await self.channel.default_exchange.publish(
                        aio_pika.Message(message.body),
                        self.failed_queue.name,
                    )
                    # Ack anyway, retrying would probably fail again
                    await message.ack()

                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Error profiling dataset",
                            error_details=exception_details(e),
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                else:
                    await message.ack()
                    logger.info("Dataset %r processed successfully",
                                dataset_id)
            except Exception:
                await message.nack()
                raise
Exemple #4
0
        async def coro(future):
            metadata = msg2json(message)['metadata']
            try:
                try:
                    metadata = future.result()
                    if metadata['nb_rows'] == 0:
                        logger.info(
                            "Dataset has no rows, not inserting into index: " +
                            "%r",
                            dataset_id,
                        )
                        delete_dataset_from_index(
                            self.es,
                            dataset_id,
                            # DO delete from Lazo
                            self.lazo_client,
                        )
                        self.es.index(
                            'pending',
                            dict(
                                status='error',
                                error="Dataset has no rows",
                                metadata=metadata,
                                date=datetime.utcnow().isoformat(),
                                source=metadata['source'],
                                materialize=metadata['materialize'],
                            ),
                            id=dataset_id,
                        )
                    else:
                        # Delete dataset if already exists in index
                        delete_dataset_from_index(
                            self.es,
                            dataset_id,
                            # Don't delete from Lazo, we inserted during profile
                            None,
                        )
                        # Insert results in Elasticsearch
                        body = dict(metadata,
                                    date=datetime.utcnow().isoformat() + 'Z',
                                    version=os.environ['DATAMART_VERSION'])
                        add_dataset_to_index(self.es, dataset_id, body)

                        # Publish to RabbitMQ
                        await self.datasets_exchange.publish(
                            json2msg(dict(body, id=dataset_id)),
                            dataset_id,
                        )

                        # Remove from alternate index
                        try:
                            self.es.delete('pending', dataset_id)
                        except elasticsearch.NotFoundError:
                            pass
                except DatasetTooBig:
                    # Materializer reached size limit
                    logger.info("Dataset over size limit: %r", dataset_id)
                    message.ack()
                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Dataset is too big",
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                except Exception as e:
                    if isinstance(e, elasticsearch.RequestError):
                        # This is a problem with our computed metadata
                        logger.exception(
                            "Error inserting dataset %r in Elasticsearch",
                            dataset_id,
                        )
                    elif isinstance(e, elasticsearch.TransportError):
                        # This is probably an issue with Elasticsearch
                        # We'll log, nack and retry
                        raise
                    else:
                        logger.exception("Error processing dataset %r",
                                         dataset_id)
                    # Move message to failed queue
                    await self.channel.default_exchange.publish(
                        aio_pika.Message(message.body),
                        self.failed_queue.name,
                    )
                    # Ack anyway, retrying would probably fail again
                    message.ack()

                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Error profiling dataset",
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                else:
                    message.ack()
                    logger.info("Dataset %r processed successfully",
                                dataset_id)
            except Exception:
                message.nack()
                raise