async def _run(self): connection = await aio_pika.connect_robust( host=os.environ['AMQP_HOST'], port=int(os.environ['AMQP_PORT']), login=os.environ['AMQP_USER'], password=os.environ['AMQP_PASSWORD'], ) self.channel = await connection.channel() await self.channel.set_qos(prefetch_count=MAX_CONCURRENT_DOWNLOAD) await self._amqp_setup() # Consume profiling queue async for message in self.profile_queue: obj = msg2json(message) dataset_id = obj['id'] metadata = obj['metadata'] materialize = metadata.get('materialize', {}) logger.info("Processing dataset %r from %r", dataset_id, materialize.get('identifier')) # Compare materialization info with stored to know whether cache # should be ignored try: hit = self.es.get('datamart', dataset_id) except elasticsearch.NotFoundError: cache_invalid = True else: cache_invalid = materialize != hit['_source']['materialize'] future = self.loop.run_in_executor( None, materialize_and_process_dataset, dataset_id, metadata, self.lazo_client, self.nominatim, self.profile_semaphore, cache_invalid, ) future.add_done_callback( self.process_dataset_callback( message, dataset_id, ) )
async def _run(self): connection = await aio_pika.connect_robust( host=os.environ['AMQP_HOST'], port=int(os.environ['AMQP_PORT']), login=os.environ['AMQP_USER'], password=os.environ['AMQP_PASSWORD'], ) self.channel = await connection.channel() await self.channel.set_qos(prefetch_count=MAX_CONCURRENT_DOWNLOAD) await self._amqp_setup() # Consume profiling queue async for message in self.profile_queue: obj = msg2json(message) dataset_id = obj['id'] metadata = obj['metadata'] materialize = metadata.get('materialize', {}) logger.info("Processing dataset %r from %r", dataset_id, materialize.get('identifier')) future = self.loop.run_in_executor( None, materialize_and_process_dataset, dataset_id, metadata, LazoDeleteFirst(self.lazo_client, self.es, dataset_id), self.nominatim, self.geo_data, self.profile_semaphore, ) future.add_done_callback( self.process_dataset_callback( message, dataset_id, ) )
async def coro(future): metadata = msg2json(message)['metadata'] _rie = asyncio.get_event_loop().run_in_executor in_thread = lambda func: _rie(None, func) try: try: metadata = future.result() if metadata['nb_rows'] == 0: logger.info( "Dataset has no rows, not inserting into index: " + "%r", dataset_id, ) await in_thread( lambda: delete_dataset_from_index( self.es, dataset_id, # DO delete from Lazo self.lazo_client, ), ) self.es.index( 'pending', dict( status='error', error="Dataset has no rows", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) else: # Delete dataset if already exists in index await in_thread( lambda: delete_dataset_from_index( self.es, dataset_id, # Don't delete from Lazo, we inserted during profile None, ), ) # Insert results in Elasticsearch body = dict(metadata, date=datetime.utcnow().isoformat() + 'Z', version=os.environ['DATAMART_VERSION']) await in_thread( lambda: add_dataset_to_index(self.es, dataset_id, body), ) # Publish to RabbitMQ msg = dict( id=dataset_id, ) for key in ( 'name', 'description', 'source', 'date', 'version', 'types', 'nb_rows', 'nb_columns', 'materialize', ): if key in body: msg[key] = body[key] await self.datasets_exchange.publish( json2msg(msg), dataset_id, ) # Remove from alternate index try: self.es.delete('pending', dataset_id) except elasticsearch.NotFoundError: pass except DatasetTooBig as e: # Materializer reached size limit if not e.limit: logger.info("Dataset over size limit: %r", dataset_id) elif e.actual: logger.info( "Dataset over size limit (%d > %d bytes): %r", e.actual, e.limit, dataset_id, ) else: logger.info( "Dataset over size limit (%d bytes): %r", e.limit, dataset_id, ) await message.ack() self.es.index( 'pending', dict( status='error', error="Dataset is too big", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) try: await in_thread( lambda: delete_dataset_from_index( self.es, dataset_id, self.lazo_client, ), ) except elasticsearch.NotFoundError: pass except Exception as e: if isinstance(e, elasticsearch.RequestError): # This is a problem with our computed metadata sentry_sdk.capture_exception(e) logger.exception( "Error inserting dataset %r in Elasticsearch", dataset_id, ) elif isinstance(e, elasticsearch.TransportError): # This is probably an issue with Elasticsearch # We'll log, nack and retry raise else: logger.warning("Error processing dataset %r", dataset_id, exc_info=True) # Move message to failed queue await self.channel.default_exchange.publish( aio_pika.Message(message.body), self.failed_queue.name, ) # Ack anyway, retrying would probably fail again await message.ack() self.es.index( 'pending', dict( status='error', error="Error profiling dataset", error_details=exception_details(e), metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) else: await message.ack() logger.info("Dataset %r processed successfully", dataset_id) except Exception: await message.nack() raise
async def coro(future): metadata = msg2json(message)['metadata'] try: try: metadata = future.result() if metadata['nb_rows'] == 0: logger.info( "Dataset has no rows, not inserting into index: " + "%r", dataset_id, ) delete_dataset_from_index( self.es, dataset_id, # DO delete from Lazo self.lazo_client, ) self.es.index( 'pending', dict( status='error', error="Dataset has no rows", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) else: # Delete dataset if already exists in index delete_dataset_from_index( self.es, dataset_id, # Don't delete from Lazo, we inserted during profile None, ) # Insert results in Elasticsearch body = dict(metadata, date=datetime.utcnow().isoformat() + 'Z', version=os.environ['DATAMART_VERSION']) add_dataset_to_index(self.es, dataset_id, body) # Publish to RabbitMQ await self.datasets_exchange.publish( json2msg(dict(body, id=dataset_id)), dataset_id, ) # Remove from alternate index try: self.es.delete('pending', dataset_id) except elasticsearch.NotFoundError: pass except DatasetTooBig: # Materializer reached size limit logger.info("Dataset over size limit: %r", dataset_id) message.ack() self.es.index( 'pending', dict( status='error', error="Dataset is too big", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) except Exception as e: if isinstance(e, elasticsearch.RequestError): # This is a problem with our computed metadata logger.exception( "Error inserting dataset %r in Elasticsearch", dataset_id, ) elif isinstance(e, elasticsearch.TransportError): # This is probably an issue with Elasticsearch # We'll log, nack and retry raise else: logger.exception("Error processing dataset %r", dataset_id) # Move message to failed queue await self.channel.default_exchange.publish( aio_pika.Message(message.body), self.failed_queue.name, ) # Ack anyway, retrying would probably fail again message.ack() self.es.index( 'pending', dict( status='error', error="Error profiling dataset", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) else: message.ack() logger.info("Dataset %r processed successfully", dataset_id) except Exception: message.nack() raise