def __init__(self): self.profile_semaphore = threading.Semaphore(MAX_CONCURRENT_PROFILE) self.es = elasticsearch.Elasticsearch( os.environ['ELASTICSEARCH_HOSTS'].split(',')) self.lazo_client = lazo_index_service.LazoIndexClient( host=os.environ['LAZO_SERVER_HOST'], port=int(os.environ['LAZO_SERVER_PORT'])) if os.environ.get('NOMINATIM_URL'): self.nominatim = os.environ['NOMINATIM_URL'] else: self.nominatim = None logger.warning("$NOMINATIM_URL is not set, not resolving URLs") self.geo_data = GeoData.from_local_cache() self.channel = None self.geo_data.load_areas([0, 1, 2], bounds=True) self.loop = asyncio.get_event_loop() log_future(self.loop.create_task(self._run()), logger, should_never_exit=True) # Retry a few times, in case the Elasticsearch container is not yet up for i in itertools.count(): try: if not self.es.indices.exists('datamart'): raise RuntimeError("'datamart' index does not exist") except Exception: logger.warning("Can't connect to Elasticsearch, retrying...") if i == 5: raise else: time.sleep(5) else: break
def check_cache(): try: # Count datasets in cache datasets, datasets_bytes = measure_cache_dir('/cache/datasets') PROM_CACHE_DATASETS.set(datasets) PROM_CACHE_DATASETS_BYTES.set(datasets_bytes) logger.info("%d datasets in cache, %d bytes", datasets, datasets_bytes) # Count augmentations in cache augmentations, augmentations_bytes = measure_cache_dir('/cache/aug') PROM_CACHE_AUGMENTATIONS.set(augmentations) PROM_CACHE_AUGMENTATIONS_BYTES.set(augmentations_bytes) logger.info("%d augmentations in cache, %d bytes", augmentations, augmentations_bytes) # Count user datasets in cache user_datasets, user_data_bytes = measure_cache_dir('/cache/user_data') PROM_CACHE_USER_DATASETS.set(user_datasets) PROM_CACHE_USER_DATASETS_BYTES.set(user_data_bytes) logger.info("%d user datasets in cache, %d bytes", user_datasets, user_data_bytes) # Remove from caches if max is reached if datasets_bytes + augmentations_bytes > CACHE_HIGH: fut = asyncio.get_event_loop().run_in_executor( None, clear_caches, ) log_future(fut, logger) finally: asyncio.get_event_loop().call_later( 5 * 60, check_cache, )
def on_finish(self): super(GracefulHandler, self).on_finish() app = self.application async def do_decrease(): async with app.close_condition: app.nb_requests -= 1 app.close_condition.notify_all() log_future(asyncio.get_event_loop().create_task(do_decrease()), logger)
def try_exit(self): async def do_exit(): async with self.close_condition: while self.nb_requests > 0: logger.info("%d requests in progress, waiting...", self.nb_requests) await self.close_condition.wait() logger.warning("Closing gracefully") tornado.ioloop.IOLoop.current().stop() log_future(asyncio.get_event_loop().create_task(do_exit()), logger)
def __init__(self, *args, es, redis_client, lazo, **kwargs): super(Application, self).__init__(*args, **kwargs) self.is_closing = False self.elasticsearch = es self.redis = redis_client self.lazo_client = lazo self.nominatim = os.environ['NOMINATIM_URL'] self.channel = None log_future(asyncio.get_event_loop().create_task(self._amqp()), logger)
def check_cache(): try: # Count datasets in cache datasets = 0 datasets_bytes = 0 for name in os.listdir('/cache/datasets'): path = os.path.join('/cache/datasets', name) if not name.endswith('.cache'): continue datasets += 1 datasets_bytes += get_tree_size(path) PROM_CACHE_DATASETS.set(datasets) PROM_CACHE_DATASETS_BYTES.set(datasets_bytes) logger.info("%d datasets in cache, %d bytes", datasets, datasets_bytes) # Count augmentations in cache augmentations = 0 augmentations_bytes = 0 for name in os.listdir('/cache/aug'): path = os.path.join('/cache/aug', name) if not name.endswith('.cache'): continue augmentations += 1 augmentations_bytes += get_tree_size(path) PROM_CACHE_AUGMENTATIONS.set(augmentations) PROM_CACHE_AUGMENTATIONS_BYTES.set(augmentations_bytes) logger.info("%d augmentations in cache, %d bytes", augmentations, augmentations_bytes) # Remove from caches if max is reached if datasets_bytes + augmentations_bytes > CACHE_HIGH: fut = asyncio.get_event_loop().run_in_executor( None, clear_caches, ) log_future(fut, logger) finally: asyncio.get_event_loop().call_later( 5 * 60, check_cache, )
async def _amqp(self): connection = await aio_pika.connect_robust( host=os.environ['AMQP_HOST'], port=int(os.environ['AMQP_PORT']), login=os.environ['AMQP_USER'], password=os.environ['AMQP_PASSWORD'], ) self.channel = await connection.channel() await self.channel.set_qos(prefetch_count=1) # Declare profiling exchange (to publish datasets via upload) self.profile_exchange = await self.channel.declare_exchange( 'profile', aio_pika.ExchangeType.FANOUT, ) # Start statistics-fetching coroutine log_future( asyncio.get_event_loop().create_task(self.update_statistics()), logger, should_never_exit=True, )
def callback(future): log_future(self.loop.create_task(coro(future)), logger)
def __init__(self, es): self.elasticsearch = es self.recent_discoveries = [] # Setup the indices from YAML file with pkg_resources.resource_stream( 'coordinator', 'elasticsearch.yml') as stream: indices = yaml.safe_load(stream) # Retry a few times, in case the Elasticsearch container is not yet up for i in itertools.count(): try: for name, index in indices.items(): if not es.indices.exists(name): logger.info("Creating index '%r' in Elasticsearch", name) es.indices.create( name, {'mappings': index['mappings']}, ) except Exception: logger.warning("Can't connect to Elasticsearch, retrying...") if i == 5: raise else: time.sleep(5) else: break # Create cache directories os.makedirs('/cache/datasets', exist_ok=True) os.makedirs('/cache/aug', exist_ok=True) # Load recent datasets from Elasticsearch try: recent = self.elasticsearch.search( index='datamart', body={ 'query': { 'match_all': {}, }, 'sort': [ {'date': {'order': 'desc'}}, ], }, size=15, )['hits']['hits'] except elasticsearch.ElasticsearchException: logging.warning("Couldn't get recent datasets from Elasticsearch") else: for h in recent: self.recent_discoveries.append(self.build_discovery(h['_id'], h['_source'])) # Start AMQP coroutine log_future( asyncio.get_event_loop().create_task(self._amqp()), logger, should_never_exit=True, ) # Start statistics coroutine self.sources_counts = {} self.profiler_versions_counts = {} log_future( asyncio.get_event_loop().create_task(self.update_statistics()), logger, should_never_exit=True, )
def __init__(self, es): self.elasticsearch = es self._recent_discoveries = RecentList(NB_RECENT) self._recent_uploads = RecentList(NB_RECENT) # Create datasets directory os.makedirs('/cache/datasets', exist_ok=True) # Setup the indices from YAML file with pkg_resources.resource_stream('coordinator', 'elasticsearch.yml') as stream: indices = yaml.safe_load(stream) indices.pop('_refs', None) # Add custom fields custom_fields = os.environ.get('CUSTOM_FIELDS', None) if custom_fields: custom_fields = json.loads(custom_fields) if custom_fields: for field, opts in custom_fields.items(): for idx, name in [ ('datasets', field), ('columns', 'dataset_' + field), ('spatial_coverage', 'dataset_' + field), ]: indices[idx]['mappings']['properties'][name] = { 'type': opts['type'], } # Retry a few times, in case the Elasticsearch container is not yet up for i in itertools.count(): try: for name, index in indices.items(): if not es.index_exists(name): logger.info("Creating index %r in Elasticsearch", name) es.index_create( name, index, ) except Exception: logger.warning("Can't connect to Elasticsearch, retrying...") if i == 5: raise else: time.sleep(5) else: break # Start AMQP coroutine log_future( asyncio.get_event_loop().create_task(self._amqp()), logger, should_never_exit=True, ) # Start statistics coroutine self.sources_counts = {} self.profiler_versions_counts = {} self.error_counts = {} log_future( asyncio.get_event_loop().create_task(self.update_statistics()), logger, should_never_exit=True, )