Example #1
0
    def __init__(self):
        self.profile_semaphore = threading.Semaphore(MAX_CONCURRENT_PROFILE)
        self.es = elasticsearch.Elasticsearch(
            os.environ['ELASTICSEARCH_HOSTS'].split(','))
        self.lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT']))
        if os.environ.get('NOMINATIM_URL'):
            self.nominatim = os.environ['NOMINATIM_URL']
        else:
            self.nominatim = None
            logger.warning("$NOMINATIM_URL is not set, not resolving URLs")
        self.geo_data = GeoData.from_local_cache()
        self.channel = None

        self.geo_data.load_areas([0, 1, 2], bounds=True)

        self.loop = asyncio.get_event_loop()
        log_future(self.loop.create_task(self._run()),
                   logger,
                   should_never_exit=True)

        # Retry a few times, in case the Elasticsearch container is not yet up
        for i in itertools.count():
            try:
                if not self.es.indices.exists('datamart'):
                    raise RuntimeError("'datamart' index does not exist")
            except Exception:
                logger.warning("Can't connect to Elasticsearch, retrying...")
                if i == 5:
                    raise
                else:
                    time.sleep(5)
            else:
                break
Example #2
0
def check_cache():
    try:
        # Count datasets in cache
        datasets, datasets_bytes = measure_cache_dir('/cache/datasets')
        PROM_CACHE_DATASETS.set(datasets)
        PROM_CACHE_DATASETS_BYTES.set(datasets_bytes)
        logger.info("%d datasets in cache, %d bytes", datasets, datasets_bytes)

        # Count augmentations in cache
        augmentations, augmentations_bytes = measure_cache_dir('/cache/aug')
        PROM_CACHE_AUGMENTATIONS.set(augmentations)
        PROM_CACHE_AUGMENTATIONS_BYTES.set(augmentations_bytes)
        logger.info("%d augmentations in cache, %d bytes", augmentations,
                    augmentations_bytes)

        # Count user datasets in cache
        user_datasets, user_data_bytes = measure_cache_dir('/cache/user_data')
        PROM_CACHE_USER_DATASETS.set(user_datasets)
        PROM_CACHE_USER_DATASETS_BYTES.set(user_data_bytes)
        logger.info("%d user datasets in cache, %d bytes", user_datasets,
                    user_data_bytes)

        # Remove from caches if max is reached
        if datasets_bytes + augmentations_bytes > CACHE_HIGH:
            fut = asyncio.get_event_loop().run_in_executor(
                None,
                clear_caches,
            )
            log_future(fut, logger)
    finally:
        asyncio.get_event_loop().call_later(
            5 * 60,
            check_cache,
        )
    def on_finish(self):
        super(GracefulHandler, self).on_finish()

        app = self.application

        async def do_decrease():
            async with app.close_condition:
                app.nb_requests -= 1
                app.close_condition.notify_all()

        log_future(asyncio.get_event_loop().create_task(do_decrease()), logger)
    def try_exit(self):
        async def do_exit():
            async with self.close_condition:
                while self.nb_requests > 0:
                    logger.info("%d requests in progress, waiting...",
                                self.nb_requests)
                    await self.close_condition.wait()
            logger.warning("Closing gracefully")
            tornado.ioloop.IOLoop.current().stop()

        log_future(asyncio.get_event_loop().create_task(do_exit()), logger)
Example #5
0
    def __init__(self, *args, es, redis_client, lazo, **kwargs):
        super(Application, self).__init__(*args, **kwargs)

        self.is_closing = False

        self.elasticsearch = es
        self.redis = redis_client
        self.lazo_client = lazo
        self.nominatim = os.environ['NOMINATIM_URL']
        self.channel = None

        log_future(asyncio.get_event_loop().create_task(self._amqp()), logger)
Example #6
0
def check_cache():
    try:
        # Count datasets in cache
        datasets = 0
        datasets_bytes = 0
        for name in os.listdir('/cache/datasets'):
            path = os.path.join('/cache/datasets', name)
            if not name.endswith('.cache'):
                continue
            datasets += 1
            datasets_bytes += get_tree_size(path)
        PROM_CACHE_DATASETS.set(datasets)
        PROM_CACHE_DATASETS_BYTES.set(datasets_bytes)
        logger.info("%d datasets in cache, %d bytes", datasets, datasets_bytes)

        # Count augmentations in cache
        augmentations = 0
        augmentations_bytes = 0
        for name in os.listdir('/cache/aug'):
            path = os.path.join('/cache/aug', name)
            if not name.endswith('.cache'):
                continue
            augmentations += 1
            augmentations_bytes += get_tree_size(path)
        PROM_CACHE_AUGMENTATIONS.set(augmentations)
        PROM_CACHE_AUGMENTATIONS_BYTES.set(augmentations_bytes)
        logger.info("%d augmentations in cache, %d bytes", augmentations,
                    augmentations_bytes)

        # Remove from caches if max is reached
        if datasets_bytes + augmentations_bytes > CACHE_HIGH:
            fut = asyncio.get_event_loop().run_in_executor(
                None,
                clear_caches,
            )
            log_future(fut, logger)
    finally:
        asyncio.get_event_loop().call_later(
            5 * 60,
            check_cache,
        )
Example #7
0
    async def _amqp(self):
        connection = await aio_pika.connect_robust(
            host=os.environ['AMQP_HOST'],
            port=int(os.environ['AMQP_PORT']),
            login=os.environ['AMQP_USER'],
            password=os.environ['AMQP_PASSWORD'],
        )
        self.channel = await connection.channel()
        await self.channel.set_qos(prefetch_count=1)

        # Declare profiling exchange (to publish datasets via upload)
        self.profile_exchange = await self.channel.declare_exchange(
            'profile',
            aio_pika.ExchangeType.FANOUT,
        )

        # Start statistics-fetching coroutine
        log_future(
            asyncio.get_event_loop().create_task(self.update_statistics()),
            logger,
            should_never_exit=True,
        )
Example #8
0
 def callback(future):
     log_future(self.loop.create_task(coro(future)), logger)
Example #9
0
    def __init__(self, es):
        self.elasticsearch = es
        self.recent_discoveries = []

        # Setup the indices from YAML file
        with pkg_resources.resource_stream(
                'coordinator', 'elasticsearch.yml') as stream:
            indices = yaml.safe_load(stream)
        # Retry a few times, in case the Elasticsearch container is not yet up
        for i in itertools.count():
            try:
                for name, index in indices.items():
                    if not es.indices.exists(name):
                        logger.info("Creating index '%r' in Elasticsearch",
                                    name)
                        es.indices.create(
                            name,
                            {'mappings': index['mappings']},
                        )
            except Exception:
                logger.warning("Can't connect to Elasticsearch, retrying...")
                if i == 5:
                    raise
                else:
                    time.sleep(5)
            else:
                break

        # Create cache directories
        os.makedirs('/cache/datasets', exist_ok=True)
        os.makedirs('/cache/aug', exist_ok=True)

        # Load recent datasets from Elasticsearch
        try:
            recent = self.elasticsearch.search(
                index='datamart',
                body={
                    'query': {
                        'match_all': {},
                    },
                    'sort': [
                        {'date': {'order': 'desc'}},
                    ],
                },
                size=15,
            )['hits']['hits']
        except elasticsearch.ElasticsearchException:
            logging.warning("Couldn't get recent datasets from Elasticsearch")
        else:
            for h in recent:
                self.recent_discoveries.append(self.build_discovery(h['_id'], h['_source']))

        # Start AMQP coroutine
        log_future(
            asyncio.get_event_loop().create_task(self._amqp()),
            logger,
            should_never_exit=True,
        )

        # Start statistics coroutine
        self.sources_counts = {}
        self.profiler_versions_counts = {}
        log_future(
            asyncio.get_event_loop().create_task(self.update_statistics()),
            logger,
            should_never_exit=True,
        )
Example #10
0
    def __init__(self, es):
        self.elasticsearch = es
        self._recent_discoveries = RecentList(NB_RECENT)
        self._recent_uploads = RecentList(NB_RECENT)

        # Create datasets directory
        os.makedirs('/cache/datasets', exist_ok=True)

        # Setup the indices from YAML file
        with pkg_resources.resource_stream('coordinator',
                                           'elasticsearch.yml') as stream:
            indices = yaml.safe_load(stream)
        indices.pop('_refs', None)
        # Add custom fields
        custom_fields = os.environ.get('CUSTOM_FIELDS', None)
        if custom_fields:
            custom_fields = json.loads(custom_fields)
            if custom_fields:
                for field, opts in custom_fields.items():
                    for idx, name in [
                        ('datasets', field),
                        ('columns', 'dataset_' + field),
                        ('spatial_coverage', 'dataset_' + field),
                    ]:
                        indices[idx]['mappings']['properties'][name] = {
                            'type': opts['type'],
                        }
        # Retry a few times, in case the Elasticsearch container is not yet up
        for i in itertools.count():
            try:
                for name, index in indices.items():
                    if not es.index_exists(name):
                        logger.info("Creating index %r in Elasticsearch", name)
                        es.index_create(
                            name,
                            index,
                        )
            except Exception:
                logger.warning("Can't connect to Elasticsearch, retrying...")
                if i == 5:
                    raise
                else:
                    time.sleep(5)
            else:
                break

        # Start AMQP coroutine
        log_future(
            asyncio.get_event_loop().create_task(self._amqp()),
            logger,
            should_never_exit=True,
        )

        # Start statistics coroutine
        self.sources_counts = {}
        self.profiler_versions_counts = {}
        self.error_counts = {}
        log_future(
            asyncio.get_event_loop().create_task(self.update_statistics()),
            logger,
            should_never_exit=True,
        )