Exemple #1
0
def make_app(debug=False):
    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(',')
    )
    redis_client = redis.Redis(host=os.environ['REDIS_HOST'])
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT'])
    )

    return Application(
        [
            URLSpec('/profile', Profile, name='profile'),
            URLSpec('/search', Search, name='search'),
            URLSpec('/download/([^/]+)', DownloadId, name='download_id'),
            URLSpec('/download', Download, name='download'),
            URLSpec('/metadata/([^/]+)', Metadata, name='metadata'),
            URLSpec('/augment', Augment, name='augment'),
            URLSpec('/upload', Upload, name='upload'),
            URLSpec('/statistics', Statistics, name='statistics'),
            URLSpec('/version', Version, name='version'),
            URLSpec('/health', Health, name='health'),
        ],
        debug=debug,
        es=es,
        redis_client=redis_client,
        lazo=lazo_client
    )
Exemple #2
0
    async def _run(self):
        self.elasticsearch = elasticsearch.Elasticsearch(
            os.environ['ELASTICSEARCH_HOSTS'].split(','))
        self.lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT']))

        connection = await aio_pika.connect_robust(
            host=os.environ['AMQP_HOST'],
            port=int(os.environ['AMQP_PORT']),
            login=os.environ['AMQP_USER'],
            password=os.environ['AMQP_PASSWORD'],
        )
        self.channel = await connection.channel()
        await self.channel.set_qos(prefetch_count=1)

        await self._amqp_setup()

        # Start profiling process
        log_future(self._call(self.main_loop), logger)

        if hasattr(self, 'handle_query'):
            log_future(self.loop.create_task(self._consume_queries()),
                       logger,
                       should_never_exit=True)
Exemple #3
0
def clear(source):
    es = PrefixedElasticsearch()
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))
    hits = es.scan(
        index='datasets,pending',
        query={
            'query': {
                'bool': {
                    'should': [
                        {
                            'term': {
                                'materialize.identifier': source,
                            },
                        },
                        {
                            'term': {
                                'source': source,
                            },
                        },
                    ],
                    'minimum_should_match':
                    1,
                },
            },
        },
        _source=False,
        size=SIZE,
    )
    for h in hits:
        delete_dataset_from_index(es, h['_id'], lazo_client)
Exemple #4
0
def delete(datasets):
    es = PrefixedElasticsearch()
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))
    for dataset in datasets:
        delete_dataset_from_index(es, dataset, lazo_client)
Exemple #5
0
    def __init__(self):
        self.profile_semaphore = threading.Semaphore(MAX_CONCURRENT_PROFILE)
        self.es = elasticsearch.Elasticsearch(
            os.environ['ELASTICSEARCH_HOSTS'].split(','))
        self.lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT']))
        if os.environ.get('NOMINATIM_URL'):
            self.nominatim = os.environ['NOMINATIM_URL']
        else:
            self.nominatim = None
            logger.warning("$NOMINATIM_URL is not set, not resolving URLs")
        self.geo_data = GeoData.from_local_cache()
        self.channel = None

        self.geo_data.load_areas([0, 1, 2], bounds=True)

        self.loop = asyncio.get_event_loop()
        log_future(self.loop.create_task(self._run()),
                   logger,
                   should_never_exit=True)

        # Retry a few times, in case the Elasticsearch container is not yet up
        for i in itertools.count():
            try:
                if not self.es.indices.exists('datamart'):
                    raise RuntimeError("'datamart' index does not exist")
            except Exception:
                logger.warning("Can't connect to Elasticsearch, retrying...")
                if i == 5:
                    raise
                else:
                    time.sleep(5)
            else:
                break
Exemple #6
0
def make_app(debug=False):
    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(','))
    redis_client = redis.Redis(host=os.environ['REDIS_HOST'])
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))

    return Application(
        [
            ApiRule('/profile', '1', Profile),
            ApiRule('/search', '1', Search),
            ApiRule('/download/([^/]+)', '1', DownloadId),
            ApiRule('/download', '1', Download),
            ApiRule('/metadata/([^/]+)', '1', Metadata),
            ApiRule('/augment', '1', Augment),
            ApiRule('/augment/([^/]+)', '1', AugmentResult),
            ApiRule('/upload', '1', Upload),
            ApiRule('/session/new', '1', SessionNew),
            ApiRule('/session/([^/]+)', '1', SessionGet),
            ApiRule('/location', '1', LocationSearch),
            ApiRule('/statistics', '1', Statistics),
            ApiRule('/version', '1', Version),
            URLSpec('/health', Health),
        ],
        debug=debug,
        es=es,
        redis_client=redis_client,
        lazo=lazo_client,
        default_handler_class=CustomErrorHandler,
        default_handler_args={"status_code": 404},
    )
Exemple #7
0
async def import_all(folder):
    es = PrefixedElasticsearch()
    if 'LAZO_SERVER_HOST' in os.environ:
        lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT']))
    else:
        lazo_client = None

    dataset_docs = []
    lazo_docs = []
    for name in os.listdir(folder):
        if name.startswith('lazo.'):
            lazo_docs.append(name)
        else:
            dataset_docs.append(name)

    for i, name in enumerate(dataset_docs):
        if i % 50 == 0:
            print(
                "\nImporting to Elasticsearch, %d/%d" % (i, len(dataset_docs)),
                flush=True,
            )
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name)
        try:
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        print('.', end='', flush=True)

    for i, name in enumerate(lazo_docs):
        if i % 500 == 0:
            print(
                "\nImporting to Lazo, %d/%d" % (i, len(lazo_docs)),
                flush=True,
            )
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name[5:]).rsplit('.', 1)[0]
        lazo_es_id = obj.pop('_id')
        assert lazo_es_id.split('__.__')[0] == dataset_id
        try:
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        if i % 10 == 0:
            print('.', end='', flush=True)
Exemple #8
0
def delete(datasets):
    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(','))
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))
    for dataset in datasets:
        delete_dataset_from_index(es, dataset, lazo_client)
Exemple #9
0
def clear(identifier):
    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(','))
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))
    hits = elasticsearch.helpers.scan(
        es,
        index='datamart',
        query={
            'query': {
                'term': {
                    'materialize.identifier': identifier,
                },
            },
        },
        _source=False,
        size=SIZE,
    )
    for h in hits:
        delete_dataset_from_index(es, h['_id'], lazo_client)
Exemple #10
0
    async def run(self):
        self.elasticsearch = PrefixedElasticsearch()
        self.lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT']))

        connection = await aio_pika.connect_robust(
            host=os.environ['AMQP_HOST'],
            port=int(os.environ['AMQP_PORT']),
            login=os.environ['AMQP_USER'],
            password=os.environ['AMQP_PASSWORD'],
        )
        self.channel = await connection.channel()
        await self.channel.set_qos(prefetch_count=1)

        await self._amqp_setup()

        # Start profiling process
        try:
            await self._call(self.discover_datasets)
        except Exception as e:
            sentry_sdk.capture_exception(e)
            logger.exception("Exception in discoverer %s", self.identifier)
            sys.exit(1)
Exemple #11
0
async def import_all(folder):
    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(',')
    )
    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_datasets_exchange = await amqp_chan.declare_exchange(
        'datasets',
        aio_pika.ExchangeType.TOPIC,
    )
    if 'LAZO_SERVER_HOST' in os.environ:
        lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT'])
        )
    else:
        lazo_client = None

    print("Importing Elasticsearch data", end='', flush=True)
    for name in os.listdir(folder):
        if name.startswith('lazo.'):
            continue
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name)
        try:
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        await amqp_datasets_exchange.publish(
            json2msg(dict(obj, id=dataset_id)),
            dataset_id,
        )
        print('.', end='', flush=True)

    print("Importing Lazo data", end='', flush=True)
    for name in os.listdir(folder):
        if not name.startswith('lazo.'):
            continue
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name[5:]).rsplit('.', 1)[0]
        lazo_es_id = obj.pop('_id')
        assert lazo_es_id.split('__.__')[0] == dataset_id
        try:
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        print('.', end='', flush=True)