Esempio n. 1
0
def clear(source):
    es = PrefixedElasticsearch()
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))
    hits = es.scan(
        index='datasets,pending',
        query={
            'query': {
                'bool': {
                    'should': [
                        {
                            'term': {
                                'materialize.identifier': source,
                            },
                        },
                        {
                            'term': {
                                'source': source,
                            },
                        },
                    ],
                    'minimum_should_match':
                    1,
                },
            },
        },
        _source=False,
        size=SIZE,
    )
    for h in hits:
        delete_dataset_from_index(es, h['_id'], lazo_client)
Esempio n. 2
0
async def freshen(version):
    # Check that it's a valid version
    version_hash = subprocess.check_output(['git', 'rev-parse', version])
    version_hash = version_hash.decode('ascii').strip()
    logger.warning("Reprocessing datasets profiled before %s", version_hash)

    es = PrefixedElasticsearch()

    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    hits = es.scan(
        index='datasets',
        query={
            'query': {
                'match_all': {},
            },
        },
        size=SIZE,
    )
    reprocessed = 0
    for h in hits:
        obj = h['_source']
        dataset_version = obj['version']
        if is_version_more_recent(version, dataset_version):
            logger.debug("%s is recent enough (version=%r)",
                         h['_id'], dataset_version)
            continue

        reprocessed += 1
        logger.info("Reprocessing %s, version=%r",
                    h['_id'], dataset_version)
        metadata = dict(name=obj['name'],
                        materialize=obj['materialize'],
                        source=obj.get('source', 'unknown'))
        if obj.get('description'):
            metadata['description'] = obj['description']
        if obj.get('date'):
            metadata['date'] = obj['date']
        if obj.get('manual_annotations'):
            metadata['manual_annotations'] = obj['manual_annotations']
        await amqp_profile_exchange.publish(
            json2msg(dict(id=h['_id'], metadata=metadata)),
            '',
        )
    logger.info("Reprocessed %d datasets", reprocessed)
Esempio n. 3
0
def delete(datasets):
    es = PrefixedElasticsearch()
    lazo_client = lazo_index_service.LazoIndexClient(
        host=os.environ['LAZO_SERVER_HOST'],
        port=int(os.environ['LAZO_SERVER_PORT']))
    for dataset in datasets:
        delete_dataset_from_index(es, dataset, lazo_client)
Esempio n. 4
0
async def import_all(folder):
    es = PrefixedElasticsearch()
    if 'LAZO_SERVER_HOST' in os.environ:
        lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT']))
    else:
        lazo_client = None

    dataset_docs = []
    lazo_docs = []
    for name in os.listdir(folder):
        if name.startswith('lazo.'):
            lazo_docs.append(name)
        else:
            dataset_docs.append(name)

    for i, name in enumerate(dataset_docs):
        if i % 50 == 0:
            print(
                "\nImporting to Elasticsearch, %d/%d" % (i, len(dataset_docs)),
                flush=True,
            )
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name)
        try:
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        print('.', end='', flush=True)

    for i, name in enumerate(lazo_docs):
        if i % 500 == 0:
            print(
                "\nImporting to Lazo, %d/%d" % (i, len(lazo_docs)),
                flush=True,
            )
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name[5:]).rsplit('.', 1)[0]
        lazo_es_id = obj.pop('_id')
        assert lazo_es_id.split('__.__')[0] == dataset_id
        try:
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        if i % 10 == 0:
            print('.', end='', flush=True)
Esempio n. 5
0
def search():
    es = PrefixedElasticsearch()
    hits = es.scan(
        index='datasets',
        query={
            'query': {
                'range': {
                    "size": {
                        "gt": 10000000000,  # 10 GB
                    },
                },
            },
        },
        _source='size',
        size=SIZE,
    )
    for h in hits:
        print("%s %.1f GB" % (h['_id'], h['_source']['size'] / 1000000000.0))
Esempio n. 6
0
def export():
    es = PrefixedElasticsearch()

    print("Dumping datasets", end='', flush=True)
    hits = es.scan(
        index='datasets',
        query={
            'query': {
                'match_all': {},
            },
        },
        size=SIZE,
    )
    for h in hits:
        # Use dataset ID as file name
        with open(encode_dataset_id(h['_id']), 'w') as fp:
            json.dump(h['_source'], fp, sort_keys=True, indent=2)

    print("Dumping Lazo data", end='', flush=True)
    hits = es.scan(
        index='lazo',
        query={
            'query': {
                'match_all': {},
            },
        },
        size=SIZE,
    )
    for h in hits:
        # Use "lazo." dataset_id ".NB" as file name
        dataset_id = h['_id'].split('__.__')[0]
        fname = unique_filename(
            'lazo.{0}.{{0}}'.format(encode_dataset_id(dataset_id))
        )
        with open(fname, 'w') as fp:
            json.dump(
                dict(h['_source'], _id=h['_id']),
                fp,
                sort_keys=True,
                indent=2,
            )
Esempio n. 7
0
async def freshen(datasets, priority):
    es = PrefixedElasticsearch()

    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    for dataset_id in datasets:
        try:
            obj = es.get('datasets', dataset_id)['_source']
        except elasticsearch.NotFoundError:
            obj = es.get('pending', dataset_id)['_source']['metadata']
            dataset_version = None
        else:
            dataset_version = obj['version']

        logger.info("Reprocessing %s, version=%r", dataset_id, dataset_version)
        metadata = dict(name=obj['name'],
                        materialize=obj['materialize'],
                        source=obj.get('source', 'unknown'))
        if obj.get('description'):
            metadata['description'] = obj['description']
        if obj.get('date'):
            metadata['date'] = obj['date']
        if obj.get('manual_annotations'):
            metadata['manual_annotations'] = obj['manual_annotations']
        await amqp_profile_exchange.publish(
            json2msg(
                dict(id=dataset_id, metadata=metadata),
                priority=priority,
            ),
            '',
        )
Esempio n. 8
0
def count():
    es = PrefixedElasticsearch()
    sources = {}
    hits = es.scan(
        index='datasets',
        query={
            'query': {
                'match_all': {},
            },
        },
        _source='source',
        size=SIZE,
    )
    for h in hits:
        source = h['_source']['source']

        try:
            sources[source] += 1
        except KeyError:
            sources[source] = 1

    for identifier, count in sorted(sources.items(), key=lambda p: -p[1]):
        print('{: 6d} {}'.format(count, identifier))
Esempio n. 9
0
    def __init__(self):
        self.profile_semaphore = threading.Semaphore(MAX_CONCURRENT_PROFILE)
        self.es = PrefixedElasticsearch()
        self.lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT'])
        )
        if os.environ.get('NOMINATIM_URL'):
            self.nominatim = os.environ['NOMINATIM_URL']
        else:
            self.nominatim = None
            logger.warning(
                "$NOMINATIM_URL is not set, not resolving addresses"
            )
        self.geo_data = GeoData.from_local_cache()
        self.channel = None

        assert(os.path.isdir('/cache/datasets'))

        self.loop = asyncio.get_event_loop()
        log_future(self.loop.create_task(self._run()), logger,
                   should_never_exit=True)

        # Retry a few times, in case the Elasticsearch container is not yet up
        for i in itertools.count():
            try:
                if not self.es.index_exists('datasets'):
                    raise RuntimeError("'datasets' index does not exist")
            except Exception:
                logger.warning("Can't connect to Elasticsearch, retrying...")
                if i == 5:
                    raise
                else:
                    time.sleep(5)
            else:
                break
Esempio n. 10
0
class Profiler(object):
    def __init__(self):
        self.profile_semaphore = threading.Semaphore(MAX_CONCURRENT_PROFILE)
        self.es = PrefixedElasticsearch()
        self.lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT'])
        )
        if os.environ.get('NOMINATIM_URL'):
            self.nominatim = os.environ['NOMINATIM_URL']
        else:
            self.nominatim = None
            logger.warning(
                "$NOMINATIM_URL is not set, not resolving addresses"
            )
        self.geo_data = GeoData.from_local_cache()
        self.channel = None

        assert(os.path.isdir('/cache/datasets'))

        self.loop = asyncio.get_event_loop()
        log_future(self.loop.create_task(self._run()), logger,
                   should_never_exit=True)

        # Retry a few times, in case the Elasticsearch container is not yet up
        for i in itertools.count():
            try:
                if not self.es.index_exists('datasets'):
                    raise RuntimeError("'datasets' index does not exist")
            except Exception:
                logger.warning("Can't connect to Elasticsearch, retrying...")
                if i == 5:
                    raise
                else:
                    time.sleep(5)
            else:
                break

    async def _amqp_setup(self):
        # Setup the datasets exchange
        self.datasets_exchange = await self.channel.declare_exchange(
            'datasets',
            aio_pika.ExchangeType.TOPIC)

        # Setup the profiling exchange
        self.profile_exchange = await self.channel.declare_exchange(
            'profile',
            aio_pika.ExchangeType.FANOUT,
        )

        # Declare the profiling queue
        self.profile_queue = await self.channel.declare_queue(
            'profile',
            arguments={'x-max-priority': 3},
        )
        await self.profile_queue.bind(self.profile_exchange)

        # Declare the failed queue
        self.failed_queue = await self.channel.declare_queue('failed_profile')

    async def _run(self):
        connection = await aio_pika.connect_robust(
            host=os.environ['AMQP_HOST'],
            port=int(os.environ['AMQP_PORT']),
            login=os.environ['AMQP_USER'],
            password=os.environ['AMQP_PASSWORD'],
        )
        self.channel = await connection.channel()
        await self.channel.set_qos(prefetch_count=MAX_CONCURRENT_DOWNLOAD)

        await self._amqp_setup()

        # Consume profiling queue
        async for message in self.profile_queue:
            obj = msg2json(message)
            dataset_id = obj['id']
            metadata = obj['metadata']
            materialize = metadata.get('materialize', {})

            logger.info("Processing dataset %r from %r",
                        dataset_id, materialize.get('identifier'))

            future = self.loop.run_in_executor(
                None,
                materialize_and_process_dataset,
                dataset_id,
                metadata,
                LazoDeleteFirst(self.lazo_client, self.es, dataset_id),
                self.nominatim,
                self.geo_data,
                self.profile_semaphore,
            )

            future.add_done_callback(
                self.process_dataset_callback(
                    message, dataset_id,
                )
            )

    def process_dataset_callback(self, message, dataset_id):
        async def coro(future):
            metadata = msg2json(message)['metadata']
            _rie = asyncio.get_event_loop().run_in_executor
            in_thread = lambda func: _rie(None, func)
            try:
                try:
                    metadata = future.result()
                    if metadata['nb_rows'] == 0:
                        logger.info(
                            "Dataset has no rows, not inserting into index: " +
                            "%r",
                            dataset_id,
                        )
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                # DO delete from Lazo
                                self.lazo_client,
                            ),
                        )
                        self.es.index(
                            'pending',
                            dict(
                                status='error',
                                error="Dataset has no rows",
                                metadata=metadata,
                                date=datetime.utcnow().isoformat(),
                                source=metadata['source'],
                                materialize=metadata['materialize'],
                            ),
                            id=dataset_id,
                        )
                    else:
                        # Delete dataset if already exists in index
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                # Don't delete from Lazo, we inserted during profile
                                None,
                            ),
                        )
                        # Insert results in Elasticsearch
                        body = dict(metadata,
                                    date=datetime.utcnow().isoformat() + 'Z',
                                    version=os.environ['DATAMART_VERSION'])
                        await in_thread(
                            lambda: add_dataset_to_index(self.es, dataset_id, body),
                        )

                        # Publish to RabbitMQ
                        msg = dict(
                            id=dataset_id,
                        )
                        for key in (
                            'name', 'description', 'source', 'date', 'version',
                            'types', 'nb_rows', 'nb_columns', 'materialize',
                        ):
                            if key in body:
                                msg[key] = body[key]
                        await self.datasets_exchange.publish(
                            json2msg(msg),
                            dataset_id,
                        )

                        # Remove from alternate index
                        try:
                            self.es.delete('pending', dataset_id)
                        except elasticsearch.NotFoundError:
                            pass
                except DatasetTooBig as e:
                    # Materializer reached size limit
                    if not e.limit:
                        logger.info("Dataset over size limit: %r", dataset_id)
                    elif e.actual:
                        logger.info(
                            "Dataset over size limit (%d > %d bytes): %r",
                            e.actual, e.limit,
                            dataset_id,
                        )
                    else:
                        logger.info(
                            "Dataset over size limit (%d bytes): %r",
                            e.limit, dataset_id,
                        )
                    await message.ack()
                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Dataset is too big",
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                    try:
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                self.lazo_client,
                            ),
                        )
                    except elasticsearch.NotFoundError:
                        pass
                except Exception as e:
                    if isinstance(e, elasticsearch.RequestError):
                        # This is a problem with our computed metadata
                        sentry_sdk.capture_exception(e)
                        logger.exception(
                            "Error inserting dataset %r in Elasticsearch",
                            dataset_id,
                        )
                    elif isinstance(e, elasticsearch.TransportError):
                        # This is probably an issue with Elasticsearch
                        # We'll log, nack and retry
                        raise
                    else:
                        logger.warning("Error processing dataset %r",
                                       dataset_id, exc_info=True)
                    # Move message to failed queue
                    await self.channel.default_exchange.publish(
                        aio_pika.Message(message.body),
                        self.failed_queue.name,
                    )
                    # Ack anyway, retrying would probably fail again
                    await message.ack()

                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Error profiling dataset",
                            error_details=exception_details(e),
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                else:
                    await message.ack()
                    logger.info("Dataset %r processed successfully",
                                dataset_id)
            except Exception:
                await message.nack()
                raise

        def callback(future):
            log_future(self.loop.create_task(coro(future)), logger)

        return callback