Ejemplo n.º 1
0
async def import_all(folder):
    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    for name in os.listdir(folder):
        if not name.startswith('lazo.'):
            dataset_id = decode_dataset_id(name)
            path = os.path.join(folder, name)
            with open(path, 'r') as fp:
                obj = json.load(fp)
            metadata = dict(name=obj['name'],
                            materialize=obj['materialize'],
                            source=obj.get('source', 'unknown'))
            if obj.get('description'):
                metadata['description'] = obj['description']
            if obj.get('date'):
                metadata['date'] = obj['date']
            if obj.get('manual_annotations'):
                metadata['manual_annotations'] = obj['manual_annotations']
            await amqp_profile_exchange.publish(
                json2msg(dict(id=dataset_id, metadata=metadata)),
                '',
            )
            print('.', end='', flush=True)
Ejemplo n.º 2
0
async def freshen(datasets):
    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(','))

    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    hits = [es.get('datamart', d) for d in datasets]
    for h in hits:
        obj = h['_source']
        dataset_version = obj['version']

        logger.info("Reprocessing %s, version=%r", h['_id'], dataset_version)
        metadata = dict(name=obj['name'],
                        materialize=obj['materialize'],
                        source=obj.get('source', 'unknown'))
        if obj.get('description'):
            metadata['description'] = obj['description']
        if obj.get('date'):
            metadata['date'] = obj['date']
        if obj.get('manual_annotations'):
            metadata['manual_annotations'] = obj['manual_annotations']
        await amqp_profile_exchange.publish(
            json2msg(dict(id=h['_id'], metadata=metadata)),
            '',
        )
Ejemplo n.º 3
0
async def freshen(version):
    # Check that it's a valid version
    version_hash = subprocess.check_output(['git', 'rev-parse', version])
    version_hash = version_hash.decode('ascii').strip()
    logger.warning("Reprocessing datasets profiled before %s", version_hash)

    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(','))

    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    hits = elasticsearch.helpers.scan(
        es,
        index='datamart',
        query={
            'query': {
                'match_all': {},
            },
        },
        size=SIZE,
    )
    reprocessed = 0
    for h in hits:
        obj = h['_source']
        dataset_version = obj['version']
        if is_version_more_recent(version, dataset_version):
            logger.debug("%s is recent enough (version=%r)", h['_id'],
                         dataset_version)
            continue

        reprocessed += 1
        logger.info("Reprocessing %s, version=%r", h['_id'], dataset_version)
        metadata = dict(name=obj['name'],
                        materialize=obj['materialize'],
                        source=obj.get('source', 'unknown'))
        if obj.get('description'):
            metadata['description'] = obj['description']
        if obj.get('date'):
            metadata['date'] = obj['date']
        if obj.get('manual_annotations'):
            metadata['manual_annotations'] = obj['manual_annotations']
        await amqp_profile_exchange.publish(
            json2msg(dict(id=h['_id'], metadata=metadata)),
            '',
        )
    logger.info("Reprocessed %d datasets", reprocessed)
Ejemplo n.º 4
0
async def freshen(datasets, priority):
    es = PrefixedElasticsearch()

    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_profile_exchange = await amqp_chan.declare_exchange(
        'profile',
        aio_pika.ExchangeType.FANOUT,
    )

    for dataset_id in datasets:
        try:
            obj = es.get('datasets', dataset_id)['_source']
        except elasticsearch.NotFoundError:
            obj = es.get('pending', dataset_id)['_source']['metadata']
            dataset_version = None
        else:
            dataset_version = obj['version']

        logger.info("Reprocessing %s, version=%r", dataset_id, dataset_version)
        metadata = dict(name=obj['name'],
                        materialize=obj['materialize'],
                        source=obj.get('source', 'unknown'))
        if obj.get('description'):
            metadata['description'] = obj['description']
        if obj.get('date'):
            metadata['date'] = obj['date']
        if obj.get('manual_annotations'):
            metadata['manual_annotations'] = obj['manual_annotations']
        await amqp_profile_exchange.publish(
            json2msg(
                dict(id=dataset_id, metadata=metadata),
                priority=priority,
            ),
            '',
        )
Ejemplo n.º 5
0
        async def coro(future):
            metadata = msg2json(message)['metadata']
            _rie = asyncio.get_event_loop().run_in_executor
            in_thread = lambda func: _rie(None, func)
            try:
                try:
                    metadata = future.result()
                    if metadata['nb_rows'] == 0:
                        logger.info(
                            "Dataset has no rows, not inserting into index: " +
                            "%r",
                            dataset_id,
                        )
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                # DO delete from Lazo
                                self.lazo_client,
                            ),
                        )
                        self.es.index(
                            'pending',
                            dict(
                                status='error',
                                error="Dataset has no rows",
                                metadata=metadata,
                                date=datetime.utcnow().isoformat(),
                                source=metadata['source'],
                                materialize=metadata['materialize'],
                            ),
                            id=dataset_id,
                        )
                    else:
                        # Delete dataset if already exists in index
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                # Don't delete from Lazo, we inserted during profile
                                None,
                            ),
                        )
                        # Insert results in Elasticsearch
                        body = dict(metadata,
                                    date=datetime.utcnow().isoformat() + 'Z',
                                    version=os.environ['DATAMART_VERSION'])
                        await in_thread(
                            lambda: add_dataset_to_index(self.es, dataset_id, body),
                        )

                        # Publish to RabbitMQ
                        msg = dict(
                            id=dataset_id,
                        )
                        for key in (
                            'name', 'description', 'source', 'date', 'version',
                            'types', 'nb_rows', 'nb_columns', 'materialize',
                        ):
                            if key in body:
                                msg[key] = body[key]
                        await self.datasets_exchange.publish(
                            json2msg(msg),
                            dataset_id,
                        )

                        # Remove from alternate index
                        try:
                            self.es.delete('pending', dataset_id)
                        except elasticsearch.NotFoundError:
                            pass
                except DatasetTooBig as e:
                    # Materializer reached size limit
                    if not e.limit:
                        logger.info("Dataset over size limit: %r", dataset_id)
                    elif e.actual:
                        logger.info(
                            "Dataset over size limit (%d > %d bytes): %r",
                            e.actual, e.limit,
                            dataset_id,
                        )
                    else:
                        logger.info(
                            "Dataset over size limit (%d bytes): %r",
                            e.limit, dataset_id,
                        )
                    await message.ack()
                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Dataset is too big",
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                    try:
                        await in_thread(
                            lambda: delete_dataset_from_index(
                                self.es,
                                dataset_id,
                                self.lazo_client,
                            ),
                        )
                    except elasticsearch.NotFoundError:
                        pass
                except Exception as e:
                    if isinstance(e, elasticsearch.RequestError):
                        # This is a problem with our computed metadata
                        sentry_sdk.capture_exception(e)
                        logger.exception(
                            "Error inserting dataset %r in Elasticsearch",
                            dataset_id,
                        )
                    elif isinstance(e, elasticsearch.TransportError):
                        # This is probably an issue with Elasticsearch
                        # We'll log, nack and retry
                        raise
                    else:
                        logger.warning("Error processing dataset %r",
                                       dataset_id, exc_info=True)
                    # Move message to failed queue
                    await self.channel.default_exchange.publish(
                        aio_pika.Message(message.body),
                        self.failed_queue.name,
                    )
                    # Ack anyway, retrying would probably fail again
                    await message.ack()

                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Error profiling dataset",
                            error_details=exception_details(e),
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                else:
                    await message.ack()
                    logger.info("Dataset %r processed successfully",
                                dataset_id)
            except Exception:
                await message.nack()
                raise
Ejemplo n.º 6
0
    async def post(self):
        if 'file' in self.request.files:
            file = self.request.files['file'][0]
            metadata = dict(
                filename=file.filename,
                name=self.get_body_argument('name', None),
                source='upload',
                materialize=dict(identifier='datamart.upload',
                                 date=datetime.utcnow().isoformat() + 'Z'),
            )
            description = self.get_body_argument('description', None)
            if description:
                metadata['description'] = description
            dataset_id = 'datamart.upload.%s' % uuid.uuid4().hex

            # Write file to shared storage
            dataset_dir = os.path.join('/datasets', dataset_id)
            os.mkdir(dataset_dir)
            try:
                with open(os.path.join(dataset_dir, 'main.csv'), 'wb') as fp:
                    fp.write(file.body)
            except Exception:
                shutil.rmtree(dataset_dir)
                raise
        elif self.get_body_argument('address', None):
            # Check the URL
            address = self.get_body_argument('address')
            response = await self.http_client.fetch(address, raise_error=False)
            if response.code != 200:
                return self.send_error_json(
                    400, "Invalid URL ({} {})".format(
                        response.code, response.reason,
                    ),
                )

            # Metadata with 'direct_url' in materialization info
            metadata = dict(
                name=self.get_body_argument('name', None),
                source='upload',
                materialize=dict(identifier='datamart.url',
                                 direct_url=address,
                                 date=datetime.utcnow().isoformat() + 'Z'),
            )
            description = self.get_body_argument('description', None)
            if description:
                metadata['description'] = description
            dataset_id = 'datamart.url.%s' % uuid.uuid4().hex
        else:
            return self.send_error_json(400, "No file")

        # Add to alternate index
        self.application.elasticsearch.index(
            'pending',
            dict(
                status='queued',
                metadata=metadata,
                date=datetime.utcnow().isoformat(),
                source='upload',
                materialize=metadata['materialize'],
            ),
            id=dataset_id,
        )

        # Publish to the profiling queue
        await self.application.profile_exchange.publish(
            json2msg(
                dict(
                    id=dataset_id,
                    metadata=metadata,
                ),
                # Lower priority than on-demand datasets, but higher than base
                priority=1,
            ),
            '',
        )

        return self.send_json({'id': dataset_id})
Ejemplo n.º 7
0
    async def post(self):
        metadata = dict(
            name=self.get_body_argument('name', None),
            source='upload',
            materialize=dict(identifier='datamart.upload',
                             date=datetime.utcnow().isoformat() + 'Z'),
        )
        description = self.get_body_argument('description', None)
        if description:
            metadata['description'] = description
        for field, opts in self.application.custom_fields.items():
            value = self.get_body_argument(field, None)
            if value:
                if 'type' in opts:
                    type_ = opts['type']
                    if type_ == 'integer':
                        value = int(value)
                    elif type_ == 'float':
                        value = float(value)
                metadata[field] = value
            elif opts.get('required', False):
                return await self.send_error_json(
                    400,
                    "Missing field %s" % field,
                )

        if 'file' in self.request.files:
            file = self.request.files['file'][0]
            metadata['filename'] = file.filename
            manual_annotations = self.get_body_argument(
                'manual_annotations',
                None,
            )
            if manual_annotations:
                try:
                    manual_annotations = json.loads(manual_annotations)
                except json.JSONDecodeError:
                    return await self.send_error_json(
                        400,
                        "Invalid manual annotations",
                    )
                metadata['manual_annotations'] = manual_annotations

            dataset_id = 'datamart.upload.%s' % uuid.uuid4().hex

            # Write file to shared storage
            dataset_dir = os.path.join('/datasets', dataset_id)
            os.mkdir(dataset_dir)
            try:
                with open(os.path.join(dataset_dir, 'main.csv'), 'wb') as fp:
                    fp.write(file.body)
            except Exception:
                shutil.rmtree(dataset_dir)
                raise
        elif self.get_body_argument('address', None):
            # Check the URL
            address = self.get_body_argument('address')
            response = await self.http_client.fetch(address, raise_error=False)
            if response.code != 200:
                return await self.send_error_json(
                    400,
                    "Invalid URL ({} {})".format(
                        response.code,
                        response.reason,
                    ),
                )

            # Set identifier
            metadata['materialize']['identifier'] = 'datamart.url'

            # Set 'direct_url'
            metadata['materialize']['direct_url'] = address
            dataset_id = 'datamart.url.%s' % uuid.uuid4().hex
        else:
            return await self.send_error_json(400, "No file")

        # Add to alternate index
        self.application.elasticsearch.index(
            'pending',
            dict(
                status='queued',
                metadata=metadata,
                date=datetime.utcnow().isoformat(),
                source='upload',
                materialize=metadata['materialize'],
            ),
            id=dataset_id,
        )

        # Publish to the profiling queue
        await self.application.profile_exchange.publish(
            json2msg(
                dict(
                    id=dataset_id,
                    metadata=metadata,
                ),
                # Lower priority than on-demand datasets, but higher than base
                priority=1,
            ),
            '',
        )

        return await self.send_json({'id': dataset_id})
Ejemplo n.º 8
0
async def import_all(folder):
    es = elasticsearch.Elasticsearch(
        os.environ['ELASTICSEARCH_HOSTS'].split(',')
    )
    amqp_conn = await aio_pika.connect_robust(
        host=os.environ['AMQP_HOST'],
        port=int(os.environ['AMQP_PORT']),
        login=os.environ['AMQP_USER'],
        password=os.environ['AMQP_PASSWORD'],
    )
    amqp_chan = await amqp_conn.channel()
    amqp_datasets_exchange = await amqp_chan.declare_exchange(
        'datasets',
        aio_pika.ExchangeType.TOPIC,
    )
    if 'LAZO_SERVER_HOST' in os.environ:
        lazo_client = lazo_index_service.LazoIndexClient(
            host=os.environ['LAZO_SERVER_HOST'],
            port=int(os.environ['LAZO_SERVER_PORT'])
        )
    else:
        lazo_client = None

    print("Importing Elasticsearch data", end='', flush=True)
    for name in os.listdir(folder):
        if name.startswith('lazo.'):
            continue
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name)
        try:
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            delete_dataset_from_index(es, dataset_id, lazo_client)
            add_dataset_to_index(es, dataset_id, obj)
        await amqp_datasets_exchange.publish(
            json2msg(dict(obj, id=dataset_id)),
            dataset_id,
        )
        print('.', end='', flush=True)

    print("Importing Lazo data", end='', flush=True)
    for name in os.listdir(folder):
        if not name.startswith('lazo.'):
            continue
        path = os.path.join(folder, name)
        with open(path, 'r') as fp:
            obj = json.load(fp)

        dataset_id = decode_dataset_id(name[5:]).rsplit('.', 1)[0]
        lazo_es_id = obj.pop('_id')
        assert lazo_es_id.split('__.__')[0] == dataset_id
        try:
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        except elasticsearch.TransportError:
            print('X', end='', flush=True)
            time.sleep(10)  # If writing can't keep up, needs a real break
            add_dataset_to_lazo_storage(es, lazo_es_id, obj)
        print('.', end='', flush=True)
Ejemplo n.º 9
0
    async def post(self):
        metadata = dict(
            name=self.get_body_argument('name', None),
            source='upload',
            materialize=dict(identifier='datamart.upload',
                             date=datetime.utcnow().isoformat() + 'Z'),
        )
        description = self.get_body_argument('description', None)
        if description:
            metadata['description'] = description
        for field, opts in self.application.custom_fields.items():
            value = self.get_body_argument(field, None)
            if value:
                if 'type' in opts:
                    type_ = opts['type']
                    if type_ == 'integer':
                        value = int(value)
                    elif type_ == 'float':
                        value = float(value)
                metadata[field] = value
            elif opts.get('required', False):
                return await self.send_error_json(
                    400,
                    "Missing field %s" % field,
                )

        if 'file' in self.request.files:
            file = self.request.files['file'][0]
            metadata['filename'] = file.filename
            manual_annotations = self.get_body_argument(
                'manual_annotations',
                None,
            )
            if manual_annotations:
                try:
                    manual_annotations = json.loads(manual_annotations)
                except json.JSONDecodeError:
                    return await self.send_error_json(
                        400,
                        "Invalid manual annotations",
                    )
                metadata['manual_annotations'] = manual_annotations

            dataset_id = 'datamart.upload.%s' % uuid.uuid4().hex

            # Write file to shared storage
            object_store = get_object_store()
            with object_store.open('datasets', dataset_id, 'wb') as fp:
                fp.write(file.body)
            await asyncio.sleep(3)  # Object store is eventually consistent
        elif self.get_body_argument('address', None):
            # Check the URL
            address = self.get_body_argument('address')

            def try_get():
                with advocate_session() as http_session:
                    return http_session.get(
                        address,
                        headers={'User-Agent': 'Auctus'},
                    )

            response = await asyncio.get_event_loop().run_in_executor(
                None,
                try_get,
            )
            if response.status_code != 200:
                return await self.send_error_json(
                    400, "Invalid URL ({} {})".format(
                        response.status_code, response.reason,
                    ),
                )

            # Set identifier
            metadata['materialize']['identifier'] = 'datamart.url'

            # Set 'direct_url'
            metadata['materialize']['direct_url'] = address
            dataset_id = 'datamart.url.%s' % uuid.uuid4().hex
        else:
            return await self.send_error_json(400, "No file")

        # Add to alternate index
        self.application.elasticsearch.index(
            'pending',
            dict(
                status='queued',
                metadata=metadata,
                date=datetime.utcnow().isoformat(),
                source='upload',
                materialize=metadata['materialize'],
            ),
            id=dataset_id,
        )

        # Publish to the profiling queue
        await self.application.profile_exchange.publish(
            json2msg(
                dict(
                    id=dataset_id,
                    metadata=metadata,
                ),
                # Lower priority than on-demand datasets, but higher than base
                priority=1,
            ),
            '',
        )

        return await self.send_json({'id': dataset_id})
Ejemplo n.º 10
0
        async def coro(future):
            metadata = msg2json(message)['metadata']
            try:
                try:
                    metadata = future.result()
                    if metadata['nb_rows'] == 0:
                        logger.info(
                            "Dataset has no rows, not inserting into index: " +
                            "%r",
                            dataset_id,
                        )
                        delete_dataset_from_index(
                            self.es,
                            dataset_id,
                            # DO delete from Lazo
                            self.lazo_client,
                        )
                        self.es.index(
                            'pending',
                            dict(
                                status='error',
                                error="Dataset has no rows",
                                metadata=metadata,
                                date=datetime.utcnow().isoformat(),
                                source=metadata['source'],
                                materialize=metadata['materialize'],
                            ),
                            id=dataset_id,
                        )
                    else:
                        # Delete dataset if already exists in index
                        delete_dataset_from_index(
                            self.es,
                            dataset_id,
                            # Don't delete from Lazo, we inserted during profile
                            None,
                        )
                        # Insert results in Elasticsearch
                        body = dict(metadata,
                                    date=datetime.utcnow().isoformat() + 'Z',
                                    version=os.environ['DATAMART_VERSION'])
                        add_dataset_to_index(self.es, dataset_id, body)

                        # Publish to RabbitMQ
                        await self.datasets_exchange.publish(
                            json2msg(dict(body, id=dataset_id)),
                            dataset_id,
                        )

                        # Remove from alternate index
                        try:
                            self.es.delete('pending', dataset_id)
                        except elasticsearch.NotFoundError:
                            pass
                except DatasetTooBig:
                    # Materializer reached size limit
                    logger.info("Dataset over size limit: %r", dataset_id)
                    message.ack()
                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Dataset is too big",
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                except Exception as e:
                    if isinstance(e, elasticsearch.RequestError):
                        # This is a problem with our computed metadata
                        logger.exception(
                            "Error inserting dataset %r in Elasticsearch",
                            dataset_id,
                        )
                    elif isinstance(e, elasticsearch.TransportError):
                        # This is probably an issue with Elasticsearch
                        # We'll log, nack and retry
                        raise
                    else:
                        logger.exception("Error processing dataset %r",
                                         dataset_id)
                    # Move message to failed queue
                    await self.channel.default_exchange.publish(
                        aio_pika.Message(message.body),
                        self.failed_queue.name,
                    )
                    # Ack anyway, retrying would probably fail again
                    message.ack()

                    self.es.index(
                        'pending',
                        dict(
                            status='error',
                            error="Error profiling dataset",
                            metadata=metadata,
                            date=datetime.utcnow().isoformat(),
                            source=metadata['source'],
                            materialize=metadata['materialize'],
                        ),
                        id=dataset_id,
                    )
                else:
                    message.ack()
                    logger.info("Dataset %r processed successfully",
                                dataset_id)
            except Exception:
                message.nack()
                raise