async def import_all(folder): amqp_conn = await aio_pika.connect_robust( host=os.environ['AMQP_HOST'], port=int(os.environ['AMQP_PORT']), login=os.environ['AMQP_USER'], password=os.environ['AMQP_PASSWORD'], ) amqp_chan = await amqp_conn.channel() amqp_profile_exchange = await amqp_chan.declare_exchange( 'profile', aio_pika.ExchangeType.FANOUT, ) for name in os.listdir(folder): if not name.startswith('lazo.'): dataset_id = decode_dataset_id(name) path = os.path.join(folder, name) with open(path, 'r') as fp: obj = json.load(fp) metadata = dict(name=obj['name'], materialize=obj['materialize'], source=obj.get('source', 'unknown')) if obj.get('description'): metadata['description'] = obj['description'] if obj.get('date'): metadata['date'] = obj['date'] if obj.get('manual_annotations'): metadata['manual_annotations'] = obj['manual_annotations'] await amqp_profile_exchange.publish( json2msg(dict(id=dataset_id, metadata=metadata)), '', ) print('.', end='', flush=True)
async def freshen(datasets): es = elasticsearch.Elasticsearch( os.environ['ELASTICSEARCH_HOSTS'].split(',')) amqp_conn = await aio_pika.connect_robust( host=os.environ['AMQP_HOST'], port=int(os.environ['AMQP_PORT']), login=os.environ['AMQP_USER'], password=os.environ['AMQP_PASSWORD'], ) amqp_chan = await amqp_conn.channel() amqp_profile_exchange = await amqp_chan.declare_exchange( 'profile', aio_pika.ExchangeType.FANOUT, ) hits = [es.get('datamart', d) for d in datasets] for h in hits: obj = h['_source'] dataset_version = obj['version'] logger.info("Reprocessing %s, version=%r", h['_id'], dataset_version) metadata = dict(name=obj['name'], materialize=obj['materialize'], source=obj.get('source', 'unknown')) if obj.get('description'): metadata['description'] = obj['description'] if obj.get('date'): metadata['date'] = obj['date'] if obj.get('manual_annotations'): metadata['manual_annotations'] = obj['manual_annotations'] await amqp_profile_exchange.publish( json2msg(dict(id=h['_id'], metadata=metadata)), '', )
async def freshen(version): # Check that it's a valid version version_hash = subprocess.check_output(['git', 'rev-parse', version]) version_hash = version_hash.decode('ascii').strip() logger.warning("Reprocessing datasets profiled before %s", version_hash) es = elasticsearch.Elasticsearch( os.environ['ELASTICSEARCH_HOSTS'].split(',')) amqp_conn = await aio_pika.connect_robust( host=os.environ['AMQP_HOST'], port=int(os.environ['AMQP_PORT']), login=os.environ['AMQP_USER'], password=os.environ['AMQP_PASSWORD'], ) amqp_chan = await amqp_conn.channel() amqp_profile_exchange = await amqp_chan.declare_exchange( 'profile', aio_pika.ExchangeType.FANOUT, ) hits = elasticsearch.helpers.scan( es, index='datamart', query={ 'query': { 'match_all': {}, }, }, size=SIZE, ) reprocessed = 0 for h in hits: obj = h['_source'] dataset_version = obj['version'] if is_version_more_recent(version, dataset_version): logger.debug("%s is recent enough (version=%r)", h['_id'], dataset_version) continue reprocessed += 1 logger.info("Reprocessing %s, version=%r", h['_id'], dataset_version) metadata = dict(name=obj['name'], materialize=obj['materialize'], source=obj.get('source', 'unknown')) if obj.get('description'): metadata['description'] = obj['description'] if obj.get('date'): metadata['date'] = obj['date'] if obj.get('manual_annotations'): metadata['manual_annotations'] = obj['manual_annotations'] await amqp_profile_exchange.publish( json2msg(dict(id=h['_id'], metadata=metadata)), '', ) logger.info("Reprocessed %d datasets", reprocessed)
async def freshen(datasets, priority): es = PrefixedElasticsearch() amqp_conn = await aio_pika.connect_robust( host=os.environ['AMQP_HOST'], port=int(os.environ['AMQP_PORT']), login=os.environ['AMQP_USER'], password=os.environ['AMQP_PASSWORD'], ) amqp_chan = await amqp_conn.channel() amqp_profile_exchange = await amqp_chan.declare_exchange( 'profile', aio_pika.ExchangeType.FANOUT, ) for dataset_id in datasets: try: obj = es.get('datasets', dataset_id)['_source'] except elasticsearch.NotFoundError: obj = es.get('pending', dataset_id)['_source']['metadata'] dataset_version = None else: dataset_version = obj['version'] logger.info("Reprocessing %s, version=%r", dataset_id, dataset_version) metadata = dict(name=obj['name'], materialize=obj['materialize'], source=obj.get('source', 'unknown')) if obj.get('description'): metadata['description'] = obj['description'] if obj.get('date'): metadata['date'] = obj['date'] if obj.get('manual_annotations'): metadata['manual_annotations'] = obj['manual_annotations'] await amqp_profile_exchange.publish( json2msg( dict(id=dataset_id, metadata=metadata), priority=priority, ), '', )
async def coro(future): metadata = msg2json(message)['metadata'] _rie = asyncio.get_event_loop().run_in_executor in_thread = lambda func: _rie(None, func) try: try: metadata = future.result() if metadata['nb_rows'] == 0: logger.info( "Dataset has no rows, not inserting into index: " + "%r", dataset_id, ) await in_thread( lambda: delete_dataset_from_index( self.es, dataset_id, # DO delete from Lazo self.lazo_client, ), ) self.es.index( 'pending', dict( status='error', error="Dataset has no rows", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) else: # Delete dataset if already exists in index await in_thread( lambda: delete_dataset_from_index( self.es, dataset_id, # Don't delete from Lazo, we inserted during profile None, ), ) # Insert results in Elasticsearch body = dict(metadata, date=datetime.utcnow().isoformat() + 'Z', version=os.environ['DATAMART_VERSION']) await in_thread( lambda: add_dataset_to_index(self.es, dataset_id, body), ) # Publish to RabbitMQ msg = dict( id=dataset_id, ) for key in ( 'name', 'description', 'source', 'date', 'version', 'types', 'nb_rows', 'nb_columns', 'materialize', ): if key in body: msg[key] = body[key] await self.datasets_exchange.publish( json2msg(msg), dataset_id, ) # Remove from alternate index try: self.es.delete('pending', dataset_id) except elasticsearch.NotFoundError: pass except DatasetTooBig as e: # Materializer reached size limit if not e.limit: logger.info("Dataset over size limit: %r", dataset_id) elif e.actual: logger.info( "Dataset over size limit (%d > %d bytes): %r", e.actual, e.limit, dataset_id, ) else: logger.info( "Dataset over size limit (%d bytes): %r", e.limit, dataset_id, ) await message.ack() self.es.index( 'pending', dict( status='error', error="Dataset is too big", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) try: await in_thread( lambda: delete_dataset_from_index( self.es, dataset_id, self.lazo_client, ), ) except elasticsearch.NotFoundError: pass except Exception as e: if isinstance(e, elasticsearch.RequestError): # This is a problem with our computed metadata sentry_sdk.capture_exception(e) logger.exception( "Error inserting dataset %r in Elasticsearch", dataset_id, ) elif isinstance(e, elasticsearch.TransportError): # This is probably an issue with Elasticsearch # We'll log, nack and retry raise else: logger.warning("Error processing dataset %r", dataset_id, exc_info=True) # Move message to failed queue await self.channel.default_exchange.publish( aio_pika.Message(message.body), self.failed_queue.name, ) # Ack anyway, retrying would probably fail again await message.ack() self.es.index( 'pending', dict( status='error', error="Error profiling dataset", error_details=exception_details(e), metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) else: await message.ack() logger.info("Dataset %r processed successfully", dataset_id) except Exception: await message.nack() raise
async def post(self): if 'file' in self.request.files: file = self.request.files['file'][0] metadata = dict( filename=file.filename, name=self.get_body_argument('name', None), source='upload', materialize=dict(identifier='datamart.upload', date=datetime.utcnow().isoformat() + 'Z'), ) description = self.get_body_argument('description', None) if description: metadata['description'] = description dataset_id = 'datamart.upload.%s' % uuid.uuid4().hex # Write file to shared storage dataset_dir = os.path.join('/datasets', dataset_id) os.mkdir(dataset_dir) try: with open(os.path.join(dataset_dir, 'main.csv'), 'wb') as fp: fp.write(file.body) except Exception: shutil.rmtree(dataset_dir) raise elif self.get_body_argument('address', None): # Check the URL address = self.get_body_argument('address') response = await self.http_client.fetch(address, raise_error=False) if response.code != 200: return self.send_error_json( 400, "Invalid URL ({} {})".format( response.code, response.reason, ), ) # Metadata with 'direct_url' in materialization info metadata = dict( name=self.get_body_argument('name', None), source='upload', materialize=dict(identifier='datamart.url', direct_url=address, date=datetime.utcnow().isoformat() + 'Z'), ) description = self.get_body_argument('description', None) if description: metadata['description'] = description dataset_id = 'datamart.url.%s' % uuid.uuid4().hex else: return self.send_error_json(400, "No file") # Add to alternate index self.application.elasticsearch.index( 'pending', dict( status='queued', metadata=metadata, date=datetime.utcnow().isoformat(), source='upload', materialize=metadata['materialize'], ), id=dataset_id, ) # Publish to the profiling queue await self.application.profile_exchange.publish( json2msg( dict( id=dataset_id, metadata=metadata, ), # Lower priority than on-demand datasets, but higher than base priority=1, ), '', ) return self.send_json({'id': dataset_id})
async def post(self): metadata = dict( name=self.get_body_argument('name', None), source='upload', materialize=dict(identifier='datamart.upload', date=datetime.utcnow().isoformat() + 'Z'), ) description = self.get_body_argument('description', None) if description: metadata['description'] = description for field, opts in self.application.custom_fields.items(): value = self.get_body_argument(field, None) if value: if 'type' in opts: type_ = opts['type'] if type_ == 'integer': value = int(value) elif type_ == 'float': value = float(value) metadata[field] = value elif opts.get('required', False): return await self.send_error_json( 400, "Missing field %s" % field, ) if 'file' in self.request.files: file = self.request.files['file'][0] metadata['filename'] = file.filename manual_annotations = self.get_body_argument( 'manual_annotations', None, ) if manual_annotations: try: manual_annotations = json.loads(manual_annotations) except json.JSONDecodeError: return await self.send_error_json( 400, "Invalid manual annotations", ) metadata['manual_annotations'] = manual_annotations dataset_id = 'datamart.upload.%s' % uuid.uuid4().hex # Write file to shared storage dataset_dir = os.path.join('/datasets', dataset_id) os.mkdir(dataset_dir) try: with open(os.path.join(dataset_dir, 'main.csv'), 'wb') as fp: fp.write(file.body) except Exception: shutil.rmtree(dataset_dir) raise elif self.get_body_argument('address', None): # Check the URL address = self.get_body_argument('address') response = await self.http_client.fetch(address, raise_error=False) if response.code != 200: return await self.send_error_json( 400, "Invalid URL ({} {})".format( response.code, response.reason, ), ) # Set identifier metadata['materialize']['identifier'] = 'datamart.url' # Set 'direct_url' metadata['materialize']['direct_url'] = address dataset_id = 'datamart.url.%s' % uuid.uuid4().hex else: return await self.send_error_json(400, "No file") # Add to alternate index self.application.elasticsearch.index( 'pending', dict( status='queued', metadata=metadata, date=datetime.utcnow().isoformat(), source='upload', materialize=metadata['materialize'], ), id=dataset_id, ) # Publish to the profiling queue await self.application.profile_exchange.publish( json2msg( dict( id=dataset_id, metadata=metadata, ), # Lower priority than on-demand datasets, but higher than base priority=1, ), '', ) return await self.send_json({'id': dataset_id})
async def import_all(folder): es = elasticsearch.Elasticsearch( os.environ['ELASTICSEARCH_HOSTS'].split(',') ) amqp_conn = await aio_pika.connect_robust( host=os.environ['AMQP_HOST'], port=int(os.environ['AMQP_PORT']), login=os.environ['AMQP_USER'], password=os.environ['AMQP_PASSWORD'], ) amqp_chan = await amqp_conn.channel() amqp_datasets_exchange = await amqp_chan.declare_exchange( 'datasets', aio_pika.ExchangeType.TOPIC, ) if 'LAZO_SERVER_HOST' in os.environ: lazo_client = lazo_index_service.LazoIndexClient( host=os.environ['LAZO_SERVER_HOST'], port=int(os.environ['LAZO_SERVER_PORT']) ) else: lazo_client = None print("Importing Elasticsearch data", end='', flush=True) for name in os.listdir(folder): if name.startswith('lazo.'): continue path = os.path.join(folder, name) with open(path, 'r') as fp: obj = json.load(fp) dataset_id = decode_dataset_id(name) try: delete_dataset_from_index(es, dataset_id, lazo_client) add_dataset_to_index(es, dataset_id, obj) except elasticsearch.TransportError: print('X', end='', flush=True) time.sleep(10) # If writing can't keep up, needs a real break delete_dataset_from_index(es, dataset_id, lazo_client) add_dataset_to_index(es, dataset_id, obj) await amqp_datasets_exchange.publish( json2msg(dict(obj, id=dataset_id)), dataset_id, ) print('.', end='', flush=True) print("Importing Lazo data", end='', flush=True) for name in os.listdir(folder): if not name.startswith('lazo.'): continue path = os.path.join(folder, name) with open(path, 'r') as fp: obj = json.load(fp) dataset_id = decode_dataset_id(name[5:]).rsplit('.', 1)[0] lazo_es_id = obj.pop('_id') assert lazo_es_id.split('__.__')[0] == dataset_id try: add_dataset_to_lazo_storage(es, lazo_es_id, obj) except elasticsearch.TransportError: print('X', end='', flush=True) time.sleep(10) # If writing can't keep up, needs a real break add_dataset_to_lazo_storage(es, lazo_es_id, obj) print('.', end='', flush=True)
async def post(self): metadata = dict( name=self.get_body_argument('name', None), source='upload', materialize=dict(identifier='datamart.upload', date=datetime.utcnow().isoformat() + 'Z'), ) description = self.get_body_argument('description', None) if description: metadata['description'] = description for field, opts in self.application.custom_fields.items(): value = self.get_body_argument(field, None) if value: if 'type' in opts: type_ = opts['type'] if type_ == 'integer': value = int(value) elif type_ == 'float': value = float(value) metadata[field] = value elif opts.get('required', False): return await self.send_error_json( 400, "Missing field %s" % field, ) if 'file' in self.request.files: file = self.request.files['file'][0] metadata['filename'] = file.filename manual_annotations = self.get_body_argument( 'manual_annotations', None, ) if manual_annotations: try: manual_annotations = json.loads(manual_annotations) except json.JSONDecodeError: return await self.send_error_json( 400, "Invalid manual annotations", ) metadata['manual_annotations'] = manual_annotations dataset_id = 'datamart.upload.%s' % uuid.uuid4().hex # Write file to shared storage object_store = get_object_store() with object_store.open('datasets', dataset_id, 'wb') as fp: fp.write(file.body) await asyncio.sleep(3) # Object store is eventually consistent elif self.get_body_argument('address', None): # Check the URL address = self.get_body_argument('address') def try_get(): with advocate_session() as http_session: return http_session.get( address, headers={'User-Agent': 'Auctus'}, ) response = await asyncio.get_event_loop().run_in_executor( None, try_get, ) if response.status_code != 200: return await self.send_error_json( 400, "Invalid URL ({} {})".format( response.status_code, response.reason, ), ) # Set identifier metadata['materialize']['identifier'] = 'datamart.url' # Set 'direct_url' metadata['materialize']['direct_url'] = address dataset_id = 'datamart.url.%s' % uuid.uuid4().hex else: return await self.send_error_json(400, "No file") # Add to alternate index self.application.elasticsearch.index( 'pending', dict( status='queued', metadata=metadata, date=datetime.utcnow().isoformat(), source='upload', materialize=metadata['materialize'], ), id=dataset_id, ) # Publish to the profiling queue await self.application.profile_exchange.publish( json2msg( dict( id=dataset_id, metadata=metadata, ), # Lower priority than on-demand datasets, but higher than base priority=1, ), '', ) return await self.send_json({'id': dataset_id})
async def coro(future): metadata = msg2json(message)['metadata'] try: try: metadata = future.result() if metadata['nb_rows'] == 0: logger.info( "Dataset has no rows, not inserting into index: " + "%r", dataset_id, ) delete_dataset_from_index( self.es, dataset_id, # DO delete from Lazo self.lazo_client, ) self.es.index( 'pending', dict( status='error', error="Dataset has no rows", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) else: # Delete dataset if already exists in index delete_dataset_from_index( self.es, dataset_id, # Don't delete from Lazo, we inserted during profile None, ) # Insert results in Elasticsearch body = dict(metadata, date=datetime.utcnow().isoformat() + 'Z', version=os.environ['DATAMART_VERSION']) add_dataset_to_index(self.es, dataset_id, body) # Publish to RabbitMQ await self.datasets_exchange.publish( json2msg(dict(body, id=dataset_id)), dataset_id, ) # Remove from alternate index try: self.es.delete('pending', dataset_id) except elasticsearch.NotFoundError: pass except DatasetTooBig: # Materializer reached size limit logger.info("Dataset over size limit: %r", dataset_id) message.ack() self.es.index( 'pending', dict( status='error', error="Dataset is too big", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) except Exception as e: if isinstance(e, elasticsearch.RequestError): # This is a problem with our computed metadata logger.exception( "Error inserting dataset %r in Elasticsearch", dataset_id, ) elif isinstance(e, elasticsearch.TransportError): # This is probably an issue with Elasticsearch # We'll log, nack and retry raise else: logger.exception("Error processing dataset %r", dataset_id) # Move message to failed queue await self.channel.default_exchange.publish( aio_pika.Message(message.body), self.failed_queue.name, ) # Ack anyway, retrying would probably fail again message.ack() self.es.index( 'pending', dict( status='error', error="Error profiling dataset", metadata=metadata, date=datetime.utcnow().isoformat(), source=metadata['source'], materialize=metadata['materialize'], ), id=dataset_id, ) else: message.ack() logger.info("Dataset %r processed successfully", dataset_id) except Exception: message.nack() raise