def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval, state): try: run_time = 1000 * (time.time() - tasks_start_time.pop(task_id)) tags = { 'name': task.name, 'state': (state or 'unknown').lower(), 'hostname': socket.gethostname() } if task.name == 'redash.tasks.execute_query': if isinstance(retval, Exception): tags['state'] = 'exception' tags['data_source_id'] = args[1] metric = "celery.task.runtime" logging.debug( "metric=%s", json.dumps({ 'metric': metric, 'tags': tags, 'value': run_time })) statsd_client.timing(metric_name(metric, tags), run_time) statsd_client.incr(metric_name('celery.task.count', tags)) except Exception: logging.exception("Exception during task_postrun handler.")
def update_health_status(): for data_source in models.DataSource.query: logger.info(u"task=update_health_status state=start ds_id=%s", data_source.id) runtime = None query_text = data_source.query_runner.noop_query ds_id = str(data_source.id) custom_query_env_var = "REDASH_CUSTOM_HEALTH_QUERIES_{data_source_id}".format(data_source_id=ds_id) custom_query = os.environ.get(custom_query_env_var, "") query_text = custom_query or query_text try: start_time = time.time() test_connection(data_source.query_runner, query_text) runtime = time.time() - start_time except NotImplementedError: logger.info(u"Unable to compute health status without test query for %s", data_source.name) continue except Exception as e: logger.warning(u"Failed health check for the data source: %s", data_source.name, exc_info=1) statsd_client.incr('update_health_status.error') logger.info(u"task=update_health_status state=error ds_id=%s runtime=%.2f", data_source.id, time.time() - start_time) status = { "status": "FAIL" if runtime is None else "SUCCESS", "last_run": start_time, "last_run_human": str(parse_human_time(str(start_time))), "runtime": runtime } store_health_status(ds_id, data_source.name, query_text, status)
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval, state, **kw): try: run_time = 1000 * (time.time() - tasks_start_time.pop(task_id)) state = (state or 'unknown').lower() tags = {'state': state, 'hostname': socket.gethostname()} if task.name == 'redash.tasks.execute_query': if isinstance(retval, Exception): tags['state'] = 'exception' state = 'exception' tags['data_source_id'] = args[1] normalized_task_name = task.name.replace('redash.tasks.', '').replace('.', '_') metric = "celery.task_runtime.{}".format(normalized_task_name) logging.debug( "metric=%s", json_dumps({ 'metric': metric, 'tags': tags, 'value': run_time })) statsd_client.timing(metric_name(metric, tags), run_time) statsd_client.incr( metric_name( 'celery.task.{}.{}'.format(normalized_task_name, state), tags)) except Exception: logging.exception("Exception during task_postrun handler.")
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval, state, **kw): try: run_time = 1000 * (time.time() - tasks_start_time.pop(task_id)) state = (state or "unknown").lower() tags = {"state": state, "hostname": socket.gethostname()} if task.name == "redash.tasks.execute_query": if isinstance(retval, Exception): tags["state"] = "exception" state = "exception" tags["data_source_id"] = args[1] normalized_task_name = task.name.replace("redash.tasks.", "").replace(".", "_") metric = "celery.task_runtime.{}".format(normalized_task_name) logging.debug( "metric=%s", json_dumps({ "metric": metric, "tags": tags, "value": run_time })) statsd_client.timing(metric_name(metric, tags), run_time) statsd_client.incr( metric_name( "celery.task.{}.{}".format(normalized_task_name, state), tags)) except Exception: logging.exception("Exception during task_postrun handler.")
def health_status(): for ds in models.DataSource.query: logger.info(u"task=health_status state=start ds_id=%s", ds.id) runtime = None query_text = ds.query_runner.noop_query custom_queries = settings.CUSTOM_HEALTH_QUERIES ds_id = str(ds.id) if custom_queries and ds_id in custom_queries: query_text = custom_queries[ds_id] try: start_time = time.time() ds.query_runner.test_connection(query_text) runtime = time.time() - start_time except Exception as e: logger.warning(u"Failed health check for the data source: %s", ds.name, exc_info=1) statsd_client.incr('health_status.error') logger.info( u"task=health_status state=error ds_id=%s runtime=%.2f", ds.id, time.time() - start_time) update_health_status( ds_id, ds.name, query_text, { "status": "SUCCESS" if runtime is not None else "FAIL", "last_run": start_time, "last_run_human": str(parse_human_time(str(start_time))), "runtime": runtime })
def run_query(self, query, user): try: cursor = self._get_cursor() statements = split_sql_statements(query) for stmt in statements: cursor.execute(stmt) if cursor.description is not None: result_set = cursor.fetchmany(ROW_LIMIT) columns = self.fetch_columns( [ (i[0], TYPES_MAP.get(i[1], TYPE_STRING)) for i in cursor.description ] ) rows = [ dict(zip((column["name"] for column in columns), row)) for row in result_set ] data = {"columns": columns, "rows": rows} if ( len(result_set) >= ROW_LIMIT and cursor.fetchone() is not None ): logger.warning("Truncated result set.") statsd_client.incr("redash.query_runner.databricks.truncated") data["truncated"] = True json_data = json_dumps(data) error = None else: error = None json_data = json_dumps( { "columns": [{"name": "result", "type": TYPE_STRING}], "rows": [{"result": "No data was returned."}], } ) cursor.close() except pyodbc.Error as e: if len(e.args) > 1: error = str(e.args[1]) else: error = str(e) json_data = None return json_data, error
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval, state): try: run_time = 1000 * (time.time() - tasks_start_time.pop(task_id)) tags = {"name": task.name, "state": (state or "unknown").lower(), "hostname": socket.gethostname()} if task.name == "redash.tasks.execute_query": if isinstance(retval, Exception): tags["state"] = "exception" tags["data_source_id"] = args[1] metric = "celery.task.runtime" logging.debug("metric=%s", json.dumps({"metric": metric, "tags": tags, "value": run_time})) statsd_client.timing(metric_name(metric, tags), run_time) statsd_client.incr(metric_name("celery.task.count", tags)) except Exception: logging.exception("Exception during task_postrun handler.")
def execute_job(self, job, queue): statsd_client.incr("rq.jobs.running.{}".format(queue.name)) statsd_client.incr("rq.jobs.started.{}".format(queue.name)) try: super().execute_job(job, queue) finally: statsd_client.decr("rq.jobs.running.{}".format(queue.name)) if job.get_status() == JobStatus.FINISHED: statsd_client.incr("rq.jobs.finished.{}".format(queue.name)) else: statsd_client.incr("rq.jobs.failed.{}".format(queue.name))
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval, state, **kw): try: run_time = 1000 * (time.time() - tasks_start_time.pop(task_id)) state = (state or 'unknown').lower() tags = {'state': state, 'hostname': socket.gethostname()} if task.name == 'redash.tasks.execute_query': if isinstance(retval, Exception): tags['state'] = 'exception' state = 'exception' tags['data_source_id'] = args[1] normalized_task_name = task.name.replace('redash.tasks.', '').replace('.', '_') metric = "celery.task_runtime.{}".format(normalized_task_name) logging.debug("metric=%s", json_dumps({'metric': metric, 'tags': tags, 'value': run_time})) statsd_client.timing(metric_name(metric, tags), run_time) statsd_client.incr(metric_name('celery.task.{}.{}'.format(normalized_task_name, state), tags)) except Exception: logging.exception("Exception during task_postrun handler.")
def refresh_schema(data_source_id): ds = models.DataSource.get_by_id(data_source_id) logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id) start_time = time.time() try: ds.get_schema(refresh=True) logger.info( u"task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id, time.time() - start_time, ) statsd_client.incr("refresh_schema.success") except JobTimeoutException: logger.info( u"task=refresh_schema state=timeout ds_id=%s runtime=%.2f", ds.id, time.time() - start_time, ) statsd_client.incr("refresh_schema.timeout") except Exception: logger.warning(u"Failed refreshing schema for the data source: %s", ds.name, exc_info=1) statsd_client.incr("refresh_schema.error") logger.info( u"task=refresh_schema state=failed ds_id=%s runtime=%.2f", ds.id, time.time() - start_time, )
def refresh_schema(data_source_id): ds = models.DataSource.get_by_id(data_source_id) logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id) start_time = time.time() try: ds.get_schema(refresh=True) logger.info(u"task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id, time.time() - start_time) statsd_client.incr('refresh_schema.success') except SoftTimeLimitExceeded: logger.info(u"task=refresh_schema state=timeout ds_id=%s runtime=%.2f", ds.id, time.time() - start_time) statsd_client.incr('refresh_schema.timeout') except Exception: logger.warning(u"Failed refreshing schema for the data source: %s", ds.name, exc_info=1) statsd_client.incr('refresh_schema.error') logger.info(u"task=refresh_schema state=failed ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)
def refresh_schema(data_source_id): ds = models.DataSource.get_by_id(data_source_id) logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id) start_time = time.time() try: existing_tables = set() schema = ds.query_runner.get_schema(get_stats=True) for table in schema: table_name = table['name'] existing_tables.add(table_name) # Assume that there will only exist 1 table with a given name for a given data source so we use first() persisted_table = TableMetadata.query.filter( TableMetadata.name == table_name, TableMetadata.data_source_id == ds.id, ).first() if persisted_table: TableMetadata.query.filter( TableMetadata.id == persisted_table.id, ).update( {"exists": True}) else: metadata = 'metadata' in table persisted_table = TableMetadata(org_id=ds.org_id, name=table_name, data_source_id=ds.id, column_metadata=metadata) models.db.session.add(persisted_table) models.db.session.flush() existing_columns = set() for i, column in enumerate(table['columns']): existing_columns.add(column) column_metadata = { 'org_id': ds.org_id, 'table_id': persisted_table.id, 'name': column, 'type': None, 'example': None, 'exists': True } if 'metadata' in table: column_metadata['type'] = table['metadata'][i]['type'] # If the column exists, update it, otherwise create a new one. persisted_column = ColumnMetadata.query.filter( ColumnMetadata.name == column, ColumnMetadata.table_id == persisted_table.id, ).options(load_only('id')).first() if persisted_column: ColumnMetadata.query.filter( ColumnMetadata.id == persisted_column.id, ).update( column_metadata) else: models.db.session.add(ColumnMetadata(**column_metadata)) models.db.session.commit() get_table_sample_data.apply_async( args=(data_source_id, table, persisted_table.id), queue=settings.SCHEMAS_REFRESH_QUEUE) # If a column did not exist, set the 'column_exists' flag to false. existing_columns_list = tuple(existing_columns) ColumnMetadata.query.filter( ColumnMetadata.exists == True, ColumnMetadata.table_id == persisted_table.id, ~ColumnMetadata.name.in_(existing_columns_list), ).update({ "exists": False, "updated_at": db.func.now() }, synchronize_session='fetch') # If a table did not exist in the get_schema() response above, set the 'exists' flag to false. existing_tables_list = tuple(existing_tables) tables_to_update = TableMetadata.query.filter( TableMetadata.exists == True, TableMetadata.data_source_id == ds.id, ~TableMetadata.name.in_(existing_tables_list)).update( { "exists": False, "updated_at": db.func.now() }, synchronize_session='fetch') models.db.session.commit() logger.info( u"task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id, time.time() - start_time) statsd_client.incr('refresh_schema.success') except SoftTimeLimitExceeded: logger.info(u"task=refresh_schema state=timeout ds_id=%s runtime=%.2f", ds.id, time.time() - start_time) statsd_client.incr('refresh_schema.timeout') except Exception: logger.warning(u"Failed refreshing schema for the data source: %s", ds.name, exc_info=1) statsd_client.incr('refresh_schema.error') logger.info(u"task=refresh_schema state=failed ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)
def enqueue_job(self, *args, **kwargs): job = super().enqueue_job(*args, **kwargs) statsd_client.incr("rq.jobs.created.{}".format(self.name)) return job
def refresh_schema(data_source_id, max_type_string_length=250): ds = models.DataSource.get_by_id(data_source_id) logger.info("task=refresh_schema state=start ds_id=%s", ds.id) lock_key = "data_source:schema:refresh:{}:lock".format(data_source_id) lock = redis_connection.lock(lock_key, timeout=settings.SCHEMA_REFRESH_TIME_LIMIT) acquired = lock.acquire(blocking=False) start_time = time.time() if acquired: logger.info("task=refresh_schema state=locked ds_id=%s", ds.id) try: # Stores data from the updated schema that tells us which # columns and which tables currently exist existing_tables_set = set() existing_columns_set = set() # Stores data that will be inserted into postgres table_data = {} column_data = {} new_column_names = {} new_column_metadata = {} for table in ds.query_runner.get_schema(get_stats=True): table_name = table["name"] existing_tables_set.add(table_name) table_data[table_name] = { "org_id": ds.org_id, "name": table_name, "data_source_id": ds.id, "column_metadata": "metadata" in table, "exists": True, } new_column_names[table_name] = table["columns"] new_column_metadata[table_name] = table.get("metadata", None) models.TableMetadata.store(ds, existing_tables_set, table_data) all_existing_persisted_tables = models.TableMetadata.query.filter( models.TableMetadata.exists.is_(True), models.TableMetadata.data_source_id == ds.id, ).all() for table in all_existing_persisted_tables: for i, column in enumerate(new_column_names.get( table.name, [])): existing_columns_set.add(column) column_data[column] = { "org_id": ds.org_id, "table_id": table.id, "name": column, "type": None, "exists": True, } if table.column_metadata: column_type = new_column_metadata[ table.name][i]["type"] column_type = truncate_long_string( column_type, max_type_string_length) column_data[column]["type"] = column_type models.ColumnMetadata.store(table, existing_columns_set, column_data) existing_columns_list = list(existing_columns_set) # If a column did not exist, set the 'column_exists' flag to false. models.ColumnMetadata.query.filter( models.ColumnMetadata.exists.is_(True), models.ColumnMetadata.table_id == table.id, ~models.ColumnMetadata.name.in_(existing_columns_list), ).update( { "exists": False, "updated_at": models.db.func.now() }, synchronize_session="fetch", ) # Clear the set for the next round existing_columns_set.clear() # If a table did not exist in the get_schema() response above, # set the 'exists' flag to false. existing_tables_list = list(existing_tables_set) models.TableMetadata.query.filter( models.TableMetadata.exists.is_(True), models.TableMetadata.data_source_id == ds.id, ~models.TableMetadata.name.in_(existing_tables_list), ).update( { "exists": False, "updated_at": models.db.func.now() }, synchronize_session="fetch", ) models.db.session.commit() logger.info("task=refresh_schema state=caching ds_id=%s", ds.id) ds.schema_cache.populate(forced=True) logger.info("task=refresh_schema state=cached ds_id=%s", ds.id) logger.info( "task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id, time.time() - start_time, ) statsd_client.incr("refresh_schema.success") except JobTimeoutException: logger.info( "task=refresh_schema state=timeout ds_id=%s runtime=%.2f", ds.id, time.time() - start_time, ) statsd_client.incr("refresh_schema.timeout") except Exception: logger.warning("Failed refreshing schema for the data source: %s", ds.name, exc_info=1) statsd_client.incr("refresh_schema.error") logger.info( "task=refresh_schema state=failed ds_id=%s runtime=%.2f", ds.id, time.time() - start_time, ) finally: lock.release() logger.info("task=refresh_schema state=unlocked ds_id=%s", ds.id) else: logger.info("task=refresh_schema state=alreadylocked ds_id=%s", ds.id)
def refresh_schema(data_source_id): ds = models.DataSource.get_by_id(data_source_id) logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id) start_time = time.time() MAX_TYPE_STRING_LENGTH = 250 try: schema = ds.query_runner.get_schema(get_stats=True) # Stores data from the updated schema that tells us which # columns and which tables currently exist existing_tables_set = set() existing_columns_set = set() # Stores data that will be inserted into postgres table_data = {} column_data = {} new_column_names = {} new_column_metadata = {} for table in schema: table_name = table['name'] existing_tables_set.add(table_name) table_data[table_name] = { 'org_id': ds.org_id, 'name': table_name, 'data_source_id': ds.id, 'column_metadata': "metadata" in table, 'exists': True } new_column_names[table_name] = table['columns'] new_column_metadata[table_name] = table.get('metadata', None) insert_or_update_table_metadata(ds, existing_tables_set, table_data) models.db.session.commit() all_existing_persisted_tables = TableMetadata.query.filter( TableMetadata.exists.is_(True), TableMetadata.data_source_id == ds.id, ).all() for table in all_existing_persisted_tables: for i, column in enumerate(new_column_names.get(table.name, [])): existing_columns_set.add(column) column_data[column] = { 'org_id': ds.org_id, 'table_id': table.id, 'name': column, 'type': None, 'exists': True } if table.column_metadata: column_type = new_column_metadata[table.name][i]['type'] column_type = truncate_long_string(column_type, MAX_TYPE_STRING_LENGTH) column_data[column]['type'] = column_type insert_or_update_column_metadata(table, existing_columns_set, column_data) models.db.session.commit() existing_columns_list = list(existing_columns_set) # If a column did not exist, set the 'column_exists' flag to false. ColumnMetadata.query.filter( ColumnMetadata.exists.is_(True), ColumnMetadata.table_id == table.id, ~ColumnMetadata.name.in_(existing_columns_list), ).update({ "exists": False, "updated_at": db.func.now() }, synchronize_session='fetch') # Clear the set for the next round existing_columns_set.clear() # If a table did not exist in the get_schema() response above, # set the 'exists' flag to false. existing_tables_list = list(existing_tables_set) TableMetadata.query.filter( TableMetadata.exists.is_(True), TableMetadata.data_source_id == ds.id, ~TableMetadata.name.in_(existing_tables_list) ).update({ "exists": False, "updated_at": db.func.now() }, synchronize_session='fetch') models.db.session.commit() logger.info(u"task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id, time.time() - start_time) statsd_client.incr('refresh_schema.success') except SoftTimeLimitExceeded: logger.info(u"task=refresh_schema state=timeout ds_id=%s runtime=%.2f", ds.id, time.time() - start_time) statsd_client.incr('refresh_schema.timeout') except Exception: logger.warning(u"Failed refreshing schema for the data source: %s", ds.name, exc_info=1) statsd_client.incr('refresh_schema.error') logger.info(u"task=refresh_schema state=failed ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)