Exemple #1
0
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval,
                         state):
    try:
        run_time = 1000 * (time.time() - tasks_start_time.pop(task_id))

        tags = {
            'name': task.name,
            'state': (state or 'unknown').lower(),
            'hostname': socket.gethostname()
        }
        if task.name == 'redash.tasks.execute_query':
            if isinstance(retval, Exception):
                tags['state'] = 'exception'

            tags['data_source_id'] = args[1]

        metric = "celery.task.runtime"
        logging.debug(
            "metric=%s",
            json.dumps({
                'metric': metric,
                'tags': tags,
                'value': run_time
            }))
        statsd_client.timing(metric_name(metric, tags), run_time)
        statsd_client.incr(metric_name('celery.task.count', tags))
    except Exception:
        logging.exception("Exception during task_postrun handler.")
Exemple #2
0
def update_health_status():
    for data_source in models.DataSource.query:
        logger.info(u"task=update_health_status state=start ds_id=%s", data_source.id)

        runtime = None
        query_text = data_source.query_runner.noop_query
        ds_id = str(data_source.id)

        custom_query_env_var = "REDASH_CUSTOM_HEALTH_QUERIES_{data_source_id}".format(data_source_id=ds_id)
        custom_query = os.environ.get(custom_query_env_var, "")
        query_text = custom_query or query_text

        try:
            start_time = time.time()
            test_connection(data_source.query_runner, query_text)
            runtime = time.time() - start_time
        except NotImplementedError:
            logger.info(u"Unable to compute health status without test query for %s", data_source.name)
            continue
        except Exception as e:
            logger.warning(u"Failed health check for the data source: %s", data_source.name, exc_info=1)
            statsd_client.incr('update_health_status.error')
            logger.info(u"task=update_health_status state=error ds_id=%s runtime=%.2f", data_source.id, time.time() - start_time)

        status = {
            "status": "FAIL" if runtime is None else "SUCCESS",
            "last_run": start_time,
            "last_run_human": str(parse_human_time(str(start_time))),
            "runtime": runtime
        }
        store_health_status(ds_id, data_source.name, query_text, status)
Exemple #3
0
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval,
                         state, **kw):
    try:
        run_time = 1000 * (time.time() - tasks_start_time.pop(task_id))

        state = (state or 'unknown').lower()
        tags = {'state': state, 'hostname': socket.gethostname()}
        if task.name == 'redash.tasks.execute_query':
            if isinstance(retval, Exception):
                tags['state'] = 'exception'
                state = 'exception'

            tags['data_source_id'] = args[1]

        normalized_task_name = task.name.replace('redash.tasks.',
                                                 '').replace('.', '_')
        metric = "celery.task_runtime.{}".format(normalized_task_name)
        logging.debug(
            "metric=%s",
            json_dumps({
                'metric': metric,
                'tags': tags,
                'value': run_time
            }))
        statsd_client.timing(metric_name(metric, tags), run_time)
        statsd_client.incr(
            metric_name(
                'celery.task.{}.{}'.format(normalized_task_name, state), tags))
    except Exception:
        logging.exception("Exception during task_postrun handler.")
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval,
                         state, **kw):
    try:
        run_time = 1000 * (time.time() - tasks_start_time.pop(task_id))

        state = (state or "unknown").lower()
        tags = {"state": state, "hostname": socket.gethostname()}
        if task.name == "redash.tasks.execute_query":
            if isinstance(retval, Exception):
                tags["state"] = "exception"
                state = "exception"

            tags["data_source_id"] = args[1]

        normalized_task_name = task.name.replace("redash.tasks.",
                                                 "").replace(".", "_")
        metric = "celery.task_runtime.{}".format(normalized_task_name)
        logging.debug(
            "metric=%s",
            json_dumps({
                "metric": metric,
                "tags": tags,
                "value": run_time
            }))
        statsd_client.timing(metric_name(metric, tags), run_time)
        statsd_client.incr(
            metric_name(
                "celery.task.{}.{}".format(normalized_task_name, state), tags))
    except Exception:
        logging.exception("Exception during task_postrun handler.")
Exemple #5
0
def health_status():
    for ds in models.DataSource.query:
        logger.info(u"task=health_status state=start ds_id=%s", ds.id)

        runtime = None
        query_text = ds.query_runner.noop_query
        custom_queries = settings.CUSTOM_HEALTH_QUERIES
        ds_id = str(ds.id)

        if custom_queries and ds_id in custom_queries:
            query_text = custom_queries[ds_id]

        try:
            start_time = time.time()
            ds.query_runner.test_connection(query_text)
            runtime = time.time() - start_time
        except Exception as e:
            logger.warning(u"Failed health check for the data source: %s",
                           ds.name,
                           exc_info=1)
            statsd_client.incr('health_status.error')
            logger.info(
                u"task=health_status state=error ds_id=%s runtime=%.2f", ds.id,
                time.time() - start_time)

        update_health_status(
            ds_id, ds.name, query_text, {
                "status": "SUCCESS" if runtime is not None else "FAIL",
                "last_run": start_time,
                "last_run_human": str(parse_human_time(str(start_time))),
                "runtime": runtime
            })
    def run_query(self, query, user):
        try:
            cursor = self._get_cursor()

            statements = split_sql_statements(query)
            for stmt in statements:
                cursor.execute(stmt)

            if cursor.description is not None:
                result_set = cursor.fetchmany(ROW_LIMIT)
                columns = self.fetch_columns(
                    [
                        (i[0], TYPES_MAP.get(i[1], TYPE_STRING))
                        for i in cursor.description
                    ]
                )

                rows = [
                    dict(zip((column["name"] for column in columns), row))
                    for row in result_set
                ]

                data = {"columns": columns, "rows": rows}

                if (
                    len(result_set) >= ROW_LIMIT
                    and cursor.fetchone() is not None
                ):
                    logger.warning("Truncated result set.")
                    statsd_client.incr("redash.query_runner.databricks.truncated")
                    data["truncated"] = True
                json_data = json_dumps(data)
                error = None
            else:
                error = None
                json_data = json_dumps(
                    {
                        "columns": [{"name": "result", "type": TYPE_STRING}],
                        "rows": [{"result": "No data was returned."}],
                    }
                )

            cursor.close()
        except pyodbc.Error as e:
            if len(e.args) > 1:
                error = str(e.args[1])
            else:
                error = str(e)
            json_data = None

        return json_data, error
Exemple #7
0
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval, state):
    try:
        run_time = 1000 * (time.time() - tasks_start_time.pop(task_id))

        tags = {"name": task.name, "state": (state or "unknown").lower(), "hostname": socket.gethostname()}
        if task.name == "redash.tasks.execute_query":
            if isinstance(retval, Exception):
                tags["state"] = "exception"

            tags["data_source_id"] = args[1]

        metric = "celery.task.runtime"
        logging.debug("metric=%s", json.dumps({"metric": metric, "tags": tags, "value": run_time}))
        statsd_client.timing(metric_name(metric, tags), run_time)
        statsd_client.incr(metric_name("celery.task.count", tags))
    except Exception:
        logging.exception("Exception during task_postrun handler.")
Exemple #8
0
 def execute_job(self, job, queue):
     statsd_client.incr("rq.jobs.running.{}".format(queue.name))
     statsd_client.incr("rq.jobs.started.{}".format(queue.name))
     try:
         super().execute_job(job, queue)
     finally:
         statsd_client.decr("rq.jobs.running.{}".format(queue.name))
         if job.get_status() == JobStatus.FINISHED:
             statsd_client.incr("rq.jobs.finished.{}".format(queue.name))
         else:
             statsd_client.incr("rq.jobs.failed.{}".format(queue.name))
Exemple #9
0
def task_postrun_handler(signal, sender, task_id, task, args, kwargs, retval, state, **kw):
    try:
        run_time = 1000 * (time.time() - tasks_start_time.pop(task_id))

        state = (state or 'unknown').lower()
        tags = {'state': state, 'hostname': socket.gethostname()}
        if task.name == 'redash.tasks.execute_query':
            if isinstance(retval, Exception):
                tags['state'] = 'exception'
                state = 'exception'

            tags['data_source_id'] = args[1]

        normalized_task_name = task.name.replace('redash.tasks.', '').replace('.', '_')
        metric = "celery.task_runtime.{}".format(normalized_task_name)
        logging.debug("metric=%s", json_dumps({'metric': metric, 'tags': tags, 'value': run_time}))
        statsd_client.timing(metric_name(metric, tags), run_time)
        statsd_client.incr(metric_name('celery.task.{}.{}'.format(normalized_task_name, state), tags))
    except Exception:
        logging.exception("Exception during task_postrun handler.")
Exemple #10
0
def refresh_schema(data_source_id):
    ds = models.DataSource.get_by_id(data_source_id)
    logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id)
    start_time = time.time()
    try:
        ds.get_schema(refresh=True)
        logger.info(
            u"task=refresh_schema state=finished ds_id=%s runtime=%.2f",
            ds.id,
            time.time() - start_time,
        )
        statsd_client.incr("refresh_schema.success")
    except JobTimeoutException:
        logger.info(
            u"task=refresh_schema state=timeout ds_id=%s runtime=%.2f",
            ds.id,
            time.time() - start_time,
        )
        statsd_client.incr("refresh_schema.timeout")
    except Exception:
        logger.warning(u"Failed refreshing schema for the data source: %s",
                       ds.name,
                       exc_info=1)
        statsd_client.incr("refresh_schema.error")
        logger.info(
            u"task=refresh_schema state=failed ds_id=%s runtime=%.2f",
            ds.id,
            time.time() - start_time,
        )
Exemple #11
0
def refresh_schema(data_source_id):
    ds = models.DataSource.get_by_id(data_source_id)
    logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id)
    start_time = time.time()
    try:
        ds.get_schema(refresh=True)
        logger.info(u"task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)
        statsd_client.incr('refresh_schema.success')
    except SoftTimeLimitExceeded:
        logger.info(u"task=refresh_schema state=timeout ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)
        statsd_client.incr('refresh_schema.timeout')
    except Exception:
        logger.warning(u"Failed refreshing schema for the data source: %s", ds.name, exc_info=1)
        statsd_client.incr('refresh_schema.error')
        logger.info(u"task=refresh_schema state=failed ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)
Exemple #12
0
def refresh_schema(data_source_id):
    ds = models.DataSource.get_by_id(data_source_id)
    logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id)
    start_time = time.time()

    try:
        existing_tables = set()
        schema = ds.query_runner.get_schema(get_stats=True)
        for table in schema:
            table_name = table['name']
            existing_tables.add(table_name)

            # Assume that there will only exist 1 table with a given name for a given data source so we use first()
            persisted_table = TableMetadata.query.filter(
                TableMetadata.name == table_name,
                TableMetadata.data_source_id == ds.id,
            ).first()

            if persisted_table:
                TableMetadata.query.filter(
                    TableMetadata.id == persisted_table.id, ).update(
                        {"exists": True})
            else:
                metadata = 'metadata' in table
                persisted_table = TableMetadata(org_id=ds.org_id,
                                                name=table_name,
                                                data_source_id=ds.id,
                                                column_metadata=metadata)
                models.db.session.add(persisted_table)
                models.db.session.flush()

            existing_columns = set()
            for i, column in enumerate(table['columns']):
                existing_columns.add(column)
                column_metadata = {
                    'org_id': ds.org_id,
                    'table_id': persisted_table.id,
                    'name': column,
                    'type': None,
                    'example': None,
                    'exists': True
                }
                if 'metadata' in table:
                    column_metadata['type'] = table['metadata'][i]['type']

                # If the column exists, update it, otherwise create a new one.
                persisted_column = ColumnMetadata.query.filter(
                    ColumnMetadata.name == column,
                    ColumnMetadata.table_id == persisted_table.id,
                ).options(load_only('id')).first()
                if persisted_column:
                    ColumnMetadata.query.filter(
                        ColumnMetadata.id == persisted_column.id, ).update(
                            column_metadata)
                else:
                    models.db.session.add(ColumnMetadata(**column_metadata))
            models.db.session.commit()

            get_table_sample_data.apply_async(
                args=(data_source_id, table, persisted_table.id),
                queue=settings.SCHEMAS_REFRESH_QUEUE)

            # If a column did not exist, set the 'column_exists' flag to false.
            existing_columns_list = tuple(existing_columns)
            ColumnMetadata.query.filter(
                ColumnMetadata.exists == True,
                ColumnMetadata.table_id == persisted_table.id,
                ~ColumnMetadata.name.in_(existing_columns_list),
            ).update({
                "exists": False,
                "updated_at": db.func.now()
            },
                     synchronize_session='fetch')

        # If a table did not exist in the get_schema() response above, set the 'exists' flag to false.
        existing_tables_list = tuple(existing_tables)
        tables_to_update = TableMetadata.query.filter(
            TableMetadata.exists == True,
            TableMetadata.data_source_id == ds.id,
            ~TableMetadata.name.in_(existing_tables_list)).update(
                {
                    "exists": False,
                    "updated_at": db.func.now()
                },
                synchronize_session='fetch')

        models.db.session.commit()

        logger.info(
            u"task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id,
            time.time() - start_time)
        statsd_client.incr('refresh_schema.success')
    except SoftTimeLimitExceeded:
        logger.info(u"task=refresh_schema state=timeout ds_id=%s runtime=%.2f",
                    ds.id,
                    time.time() - start_time)
        statsd_client.incr('refresh_schema.timeout')
    except Exception:
        logger.warning(u"Failed refreshing schema for the data source: %s",
                       ds.name,
                       exc_info=1)
        statsd_client.incr('refresh_schema.error')
        logger.info(u"task=refresh_schema state=failed ds_id=%s runtime=%.2f",
                    ds.id,
                    time.time() - start_time)
Exemple #13
0
 def enqueue_job(self, *args, **kwargs):
     job = super().enqueue_job(*args, **kwargs)
     statsd_client.incr("rq.jobs.created.{}".format(self.name))
     return job
Exemple #14
0
def refresh_schema(data_source_id, max_type_string_length=250):
    ds = models.DataSource.get_by_id(data_source_id)
    logger.info("task=refresh_schema state=start ds_id=%s", ds.id)
    lock_key = "data_source:schema:refresh:{}:lock".format(data_source_id)
    lock = redis_connection.lock(lock_key,
                                 timeout=settings.SCHEMA_REFRESH_TIME_LIMIT)
    acquired = lock.acquire(blocking=False)
    start_time = time.time()

    if acquired:
        logger.info("task=refresh_schema state=locked ds_id=%s", ds.id)
        try:
            # Stores data from the updated schema that tells us which
            # columns and which tables currently exist
            existing_tables_set = set()
            existing_columns_set = set()

            # Stores data that will be inserted into postgres
            table_data = {}
            column_data = {}

            new_column_names = {}
            new_column_metadata = {}

            for table in ds.query_runner.get_schema(get_stats=True):
                table_name = table["name"]
                existing_tables_set.add(table_name)

                table_data[table_name] = {
                    "org_id": ds.org_id,
                    "name": table_name,
                    "data_source_id": ds.id,
                    "column_metadata": "metadata" in table,
                    "exists": True,
                }
                new_column_names[table_name] = table["columns"]
                new_column_metadata[table_name] = table.get("metadata", None)

            models.TableMetadata.store(ds, existing_tables_set, table_data)

            all_existing_persisted_tables = models.TableMetadata.query.filter(
                models.TableMetadata.exists.is_(True),
                models.TableMetadata.data_source_id == ds.id,
            ).all()

            for table in all_existing_persisted_tables:
                for i, column in enumerate(new_column_names.get(
                        table.name, [])):
                    existing_columns_set.add(column)
                    column_data[column] = {
                        "org_id": ds.org_id,
                        "table_id": table.id,
                        "name": column,
                        "type": None,
                        "exists": True,
                    }

                    if table.column_metadata:
                        column_type = new_column_metadata[
                            table.name][i]["type"]
                        column_type = truncate_long_string(
                            column_type, max_type_string_length)
                        column_data[column]["type"] = column_type

                models.ColumnMetadata.store(table, existing_columns_set,
                                            column_data)

                existing_columns_list = list(existing_columns_set)

                # If a column did not exist, set the 'column_exists' flag to false.
                models.ColumnMetadata.query.filter(
                    models.ColumnMetadata.exists.is_(True),
                    models.ColumnMetadata.table_id == table.id,
                    ~models.ColumnMetadata.name.in_(existing_columns_list),
                ).update(
                    {
                        "exists": False,
                        "updated_at": models.db.func.now()
                    },
                    synchronize_session="fetch",
                )

                # Clear the set for the next round
                existing_columns_set.clear()

            # If a table did not exist in the get_schema() response above,
            # set the 'exists' flag to false.
            existing_tables_list = list(existing_tables_set)
            models.TableMetadata.query.filter(
                models.TableMetadata.exists.is_(True),
                models.TableMetadata.data_source_id == ds.id,
                ~models.TableMetadata.name.in_(existing_tables_list),
            ).update(
                {
                    "exists": False,
                    "updated_at": models.db.func.now()
                },
                synchronize_session="fetch",
            )

            models.db.session.commit()

            logger.info("task=refresh_schema state=caching ds_id=%s", ds.id)
            ds.schema_cache.populate(forced=True)
            logger.info("task=refresh_schema state=cached ds_id=%s", ds.id)

            logger.info(
                "task=refresh_schema state=finished ds_id=%s runtime=%.2f",
                ds.id,
                time.time() - start_time,
            )
            statsd_client.incr("refresh_schema.success")
        except JobTimeoutException:
            logger.info(
                "task=refresh_schema state=timeout ds_id=%s runtime=%.2f",
                ds.id,
                time.time() - start_time,
            )
            statsd_client.incr("refresh_schema.timeout")
        except Exception:
            logger.warning("Failed refreshing schema for the data source: %s",
                           ds.name,
                           exc_info=1)
            statsd_client.incr("refresh_schema.error")
            logger.info(
                "task=refresh_schema state=failed ds_id=%s runtime=%.2f",
                ds.id,
                time.time() - start_time,
            )
        finally:
            lock.release()
            logger.info("task=refresh_schema state=unlocked ds_id=%s", ds.id)
    else:
        logger.info("task=refresh_schema state=alreadylocked ds_id=%s", ds.id)
Exemple #15
0
def refresh_schema(data_source_id):
    ds = models.DataSource.get_by_id(data_source_id)
    logger.info(u"task=refresh_schema state=start ds_id=%s", ds.id)
    start_time = time.time()

    MAX_TYPE_STRING_LENGTH = 250
    try:
        schema = ds.query_runner.get_schema(get_stats=True)

        # Stores data from the updated schema that tells us which
        # columns and which tables currently exist
        existing_tables_set = set()
        existing_columns_set = set()

        # Stores data that will be inserted into postgres
        table_data = {}
        column_data = {}

        new_column_names = {}
        new_column_metadata = {}
        for table in schema:
            table_name = table['name']
            existing_tables_set.add(table_name)

            table_data[table_name] = {
                'org_id': ds.org_id,
                'name': table_name,
                'data_source_id': ds.id,
                'column_metadata': "metadata" in table,
                'exists': True
            }
            new_column_names[table_name] = table['columns']
            new_column_metadata[table_name] = table.get('metadata', None)

        insert_or_update_table_metadata(ds, existing_tables_set, table_data)
        models.db.session.commit()

        all_existing_persisted_tables = TableMetadata.query.filter(
            TableMetadata.exists.is_(True),
            TableMetadata.data_source_id == ds.id,
        ).all()

        for table in all_existing_persisted_tables:
            for i, column in enumerate(new_column_names.get(table.name, [])):
                existing_columns_set.add(column)
                column_data[column] = {
                    'org_id': ds.org_id,
                    'table_id': table.id,
                    'name': column,
                    'type': None,
                    'exists': True
                }

                if table.column_metadata:
                    column_type = new_column_metadata[table.name][i]['type']
                    column_type = truncate_long_string(column_type, MAX_TYPE_STRING_LENGTH)
                    column_data[column]['type'] = column_type

            insert_or_update_column_metadata(table, existing_columns_set, column_data)
            models.db.session.commit()

            existing_columns_list = list(existing_columns_set)

            # If a column did not exist, set the 'column_exists' flag to false.
            ColumnMetadata.query.filter(
                ColumnMetadata.exists.is_(True),
                ColumnMetadata.table_id == table.id,
                ~ColumnMetadata.name.in_(existing_columns_list),
            ).update({
                "exists": False,
                "updated_at": db.func.now()
            }, synchronize_session='fetch')

            # Clear the set for the next round
            existing_columns_set.clear()

        # If a table did not exist in the get_schema() response above,
        # set the 'exists' flag to false.
        existing_tables_list = list(existing_tables_set)
        TableMetadata.query.filter(
            TableMetadata.exists.is_(True),
            TableMetadata.data_source_id == ds.id,
            ~TableMetadata.name.in_(existing_tables_list)
        ).update({
            "exists": False,
            "updated_at": db.func.now()
        }, synchronize_session='fetch')

        models.db.session.commit()

        logger.info(u"task=refresh_schema state=finished ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)
        statsd_client.incr('refresh_schema.success')
    except SoftTimeLimitExceeded:
        logger.info(u"task=refresh_schema state=timeout ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)
        statsd_client.incr('refresh_schema.timeout')
    except Exception:
        logger.warning(u"Failed refreshing schema for the data source: %s", ds.name, exc_info=1)
        statsd_client.incr('refresh_schema.error')
        logger.info(u"task=refresh_schema state=failed ds_id=%s runtime=%.2f", ds.id, time.time() - start_time)