Esempio n. 1
0
    def run(self):
        to_delta = {}
        stat = open('/proc/stat')
        for line in stat:
            cols = line.split()
            if len(cols) == 0:
                continue
            if cols[0] == 'cpu':
                # Convert values to int then in milliseconds,
                to_delta = {
                    'time_user': ((int(cols[1]) + int(cols[2])) * 1000 /
                                  self.hz),
                    'time_system': (int(cols[3]) + int(cols[6]) +
                                    int(cols[7])) * 1000 / self.hz,
                    'time_idle': int(cols[4]) * 1000 / self.hz,
                    'time_iowait': int(cols[5]) * 1000 / self.hz,
                    'time_steal': int(cols[8]) * 1000 / self.hz
                }
        stat.close()

        # Compute deltas for values of /proc/stat since boot time
        (interval, metrics) = self.delta('global', to_delta)

        # No deltas on the first call
        if interval is None:
            return []

        metrics['measure_interval'] = interval
        metrics['datetime'] = now()
        metrics['cpu'] = 'global'

        return [metrics]
Esempio n. 2
0
def get_statements(http_context, app):
    """Return a snapshot of latest statistics of executed SQL statements
    """
    config = app.config
    dbname = config.statements.dbname
    assert dbname == "postgres", dbname
    snapshot_datetime = now()
    conninfo = dict(config.postgresql, dbname=dbname)
    try:
        with Postgres(**conninfo).connect() as conn:
            data = list(conn.query(query))
    except Exception as e:
        pg_version = app.postgres.fetch_version()
        if (pg_version < 90600
                or 'relation "pg_stat_statements" does not exist' in str(e)):
            raise HTTPError(
                404, "pg_stat_statements not enabled on database %s" % dbname)
        logger.error(
            "Failed to get pg_stat_statements data on database %s: %s",
            dbname,
            e,
        )
        raise HTTPError(500, e)
    else:
        return {"snapshot_datetime": snapshot_datetime, "data": data}
Esempio n. 3
0
    def run(self):
        # Everything is already gathered in the inventory, just add
        # the time
        out = []
        datetime = now()
        sysinfo = SysInfo()
        for fs in sysinfo.file_systems():
            fs['datetime'] = datetime
            out.append(fs)

        return out
Esempio n. 4
0
def monitoring_collector_worker(app):
    """
    Run probes and push collected metrics in a queue.
    """
    logger.debug("Starting monitoring collector")
    config = app.config
    conninfo = dict(
        host=config.postgresql.host,
        port=config.postgresql.port,
        user=config.postgresql.user,
        database=config.postgresql.dbname,
        password=config.postgresql.password,
        dbnames=config.monitoring.dbnames,
        instance=config.postgresql.instance,
    )

    system_info = host_info(config.temboard.hostname)
    # Load the probes to run
    probes = load_probes(
        config.monitoring,
        config.temboard.home
    )

    instance = instance_info(conninfo, system_info['hostname'])

    logger.debug("Running probes")
    # Gather the data from probes
    data = run_probes(probes, [instance])

    # Prepare and send output
    output = dict(
        datetime=now(),
        hostinfo=system_info,
        instances=remove_passwords([instance]),
        data=data,
        version=__VERSION__,
    )
    logger.debug(output)

    # Add data to metrics table
    db.add_metric(
        config.temboard.home,
        'monitoring.db',
        time.time(),
        output
    )

    logger.debug("Done")
Esempio n. 5
0
def monitoring_collector_worker(app):
    """
    Run probes and push collected metrics in a queue.
    """
    logger.debug("Starting monitoring collector")
    config = app.config
    conninfo = dict(
        host=config.postgresql.host,
        port=config.postgresql.port,
        user=config.postgresql.user,
        database=config.postgresql.dbname,
        password=config.postgresql.password,
        dbnames=config.monitoring.dbnames,
        instance=config.postgresql.instance,
    )

    system_info = host_info(config.temboard.hostname)
    # Load the probes to run
    probes = load_probes(config.monitoring, config.temboard.home)

    instance = instance_info(conninfo, system_info['hostname'])

    logger.debug("Running probes")
    # Gather the data from probes
    data = run_probes(probes, [instance])

    # Prepare and send output
    output = dict(
        datetime=now(),
        hostinfo=system_info,
        instances=remove_passwords([instance]),
        data=data,
        version=__VERSION__,
    )
    logger.debug(output)
    q = Queue(os.path.join(config.temboard.home, 'metrics.q'),
              max_size=1024 * 1024 * 10,
              overflow_mode='slide')
    q.push(Message(content=json.dumps(output)))
    logger.debug("Done")
Esempio n. 6
0
def monitoring_collector_worker(config):
    """
    Run probes and push collected metrics in a queue.
    """
    signal.signal(signal.SIGTERM, monitoring_worker_sigterm_handler)
    # convert config dict to namedtuple
    config = collections.namedtuple(
        '__config', ['temboard', 'plugins', 'postgresql', 'logging'])(
            temboard=config['temboard'],
            plugins=config['plugins'],
            postgresql=config['postgresql'],
            logging=config['logging'])

    logger.debug("Starting collector")

    try:
        system_info = host_info(config.temboard['hostname'])
    except (ValueError, Exception) as e:
        logger.exception(e)
        logger.debug("Failed")
        sys.exit(1)

    # Load the probes to run
    try:
        probes = load_probes(config.plugins['monitoring'],
                             config.temboard['home'])
        config.plugins['monitoring']['conninfo'] = [{
            'host':
            config.postgresql['host'],
            'port':
            config.postgresql['port'],
            'user':
            config.postgresql['user'],
            'database':
            config.postgresql['dbname'],
            'password':
            config.postgresql['password'],
            'dbnames':
            config.plugins['monitoring']['dbnames'],
            'instance':
            config.postgresql['instance']
        }]

        # Validate connection information from the config, and ensure
        # the instance is available
        instances = []
        for conninfo in config.plugins['monitoring']['conninfo']:
            instances.append(instance_info(conninfo, system_info['hostname']))

        logger.debug("Running probes")
        # Gather the data from probes
        data = run_probes(probes, instances)

        # Prepare and send output
        output = {
            'datetime': now(),
            'hostinfo': system_info,
            'instances': remove_passwords(instances),
            'data': data,
            'version': __VERSION__
        }
        logger.debug(output)
        q = Queue('%s/metrics.q' % (config.temboard['home']),
                  max_size=1024 * 1024 * 10,
                  overflow_mode='slide')
        q.push(Message(content=json.dumps(output)))
        logger.debug("Done")
    except Exception as e:
        logger.exception(e)
        logger.error("Could not collect data")
        sys.exit(1)
Esempio n. 7
0
    def run(self, conninfo):
        version = self.get_version(conninfo)

        if conninfo['standby']:
            return []

        metric = {
            'datetime': now(),
            'port': conninfo['port']
        }
        if version < 100000:
            sql = """
            SELECT count(s.f) AS total,
                   sum((pg_stat_file('pg_xlog/'||s.f)).size) AS total_size,
                   pg_current_xlog_location() as current_location
            FROM pg_ls_dir('pg_xlog') AS s(f)
            WHERE f ~ E'^[0-9A-F]{24}$'
            """
        else:
            sql = """
            SELECT count(s.f) AS total,
                   sum((pg_stat_file('pg_wal/'||s.f)).size) AS total_size,
                   pg_current_wal_lsn() as current_location
            FROM pg_ls_dir('pg_wal') AS s(f)
            WHERE f ~ E'^[0-9A-F]{24}$'
            """
        rows = self.run_sql(conninfo, sql)

        metric['total'] = rows[0]['total']
        metric['total_size'] = rows[0]['total_size']
        metric['current_location'] = rows[0]['current_location']

        if version < 100000:
            sql = """
            SELECT count(s.f) AS archive_ready
            FROM pg_ls_dir('pg_xlog/archive_status') AS s(f)
            WHERE f ~ E'\.ready$'
            """  # noqa W605
        else:
            sql = """
            SELECT count(s.f) AS archive_ready
            FROM pg_ls_dir('pg_wal/archive_status') AS s(f)
            WHERE f ~ E'\.ready$'
            """  # noqa W605
        rows = self.run_sql(conninfo, sql)

        metric['archive_ready'] = rows[0]['archive_ready']

        # Calcutate the written size by using the delta between the
        # position between to runs. The current xlog location must be
        # converted to an number first
        m = re.match(r'^([0-9A-F]+)/([0-9A-F]+)$', metric['current_location'])
        if m:
            current = int("0xff000000", 0) * \
                int("0x" + m.group(1), 0) + int("0x" + m.group(2), 0)
        else:
            logger.error("Unable to convert xlog location to a number")
            return []

        (interval, delta) = self.delta(conninfo['instance'].replace('/', ''),
                                       {'written_size': current})

        # Empty the first time
        if interval is None:
            return []

        metric['measure_interval'] = interval
        metric.update(delta)

        return [metric]
Esempio n. 8
0
def supervision_collector_worker(commands, command, config):
    """
    Run probes and push collected metrics in a queue.
    """
    signal.signal(signal.SIGTERM, supervision_worker_sigterm_handler)

    start_time = time.time() * 1000
    set_logger_name("supervision_collector_worker")
    logger = get_logger(config)
    # TODO: logging methods in supervision plugin must be aligned.
    logging.root = logger
    logger.debug("Starting with pid=%s" % (os.getpid()))
    logger.debug("commandid=%s" % (command.commandid))
    command.state = COMMAND_START
    command.time = time.time()

    try:
        command.pid = os.getpid()
        commands.update(command)
        system_info = host_info(config.temboard['hostname'])
    except (ValueError, Exception) as e:
        logger.traceback(get_tb())
        logger.error(str(e))
        logger.debug("Failed.")
        sys.exit(1)

# Load the probes to run
    try:
        probes = load_probes(config.plugins['supervision'],
                             config.temboard['home'])
        config.plugins['supervision']['conninfo'] = [{
            'host':
            config.postgresql['host'],
            'port':
            config.postgresql['port'],
            'user':
            config.postgresql['user'],
            'database':
            config.postgresql['dbname'],
            'password':
            config.postgresql['password'],
            'dbnames':
            config.plugins['supervision']['dbnames'],
            'instance':
            config.postgresql['instance']
        }]

        # Validate connection information from the config, and ensure
        # the instance is available
        instances = []
        for conninfo in config.plugins['supervision']['conninfo']:
            logging.debug("Validate connection information on instance \"%s\"",
                          conninfo['instance'])
            instances.append(instance_info(conninfo, system_info['hostname']))

        # Gather the data from probes
        data = run_probes(probes, instances)

        # Prepare and send output
        output = {
            'datetime': now(),
            'hostinfo': system_info,
            'instances': remove_passwords(instances),
            'data': data,
            'version': __VERSION__
        }
        logger.debug("Collected data: %s" % (output))
        q = Queue('%s/metrics.q' % (config.temboard['home']),
                  max_size=1024 * 1024 * 10,
                  overflow_mode='slide')
        q.push(Message(content=json.dumps(output)))
    except Exception as e:
        logger.traceback(get_tb())
        logger.error(str(e))
        logger.debug("Failed.")
        sys.exit(1)

    logger.debug("Duration: %s." % (str(time.time() * 1000 - start_time)))
    logger.debug("Done.")