Example #1
0
def main():
  """dfstats main loop"""
  try:
    f_mounts = open("/proc/mounts", "r")
  except IOError, e:
    utils.err("error: can't open /proc/mounts: %s" % e)
    return 13 # Ask tcollector to not respawn us
Example #2
0
def find_conf_file(pid):
    """Returns config file for couchbase-server."""
    try:
        fd = open('/proc/%s/cmdline' % pid)
    except IOError, e:
        utils.err("Couchbase (pid %s) went away ? %s" % (pid, e))
        return
Example #3
0
def find_databases(dbs=None):
  """Returns a map of dbname (string) to DB instances to monitor.

  Args:
    dbs: A map of dbname (string) to DB instances already monitored.
      This map will be modified in place if it's not None.
  """
  sockfiles = find_sockfiles()
  if dbs is None:
    dbs = {}
  for sockfile in sockfiles:
    dbname = get_dbname(sockfile)
    if dbname in dbs:
      continue
    if not dbname:
      continue
    try:
      db = mysql_connect(sockfile)
      cursor = db.cursor()
      cursor.execute("SELECT VERSION()")
    except (EnvironmentError, EOFError, RuntimeError, socket.error,
            MySQLdb.MySQLError), e:
      utils.err("Couldn't connect to %s: %s" % (sockfile, e))
      continue
    version = cursor.fetchone()[0]
    dbs[dbname] = DB(sockfile, dbname, db, cursor, version)
Example #4
0
def read_socket(sock):
    """
    Connect to the HAProxy stats socket and ready the data from the show stat
    command, allowing up to three retries before aborting. This setup assumes
    that the socket will be closed and doesn't try to keep it open, reconnecting
    on each attempt to fetch the statistics. (Should better handle restarts
    and reloads of the monitored process.)
    """

    stats = ''

    # Establish a socket to connect to the unix socket on HAProxy
    sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    sock.connect(DEFAULT_SOCKET)

    for attempt in range(3):
        try:
            sock.send("show stat\n")
            data = sock.recv(4096)
            while data:
                stats += data
                data = sock.recv(4096)
            return stats.split("\n")
        except IOError, error:
            utils.err("Error: Connection to HAProxy socket lost: %s (%d)" %
                      (error, attempt))
            sock.close()
            sock.connect(DEFAULT_SOCKET)
            # Reset stats in case it was broken mid-stream
            stats = ''
Example #5
0
def main(args):
    """Collects and dumps stats from a MySQL server."""
    if not find_sockfiles():  # Nothing to monitor.
        return 13  # Ask tcollector to not respawn us.
    if MySQLdb is None:
        utils.err("error: Python module `MySQLdb' is missing")
        return 1

    last_db_refresh = now()
    dbs = find_databases()
    while True:
        ts = now()
        if ts - last_db_refresh >= DB_REFRESH_INTERVAL:
            find_databases(dbs)
            last_db_refresh = ts

        errs = []
        for dbname, db in dbs.iteritems():
            try:
                collect(db)
            except (EnvironmentError, EOFError, RuntimeError, socket.error,
                    MySQLdb.MySQLError), e:
                if isinstance(e, IOError) and e[0] == errno.EPIPE:
                    # Exit on a broken pipe.  There's no point in continuing
                    # because no one will read our stdout anyway.
                    return 2
                utils.err("error: failed to collect data from %s: %s" %
                          (db, e))
                errs.append(dbname)

        for dbname in errs:
            del dbs[dbname]

        sys.stdout.flush()
        time.sleep(COLLECTION_INTERVAL)
Example #6
0
def main():
    pid = haproxy_pid()
    if not pid:
        utils.err("Error: HAProxy is not running")
        return 13  # Ask tcollector to not respawn us.

    conf_file = find_conf_file(pid)
    if not conf_file:
        return 13

    sock_file = find_sock_file(conf_file)
    if sock_file is None:
        utils.err("Error: HAProxy is not listening on any unix domain socket")
        return 13

    sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    sock.connect(sock_file)

    # put haproxy to interactive mode, otherwise haproxy closes
    # connection after first command.
    # See haproxy documentation section 9.2. Unix Socket commands.
    sock.send("prompt\n")

    while True:
        collect_stats(sock)
        time.sleep(COLLECTION_INTERVAL)
Example #7
0
def main():
    if json is None:
        utils.err("This collector requires the 'json' Python module.")
        return 13
    while True:
        read_impala_log()
        time.sleep(1)
Example #8
0
def cloudwatch_query_metric(cloudwatch, region, metric):
    end = datetime.datetime.utcnow()
    start = end - datetime.timedelta(seconds=COLLECTION_INTERVAL)
    global STATISTICS
    # TODO: statistics no longer need to be one at at time so refactor that
    response = cloudwatch.get_metric_statistics(
        Namespace=metric["Namespace"],
        MetricName=metric["MetricName"],
        Dimensions=metric["Dimensions"],
        StartTime=start,
        EndTime=end,
        Period=300,
        Statistics=list(STATISTICS),
        Unit='Count'
    )

    for datapoint in response['Datapoints']:
        for statistic in STATISTICS:
            timestamp = format_timestamp(str(datapoint['Timestamp']))
            value = int(datapoint[statistic])
            metric_name, tags = build_tag_list(metric['MetricName'].lower(), region, metric['Dimensions'])
            namespace = metric["Namespace"].lower().replace('/', '.')
            output = "%s.%s.%s %s %s %s" % (
                namespace, metric_name, statistic.lower(), str(timestamp),
                str(value),
                tags)
            #sys.stderr.write('output: %s\n' % (output))
            if validate_line_parses(output):
                sendQueue.put({'timestamp': timestamp, 'output': output})
            else:
                utils.err("Invalid Line: %s" % output)
Example #9
0
def read_socket(sock):
    """
    Connect to the HAProxy stats socket and ready the data from the show stat
    command, allowing up to three retries before aborting. This setup assumes
    that the socket will be closed and doesn't try to keep it open, reconnecting
    on each attempt to fetch the statistics. (Should better handle restarts
    and reloads of the monitored process.)
    """

    stats = ''

    # Establish a socket to connect to the unix socket on HAProxy
    sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
    sock.connect(DEFAULT_SOCKET)

    for attempt in range(3):
        try:
            sock.send("show stat\n")
            data = sock.recv(4096)
            while data:
                stats += data
                data = sock.recv(4096)
            return stats.split("\n")
        except IOError, error:
            utils.err("Error: Connection to HAProxy socket lost: %s (%d)"
                        % (error, attempt))
            sock.close()
            sock.connect(DEFAULT_SOCKET)
            # Reset stats in case it was broken mid-stream
            stats = ''
Example #10
0
def main(argv):
  with utils.lower_privileges(self._logger):
      socket.setdefaulttimeout(DEFAULT_TIMEOUT)
      servers = []

      if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 1

      for conf in elasticsearch_conf.get_servers():
        server = httplib.HTTPConnection( *conf )
        try:
          server.connect()
        except socket.error, (erno, e):
          if erno == errno.ECONNREFUSED:
            continue
          raise
        servers.append( server )

      if len( servers ) == 0:
        return 13  # No ES running, ask tcollector to not respawn us.

      status = node_status(server)
      version = status["version"]["number"]

      while True:
        for server in servers:
          _collect_server(server, version)
        time.sleep(COLLECTION_INTERVAL)
Example #11
0
def main(argv):
    utils.drop_privileges()
    socket.setdefaulttimeout(DEFAULT_TIMEOUT)
    servers = []

    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 1

    for conf in elasticsearch_conf.get_servers():
        server = HTTPConnection(*conf)
        try:
            server.connect()
        except socket.error as exc:
            if exc.errno == errno.ECONNREFUSED:
                continue
            raise
        servers.append(server)

    if len(servers) == 0:
        return 13  # No ES running, ask tcollector to not respawn us.

    lock = threading.Lock()
    while True:
        threads = []
        for server in servers:
            status = node_status(server)
            version = status["version"]["number"]
            t = threading.Thread(target=_collect_server,
                                 args=(server, version, lock))
            t.start()
            threads.append(t)
        for thread in threads:
            thread.join()
        time.sleep(COLLECTION_INTERVAL)
    def process_metric(self, timestamp, metric, tags, value, mbean_domain, mbean_properties):
        if not mbean_domain.startswith("kafka") and not mbean_domain == "java.lang":
            utils.err("Unexpected mbean domain = %r" % mbean_domain)
            return

        if mbean_domain == "java.lang":
            jmx_service = mbean_properties.pop("type", "jvm")
        # Kafka producer metrics
        elif mbean_domain == "kafka.producer":
            self._process_kafka_producer_metric(timestamp, metric, tags, value, mbean_domain, mbean_properties)
            return
        # Kafka consumer metrics
        elif mbean_domain == "kafka.consumer":
            self._process_kafka_consumer_metric(timestamp, metric, tags, value, mbean_domain, mbean_properties)
            return
        # Kafka broker metrics
        elif mbean_domain.startswith("kafka."):
            domain_parts = mbean_domain.split(".")
            # drop the kafka prefix
            mbean_domain = mbean_domain[len("kafka."):]
            jmx_service = mbean_properties.get("type", domain_parts[-1])
        else:
            return

        if mbean_properties:
            tags += " " + " ".join(k + "=" + v for k, v in
                                   mbean_properties.iteritems())

        jmx_service = JmxMonitor.SHORT_SERVICE_NAMES.get(jmx_service, jmx_service)
        metric = mbean_domain + "." + jmx_service.lower() + "." + metric

        self.emit(metric, timestamp, value, tags)
Example #13
0
def main(argv):
    with utils.lower_privileges(self._logger):
        socket.setdefaulttimeout(DEFAULT_TIMEOUT)
        servers = []

        if json is None:
            utils.err("This collector requires the `json' Python module.")
            return 1

        for conf in elasticsearch_conf.get_servers():
            server = httplib.HTTPConnection(*conf)
            try:
                server.connect()
            except socket.error, (erno, e):
                if erno == errno.ECONNREFUSED:
                    continue
                raise
            servers.append(server)

        if len(servers) == 0:
            return 13  # No ES running, ask tcollector to not respawn us.

        status = node_status(server)
        version = status["version"]["number"]

        while True:
            for server in servers:
                _collect_server(server, version)
            time.sleep(COLLECTION_INTERVAL)
Example #14
0
def main():
    """dfstats main loop"""
    try:
        f_mounts = open("/proc/mounts", "r")
    except IOError, e:
        utils.err("error: can't open /proc/mounts: %s" % e)
        return 13  # Ask tcollector to not respawn us
Example #15
0
def get_metrics(webserver_url, username, password, params):
    try:
        r = requests.get(webserver_url,
                         auth=(username, password),
                         verify=False,
                         params=params)
    except requests.exceptions.ConnectionError as error:
        print >> sys.stderr, "Error connecting: %s" % error
        utils.err("Connection error: %s" % error)
        raise

    try:
        r.raise_for_status()
    except requests.exceptions.HTTPError as error:
        print >> sys.stderr, "Request was not successful: %s" % error
        utils.err("HTTP error getting metrics from '%s' - %s" %
                  (webserver_url, error))
        return 13  # tell tcollector to not respawn

    response = r.json()
    try:
        data = response['data']
    except KeyError as e:
        print >> sys.stderr, "Did not get a 'data' key in the response."
        print >> sys.stderr, response
        raise
    return data
Example #16
0
def find_sock_file(conf_file):
  """Returns the unix socket file of haproxy."""
  try:
    fd = open(conf_file)
  except IOError, e:
    utils.err("Error: %s. Config file path is relative: %s" % (e, conf_file))
    return None
def find_bindir_path(config_file):
  """Returns the bin directory path"""
  try:
    fd = open(config_file)
  except IOError, e:
    utils.err("Error for Config file (%s): %s" % (config_file, e))
    return None
Example #18
0
def main():
  pid = haproxy_pid()
  if not pid:
    utils.err("Error: HAProxy is not running")
    return 13  # Ask tcollector to not respawn us.

  conf_file = find_conf_file(pid)
  if not conf_file:
    return 13

  sock_file = find_sock_file(conf_file)
  if sock_file is None:
    utils.err("Error: HAProxy is not listening on any unix domain socket")
    return 13

  sock = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
  sock.connect(sock_file)

  # put haproxy to interactive mode, otherwise haproxy closes
  # connection after first command.
  # See haproxy documentation section 9.2. Unix Socket commands.
  sock.send("prompt\n")

  while True:
    collect_stats(sock)
    time.sleep(COLLECTION_INTERVAL)
def find_conf_file(pid):
  """Returns config file for couchbase-server."""
  try:
    fd = open('/proc/%s/cmdline' % pid)
  except IOError, e:
    utils.err("Couchbase (pid %s) went away ? %s" % (pid, e))
    return
Example #20
0
def main():
    try:
        check_imports()

        conn = libvirt.openReadOnly(LIBVIRT_URI)
        if conn is None:
            utils.err("Failed to open connection to the hypervisor")
            return ERROR_CODE_DONT_RETRY

        while True:
            domains = conn.listAllDomains()
            random.shuffle(domains)
            pids = get_pids()

            count = 0
            for domain in domains:
                if process_domain(domain, pids.get(domain.UUIDString())):
                    count += 1  # count only successfully processed VMs

            # write libvirt.vm.count metric
            print("%s %d %s" % (FIELDS["count"], int(time.time()), count))

            sys.stdout.flush()
            time.sleep(INTERVAL)

    except LibvirtVmProcessingError as err:
        utils.err(err.value)
        return ERROR_CODE_DONT_RETRY
Example #21
0
def main(args):
    """ Calls HadoopYarnNodeManager at interval secs
      and emits metrics to stdout for TCollector """
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    utils.drop_privileges()
    parser = argparse.ArgumentParser()
    parser.add_argument('-H',
                        '--host',
                        default='localhost',
                        help='Host to connect to (default: localhost)')
    parser.add_argument('-P',
                        '--port',
                        default=8042,
                        type=int,
                        help='Port to connect to (default: 8042)')
    parser.add_argument('-i',
                        '--interval',
                        default=90,
                        type=int,
                        help='Interval at which to emit metrics')
    args = parser.parse_args(args[1:])
    host = args.host
    port = args.port
    interval = args.interval
    yarn_service = HadoopYarnNodeManager(host=host, port=port)
    while True:
        yarn_service.emit()
        time.sleep(interval)
    return 0
Example #22
0
def find_conf_file(pid):
  """Returns the conf file of haproxy."""
  try:
     output = subprocess.check_output(["ps", "--no-headers", "-o", "cmd", pid])
  except subprocess.CalledProcessError, e:
     utils.err("HAProxy (pid %s) went away? %s" % (pid, e))
     return None
Example #23
0
def find_databases(dbs=None):
    """Returns a map of dbname (string) to DB instances to monitor.

  Args:
    dbs: A map of dbname (string) to DB instances already monitored.
      This map will be modified in place if it's not None.
  """
    sockfiles = find_sockfiles()
    if dbs is None:
        dbs = {}
    for sockfile in sockfiles:
        dbname = get_dbname(sockfile)
        if dbname in dbs:
            continue
        if not dbname:
            continue
        try:
            db = mysql_connect(sockfile)
            cursor = db.cursor()
            cursor.execute("SELECT VERSION()")
        except (EnvironmentError, EOFError, RuntimeError, socket.error,
                MySQLdb.MySQLError), e:
            utils.err("Couldn't connect to %s: %s" % (sockfile, e))
            continue
        version = cursor.fetchone()[0]
        dbs[dbname] = DB(sockfile, dbname, db, cursor, version)
Example #24
0
def main(args):
  """Collects and dumps stats from a MySQL server."""
  if not find_sockfiles():  # Nothing to monitor.
    return 13               # Ask tcollector to not respawn us.
  if MySQLdb is None:
    utils.err("error: Python module `MySQLdb' is missing")
    return 1

  last_db_refresh = now()
  dbs = find_databases()
  while True:
    ts = now()
    if ts - last_db_refresh >= DB_REFRESH_INTERVAL:
      find_databases(dbs)
      last_db_refresh = ts

    errs = []
    for dbname, db in dbs.iteritems():
      try:
        collect(db)
      except (EnvironmentError, EOFError, RuntimeError, socket.error,
              MySQLdb.MySQLError), e:
        if isinstance(e, IOError) and e[0] == errno.EPIPE:
          # Exit on a broken pipe.  There's no point in continuing
          # because no one will read our stdout anyway.
          return 2
        utils.err("error: failed to collect data from %s: %s" % (db, e))
        errs.append(dbname)

    for dbname in errs:
      del dbs[dbname]

    sys.stdout.flush()
    time.sleep(COLLECTION_INTERVAL)
Example #25
0
def main(argv):
  utils.drop_privileges()
  socket.setdefaulttimeout(DEFAULT_TIMEOUT)
  servers = []

  if json is None:
    utils.err("This collector requires the `json' Python module.")
    return 1

  for conf in elasticsearch_conf.get_servers():
    server = HTTPConnection( *conf )
    try:
      server.connect()
    except socket.error as exc:
      if exc.errno == errno.ECONNREFUSED:
        continue
      raise
    servers.append( server )

  if len( servers ) == 0:
    return 13  # No ES running, ask tcollector to not respawn us.

  lock = threading.Lock()
  while True:
    threads = []
    for server in servers:
      status = node_status(server)
      version = status["version"]["number"]
      t = threading.Thread(target = _collect_server, args = (server, version, lock))
      t.start()
      threads.append(t)
    for thread in threads:
      thread.join()
    time.sleep(COLLECTION_INTERVAL)
Example #26
0
def find_bindir_path(config_file):
    """Returns the bin directory path"""
    try:
        fd = open(config_file)
    except IOError, e:
        utils.err("Error for Config file (%s): %s" % (config_file, e))
        return None
Example #27
0
def process_gc_log(collector):

    prefix = collector['prefix']
    # get latest gc log to process
    gc_log = get_latest_gc_log(collector['log_dir'],
                               collector['log_name_pattern'])

    # update current_file and current_file_pos if this is the first time to
    # process the gc log
    if collector['current_file'] != gc_log:
        collector['current_file'] = gc_log
        with open(gc_log, 'rb') as file_handler:
            collector['current_file_pos'] = get_file_end(file_handler)
        return
    try:
        with open(gc_log, 'rb') as file_handler:

            pos = collector['current_file_pos']
            collector['current_file_pos'] = get_file_end(file_handler)
            file_handler.seek(pos)

            # Do not use foreach loop because inside function process_gc_record
            # will call file_handler.readline(). The reason is that some GC
            # event are multiline and need to be processed as a whole
            while True:
                line = file_handler.readline()
                if len(line) == 0:
                    break
                pattern_name, matcher = match_pattern(line)
                if pattern_name == GC_START_TIME_PATTERN:
                    year, month, day, hour, minute, second, timezone = [
                        int(matcher.group(i)) for i in range(1, 8)
                    ]
                    cause = matcher.group(8)
                    timestamp = true_unix_timestamp(year, month, day, hour,
                                                    minute, second, timezone)
                    process_gc_record(prefix, file_handler, timestamp, cause,
                                      collector)
                else:
                    unmatched_gc_log(line)

        current_timestamp_in_sec = int(time.time())

        if not collector['timestamp'] is None:
            for gen, value in collector['gensize'].items():
                print "%s.gc.g1.gensize %s %s gen=%s" % (
                    prefix, current_timestamp_in_sec, value, gen)

        # publish gc event count metrics
        for event, value in collector['count'].items():
            print "%s.gc.g1.event.count %s %s event=%s" % (
                prefix, current_timestamp_in_sec, value, event)

    except Exception:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        utils.err(''.join(
            traceback.format_exception(exc_type, exc_value, exc_traceback)))

    return 0
Example #28
0
def main():
    utils.drop_privileges()
    if BinLogStreamReader is None:
        utils.err("error: Python module `pymysqlreplication' is missing")
        return 1
    settings = zabbix_bridge_conf.get_settings()

    # Set blocking to True if you want to block and wait for the next event at
    # the end of the stream
    stream = BinLogStreamReader(connection_settings=settings['mysql'],
                                server_id=settings['slaveid'],
                                only_events=[WriteRowsEvent],
                                resume_stream=True,
                                blocking=True)

    db_filename = settings['sqlitedb']
    dbcache = sqlite3.connect(':memory:')
    cachecur = dbcache.cursor()
    cachecur.execute("ATTACH DATABASE '%s' as 'dbfile'" % (db_filename,))
    cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache')
    cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)')

    # tcollector.zabbix_bridge namespace for internal Zabbix bridge metrics.
    log_pos = 0
    key_lookup_miss = 0
    sample_last_ts = int(time.time())
    last_key_lookup_miss = 0

    for binlogevent in stream:
        if binlogevent.schema == settings['mysql']['db']:
            table = binlogevent.table
            log_pos = binlogevent.packet.log_pos
            if table == 'history' or table == 'history_uint':
                for row in binlogevent.rows:
                    r = row['values']
                    itemid = r['itemid']
                    cachecur.execute('SELECT id, key, host, proxy FROM zabbix_cache WHERE id=?', (itemid,))
                    row = cachecur.fetchone()
                    if (row is not None):
                        print("zbx.%s %d %s host=%s proxy=%s" % (row[1], r['clock'], r['value'], row[2], row[3]))
                        if ((int(time.time()) - sample_last_ts) > settings['internal_metric_interval']): # Sample internal metrics @ 10s intervals
                            sample_last_ts = int(time.time())
                            print("tcollector.zabbix_bridge.log_pos %d %s" % (sample_last_ts, log_pos))
                            print("tcollector.zabbix_bridge.key_lookup_miss %d %s" % (sample_last_ts, key_lookup_miss))
                            print("tcollector.zabbix_bridge.timestamp_drift %d %s" % (sample_last_ts, (sample_last_ts - r['clock'])))
                            if ((key_lookup_miss - last_key_lookup_miss) > settings['dbrefresh']):
                                print("tcollector.zabbix_bridge.key_lookup_miss_reload %d %s" % (sample_last_ts, (key_lookup_miss - last_key_lookup_miss)))
                                cachecur.execute('DROP TABLE zabbix_cache')
                                cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache')
                                cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)')
                                last_key_lookup_miss = key_lookup_miss
                    else:
                        # TODO: Consider https://wiki.python.org/moin/PythonDecoratorLibrary#Retry
                        utils.err("error: Key lookup miss for %s" % (itemid))
                        key_lookup_miss += 1
                sys.stdout.flush()

    dbcache.close()
    stream.close()
Example #29
0
def find_sock_file(conf_file):
    """Returns the unix socket file of haproxy."""
    try:
        fd = open(conf_file)
    except IOError, e:
        utils.err("Error: %s. Config file path is relative: %s" %
                  (e, conf_file))
        return None
Example #30
0
def find_conf_file(pid):
    """Returns the conf file of haproxy."""
    try:
        output = subprocess.check_output(
            ["ps", "--no-headers", "-o", "cmd", pid])
    except subprocess.CalledProcessError, e:
        utils.err("HAProxy (pid %s) went away? %s" % (pid, e))
        return None
Example #31
0
def main(args):
    """Collects and dumps stats from a PostgreSQL server."""

    try:
        db = postgresqlutils.connect()
    except (Exception), e:
        utils.err("error: Could not initialize collector : %s" % (e))
        return 13  # Ask tcollector to not respawn us
Example #32
0
def main():
    """ifstat main loop"""

    try:
        f_netdev = open("/proc/net/dev")
    except IOError, e:
        utils.err("error: can't open /proc/net/dev: %s" % e)
        return 13 # Ask tcollector to not respawn us
Example #33
0
def main():
    """ifstat main loop"""

    try:
        f_netdev = open("/proc/net/dev")
    except IOError, e:
        utils.err("error: can't open /proc/net/dev: %s" % e)
        return 13  # Ask tcollector to not respawn us
Example #34
0
def main():
    utils.drop_privileges()
    if BinLogStreamReader is None:
        utils.err("error: Python module `pymysqlreplication' is missing")
        return 1
    settings = zabbix_bridge_conf.get_settings()

    # Set blocking to True if you want to block and wait for the next event at
    # the end of the stream
    stream = BinLogStreamReader(connection_settings=settings['mysql'],
                                server_id=settings['slaveid'],
                                only_events=[WriteRowsEvent],
                                resume_stream=True,
                                blocking=True)

    db_filename = settings['sqlitedb']
    dbcache = sqlite3.connect(':memory:')
    cachecur = dbcache.cursor()
    cachecur.execute("ATTACH DATABASE '%s' as 'dbfile'" % (db_filename,))
    cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache')
    cachecur.execute('CREATE UNIQUE INDEX uniq_zid on zabbix_cache (id)')

    # tcollector.zabbix_bridge namespace for internal Zabbix bridge metrics.
    log_pos = 0
    key_lookup_miss = 0
    sample_last_ts = int(time.time())
    last_key_lookup_miss = 0

    for binlogevent in stream:
        if binlogevent.schema == settings['mysql']['db']:
            table = binlogevent.table
            log_pos = binlogevent.packet.log_pos
            if table == 'history' or table == 'history_uint':
                for row in binlogevent.rows:
                    r = row['values']
                    itemid = r['itemid']
                    cachecur.execute('SELECT id, key, host, proxy FROM zabbix_cache WHERE id=?', (itemid,))
                    row = cachecur.fetchone()
                    if (row is not None):
                        print "zbx.%s %d %s host=%s proxy=%s" % (row[1], r['clock'], r['value'], row[2], row[3])
                        if ((int(time.time()) - sample_last_ts) > settings['internal_metric_interval']): # Sample internal metrics @ 10s intervals
                            sample_last_ts = int(time.time())
                            print "tcollector.zabbix_bridge.log_pos %d %s" % (sample_last_ts, log_pos)
                            print "tcollector.zabbix_bridge.key_lookup_miss %d %s" % (sample_last_ts, key_lookup_miss)
                            print "tcollector.zabbix_bridge.timestamp_drift %d %s" % (sample_last_ts, (sample_last_ts - r['clock']))
                            if ((key_lookup_miss - last_key_lookup_miss) > settings['dbrefresh']):
                                print "tcollector.zabbix_bridge.key_lookup_miss_reload %d %s" % (sample_last_ts, (key_lookup_miss - last_key_lookup_miss))
                                cachecur.execute('DROP TABLE zabbix_cache')
                                cachecur.execute('CREATE TABLE zabbix_cache AS SELECT * FROM dbfile.zabbix_cache')
                                last_key_lookup_miss = key_lookup_miss
                    else:
                        # TODO: Consider https://wiki.python.org/moin/PythonDecoratorLibrary#Retry
                        utils.err("error: Key lookup miss for %s" % (itemid))
                        key_lookup_miss += 1
                sys.stdout.flush()

    dbcache.close()
    stream.close()
Example #35
0
def get_dbname(sockfile):
  """Returns the name of the DB based on the path to the socket file."""
  if sockfile in DEFAULT_SOCKFILES:
    return "default"
  m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile)
  if not m:
    utils.err("error: couldn't guess the name of the DB for " + sockfile)
    return None
  return m.group(1)
Example #36
0
 def loop(self):
     utils.drop_privileges()
     if json is None:
         utils.err("This collector requires the `json' Python module.")
         return 13  # Ask tcollector not to respawn us
     while True:
         self.emit()
         time.sleep(self.delay)
     return 0
Example #37
0
def get_dbname(sockfile):
    """Returns the name of the DB based on the path to the socket file."""
    if sockfile in DEFAULT_SOCKFILES:
        return "default"
    m = re.search("/mysql-(.+)/[^.]+\.sock$", sockfile)
    if not m:
        utils.err("error: couldn't guess the name of the DB for " + sockfile)
        return None
    return m.group(1)
Example #38
0
def validate_config():
    aws_profile = aws_cloudwatch_conf.get_aws_profile()
    access_key, secret_access_key = aws_cloudwatch_conf.get_accesskey_secretkey()
    if (access_key == '<access_key_id>' or secret_access_key == '<secret_access_key>') and aws_profile is None:
        utils.err("Cloudwatch Collector is not configured\n")
        sys.exit(13)
    if not aws_cloudwatch_conf.enabled:
        utils.err("Cloudwatch Collector is not enabled\n")
        sys.exit(13)
Example #39
0
def collect(db):
    """
  Collects and prints stats.

  Here we collect only general info, for full list of data for collection
  see http://www.postgresql.org/docs/9.2/static/monitoring-stats.html
  """

    try:
        cursor = db.cursor()

        # general statics
        cursor.execute(
            "SELECT pg_stat_database.*, pg_database_size"
            " (pg_database.datname) AS size FROM pg_database JOIN"
            " pg_stat_database ON pg_database.datname ="
            " pg_stat_database.datname WHERE pg_stat_database.datname"
            " NOT IN ('template0', 'template1', 'postgres')")
        ts = time.time()
        stats = cursor.fetchall()

        #  datid |  datname   | numbackends | xact_commit | xact_rollback | blks_read  |  blks_hit   | tup_returned | tup_fetched | tup_inserted | tup_updated | tup_deleted | conflicts | temp_files |  temp_bytes  | deadlocks | blk_read_time | blk_write_time |          stats_reset          |     size
        result = {}
        for stat in stats:
            database = stat[1]
            result[database] = stat

        for database in result:
            for i in range(2, len(cursor.description)):
                metric = cursor.description[i].name
                value = result[database][i]
                try:
                    if metric in ("stats_reset"):
                        continue
                    print("postgresql.%s %i %s database=%s" %
                          (metric, ts, value, database))
                except:
                    utils.err("got here")
                    continue

        # connections
        cursor.execute("SELECT datname, count(datname) FROM pg_stat_activity"
                       " GROUP BY pg_stat_activity.datname")
        ts = time.time()
        connections = cursor.fetchall()

        for database, connection in connections:
            print("postgresql.connections %i %s database=%s" %
                  (ts, connection, database))

    except (EnvironmentError, EOFError, RuntimeError, socket.error), e:
        if isinstance(e, IOError) and e[0] == errno.EPIPE:
            # exit on a broken pipe. There is no point in continuing
            # because no one will read our stdout anyway.
            return 2
        utils.err("error: failed to collect data: %s" % e)
Example #40
0
def main(args):
    utils.drop_privileges()
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    hbase_service = HBaseMaster()
    while True:
        hbase_service.emit()
        time.sleep(90)
    return 0
Example #41
0
def main(args):
    utils.drop_privileges()
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    rm_node_service = HadoopResourceManager()
    while True:
        rm_node_service.emit()
        time.sleep(90)
    return 0
Example #42
0
def validate_line_parses(line):
    parsed = re.match('^([-_./a-zA-Z0-9]+)\s+'  # Metric name.
                      '(\d+\.?\d+)\s+'  # Timestamp.
                      '(\S+?)'  # Value (int or float).
                      '((?:\s+[-_./a-zA-Z0-9]+=[-_./a-zA-Z0-9]+)*)$',  # Tags
                      line)
    if parsed is None:
        utils.err("invalid data: %s \n" % (line))
        return False
    return True
Example #43
0
def main(args):
    utils.drop_privileges()
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    name_node_service = HadoopNameNode()
    while True:
        name_node_service.emit()
        time.sleep(90)
    return 0
Example #44
0
def collect():
  """Collects HTTP latencies in milliseconds from a list of ports in configuration"""
  ts = time.time()
  try:
    for metric, url in httpconf.urls().iteritems():
      response = requests.get(url)
      latency = response.elapsed.total_seconds() * 1000
      print("%s %i %f" % (metric, ts, latency))
  except Exception as e:
    utils.err("error: something wrong happened in http: %s" % e)
def main(args):
    utils.drop_privileges()
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    hbase_service = HBaseMaster()
    while True:
        hbase_service.emit()
        time.sleep(90)
    return 0
Example #46
0
def postgres_connect(sockdir):
    """Connects to the PostgreSQL server using the specified socket file."""
    user, password = postgresqlconf.get_user_password()

    try:
        return psycopg2.connect("host='%s' user='******' password='******' "
                                "connect_timeout='%s' dbname=postgres" %
                                (sockdir, user, password, CONNECT_TIMEOUT))
    except (EnvironmentError, EOFError, RuntimeError, socket.error), e:
        utils.err("Couldn't connect to DB :%s" % (e))
Example #47
0
def main(args):
    utils.drop_privileges()
    if json is None:
        utils.err("This collector requires the `json' Python module.")
        return 13  # Ask tcollector not to respawn us
    datanode_service = HadoopDataNode()
    while True:
        datanode_service.emit()
        time.sleep(15)
    return 0
Example #48
0
def collect(db):
  """
  Collects and prints stats.

  Here we collect only general info, for full list of data for collection
  see http://www.postgresql.org/docs/9.2/static/monitoring-stats.html
  """

  try:
    cursor = db.cursor()

    # general statics
    cursor.execute("SELECT pg_stat_database.*, pg_database_size"
                   " (pg_database.datname) AS size FROM pg_database JOIN"
                   " pg_stat_database ON pg_database.datname ="
                   " pg_stat_database.datname WHERE pg_stat_database.datname"
                   " NOT IN ('template0', 'template1', 'postgres')")
    ts = time.time()
    stats = cursor.fetchall()

#  datid |  datname   | numbackends | xact_commit | xact_rollback | blks_read  |  blks_hit   | tup_returned | tup_fetched | tup_inserted | tup_updated | tup_deleted | conflicts | temp_files |  temp_bytes  | deadlocks | blk_read_time | blk_write_time |          stats_reset          |     size     
    result = {}
    for stat in stats:
      database = stat[1]
      result[database] = stat

    for database in result:
      for i in range(2,len(cursor.description)):
        metric = cursor.description[i].name
        value = result[database][i]
        try:
          if metric in ("stats_reset"):
            continue
          print ("postgresql.%s %i %s database=%s"
                 % (metric, ts, value, database))
        except:
          err("got here")
          continue

    # connections
    cursor.execute("SELECT datname, count(datname) FROM pg_stat_activity"
                   " GROUP BY pg_stat_activity.datname")
    ts = time.time()
    connections = cursor.fetchall()

    for database, connection in connections:
      print ("postgresql.connections %i %s database=%s"
             % (ts, connection, database))

  except (EnvironmentError, EOFError, RuntimeError, socket.error), e:
    if isinstance(e, IOError) and e[0] == errno.EPIPE:
      # exit on a broken pipe. There is no point in continuing
      # because no one will read our stdout anyway.
      return 2
    utils.err("error: failed to collect data: %s" % e)
Example #49
0
def postgres_connect(sockdir):
  """Connects to the PostgreSQL server using the specified socket file."""
  user, password = postgresqlconf.get_user_password()

  try:
    return psycopg2.connect("host='%s' user='******' password='******' "
                            "connect_timeout='%s' dbname=postgres"
                            % (sockdir, user, password,
                            CONNECT_TIMEOUT))
  except (EnvironmentError, EOFError, RuntimeError, socket.error), e:
    utils.err("Couldn't connect to DB :%s" % (e))
Example #50
0
def scan_zk_instances():
    """ 
    Finding out all the running instances of zookeeper
    - Using netstat, finds out all listening java processes.	 
    - Figures out ZK instances among java processes by looking for the 
      string "org.apache.zookeeper.server.quorum.QuorumPeerMain" in cmdline.
    """

    instances = []
    try:
        listen_sock = subprocess.check_output(["netstat", "-lnpt"],
                                              stderr=subprocess.PIPE)
    except subprocess.CalledProcessError:
        utils.err("netstat directory doesn't exist in PATH variable")
        return instances

    for line in listen_sock.split("\n"):
        if not "java" in line:
            continue
        listen_sock = line.split()[3]
        tcp_version = line.split()[0]

        m = re.match("(.+):(\d+)", listen_sock)
        ip = m.group(1)
        port = int(m.group(2))

        pid = int(line.split()[6].split("/")[0])
        try:
            fd = open("/proc/%d/cmdline" % pid)
            cmdline = fd.readline()
            if "org.apache.zookeeper.server.quorum.QuorumPeerMain" in cmdline:
                try:
                    if tcp_version == "tcp6":
                        sock = socket.socket(socket.AF_INET6,
                                             socket.SOCK_STREAM)
                    else:
                        sock = socket.socket(socket.AF_INET,
                                             socket.SOCK_STREAM)
                    sock.settimeout(0.5)
                    sock.connect((ip, port))
                    sock.send("ruok\n")
                    data = sock.recv(1024)
                except:
                    pass
                finally:
                    sock.close()
                if data == "imok":
                    instances.append([ip, port, tcp_version])
                    data = ""
        except:
            continue
        finally:
            fd.close()
    return instances
Example #51
0
def find_conf_file(pid):
  """Returns config file for couchbase-server."""
  try:
    fd = open('/proc/%s/cmdline' % pid)
  except IOError as e:
    utils.err("Couchbase (pid %s) went away ? %s" % (pid, e))
    return
  try:
    config = fd.read().split("config_path")[1].split("\"")[1]
    return config
  finally:
    fd.close()
Example #52
0
def connect_socket(tcp_version, port):
    sock = None
    if tcp_version == "tcp6":
        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
        ipaddr = '::1'
    else:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ipaddr = '127.0.0.1'
    try:
        sock.connect((ipaddr, port))
    except Exception, err:
        utils.err(err)
Example #53
0
def get_role_status():
    ms_checker_host = "localhost:3300"
    command_is_salve = "curl " + ms_checker_host + "/checkSlave"
    s, o = commands.getstatusoutput(command_is_salve)
    if o == "" or s != 0:
        utils.err("Error checking mysql role, status %s" % s)
    elif s == 0:
        utils.err("INFO: status msg: %s" % o)
        if "not" not in o.lower():
            return 1

    return 0
Example #54
0
def find_conf_file(pid):
    """Returns config file for couchbase-server."""
    try:
        fd = open('/proc/%s/cmdline' % pid)
    except IOError as e:
        utils.err("Couchbase (pid %s) went away ? %s" % (pid, e))
        return
    try:
        config = fd.read().split("config_path")[1].split("\"")[1]
        return config
    finally:
        fd.close()
Example #55
0
def scan_zk_instances():
    """ 
    Finding out all the running instances of zookeeper
    - Using netstat, finds out all listening java processes.	 
    - Figures out ZK instances among java processes by looking for the 
      string "org.apache.zookeeper.server.quorum.QuorumPeerMain" in cmdline.
    """

    instances = []
    try:
        listen_sock = subprocess.check_output(["netstat", "-lnpt"], stderr=subprocess.PIPE)
    except subprocess.CalledProcessError:
        utils.err("netstat directory doesn't exist in PATH variable")
        return instances

    for line in listen_sock.split("\n"):
        if not "java" in line:
            continue
        listen_sock = line.split()[3]
        tcp_version = line.split()[0]

        m = re.match("(.+):(\d+)", listen_sock)
        ip = m.group(1)
        port = int(m.group(2))

        pid = int(line.split()[6].split("/")[0])
        try:
            fd = open("/proc/%d/cmdline" % pid)
            cmdline = fd.readline()
            if "org.apache.zookeeper.server.quorum.QuorumPeerMain" in cmdline:
                try:
                    if tcp_version == "tcp6" or ip == "::":
                        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
                        ip = "::1"
                    else:
                        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
                        ip = "127.0.0.1"
                    sock.settimeout(0.5)
                    sock.connect((ip, port))
                    sock.send("ruok\n")
                    data = sock.recv(1024)
                except:
                    pass
                finally:
                    sock.close()
                if data == "imok":	
                    instances.append([ip, port, tcp_version])
                    data = ""
        except:
            continue
        finally:
            fd.close()
    return instances 
Example #56
0
def connect_socket(tcp_version, port):
    sock = None
    if tcp_version == "tcp6":
        sock = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
        ipaddr = '::1'
    else:
        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        ipaddr = '127.0.0.1'
    try:
        sock.connect((ipaddr, port))
    except Exception, err:
        utils.err(err)
Example #57
0
def process_gc_log(collector):

    prefix = collector['prefix']
    # get latest gc log to process
    gc_log = get_latest_gc_log(collector['log_dir'], collector['log_name_pattern'])

    # update current_file and current_file_pos if this is the first time to
    # process the gc log
    if collector['current_file'] != gc_log:
        collector['current_file'] = gc_log
        with open(gc_log, 'rb') as file_handler:
            collector['current_file_pos'] = get_file_end(file_handler)
        return
    try:
        with open(gc_log, 'rb') as file_handler:

            pos = collector['current_file_pos']
            collector['current_file_pos'] = get_file_end(file_handler)
            file_handler.seek(pos)

            # Do not use foreach loop because inside function process_gc_record
            # will call file_handler.readline(). The reason is that some GC
            # event are multiline and need to be processed as a whole
            while True:
                line = file_handler.readline()
                if len(line) == 0:
                    break
                pattern_name, matcher = match_pattern(line)
                if pattern_name == GC_START_TIME_PATTERN:
                    year, month, day, hour, minute, second, timezone = [int(matcher.group(i)) for i in range(1, 8)]
                    cause = matcher.group(8)
                    timestamp = true_unix_timestamp(year, month, day, hour, minute, second, timezone)
                    process_gc_record(prefix, file_handler, timestamp, cause, collector)
                else:
                    unmatched_gc_log(line)

        current_timestamp_in_sec = int(time.time())

        if not collector['timestamp'] is None:
            for gen, value in collector['gensize'].items():
                print "%s.gc.g1.gensize %s %s gen=%s" % (prefix, current_timestamp_in_sec, value, gen)

        # publish gc event count metrics
        for event, value in collector['count'].items():
            print "%s.gc.g1.event.count %s %s event=%s" % (prefix, current_timestamp_in_sec, value, event)

    except Exception:
        exc_type, exc_value, exc_traceback = sys.exc_info()
        utils.err(''.join(
            traceback.format_exception(exc_type, exc_value, exc_traceback)))

    return 0
Example #58
0
def find_bindir_path(config_file):
  """Returns the bin directory path"""
  try:
    fd = open(config_file)
  except IOError as e:
    utils.err("Error for Config file (%s): %s" % (config_file, e))
    return None
  try:
    for line in fd:
      if line.startswith("{path_config_bindir"):
        return line.split(",")[1].split("\"")[1]
  finally:
    fd.close()
def main(args):
  """Collects and dumps stats from a PostgreSQL server."""

  try:
    db = postgresqlutils.connect()
  except (Exception) as e:
    utils.err("error: Could not initialize collector : %s" % (e))
    return 13 # Ask tcollector to not respawn us

  while True:
    collect(db)
    sys.stdout.flush()
    time.sleep(COLLECTION_INTERVAL)
Example #60
0
def process_domain(domain, pid):
    """Process one domain (vm)"""
    # skip vms that are not running
    if domain.isActive() != 1:
        utils.err("Domain %s is inactive. Skipping." % domain.name())
        return False
    if not pid:
        utils.err("Cannot find PID for domain %s. Skipping." % domain.name())
        return False
    if not psutil.pid_exists(pid):
        utils.err("PID %d no longer exists for domain %s. Skipping." %
                  (pid, domain.name()))
        return False

    # populate vm structure with metrics
    try:
        vm = {}
        vm[FIELDS["cpu_time"]] = get_cpu_time(pid)
        vm[FIELDS["cpu_load"]] = get_cpu_load(pid)
        vm[FIELDS["memory"]] = get_memory(domain)
        vm[FIELDS["max_memory"]] = domain.maxMemory()
        vm[FIELDS["max_vcpus"]] = domain.maxVcpus()

        xml = BeautifulSoup(domain.XMLDesc())
        vm[TAG_DEPLOY_ID] = domain.name()
        vm[TAG_TYPE] = get_type(domain, xml)

        vm.update(get_network_traffic(domain, xml))
        vm.update(get_disk_io(domain, xml))
    except LibvirtVmDataError as err:
        utils.err(err.value)
        return False

    print_vm(vm)
    return True