def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.init_config.get('min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL)) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc()) finally:
def run(self): """ Run all instances. """ instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get('min_collection_interval', self.init_config.get('min_collection_interval', self.DEFAULT_MIN_COLLECTION_INTERVAL)) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug("Not running instance #{0} of check {1} as it ran less than {2}s ago".format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now self.check(copy.deepcopy(instance)) if self.has_warnings(): instance_status = check_status.InstanceStatus(i, check_status.STATUS_WARNING, warnings=self.get_warnings() ) else: instance_status = check_status.InstanceStatus(i, check_status.STATUS_OK) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus(i, check_status.STATUS_ERROR, error=e, tb=traceback.format_exc() ) instance_statuses.append(instance_status)
def run(self): """ Run all instances. """ instance_statuses = [] for i, instance in enumerate(self.instances): try: self.check(instance) instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, e) instance_statuses.append(instance_status)
def run(self): """ Run all instances. """ instance_statuses = [] for i, instance in enumerate(self.instances): try: self.check(instance) if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings()) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK) except Exception, e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=e, tb=traceback.format_exc()) instance_statuses.append(instance_status)
def run(self): """ Run all instances. """ # Store run statistics if needed before, after = None, None if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: before = AgentCheck._collect_internal_stats() except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats before check {0}".format( self.name)) instance_statuses = [] for i, instance in enumerate(self.instances): try: min_collection_interval = instance.get( 'min_collection_interval', self.min_collection_interval) now = time.time() if now - self.last_collection_time[i] < min_collection_interval: self.log.debug( "Not running instance #{0} of check {1} as it ran less than {2}s ago" .format(i, self.name, min_collection_interval)) continue self.last_collection_time[i] = now check_start_time = None if self.in_developer_mode: check_start_time = timeit.default_timer() self.check(copy.deepcopy(instance)) instance_check_stats = None if check_start_time is not None: instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } if self.has_warnings(): instance_status = check_status.InstanceStatus( i, check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: instance_status = check_status.InstanceStatus( i, check_status.STATUS_OK, instance_check_stats=instance_check_stats) except Exception as e: self.log.exception("Check '%s' instance #%s failed" % (self.name, i)) instance_status = check_status.InstanceStatus( i, check_status.STATUS_ERROR, error=str(e), tb=traceback.format_exc()) finally: self._roll_up_instance_metadata() # Discard any remaining warning so that next instance starts clean self.get_warnings() instance_statuses.append(instance_status) if self.in_developer_mode and self.name != AGENT_METRICS_CHECK_NAME: try: after = AgentCheck._collect_internal_stats() if self.allow_profiling: self._set_internal_profiling_stats(before, after) log.info( "\n \t %s %s" % (self.name, pretty_statistics(self._internal_profiling_stats))) except Exception: # It's fine if we can't collect stats for the run, just log and proceed self.log.debug( "Failed to collect Agent Stats after check {0}".format( self.name)) return instance_statuses
def run(self): """ Run all instances. """ inst = {} hosts = [] for i, instance in enumerate(self.instances): inst[instance['addr']] = instance hosts.append(instance['addr']) instance_statuses = [None] * len(hosts) fping = Fping(hosts, self._ping_timeout) # record elapsed time for fping check_start_time = timeit.default_timer() elapsed_time = 0 num = 0 failures = {} while elapsed_time < self._last_check_time: result = fping.run() exec_time = timeit.default_timer() elapsed_time = exec_time - check_start_time num += 1 instance_check_stats = { 'run_time': timeit.default_timer() - check_start_time } for addr, v in result.items(): instance = inst[addr] if v is None: self._increment_with_tags('loss_cnt', instance) failures[addr] = failures.get(addr, 0) + 1 if num == 1: instance_status = check_status.InstanceStatus( hosts.index(addr), check_status.STATUS_WARNING, warnings=self.get_warnings(), instance_check_stats=instance_check_stats) else: self.histogram('%s.rtt' % self._basename, v, tags=self._instance_tags(instance)) if num == 1: instance_status = check_status.InstanceStatus( hosts.index(addr), check_status.STATUS_OK, instance_check_stats=instance_check_stats) self._increment_with_tags('total_cnt', instance) self._roll_up_instance_metadata() if num == 1: instance_statuses[hosts.index(addr)] = instance_status for addr in failures.keys(): self.event({ 'timestamp': int(exec_time), 'event_type': self._basename, 'msg_title': 'fping timeout', 'msg_text': 'ICMP Network Unreachable for ICMP Echo sent to %s %d times' % (addr, failures[addr]), 'aggregation_key': md5(addr).hexdigest() }) elapsed_time = timeit.default_timer() - check_start_time self.log.info("elapsed_time:%s[sec] check_times: %d" % (round(elapsed_time, 2), num)) return instance_statuses