Ejemplo n.º 1
0
  def __init__(self, initializer_module, in_minutes=True):

    self.cachedir = initializer_module.config.alerts_cachedir
    self.stacks_dir = initializer_module.config.stacks_dir
    self.common_services_dir = initializer_module.config.common_services_dir
    self.extensions_dir = initializer_module.config.extensions_dir
    self.host_scripts_dir = initializer_module.config.host_scripts_dir
    self.configuration_builder = initializer_module.configuration_builder

    self._cluster_configuration = initializer_module.configurations_cache
    self.alert_definitions_cache = initializer_module.alert_definitions_cache

    self.config = initializer_module.config

    # the amount of time, in seconds, that an alert can run after it's scheduled time
    alert_grace_period = int(self.config.get('agent', 'alert_grace_period', 5))

    apscheduler_standalone = False

    self.APS_CONFIG = {
      'apscheduler.threadpool.core_threads': 3,
      'apscheduler.coalesce': True,
      'apscheduler.standalone': apscheduler_standalone,
      'apscheduler.misfire_grace_time': alert_grace_period,
      'apscheduler.threadpool.context_injector': self._job_context_injector if not apscheduler_standalone else None,
      'apscheduler.threadpool.agent_config': self.config
    }

    self._collector = AlertCollector()
    self.__scheduler = Scheduler(self.APS_CONFIG)
    self.__in_minutes = in_minutes
    self.recovery_manger = initializer_module.recovery_manager

    # register python exit handler
    ExitHelper().register(self.exit_handler)
Ejemplo n.º 2
0
  def __init__(self, cachedir, stacks_dir, common_services_dir, host_scripts_dir,
      alert_grace_period, cluster_configuration, config, recovery_manager,
      in_minutes=True):

    self.cachedir = cachedir
    self.stacks_dir = stacks_dir
    self.common_services_dir = common_services_dir
    self.host_scripts_dir = host_scripts_dir

    self._cluster_configuration = cluster_configuration
    
    if not os.path.exists(cachedir):
      try:
        os.makedirs(cachedir)
      except:
        logger.critical("[AlertScheduler] Could not create the cache directory {0}".format(cachedir))

    self.APS_CONFIG = {
      'apscheduler.threadpool.core_threads': 3,
      'apscheduler.coalesce': True,
      'apscheduler.standalone': False,
      'apscheduler.misfire_grace_time': alert_grace_period
    }

    self._collector = AlertCollector()
    self.__scheduler = Scheduler(self.APS_CONFIG)
    self.__in_minutes = in_minutes
    self.config = config
    self.recovery_manger = recovery_manager

    # register python exit handler
    ExitHelper().register(self.exit_handler)
Ejemplo n.º 3
0
    def __init__(self,
                 cachedir,
                 stacks_dir,
                 common_services_dir,
                 host_scripts_dir,
                 cluster_configuration,
                 config,
                 in_minutes=True):

        self.cachedir = cachedir
        self.stacks_dir = stacks_dir
        self.common_services_dir = common_services_dir
        self.host_scripts_dir = host_scripts_dir

        self._cluster_configuration = cluster_configuration

        if not os.path.exists(cachedir):
            try:
                os.makedirs(cachedir)
            except:
                logger.critical(
                    "[AlertScheduler] Could not create the cache directory {0}"
                    .format(cachedir))

        self._collector = AlertCollector()
        self.__scheduler = Scheduler(AlertSchedulerHandler.APS_CONFIG)
        self.__in_minutes = in_minutes
        self.config = config

        # register python exit handler
        ExitHelper().register(self.exit_handler)
Ejemplo n.º 4
0
    def __init__(self,
                 cachedir,
                 stacks_dir,
                 common_services_dir,
                 extensions_dir,
                 host_scripts_dir,
                 cluster_configuration,
                 config,
                 recovery_manager,
                 in_minutes=True):

        self.cachedir = cachedir
        self.stacks_dir = stacks_dir
        self.common_services_dir = common_services_dir
        self.extensions_dir = extensions_dir
        self.host_scripts_dir = host_scripts_dir

        self._cluster_configuration = cluster_configuration

        # a mapping between a cluster name and a unique hash for all definitions
        self._cluster_hashes = {}

        # the amount of time, in seconds, that an alert can run after it's scheduled time
        alert_grace_period = int(config.get('agent', 'alert_grace_period', 5))

        if not os.path.exists(cachedir):
            try:
                os.makedirs(cachedir)
            except:
                logger.critical(
                    "[AlertScheduler] Could not create the cache directory {0}"
                    .format(cachedir))

        apscheduler_standalone = False

        self.APS_CONFIG = {
            'apscheduler.threadpool.core_threads':
            3,
            'apscheduler.coalesce':
            True,
            'apscheduler.standalone':
            apscheduler_standalone,
            'apscheduler.misfire_grace_time':
            alert_grace_period,
            'apscheduler.threadpool.context_injector':
            self._job_context_injector if not apscheduler_standalone else None,
            'apscheduler.threadpool.agent_config':
            config
        }

        self._collector = AlertCollector()
        self.__scheduler = Scheduler(self.APS_CONFIG)
        self.__in_minutes = in_minutes
        self.config = config
        self.recovery_manger = recovery_manager

        # register python exit handler
        ExitHelper().register(self.exit_handler)
Ejemplo n.º 5
0
    def __init__(self, cachedir, stacks_dir, in_minutes=True):
        self.cachedir = cachedir
        self.stacks_dir = stacks_dir

        if not os.path.exists(
                cachedir) and AlertSchedulerHandler.make_cachedir:
            try:
                os.makedirs(cachedir)
            except:
                logger.critical(
                    "Could not create the cache directory {0}".format(
                        cachedir))
                pass

        self._collector = AlertCollector()
        self.__scheduler = Scheduler(AlertSchedulerHandler.APS_CONFIG)
        self.__in_minutes = in_minutes
        self.__config_maps = {}
Ejemplo n.º 6
0
  def __init__(self, cachedir, stacks_dir, common_services_dir, host_scripts_dir,
      cluster_configuration, config, recovery_manager, in_minutes=True):

    self.cachedir = cachedir
    self.stacks_dir = stacks_dir
    self.common_services_dir = common_services_dir
    self.host_scripts_dir = host_scripts_dir

    self._cluster_configuration = cluster_configuration

    # a mapping between a cluster name and a unique hash for all definitions
    self._cluster_hashes = {}

    # the amount of time, in seconds, that an alert can run after it's scheduled time
    alert_grace_period = int(config.get('agent', 'alert_grace_period', 5))

    if not os.path.exists(cachedir):
      try:
        os.makedirs(cachedir)
      except:
        logger.critical("[AlertScheduler] Could not create the cache directory {0}".format(cachedir))

    self.APS_CONFIG = {
      'apscheduler.threadpool.core_threads': 3,
      'apscheduler.coalesce': True,
      'apscheduler.standalone': False,
      'apscheduler.misfire_grace_time': alert_grace_period
    }

    self._collector = AlertCollector()
    self.__scheduler = Scheduler(self.APS_CONFIG)
    self.__in_minutes = in_minutes
    self.config = config
    self.recovery_manger = recovery_manager

    # register python exit handler
    ExitHelper().register(self.exit_handler)
Ejemplo n.º 7
0
class AlertSchedulerHandler():
  FILENAME = 'definitions.json'
  TYPE_PORT = 'PORT'
  TYPE_METRIC = 'METRIC'
  TYPE_SCRIPT = 'SCRIPT'
  TYPE_WEB = 'WEB'
  TYPE_RECOVERY = 'RECOVERY'

  def __init__(self, cachedir, stacks_dir, common_services_dir, host_scripts_dir,
      alert_grace_period, cluster_configuration, config, recovery_manager,
      in_minutes=True):

    self.cachedir = cachedir
    self.stacks_dir = stacks_dir
    self.common_services_dir = common_services_dir
    self.host_scripts_dir = host_scripts_dir

    self._cluster_configuration = cluster_configuration
    
    if not os.path.exists(cachedir):
      try:
        os.makedirs(cachedir)
      except:
        logger.critical("[AlertScheduler] Could not create the cache directory {0}".format(cachedir))

    self.APS_CONFIG = {
      'apscheduler.threadpool.core_threads': 3,
      'apscheduler.coalesce': True,
      'apscheduler.standalone': False,
      'apscheduler.misfire_grace_time': alert_grace_period
    }

    self._collector = AlertCollector()
    self.__scheduler = Scheduler(self.APS_CONFIG)
    self.__in_minutes = in_minutes
    self.config = config
    self.recovery_manger = recovery_manager

    # register python exit handler
    ExitHelper().register(self.exit_handler)


  def exit_handler(self):
    """
    Exit handler
    """
    self.stop()


  def update_definitions(self, heartbeat):
    """
    Updates the persisted alert definitions JSON.
    :param heartbeat:
    :return:
    """
    if 'alertDefinitionCommands' not in heartbeat:
      logger.warning("There are no alert definition commands in the heartbeat; unable to update definitions")
      return

    # prune out things we don't want to store
    alert_definitions = []
    for command in heartbeat['alertDefinitionCommands']:
      command_copy = command.copy()

      # no need to store these since we always use the in-memory cached values
      if 'configurations' in command_copy:
        del command_copy['configurations']

      alert_definitions.append(command_copy)

    # write out the new definitions
    with open(os.path.join(self.cachedir, self.FILENAME), 'w') as f:
      json.dump(alert_definitions, f, indent=2)

    # reschedule only the jobs that have changed
    self.reschedule()


  def __make_function(self, alert_def):
    return lambda: alert_def.collect()


  def start(self):
    """ loads definitions from file and starts the scheduler """

    if self.__scheduler is None:
      return

    if self.__scheduler.running:
      self.__scheduler.shutdown(wait=False)
      self.__scheduler = Scheduler(self.APS_CONFIG)

    alert_callables = self.__load_definitions()

    # schedule each definition
    for _callable in alert_callables:
      self.schedule_definition(_callable)

    logger.info("[AlertScheduler] Starting {0}; currently running: {1}".format(
      str(self.__scheduler), str(self.__scheduler.running)))

    self.__scheduler.start()


  def stop(self):
    if not self.__scheduler is None:
      self.__scheduler.shutdown(wait=False)
      self.__scheduler = Scheduler(self.APS_CONFIG)

    logger.info("[AlertScheduler] Stopped the alert scheduler.")

  def reschedule(self):
    """
    Removes jobs that are scheduled where their UUID no longer is valid.
    Schedules jobs where the definition UUID is not currently scheduled.
    """
    jobs_scheduled = 0
    jobs_removed = 0

    definitions = self.__load_definitions()
    scheduled_jobs = self.__scheduler.get_jobs()

    # for every scheduled job, see if its UUID is still valid
    for scheduled_job in scheduled_jobs:
      uuid_valid = False

      for definition in definitions:
        definition_uuid = definition.get_uuid()
        if scheduled_job.name == definition_uuid:
          uuid_valid = True
          break

      # jobs without valid UUIDs should be unscheduled
      if uuid_valid == False:
        jobs_removed += 1
        logger.info("[AlertScheduler] Unscheduling {0}".format(scheduled_job.name))
        self._collector.remove_by_uuid(scheduled_job.name)
        self.__scheduler.unschedule_job(scheduled_job)

    # for every definition, determine if there is a scheduled job
    for definition in definitions:
      definition_scheduled = False
      for scheduled_job in scheduled_jobs:
        definition_uuid = definition.get_uuid()
        if definition_uuid == scheduled_job.name:
          definition_scheduled = True
          break

      # if no jobs are found with the definitions UUID, schedule it
      if definition_scheduled == False:
        jobs_scheduled += 1
        self.schedule_definition(definition)

    logger.info("[AlertScheduler] Reschedule Summary: {0} rescheduled, {1} unscheduled".format(
        str(jobs_scheduled), str(jobs_removed)))


  def reschedule_all(self):
    """
    Removes jobs that are scheduled where their UUID no longer is valid.
    Schedules jobs where the definition UUID is not currently scheduled.
    """
    jobs_scheduled = 0
    jobs_removed = 0

    definitions = self.__load_definitions()
    scheduled_jobs = self.__scheduler.get_jobs()

    # unschedule all scheduled jobs
    for scheduled_job in scheduled_jobs:
        jobs_removed += 1
        logger.info("[AlertScheduler] Unscheduling {0}".format(scheduled_job.name))
        self._collector.remove_by_uuid(scheduled_job.name)
        self.__scheduler.unschedule_job(scheduled_job)

    # for every definition, schedule a job
    for definition in definitions:
        jobs_scheduled += 1
        self.schedule_definition(definition)

    logger.info("[AlertScheduler] Reschedule Summary: {0} rescheduled, {1} unscheduled".format(
      str(jobs_scheduled), str(jobs_removed)))


  def collector(self):
    """ gets the collector for reporting to the server """
    return self._collector


  def __load_definitions(self):
    """
    Loads all alert definitions from a file. All clusters are stored in
    a single file.
    :return:
    """
    definitions = []

    all_commands = None
    alerts_definitions_path = os.path.join(self.cachedir, self.FILENAME)
    try:
      with open(alerts_definitions_path) as fp:
        all_commands = json.load(fp)
    except:
      logger.warning('[AlertScheduler] {0} not found or invalid. No alerts will be scheduled until registration occurs.'.format(alerts_definitions_path))
      return definitions

    for command_json in all_commands:
      clusterName = '' if not 'clusterName' in command_json else command_json['clusterName']
      hostName = '' if not 'hostName' in command_json else command_json['hostName']

      for definition in command_json['alertDefinitions']:
        alert = self.__json_to_callable(clusterName, hostName, definition)

        if alert is None:
          continue

        alert.set_helpers(self._collector, self._cluster_configuration)

        definitions.append(alert)

    return definitions


  def __json_to_callable(self, clusterName, hostName, json_definition):
    """
    converts the json that represents all aspects of a definition
    and makes an object that extends BaseAlert that is used for individual
    """
    alert = None

    try:
      source = json_definition['source']
      source_type = source.get('type', '')

      if logger.isEnabledFor(logging.DEBUG):
        logger.debug("[AlertScheduler] Creating job type {0} with {1}".format(source_type, str(json_definition)))


      if source_type == AlertSchedulerHandler.TYPE_METRIC:
        alert = MetricAlert(json_definition, source, self.config)
      elif source_type == AlertSchedulerHandler.TYPE_PORT:
        alert = PortAlert(json_definition, source)
      elif source_type == AlertSchedulerHandler.TYPE_SCRIPT:
        source['stacks_directory'] = self.stacks_dir
        source['common_services_directory'] = self.common_services_dir
        source['host_scripts_directory'] = self.host_scripts_dir
        alert = ScriptAlert(json_definition, source, self.config)
      elif source_type == AlertSchedulerHandler.TYPE_WEB:
        alert = WebAlert(json_definition, source, self.config)
      elif source_type == AlertSchedulerHandler.TYPE_RECOVERY:
        alert = RecoveryAlert(json_definition, source, self.recovery_manger)

      if alert is not None:
        alert.set_cluster(clusterName, hostName)

    except Exception,exception:
      logger.exception("[AlertScheduler] Unable to load an invalid alert definition. It will be skipped.")

    return alert
Ejemplo n.º 8
0
class AlertSchedulerHandler():
  FILENAME = 'definitions.json'
  TYPE_PORT = 'PORT'
  TYPE_METRIC = 'METRIC'
  TYPE_SCRIPT = 'SCRIPT'
  TYPE_WEB = 'WEB'

  APS_CONFIG = { 
    'threadpool.core_threads': 3,
    'coalesce': True,
    'standalone': False
  }

  def __init__(self, cachedir, stacks_dir, common_services_dir, host_scripts_dir,
      cluster_configuration, config, in_minutes=True):

    self.cachedir = cachedir
    self.stacks_dir = stacks_dir
    self.common_services_dir = common_services_dir
    self.host_scripts_dir = host_scripts_dir

    self._cluster_configuration = cluster_configuration
    
    if not os.path.exists(cachedir):
      try:
        os.makedirs(cachedir)
      except:
        logger.critical("[AlertScheduler] Could not create the cache directory {0}".format(cachedir))

    self._collector = AlertCollector()
    self.__scheduler = Scheduler(AlertSchedulerHandler.APS_CONFIG)
    self.__in_minutes = in_minutes
    self.config = config

    # register python exit handler
    atexit.register(self.exit_handler)


  def exit_handler(self):
    """
    Exit handler
    """
    self.stop()


  def update_definitions(self, heartbeat):
    """
    Updates the persisted alert definitions JSON.
    :param heartbeat:
    :return:
    """
    if 'alertDefinitionCommands' not in heartbeat:
      logger.warning("There are no alert definition commands in the heartbeat; unable to update definitions")
      return

    # prune out things we don't want to store
    alert_definitions = []
    for command in heartbeat['alertDefinitionCommands']:
      command_copy = command.copy()

      # no need to store these since we always use the in-memory cached values
      if 'configurations' in command_copy:
        del command_copy['configurations']

      alert_definitions.append(command_copy)

    # write out the new definitions
    with open(os.path.join(self.cachedir, self.FILENAME), 'w') as f:
      json.dump(alert_definitions, f, indent=2)

    # reschedule only the jobs that have changed
    self.reschedule()


  def __make_function(self, alert_def):
    return lambda: alert_def.collect()


  def start(self):
    """ loads definitions from file and starts the scheduler """

    if self.__scheduler is None:
      return

    if self.__scheduler.running:
      self.__scheduler.shutdown(wait=False)
      self.__scheduler = Scheduler(AlertSchedulerHandler.APS_CONFIG)

    alert_callables = self.__load_definitions()

    # schedule each definition
    for _callable in alert_callables:
      self.schedule_definition(_callable)

    logger.info("[AlertScheduler] Starting {0}; currently running: {1}".format(
      str(self.__scheduler), str(self.__scheduler.running)))

    self.__scheduler.start()


  def stop(self):
    if not self.__scheduler is None:
      self.__scheduler.shutdown(wait=False)
      self.__scheduler = Scheduler(AlertSchedulerHandler.APS_CONFIG)

    logger.info("[AlertScheduler] Stopped the alert scheduler.")

  def reschedule(self):
    """
    Removes jobs that are scheduled where their UUID no longer is valid.
    Schedules jobs where the definition UUID is not currently scheduled.
    """
    jobs_scheduled = 0
    jobs_removed = 0

    definitions = self.__load_definitions()
    scheduled_jobs = self.__scheduler.get_jobs()

    # for every scheduled job, see if its UUID is still valid
    for scheduled_job in scheduled_jobs:
      uuid_valid = False

      for definition in definitions:
        definition_uuid = definition.get_uuid()
        if scheduled_job.name == definition_uuid:
          uuid_valid = True
          break

      # jobs without valid UUIDs should be unscheduled
      if uuid_valid == False:
        jobs_removed += 1
        logger.info("[AlertScheduler] Unscheduling {0}".format(scheduled_job.name))
        self._collector.remove_by_uuid(scheduled_job.name)
        self.__scheduler.unschedule_job(scheduled_job)

    # for every definition, determine if there is a scheduled job
    for definition in definitions:
      definition_scheduled = False
      for scheduled_job in scheduled_jobs:
        definition_uuid = definition.get_uuid()
        if definition_uuid == scheduled_job.name:
          definition_scheduled = True
          break

      # if no jobs are found with the definitions UUID, schedule it
      if definition_scheduled == False:
        jobs_scheduled += 1
        self.schedule_definition(definition)

    logger.info("[AlertScheduler] Reschedule Summary: {0} rescheduled, {1} unscheduled".format(
        str(jobs_scheduled), str(jobs_removed)))


  def reschedule_all(self):
    """
    Removes jobs that are scheduled where their UUID no longer is valid.
    Schedules jobs where the definition UUID is not currently scheduled.
    """
    jobs_scheduled = 0
    jobs_removed = 0

    definitions = self.__load_definitions()
    scheduled_jobs = self.__scheduler.get_jobs()

    # unschedule all scheduled jobs
    for scheduled_job in scheduled_jobs:
        jobs_removed += 1
        logger.info("[AlertScheduler] Unscheduling {0}".format(scheduled_job.name))
        self._collector.remove_by_uuid(scheduled_job.name)
        self.__scheduler.unschedule_job(scheduled_job)

    # for every definition, schedule a job
    for definition in definitions:
        jobs_scheduled += 1
        self.schedule_definition(definition)

    logger.info("[AlertScheduler] Reschedule Summary: {0} rescheduled, {1} unscheduled".format(
      str(jobs_scheduled), str(jobs_removed)))


  def collector(self):
    """ gets the collector for reporting to the server """
    return self._collector


  def __load_definitions(self):
    """
    Loads all alert definitions from a file. All clusters are stored in
    a single file.
    :return:
    """
    definitions = []

    all_commands = None
    alerts_definitions_path = os.path.join(self.cachedir, self.FILENAME)
    try:
      with open(alerts_definitions_path) as fp:
        all_commands = json.load(fp)
    except:
      logger.warning('[AlertScheduler] {0} not found or invalid. No alerts will be scheduled until registration occurs.'.format(alerts_definitions_path))
      return definitions

    for command_json in all_commands:
      clusterName = '' if not 'clusterName' in command_json else command_json['clusterName']
      hostName = '' if not 'hostName' in command_json else command_json['hostName']

      for definition in command_json['alertDefinitions']:
        alert = self.__json_to_callable(clusterName, hostName, definition)

        if alert is None:
          continue

        alert.set_helpers(self._collector, self._cluster_configuration)

        definitions.append(alert)

    return definitions


  def __json_to_callable(self, clusterName, hostName, json_definition):
    """
    converts the json that represents all aspects of a definition
    and makes an object that extends BaseAlert that is used for individual
    """
    source = json_definition['source']
    source_type = source.get('type', '')

    if logger.isEnabledFor(logging.DEBUG):
      logger.debug("[AlertScheduler] Creating job type {0} with {1}".format(source_type, str(json_definition)))

    alert = None

    if source_type == AlertSchedulerHandler.TYPE_METRIC:
      alert = MetricAlert(json_definition, source)
    elif source_type == AlertSchedulerHandler.TYPE_PORT:
      alert = PortAlert(json_definition, source)
    elif source_type == AlertSchedulerHandler.TYPE_SCRIPT:
      source['stacks_directory'] = self.stacks_dir
      source['common_services_directory'] = self.common_services_dir
      source['host_scripts_directory'] = self.host_scripts_dir
      alert = ScriptAlert(json_definition, source, self.config)
    elif source_type == AlertSchedulerHandler.TYPE_WEB:
      alert = WebAlert(json_definition, source, self.config)

    if alert is not None:
      alert.set_cluster(clusterName, hostName)

    return alert


  def schedule_definition(self,definition):
    """
    Schedule a definition (callable). Scheduled jobs are given the UUID
    as their name so that they can be identified later on.
    <p/>
    This function can be called with a definition that is disabled; it will
    simply NOOP.
    """
    # NOOP if the definition is disabled; don't schedule it
    if not definition.is_enabled():
      logger.info("[AlertScheduler] The alert {0} with UUID {1} is disabled and will not be scheduled".format(
          definition.get_name(),definition.get_uuid()))
      return

    job = None

    if self.__in_minutes:
      job = self.__scheduler.add_interval_job(self.__make_function(definition),
        minutes=definition.interval())
    else:
      job = self.__scheduler.add_interval_job(self.__make_function(definition),
        seconds=definition.interval())

    # although the documentation states that Job(kwargs) takes a name
    # key/value pair, it does not actually set the name; do it manually
    if job is not None:
      job.name = definition.get_uuid()

    logger.info("[AlertScheduler] Scheduling {0} with UUID {1}".format(
      definition.get_name(), definition.get_uuid()))


  def get_job_count(self):
    """
    Gets the number of jobs currently scheduled. This is mainly used for
    test verification of scheduling.
    """
    if self.__scheduler is None:
      return 0

    return len(self.__scheduler.get_jobs())


  def execute_alert(self, execution_commands):
    """
    Executes an alert immediately, ignoring any scheduled jobs. The existing
    jobs remain untouched. The result of this is stored in the alert
    collector for tranmission during the next heartbeat
    """
    if self.__scheduler is None or execution_commands is None:
      return

    for execution_command in execution_commands:
      try:
        alert_definition = execution_command['alertDefinition']

        clusterName = '' if not 'clusterName' in execution_command else execution_command['clusterName']
        hostName = '' if not 'hostName' in execution_command else execution_command['hostName']

        alert = self.__json_to_callable(clusterName, hostName, alert_definition)

        if alert is None:
          continue

        logger.info("[AlertScheduler] Executing on-demand alert {0} ({1})".format(alert.get_name(),
            alert.get_uuid()))

        alert.set_helpers(self._collector, self._cluster_configuration)
        alert.collect()
      except:
        logger.exception("[AlertScheduler] Unable to execute the alert outside of the job scheduler")
Ejemplo n.º 9
0
class AlertSchedulerHandler():
    TYPE_PORT = 'PORT'
    TYPE_METRIC = 'METRIC'
    TYPE_AMS = 'AMS'
    TYPE_SCRIPT = 'SCRIPT'
    TYPE_WEB = 'WEB'
    TYPE_RECOVERY = 'RECOVERY'

    def __init__(self, initializer_module, in_minutes=True):
        self.initializer_module = initializer_module
        self.cachedir = initializer_module.config.alerts_cachedir
        self.stacks_dir = initializer_module.config.stacks_dir
        self.common_services_dir = initializer_module.config.common_services_dir
        self.extensions_dir = initializer_module.config.extensions_dir
        self.host_scripts_dir = initializer_module.config.host_scripts_dir
        self.configuration_builder = initializer_module.configuration_builder

        self._cluster_configuration = initializer_module.configurations_cache
        self.alert_definitions_cache = initializer_module.alert_definitions_cache

        self.config = initializer_module.config

        # the amount of time, in seconds, that an alert can run after it's scheduled time
        alert_grace_period = int(
            self.config.get('agent', 'alert_grace_period', 5))

        apscheduler_standalone = False

        self.APS_CONFIG = {
            'apscheduler.threadpool.core_threads':
            3,
            'apscheduler.coalesce':
            True,
            'apscheduler.standalone':
            apscheduler_standalone,
            'apscheduler.misfire_grace_time':
            alert_grace_period,
            'apscheduler.threadpool.context_injector':
            self._job_context_injector if not apscheduler_standalone else None,
            'apscheduler.threadpool.agent_config':
            self.config
        }

        self._collector = AlertCollector()
        self.__scheduler = Scheduler(self.APS_CONFIG)
        self.__in_minutes = in_minutes
        self.recovery_manger = initializer_module.recovery_manager

        # register python exit handler
        ExitHelper().register(self.exit_handler)

    def _job_context_injector(self, config):
        """
    apscheduler hack to inject monkey-patching, context and configuration to all jobs inside scheduler in case if scheduler running
    in embedded mode

    Please note, this function called in job context thus all injects should be time-running optimized

    :type config AmbariConfig.AmbariConfig
    """
        if not config.use_system_proxy_setting():
            from ambari_commons.network import reconfigure_urllib2_opener
            reconfigure_urllib2_opener(ignore_system_proxy=True)

    def exit_handler(self):
        """
    Exit handler
    """
        self.stop()

    def update_definitions(self, event_type):
        """
    Updates the persisted alert definitions JSON.
    :return:
    """
        # prune out things we don't want to store
        alert_definitions = []
        for cluster_id, command in self.alert_definitions_cache.iteritems():
            command_copy = Utils.get_mutable_copy(command)
            alert_definitions.append(command_copy)

        if event_type == "CREATE":
            # reschedule all jobs, creating new instances
            self.reschedule_all()
        else:
            # reschedule only the jobs that have changed
            self.reschedule()

    def __make_function(self, alert_def):
        return lambda: alert_def.collect()

    def start(self):
        """ loads definitions from file and starts the scheduler """

        if self.__scheduler is None:
            return

        if self.__scheduler.running:
            self.__scheduler.shutdown(wait=False)
            self.__scheduler = Scheduler(self.APS_CONFIG)

        alert_callables = self.__load_definitions()

        # schedule each definition
        for _callable in alert_callables:
            self.schedule_definition(_callable)

        logger.info(
            "[AlertScheduler] Starting {0}; currently running: {1}".format(
                str(self.__scheduler), str(self.__scheduler.running)))

        self.__scheduler.start()

    def stop(self):
        if not self.__scheduler is None:
            self.__scheduler.shutdown(wait=False)
            self.__scheduler = Scheduler(self.APS_CONFIG)

        logger.info("[AlertScheduler] Stopped the alert scheduler.")

    def reschedule(self):
        """
    Removes jobs that are scheduled where their UUID no longer is valid.
    Schedules jobs where the definition UUID is not currently scheduled.
    """
        jobs_scheduled = 0
        jobs_removed = 0

        definitions = self.__load_definitions()
        scheduled_jobs = self.__scheduler.get_jobs()

        self.initializer_module.alert_status_reporter.reported_alerts.clear()

        # for every scheduled job, see if its UUID is still valid
        for scheduled_job in scheduled_jobs:
            uuid_valid = False

            for definition in definitions:
                definition_uuid = definition.get_uuid()
                if scheduled_job.name == definition_uuid:
                    uuid_valid = True
                    break

            # jobs without valid UUIDs should be unscheduled
            if uuid_valid is False:
                jobs_removed += 1
                logger.info("[AlertScheduler] Unscheduling {0}".format(
                    scheduled_job.name))
                self._collector.remove_by_uuid(scheduled_job.name)
                self.__scheduler.unschedule_job(scheduled_job)

        # for every definition, determine if there is a scheduled job
        for definition in definitions:
            definition_scheduled = False
            for scheduled_job in scheduled_jobs:
                definition_uuid = definition.get_uuid()
                if definition_uuid == scheduled_job.name:
                    definition_scheduled = True
                    break

            # if no jobs are found with the definitions UUID, schedule it
            if definition_scheduled is False:
                jobs_scheduled += 1
                self.schedule_definition(definition)

        logger.info(
            "[AlertScheduler] Reschedule Summary: {0} rescheduled, {1} unscheduled"
            .format(str(jobs_scheduled), str(jobs_removed)))

    def reschedule_all(self):
        """
    Removes jobs that are scheduled where their UUID no longer is valid.
    Schedules jobs where the definition UUID is not currently scheduled.
    """
        logger.info("[AlertScheduler] Rescheduling all jobs...")

        jobs_scheduled = 0
        jobs_removed = 0

        definitions = self.__load_definitions()
        scheduled_jobs = self.__scheduler.get_jobs()

        # unschedule all scheduled jobs
        for scheduled_job in scheduled_jobs:
            jobs_removed += 1
            logger.info("[AlertScheduler] Unscheduling {0}".format(
                scheduled_job.name))
            self._collector.remove_by_uuid(scheduled_job.name)
            self.__scheduler.unschedule_job(scheduled_job)

        # for every definition, schedule a job
        for definition in definitions:
            jobs_scheduled += 1
            self.schedule_definition(definition)

        logger.info(
            "[AlertScheduler] Reschedule Summary: {0} unscheduled, {0} rescheduled"
            .format(str(jobs_removed), str(jobs_scheduled)))

    def collector(self):
        """ gets the collector for reporting to the server """
        return self._collector

    def __load_definitions(self):
        """
    Loads all alert definitions from a file. All clusters are stored in
    a single file. This wil also populate the cluster-to-hash dictionary.
    :return:
    """
        definitions = []
        for cluster_id, command_json in self.alert_definitions_cache.iteritems(
        ):
            clusterName = '' if not 'clusterName' in command_json else command_json[
                'clusterName']
            hostName = '' if not 'hostName' in command_json else command_json[
                'hostName']
            publicHostName = '' if not 'publicHostName' in command_json else command_json[
                'publicHostName']
            clusterHash = None if not 'hash' in command_json else command_json[
                'hash']

            # cache the cluster and cluster hash after loading the JSON
            if clusterName != '' and clusterHash is not None:
                logger.info(
                    '[AlertScheduler] Caching cluster {0} with alert hash {1}'.
                    format(clusterName, clusterHash))

            for definition in command_json['alertDefinitions']:
                alert = self.__json_to_callable(
                    clusterName, hostName, publicHostName,
                    Utils.get_mutable_copy(definition))

                if alert is None:
                    continue

                alert.set_helpers(self._collector, self._cluster_configuration,
                                  self.configuration_builder)

                definitions.append(alert)

        return definitions

    def __json_to_callable(self, clusterName, hostName, publicHostName,
                           json_definition):
        """
    converts the json that represents all aspects of a definition
    and makes an object that extends BaseAlert that is used for individual
    """
        alert = None

        try:
            source = json_definition['source']
            source_type = source.get('type', '')

            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(
                    "[AlertScheduler] Creating job type {0} with {1}".format(
                        source_type, str(json_definition)))

            if source_type == AlertSchedulerHandler.TYPE_METRIC:
                alert = MetricAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_AMS:
                alert = AmsAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_PORT:
                alert = PortAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_SCRIPT:
                source['stacks_directory'] = self.stacks_dir
                source['common_services_directory'] = self.common_services_dir
                source['extensions_directory'] = self.extensions_dir
                source['host_scripts_directory'] = self.host_scripts_dir
                alert = ScriptAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_WEB:
                alert = WebAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_RECOVERY:
                alert = RecoveryAlert(json_definition, source, self.config,
                                      self.recovery_manger)

            if alert is not None:
                alert.set_cluster(clusterName, json_definition['clusterId'],
                                  hostName, publicHostName)

        except Exception, exception:
            logger.exception(
                "[AlertScheduler] Unable to load an invalid alert definition. It will be skipped."
            )

        return alert
Ejemplo n.º 10
0
class AlertSchedulerHandler():
    make_cachedir = True

    FILENAME = 'definitions.json'
    TYPE_PORT = 'PORT'
    TYPE_METRIC = 'METRIC'
    TYPE_SCRIPT = 'SCRIPT'

    APS_CONFIG = {
        'threadpool.core_threads': 3,
        'coalesce': True,
        'standalone': False
    }

    def __init__(self, cachedir, stacks_dir, in_minutes=True):
        self.cachedir = cachedir
        self.stacks_dir = stacks_dir

        if not os.path.exists(
                cachedir) and AlertSchedulerHandler.make_cachedir:
            try:
                os.makedirs(cachedir)
            except:
                logger.critical(
                    "Could not create the cache directory {0}".format(
                        cachedir))
                pass

        self._collector = AlertCollector()
        self.__scheduler = Scheduler(AlertSchedulerHandler.APS_CONFIG)
        self.__in_minutes = in_minutes
        self.__config_maps = {}

    def update_definitions(self, alert_commands, reschedule_jobs=False):
        ''' updates the persisted definitions and restarts the scheduler '''

        with open(os.path.join(self.cachedir, self.FILENAME), 'w') as f:
            json.dump(alert_commands, f, indent=2)

        if reschedule_jobs:
            self.reschedule()

    def __make_function(self, alert_def):
        return lambda: alert_def.collect()

    def start(self):
        ''' loads definitions from file and starts the scheduler '''

        if self.__scheduler is None:
            return

        if self.__scheduler.running:
            self.__scheduler.shutdown(wait=False)
            self.__scheduler = Scheduler(AlertSchedulerHandler.APS_CONFIG)

        alert_callables = self.__load_definitions()

        # schedule each definition
        for _callable in alert_callables:
            self.schedule_definition(_callable)

        logger.debug("Starting scheduler {0}; currently running: {1}".format(
            str(self.__scheduler), str(self.__scheduler.running)))

        self.__scheduler.start()

    def stop(self):
        if not self.__scheduler is None:
            self.__scheduler.shutdown(wait=False)
            self.__scheduler = Scheduler(AlertSchedulerHandler.APS_CONFIG)

    def reschedule(self):
        '''
    Removes jobs that are scheduled where their UUID no longer is valid. 
    Schedules jobs where the definition UUID is not currently scheduled.
    '''
        jobs_scheduled = 0
        jobs_removed = 0

        definitions = self.__load_definitions()
        scheduled_jobs = self.__scheduler.get_jobs()

        # for every scheduled job, see if its UUID is still valid
        for scheduled_job in scheduled_jobs:
            uuid_valid = False

            for definition in definitions:
                definition_uuid = definition.get_uuid()
                if scheduled_job.name == definition_uuid:
                    uuid_valid = True
                    break

            # jobs without valid UUIDs should be unscheduled
            if uuid_valid == False:
                jobs_removed += 1
                logger.info("Unscheduling {0}".format(scheduled_job.name))
                self._collector.remove_by_uuid(scheduled_job.name)
                self.__scheduler.unschedule_job(scheduled_job)

        # for every definition, determine if there is a scheduled job
        for definition in definitions:
            definition_scheduled = False
            for scheduled_job in scheduled_jobs:
                definition_uuid = definition.get_uuid()
                if definition_uuid == scheduled_job.name:
                    definition_scheduled = True
                    break

            # if no jobs are found with the definitions UUID, schedule it
            if definition_scheduled == False:
                jobs_scheduled += 1
                self.schedule_definition(definition)

        logger.info(
            "Alert Reschedule Summary: {0} rescheduled, {1} unscheduled".
            format(str(jobs_scheduled), str(jobs_removed)))

    def collector(self):
        ''' gets the collector for reporting to the server '''
        return self._collector

    def __load_definitions(self):
        ''' loads all alert commands from the file.  all clusters are stored in one file '''
        definitions = []

        all_commands = None
        try:
            with open(os.path.join(self.cachedir, self.FILENAME)) as fp:
                all_commands = json.load(fp)
        except:
            if (logger.isEnabledFor(logging.DEBUG)):
                traceback.print_exc()
            return definitions

        for command_json in all_commands:
            clusterName = '' if not 'clusterName' in command_json else command_json[
                'clusterName']
            hostName = '' if not 'hostName' in command_json else command_json[
                'hostName']

            configmap = None
            # each cluster gets a map of key/value pairs of substitution values
            self.__config_maps[clusterName] = {}
            if 'configurations' in command_json:
                configmap = command_json['configurations']

            for definition in command_json['alertDefinitions']:
                obj = self.__json_to_callable(clusterName, hostName,
                                              definition)

                if obj is None:
                    continue

                # get the config values for the alerts 'lookup keys',
                # eg: hdfs-site/dfs.namenode.http-address : host_and_port
                vals = self.__find_config_values(configmap,
                                                 obj.get_lookup_keys())
                self.__config_maps[clusterName].update(vals)

                obj.set_helpers(self._collector,
                                self.__config_maps[clusterName])

                definitions.append(obj)

        return definitions

    def __json_to_callable(self, clusterName, hostName, json_definition):
        '''
    converts the json that represents all aspects of a definition
    and makes an object that extends BaseAlert that is used for individual
    '''
        source = json_definition['source']
        source_type = source.get('type', '')

        if logger.isEnabledFor(logging.DEBUG):
            logger.debug("Creating job type {0} with {1}".format(
                source_type, str(json_definition)))

        alert = None

        if source_type == AlertSchedulerHandler.TYPE_METRIC:
            alert = MetricAlert(json_definition, source)
        elif source_type == AlertSchedulerHandler.TYPE_PORT:
            alert = PortAlert(json_definition, source)
        elif source_type == AlertSchedulerHandler.TYPE_SCRIPT:
            source['stacks_dir'] = self.stacks_dir
            alert = ScriptAlert(json_definition, source)

        if alert is not None:
            alert.set_cluster(clusterName, hostName)

        return alert

    def __find_config_values(self, configmap, obj_keylist):
        ''' finds templated values in the configuration map provided  by the server '''
        if configmap is None:
            return {}

        result = {}

        for key in obj_keylist:
            try:
                obj = configmap
                for layer in key.split('/'):
                    obj = obj[layer]
                result[key] = obj
            except KeyError:  # the nested key is missing somewhere
                pass

        return result

    def update_configurations(self, commands):
        '''
    when an execution command comes in, update any necessary values.
    status commands do not contain useful configurations
    '''
        for command in commands:
            clusterName = command['clusterName']
            if not clusterName in self.__config_maps:
                continue

            if 'configurations' in command:
                configmap = command['configurations']
                keylist = self.__config_maps[clusterName].keys()
                vals = self.__find_config_values(configmap, keylist)
                self.__config_maps[clusterName].update(vals)

    def schedule_definition(self, definition):
        '''
    Schedule a definition (callable). Scheduled jobs are given the UUID
    as their name so that they can be identified later on.
    <p/>
    This function can be called with a definition that is disabled; it will
    simply NOOP.
    '''
        # NOOP if the definition is disabled; don't schedule it
        if definition.is_enabled() == False:
            logger.info(
                "The alert {0} with UUID {1} is disabled and will not be scheduled"
                .format(definition.get_name(), definition.get_uuid()))
            return

        job = None

        if self.__in_minutes:
            job = self.__scheduler.add_interval_job(
                self.__make_function(definition),
                minutes=definition.interval())
        else:
            job = self.__scheduler.add_interval_job(
                self.__make_function(definition),
                seconds=definition.interval())

        # although the documentation states that Job(kwargs) takes a name
        # key/value pair, it does not actually set the name; do it manually
        if job is not None:
            job.name = definition.get_uuid()

        logger.info("Scheduling {0} with UUID {1}".format(
            definition.get_name(), definition.get_uuid()))

    def get_job_count(self):
        '''
    Gets the number of jobs currently scheduled. This is mainly used for
    test verification of scheduling
    '''
        if self.__scheduler is None:
            return 0

        return len(self.__scheduler.get_jobs())

    def execute_alert(self, execution_commands):
        '''
    Executes an alert immediately, ignoring any scheduled jobs. The existing
    jobs remain untouched. The result of this is stored in the alert
    collector for tranmission during the next heartbeat
    '''
        if self.__scheduler is None or execution_commands is None:
            return

        for execution_command in execution_commands:
            try:
                alert_definition = execution_command['alertDefinition']

                clusterName = '' if not 'clusterName' in execution_command else execution_command[
                    'clusterName']
                hostName = '' if not 'hostName' in execution_command else execution_command[
                    'hostName']

                alert = self.__json_to_callable(clusterName, hostName,
                                                alert_definition)

                if alert is None:
                    continue

                logger.info("Executing on-demand alert {0} ({1})".format(
                    alert.get_name(), alert.get_uuid()))

                alert.set_helpers(self._collector,
                                  self.__config_maps[clusterName])
                alert.collect()
            except:
                logger.exception(
                    "Unable to execute the alert outside of the job scheduler")
Ejemplo n.º 11
0
class AlertSchedulerHandler():
    FILENAME = 'definitions.json'
    TYPE_PORT = 'PORT'
    TYPE_METRIC = 'METRIC'
    TYPE_AMS = 'AMS'
    TYPE_SCRIPT = 'SCRIPT'
    TYPE_WEB = 'WEB'
    TYPE_RECOVERY = 'RECOVERY'

    def __init__(self,
                 cachedir,
                 stacks_dir,
                 common_services_dir,
                 extensions_dir,
                 host_scripts_dir,
                 cluster_configuration,
                 config,
                 recovery_manager,
                 in_minutes=True):

        self.cachedir = cachedir
        self.stacks_dir = stacks_dir
        self.common_services_dir = common_services_dir
        self.extensions_dir = extensions_dir
        self.host_scripts_dir = host_scripts_dir

        self._cluster_configuration = cluster_configuration

        # a mapping between a cluster name and a unique hash for all definitions
        self._cluster_hashes = {}

        # the amount of time, in seconds, that an alert can run after it's scheduled time
        alert_grace_period = int(config.get('agent', 'alert_grace_period', 5))

        if not os.path.exists(cachedir):
            try:
                os.makedirs(cachedir)
            except:
                logger.critical(
                    "[AlertScheduler] Could not create the cache directory {0}"
                    .format(cachedir))

        apscheduler_standalone = False

        self.APS_CONFIG = {
            'apscheduler.threadpool.core_threads':
            3,
            'apscheduler.coalesce':
            True,
            'apscheduler.standalone':
            apscheduler_standalone,
            'apscheduler.misfire_grace_time':
            alert_grace_period,
            'apscheduler.threadpool.context_injector':
            self._job_context_injector if not apscheduler_standalone else None,
            'apscheduler.threadpool.agent_config':
            config
        }

        self._collector = AlertCollector()
        self.__scheduler = Scheduler(self.APS_CONFIG)
        self.__in_minutes = in_minutes
        self.config = config
        self.recovery_manger = recovery_manager

        # register python exit handler
        ExitHelper().register(self.exit_handler)

    def _job_context_injector(self, config):
        """
    apscheduler hack to inject monkey-patching, context and configuration to all jobs inside scheduler in case if scheduler running
    in embedded mode

    Please note, this function called in job context thus all injects should be time-running optimized

    :type config AmbariConfig.AmbariConfig
    """
        if not config.use_system_proxy_setting():
            from ambari_commons.network import reconfigure_urllib2_opener
            reconfigure_urllib2_opener(ignore_system_proxy=True)

    def exit_handler(self):
        """
    Exit handler
    """
        self.stop()

    def update_definitions(self, heartbeat):
        """
    Updates the persisted alert definitions JSON.
    :param heartbeat:
    :return:
    """
        if 'alertDefinitionCommands' not in heartbeat:
            logger.warning(
                "There are no alert definition commands in the heartbeat; unable to update definitions"
            )
            return

        # prune out things we don't want to store
        alert_definitions = []
        for command in heartbeat['alertDefinitionCommands']:
            command_copy = command.copy()

            # no need to store these since we always use the in-memory cached values
            if 'configurations' in command_copy:
                del command_copy['configurations']

            alert_definitions.append(command_copy)

        # write out the new definitions
        with open(os.path.join(self.cachedir, self.FILENAME), 'w') as f:
            json.dump(alert_definitions, f, indent=2)

        # determine how to reschedule the jobs
        reschedule_all = False
        if "clusterName" in command_copy and command_copy[
                "clusterName"] not in self._cluster_hashes:
            reschedule_all = True

        if reschedule_all is True:
            # reschedule all jobs, creating new instances
            self.reschedule_all()
        else:
            # reschedule only the jobs that have changed
            self.reschedule()

    def __make_function(self, alert_def):
        return lambda: alert_def.collect()

    def start(self):
        """ loads definitions from file and starts the scheduler """

        if self.__scheduler is None:
            return

        if self.__scheduler.running:
            self.__scheduler.shutdown(wait=False)
            self.__scheduler = Scheduler(self.APS_CONFIG)

        alert_callables = self.__load_definitions()

        # schedule each definition
        for _callable in alert_callables:
            self.schedule_definition(_callable)

        logger.info(
            "[AlertScheduler] Starting {0}; currently running: {1}".format(
                str(self.__scheduler), str(self.__scheduler.running)))

        self.__scheduler.start()

    def stop(self):
        if not self.__scheduler is None:
            self.__scheduler.shutdown(wait=False)
            self.__scheduler = Scheduler(self.APS_CONFIG)

        logger.info("[AlertScheduler] Stopped the alert scheduler.")

    def reschedule(self):
        """
    Removes jobs that are scheduled where their UUID no longer is valid.
    Schedules jobs where the definition UUID is not currently scheduled.
    """
        jobs_scheduled = 0
        jobs_removed = 0

        definitions = self.__load_definitions()
        scheduled_jobs = self.__scheduler.get_jobs()

        # for every scheduled job, see if its UUID is still valid
        for scheduled_job in scheduled_jobs:
            uuid_valid = False

            for definition in definitions:
                definition_uuid = definition.get_uuid()
                if scheduled_job.name == definition_uuid:
                    uuid_valid = True
                    break

            # jobs without valid UUIDs should be unscheduled
            if uuid_valid is False:
                jobs_removed += 1
                logger.info("[AlertScheduler] Unscheduling {0}".format(
                    scheduled_job.name))
                self._collector.remove_by_uuid(scheduled_job.name)
                self.__scheduler.unschedule_job(scheduled_job)

        # for every definition, determine if there is a scheduled job
        for definition in definitions:
            definition_scheduled = False
            for scheduled_job in scheduled_jobs:
                definition_uuid = definition.get_uuid()
                if definition_uuid == scheduled_job.name:
                    definition_scheduled = True
                    break

            # if no jobs are found with the definitions UUID, schedule it
            if definition_scheduled is False:
                jobs_scheduled += 1
                self.schedule_definition(definition)

        logger.info(
            "[AlertScheduler] Reschedule Summary: {0} rescheduled, {1} unscheduled"
            .format(str(jobs_scheduled), str(jobs_removed)))

    def reschedule_all(self):
        """
    Removes jobs that are scheduled where their UUID no longer is valid.
    Schedules jobs where the definition UUID is not currently scheduled.
    """
        logger.info("[AlertScheduler] Rescheduling all jobs...")

        jobs_scheduled = 0
        jobs_removed = 0

        definitions = self.__load_definitions()
        scheduled_jobs = self.__scheduler.get_jobs()

        # unschedule all scheduled jobs
        for scheduled_job in scheduled_jobs:
            jobs_removed += 1
            logger.info("[AlertScheduler] Unscheduling {0}".format(
                scheduled_job.name))
            self._collector.remove_by_uuid(scheduled_job.name)
            self.__scheduler.unschedule_job(scheduled_job)

        # for every definition, schedule a job
        for definition in definitions:
            jobs_scheduled += 1
            self.schedule_definition(definition)

        logger.info(
            "[AlertScheduler] Reschedule Summary: {0} unscheduled, {0} rescheduled"
            .format(str(jobs_removed), str(jobs_scheduled)))

    def collector(self):
        """ gets the collector for reporting to the server """
        return self._collector

    def __load_definitions(self):
        """
    Loads all alert definitions from a file. All clusters are stored in
    a single file. This wil also populate the cluster-to-hash dictionary.
    :return:
    """
        definitions = []

        alerts_definitions_path = os.path.join(self.cachedir, self.FILENAME)
        try:
            with open(alerts_definitions_path) as fp:
                all_commands = json.load(fp)
        except:
            logger.warning(
                '[AlertScheduler] {0} not found or invalid. No alerts will be scheduled until registration occurs.'
                .format(alerts_definitions_path))
            return definitions

        for command_json in all_commands:
            clusterName = '' if not 'clusterName' in command_json else command_json[
                'clusterName']
            hostName = '' if not 'hostName' in command_json else command_json[
                'hostName']
            clusterHash = None if not 'hash' in command_json else command_json[
                'hash']

            # cache the cluster and cluster hash after loading the JSON
            if clusterName != '' and clusterHash is not None:
                logger.info(
                    '[AlertScheduler] Caching cluster {0} with alert hash {1}'.
                    format(clusterName, clusterHash))
                self._cluster_hashes[clusterName] = clusterHash

            for definition in command_json['alertDefinitions']:
                alert = self.__json_to_callable(clusterName, hostName,
                                                definition)

                if alert is None:
                    continue

                alert.set_helpers(self._collector, self._cluster_configuration)

                definitions.append(alert)

        return definitions

    def __json_to_callable(self, clusterName, hostName, json_definition):
        """
    converts the json that represents all aspects of a definition
    and makes an object that extends BaseAlert that is used for individual
    """
        alert = None

        try:
            source = json_definition['source']
            source_type = source.get('type', '')

            if logger.isEnabledFor(logging.DEBUG):
                logger.debug(
                    "[AlertScheduler] Creating job type {0} with {1}".format(
                        source_type, str(json_definition)))

            if source_type == AlertSchedulerHandler.TYPE_METRIC:
                alert = MetricAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_AMS:
                alert = AmsAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_PORT:
                alert = PortAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_SCRIPT:
                source['stacks_directory'] = self.stacks_dir
                source['common_services_directory'] = self.common_services_dir
                source['extensions_directory'] = self.extensions_dir
                source['host_scripts_directory'] = self.host_scripts_dir
                alert = ScriptAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_WEB:
                alert = WebAlert(json_definition, source, self.config)
            elif source_type == AlertSchedulerHandler.TYPE_RECOVERY:
                alert = RecoveryAlert(json_definition, source, self.config,
                                      self.recovery_manger)

            if alert is not None:
                alert.set_cluster(clusterName, hostName)

        except Exception, exception:
            logger.exception(
                "[AlertScheduler] Unable to load an invalid alert definition. It will be skipped."
            )

        return alert