Example #1
0
File: run.py Project: dzhang30/Iris
    def log_terminate(self) -> None:
        """
        Log to this child process' log file once it terminates

        :return:None
        """
        logger = get_logger('iris.{}'.format(self.name), self.log_file_path,
                            self.log_debug_file_path)
        logger.error('Terminated the {} with exit_code {}'.format(
            self.name, self.get_exit_code()))
Example #2
0
def main(iris_config: ConfigParser) -> None:
    """
    The starting point of Iris. Creates the log directory and shoots off the Iris main process, which in turn starts its
    sub processes (Config_Service, Scheduler, Garbage_Collector)

    :param iris_config: the iris.cfg config file object
    :return: None
    """
    iris_root_path = iris_config['main_settings']['iris_root_path']

    log_directory_path = os.path.join(iris_root_path, 'logs')
    os.makedirs(log_directory_path, exist_ok=True)

    iris_main_log_path = os.path.join(log_directory_path, 'iris_main.log')
    log_debug_file_path = os.path.join(iris_root_path, 'iris.debug')
    iris_main_logger = get_logger('iris.main', iris_main_log_path, log_debug_file_path)

    check_iris_dev_settings(iris_config, iris_main_logger)

    run_iris(logger=iris_main_logger, iris_config=iris_config)
Example #3
0
File: run.py Project: dzhang30/Iris
def run_garbage_collector(global_config_path: str, local_config_path: str, prom_dir_path: str, run_frequency: float,
                          internal_metrics_whitelist: Tuple[str], log_path: str, log_debug_path: str, ) -> None:
    """
    Run the Garbage Collector

    :param global_config_path: the path to the GlobalConfig object pulled down by the Config Service
    :param local_config_path: the path to the local config object created by the Config Service
    :param prom_dir_path: the path to the prom files directory where we look for stale prom files
    :param run_frequency: the frequency to which we run the Garbage Collector
    :param internal_metrics_whitelist: a list of internal metrics that we don't want the Garbage Collector to delete
    :param log_path: the path to the scheduler log file
    :param log_debug_path: the path to the iris.debug file that triggers when we want to enable verbose logging
    :return: None
    """
    logger = get_logger('iris.garbage_collector', log_path, log_debug_path)

    prom_writer = PromFileWriter(logger=logger)
    general_error_flag = False
    while True:
        try:
            logger.info('Resuming the Garbage_Collector')

            logger.info('Starting linter to transform the configs created by the config_service into python objs')

            linter = Linter(logger=logger)

            try:
                global_config_obj = linter.lint_global_config(global_config_path)
                local_config_obj = linter.lint_metrics_config(global_config_obj, local_config_path)
            except OSError:
                local_config_obj = {}

            gc = GarbageCollector(
                local_config_obj=local_config_obj,
                internal_metrics_whitelist=internal_metrics_whitelist,
                prom_dir_path=prom_dir_path,
                logger=logger
            )

            logger.info('Running GC to detect stale prom files')

            deleted_files = gc.delete_stale_prom_files()

            metric_name = 'iris_garbage_collector_deleted_stale_files'
            prom_builder = PromStrBuilder(
                metric_name=metric_name,
                metric_result=len(deleted_files),
                help_str='Indicate how many stale prom files the Garbage Collector had to delete',
                type_str='gauge'
            )

            prom_string = prom_builder.create_prom_string()
            prom_file_path = os.path.join(prom_dir_path, '{}.prom'.format(metric_name))
            prom_writer.write_prom_file(prom_file_path, prom_string)

            general_error_flag = False

        except Exception as e:
            logger.error('Garbage Collector has an err: {}'.format(e))
            general_error_flag = True

        finally:
            metric_name = 'iris_garbage_collector_error'
            prom_builder = PromStrBuilder(
                metric_name=metric_name,
                metric_result=int(general_error_flag),
                help_str='Indicate if an exception/error has occurred in the Garbage Collector',
                type_str='gauge'
            )

            prom_string = prom_builder.create_prom_string()
            prom_file_path = os.path.join(prom_dir_path, '{}.prom'.format(metric_name))
            prom_writer.write_prom_file(prom_file_path, prom_string)

            logger.info('Sleeping the Garbage Collector for {} seconds\n'.format(run_frequency))

            time.sleep(run_frequency)
Example #4
0
File: run.py Project: dzhang30/Iris
def run_config_service(aws_creds_path: str, s3_region_name: str,
                       s3_bucket_env: str, s3_bucket_name: str,
                       s3_download_to_path: str, ec2_region_name: str,
                       ec2_dev_instance_id: str, ec2_metadata_url: str,
                       local_config_path: str, prom_dir_path: str,
                       run_frequency: float, log_path: str,
                       log_debug_path: str, dev_mode: bool) -> None:
    """
    Run the Config Service with each of its components (S3, Linter, EC2Tags)

    :param aws_creds_path: path to the aws_credentials file
    :param s3_region_name: region that the S3 bucket is in
    :param s3_bucket_env: the bucket_environment/aws_profile_name, is the env the bucket is in ie prod/nonprod
    :param s3_bucket_name: the name of the bucket
    :param s3_download_to_path: the path to download the bucket content/configs
    :param ec2_region_name: region that the ec2 instance is in
    :param ec2_dev_instance_id: the instance id of the host you want to test in dev mode, see readme & iris.cfg. This
    field is not set by default in iris.cfg when running on a host. It must be manually set by the tester for debugging
    :param ec2_metadata_url: the metadata url that allows the instance to get info of itself, defined in iris.cfg
    :param local_config_path: the path we want to write the local config object to. The local config object contains
    the list of metrics the current ec2 host needs to run
    :param prom_dir_path: the path to the prom files directory that we write metric results to
    :param run_frequency: the frequency to which we run the config service
    :param log_path: the path to the config_service log file
    :param log_debug_path: the path to the iris.debug file that triggers when we want to enable verbose logging
    :param dev_mode: set to True when you want to run in dev mode, see readme & iris.cfg
    :return: None
    """
    logger = get_logger('iris.config_service', log_path, log_debug_path)

    general_error_flag = False
    missing_iris_tags_error_flag = False
    while True:
        # Run config service S3 puller to get the config files from iris bucket
        try:
            logger.info('Resuming the Config_Service')

            logger.info(
                'Downloading content from s3 bucket: {} to dir: {}'.format(
                    s3_bucket_name, s3_download_to_path))

            s3 = S3(aws_creds_path=aws_creds_path,
                    region_name=s3_region_name,
                    bucket_environment=s3_bucket_env,
                    bucket_name=s3_bucket_name,
                    dev_mode=dev_mode,
                    logger=logger)

            s3.download_bucket(s3_download_to_path)

            # run linter to transform downloaded s3 configs into Python objects. Also lints the configs for errors
            global_config_path = os.path.join(s3_download_to_path,
                                              'global_config.json')
            metrics_config_path = os.path.join(s3_download_to_path,
                                               'metrics.json')
            profile_configs_path = os.path.join(s3_download_to_path,
                                                'profiles')

            logger.info(
                'Starting linter to transform the downloaded configs into GlobalConfig, Metric, & Profile objs'
            )
            linter = Linter(logger)

            logger.info(
                'Linting Global Config file at {}'.format(global_config_path))
            global_config = linter.lint_global_config(global_config_path)

            logger.info('Linting Metrics Config file at {}'.format(
                metrics_config_path))
            metrics = linter.lint_metrics_config(global_config,
                                                 metrics_config_path)

            logger.info('Linting Profile Configs file at {}'.format(
                profile_configs_path))
            profiles = linter.lint_profile_configs(profile_configs_path)

            # run EC2Tags to retrieve the iris_tags of the host
            logger.info('Retrieving current ec2 host iris_tags')

            ec2 = EC2Tags(aws_creds_path=aws_creds_path,
                          region_name=ec2_region_name,
                          ec2_metadata_url=ec2_metadata_url,
                          dev_instance_id=ec2_dev_instance_id,
                          dev_mode=dev_mode,
                          logger=logger)

            ec2_iris_tags = ec2.get_iris_tags()

            # use iris_tags and downloaded s3 configs to generate the local_config object
            logger.info(
                'Matching retrieved iris_tags with the downloaded configs to generate the local_config obj'
            )

            iris_profile = ec2_iris_tags['ihr:iris:profile']
            if iris_profile not in profiles:
                err_msg = 'The ihr:iris:profile tag on {} is not defined in any profile config'.format(
                    ec2.instance_id)
                logger.error(err_msg)
                raise KeyError(err_msg)

            local_config_metrics = {}
            for prof_metric in profiles[iris_profile].metrics:
                if prof_metric not in metrics:
                    err_msg = 'Metric {} in profile {} not defined in metrics config'.format(
                        prof_metric, iris_profile)
                    logger.error(err_msg)
                    raise KeyError(err_msg)

                local_config_metrics[prof_metric] = metrics[
                    prof_metric].to_json()

            logger.info('Generated the local_config object')

            with tempfile.NamedTemporaryFile('w', delete=False) as tmpfile:
                json.dump(local_config_metrics, tmpfile, indent=2)
            os.rename(tmpfile.name, local_config_path)

            logger.info('Finished writing to local_config file at {}'.format(
                local_config_path))

            general_error_flag = False
            missing_iris_tags_error_flag = False

        except MissingIrisTagsError as e:
            logger.error('Config_Service MissingIrisTagsError: {}'.format(e))
            missing_iris_tags_error_flag = True

        # will log twice for defined err logs in iris, but will catch & log unlogged errs in code (3rd party err)
        except Exception as e:
            logger.error('Config_Service has an err: {}'.format(e))
            general_error_flag = True

        finally:
            general_error_name = 'iris_config_service_error'
            general_error_prom_builder = PromStrBuilder(
                metric_name=general_error_name,
                metric_result=int(general_error_flag),
                help_str=
                'Indicate if a general exception/error has occured in the Scheduler',
                type_str='gauge')
            general_error_prom_string = general_error_prom_builder.create_prom_string(
            )
            general_error_prom_file_path = os.path.join(
                prom_dir_path, '{}.prom'.format(general_error_name))

            missing_iris_tags_name = 'iris_missing_ec2_tags'
            missing_iris_tags_prom_builder = PromStrBuilder(
                metric_name=missing_iris_tags_name,
                metric_result=int(missing_iris_tags_error_flag),
                help_str='Indicate if the ec2 host is missing the iris tags',
                type_str='gauge')
            missing_iris_tags_prom_string = missing_iris_tags_prom_builder.create_prom_string(
            )
            missing_iris_tags_prom_file_path = os.path.join(
                prom_dir_path, '{}.prom'.format(missing_iris_tags_name))

            prom_writer = PromFileWriter(logger=logger)
            prom_writer.write_prom_file(general_error_prom_file_path,
                                        general_error_prom_string)
            prom_writer.write_prom_file(missing_iris_tags_prom_file_path,
                                        missing_iris_tags_prom_string)

            logger.info(
                'Sleeping the Config_Service for {}\n'.format(run_frequency))

            time.sleep(run_frequency)
Example #5
0
File: run.py Project: dzhang30/Iris
def run_scheduler(global_config_path: str, local_config_path: str,
                  prom_dir_path: str, run_frequency: float,
                  internal_metrics_whitelist: Tuple[str], log_path: str,
                  log_debug_path: str) -> None:
    """
    Run the Scheduler

    :param global_config_path: the path to the GlobalConfig object pulled down by the Config Service
    :param local_config_path: the path to the local config object created by the Config Service
    :param prom_dir_path: the path to the prom files directory that we write metric results to
    :param run_frequency: the frequency to which we run the scheduler
    :param internal_metrics_whitelist: a list of internal metrics that we don't want to count in the
    iris_custom_metrics_count metric
    :param log_path: the path to the scheduler log file
    :param log_debug_path: the path to the iris.debug file that triggers when we want to enable verbose logging
    :return: None
    """
    logger = get_logger('iris.scheduler', log_path, log_debug_path)

    error_flag = 0
    while True:
        try:
            sleep_total = 0  # the accumulated sleep time for checking the global_config and local_config
            sleep_increment = 10  # check for global_config and local_config every 5 seconds if they don't exist
            max_wait_time = 120  # max wait/sleep time that the scheduler will wait for these configs
            while not os.path.isfile(global_config_path) or not os.path.isfile(
                    local_config_path):
                if sleep_total == max_wait_time:
                    err_msg = 'No global_config: {} or local_config: {}. The scheduler has waited for 2 mins'.format(
                        global_config_path, local_config_path)
                    logger.error('OSError: {}'.format(err_msg))
                    raise OSError(err_msg)
                else:
                    msg = 'The scheduler is still waiting on the config_service for the global_config/local_config file'
                    logger.warning(msg)
                    sleep_total += sleep_increment
                    time.sleep(sleep_increment)

            # run linter to transform the local_config file created by the config_service into objects for the scheduler
            logger.info(
                'Starting linter to transform the config files created by the config_service into python objs'
            )

            linter = Linter(logger)
            global_config_obj = linter.lint_global_config(global_config_path)
            local_config_obj = linter.lint_metrics_config(
                global_config_obj, local_config_path)
            metrics_list = list(local_config_obj.values())

            logger.info('Read local_config file metrics {}'.format(', '.join(
                [metric.name for metric in metrics_list])))

            # run scheduler to asynchronously execute each metric and asynchronously write to the metric's prom file
            scheduler = Scheduler(metrics_list, prom_dir_path, logger=logger)
            scheduler.run()

            error_flag = 0

        # will log twice for defined err logs in iris, but will catch & log unlogged errs in code (3rd party err)
        except Exception as e:
            logger.error('Scheduler has an err: {}'.format(e))
            error_flag = 1

        finally:
            prom_writer = PromFileWriter(logger=logger)

            metric_name = 'iris_scheduler_error'
            prom_builder = PromStrBuilder(
                metric_name=metric_name,
                metric_result=error_flag,
                help_str=
                'Indicate if an exception/error has occured in the Scheduler',
                type_str='gauge')

            prom_string = prom_builder.create_prom_string()
            prom_file_path = os.path.join(prom_dir_path,
                                          '{}.prom'.format(metric_name))
            prom_writer.write_prom_file(prom_file_path, prom_string)

            # count how many custom metrics prom files are currently being exposed and create the prom file
            custom_metrics_count_result = 0
            for prom_file in os.listdir(prom_dir_path):
                metric_name = prom_file.replace('.prom', '')
                if metric_name not in internal_metrics_whitelist:
                    custom_metrics_count_result += 1

            metric_name = 'iris_custom_metrics_count'
            prom_builder = PromStrBuilder(
                metric_name=metric_name,
                metric_result=custom_metrics_count_result,
                help_str=
                'Indicate how many custom metrics the Scheduler is exposing',
                type_str='gauge')

            prom_string = prom_builder.create_prom_string()
            prom_file_path = os.path.join(prom_dir_path,
                                          '{}.prom'.format(metric_name))
            prom_writer.write_prom_file(prom_file_path, prom_string)

            logger.info('Sleeping the Scheduler for {} seconds\n'.format(
                run_frequency))

            time.sleep(run_frequency)