def log_terminate(self) -> None: """ Log to this child process' log file once it terminates :return:None """ logger = get_logger('iris.{}'.format(self.name), self.log_file_path, self.log_debug_file_path) logger.error('Terminated the {} with exit_code {}'.format( self.name, self.get_exit_code()))
def main(iris_config: ConfigParser) -> None: """ The starting point of Iris. Creates the log directory and shoots off the Iris main process, which in turn starts its sub processes (Config_Service, Scheduler, Garbage_Collector) :param iris_config: the iris.cfg config file object :return: None """ iris_root_path = iris_config['main_settings']['iris_root_path'] log_directory_path = os.path.join(iris_root_path, 'logs') os.makedirs(log_directory_path, exist_ok=True) iris_main_log_path = os.path.join(log_directory_path, 'iris_main.log') log_debug_file_path = os.path.join(iris_root_path, 'iris.debug') iris_main_logger = get_logger('iris.main', iris_main_log_path, log_debug_file_path) check_iris_dev_settings(iris_config, iris_main_logger) run_iris(logger=iris_main_logger, iris_config=iris_config)
def run_garbage_collector(global_config_path: str, local_config_path: str, prom_dir_path: str, run_frequency: float, internal_metrics_whitelist: Tuple[str], log_path: str, log_debug_path: str, ) -> None: """ Run the Garbage Collector :param global_config_path: the path to the GlobalConfig object pulled down by the Config Service :param local_config_path: the path to the local config object created by the Config Service :param prom_dir_path: the path to the prom files directory where we look for stale prom files :param run_frequency: the frequency to which we run the Garbage Collector :param internal_metrics_whitelist: a list of internal metrics that we don't want the Garbage Collector to delete :param log_path: the path to the scheduler log file :param log_debug_path: the path to the iris.debug file that triggers when we want to enable verbose logging :return: None """ logger = get_logger('iris.garbage_collector', log_path, log_debug_path) prom_writer = PromFileWriter(logger=logger) general_error_flag = False while True: try: logger.info('Resuming the Garbage_Collector') logger.info('Starting linter to transform the configs created by the config_service into python objs') linter = Linter(logger=logger) try: global_config_obj = linter.lint_global_config(global_config_path) local_config_obj = linter.lint_metrics_config(global_config_obj, local_config_path) except OSError: local_config_obj = {} gc = GarbageCollector( local_config_obj=local_config_obj, internal_metrics_whitelist=internal_metrics_whitelist, prom_dir_path=prom_dir_path, logger=logger ) logger.info('Running GC to detect stale prom files') deleted_files = gc.delete_stale_prom_files() metric_name = 'iris_garbage_collector_deleted_stale_files' prom_builder = PromStrBuilder( metric_name=metric_name, metric_result=len(deleted_files), help_str='Indicate how many stale prom files the Garbage Collector had to delete', type_str='gauge' ) prom_string = prom_builder.create_prom_string() prom_file_path = os.path.join(prom_dir_path, '{}.prom'.format(metric_name)) prom_writer.write_prom_file(prom_file_path, prom_string) general_error_flag = False except Exception as e: logger.error('Garbage Collector has an err: {}'.format(e)) general_error_flag = True finally: metric_name = 'iris_garbage_collector_error' prom_builder = PromStrBuilder( metric_name=metric_name, metric_result=int(general_error_flag), help_str='Indicate if an exception/error has occurred in the Garbage Collector', type_str='gauge' ) prom_string = prom_builder.create_prom_string() prom_file_path = os.path.join(prom_dir_path, '{}.prom'.format(metric_name)) prom_writer.write_prom_file(prom_file_path, prom_string) logger.info('Sleeping the Garbage Collector for {} seconds\n'.format(run_frequency)) time.sleep(run_frequency)
def run_config_service(aws_creds_path: str, s3_region_name: str, s3_bucket_env: str, s3_bucket_name: str, s3_download_to_path: str, ec2_region_name: str, ec2_dev_instance_id: str, ec2_metadata_url: str, local_config_path: str, prom_dir_path: str, run_frequency: float, log_path: str, log_debug_path: str, dev_mode: bool) -> None: """ Run the Config Service with each of its components (S3, Linter, EC2Tags) :param aws_creds_path: path to the aws_credentials file :param s3_region_name: region that the S3 bucket is in :param s3_bucket_env: the bucket_environment/aws_profile_name, is the env the bucket is in ie prod/nonprod :param s3_bucket_name: the name of the bucket :param s3_download_to_path: the path to download the bucket content/configs :param ec2_region_name: region that the ec2 instance is in :param ec2_dev_instance_id: the instance id of the host you want to test in dev mode, see readme & iris.cfg. This field is not set by default in iris.cfg when running on a host. It must be manually set by the tester for debugging :param ec2_metadata_url: the metadata url that allows the instance to get info of itself, defined in iris.cfg :param local_config_path: the path we want to write the local config object to. The local config object contains the list of metrics the current ec2 host needs to run :param prom_dir_path: the path to the prom files directory that we write metric results to :param run_frequency: the frequency to which we run the config service :param log_path: the path to the config_service log file :param log_debug_path: the path to the iris.debug file that triggers when we want to enable verbose logging :param dev_mode: set to True when you want to run in dev mode, see readme & iris.cfg :return: None """ logger = get_logger('iris.config_service', log_path, log_debug_path) general_error_flag = False missing_iris_tags_error_flag = False while True: # Run config service S3 puller to get the config files from iris bucket try: logger.info('Resuming the Config_Service') logger.info( 'Downloading content from s3 bucket: {} to dir: {}'.format( s3_bucket_name, s3_download_to_path)) s3 = S3(aws_creds_path=aws_creds_path, region_name=s3_region_name, bucket_environment=s3_bucket_env, bucket_name=s3_bucket_name, dev_mode=dev_mode, logger=logger) s3.download_bucket(s3_download_to_path) # run linter to transform downloaded s3 configs into Python objects. Also lints the configs for errors global_config_path = os.path.join(s3_download_to_path, 'global_config.json') metrics_config_path = os.path.join(s3_download_to_path, 'metrics.json') profile_configs_path = os.path.join(s3_download_to_path, 'profiles') logger.info( 'Starting linter to transform the downloaded configs into GlobalConfig, Metric, & Profile objs' ) linter = Linter(logger) logger.info( 'Linting Global Config file at {}'.format(global_config_path)) global_config = linter.lint_global_config(global_config_path) logger.info('Linting Metrics Config file at {}'.format( metrics_config_path)) metrics = linter.lint_metrics_config(global_config, metrics_config_path) logger.info('Linting Profile Configs file at {}'.format( profile_configs_path)) profiles = linter.lint_profile_configs(profile_configs_path) # run EC2Tags to retrieve the iris_tags of the host logger.info('Retrieving current ec2 host iris_tags') ec2 = EC2Tags(aws_creds_path=aws_creds_path, region_name=ec2_region_name, ec2_metadata_url=ec2_metadata_url, dev_instance_id=ec2_dev_instance_id, dev_mode=dev_mode, logger=logger) ec2_iris_tags = ec2.get_iris_tags() # use iris_tags and downloaded s3 configs to generate the local_config object logger.info( 'Matching retrieved iris_tags with the downloaded configs to generate the local_config obj' ) iris_profile = ec2_iris_tags['ihr:iris:profile'] if iris_profile not in profiles: err_msg = 'The ihr:iris:profile tag on {} is not defined in any profile config'.format( ec2.instance_id) logger.error(err_msg) raise KeyError(err_msg) local_config_metrics = {} for prof_metric in profiles[iris_profile].metrics: if prof_metric not in metrics: err_msg = 'Metric {} in profile {} not defined in metrics config'.format( prof_metric, iris_profile) logger.error(err_msg) raise KeyError(err_msg) local_config_metrics[prof_metric] = metrics[ prof_metric].to_json() logger.info('Generated the local_config object') with tempfile.NamedTemporaryFile('w', delete=False) as tmpfile: json.dump(local_config_metrics, tmpfile, indent=2) os.rename(tmpfile.name, local_config_path) logger.info('Finished writing to local_config file at {}'.format( local_config_path)) general_error_flag = False missing_iris_tags_error_flag = False except MissingIrisTagsError as e: logger.error('Config_Service MissingIrisTagsError: {}'.format(e)) missing_iris_tags_error_flag = True # will log twice for defined err logs in iris, but will catch & log unlogged errs in code (3rd party err) except Exception as e: logger.error('Config_Service has an err: {}'.format(e)) general_error_flag = True finally: general_error_name = 'iris_config_service_error' general_error_prom_builder = PromStrBuilder( metric_name=general_error_name, metric_result=int(general_error_flag), help_str= 'Indicate if a general exception/error has occured in the Scheduler', type_str='gauge') general_error_prom_string = general_error_prom_builder.create_prom_string( ) general_error_prom_file_path = os.path.join( prom_dir_path, '{}.prom'.format(general_error_name)) missing_iris_tags_name = 'iris_missing_ec2_tags' missing_iris_tags_prom_builder = PromStrBuilder( metric_name=missing_iris_tags_name, metric_result=int(missing_iris_tags_error_flag), help_str='Indicate if the ec2 host is missing the iris tags', type_str='gauge') missing_iris_tags_prom_string = missing_iris_tags_prom_builder.create_prom_string( ) missing_iris_tags_prom_file_path = os.path.join( prom_dir_path, '{}.prom'.format(missing_iris_tags_name)) prom_writer = PromFileWriter(logger=logger) prom_writer.write_prom_file(general_error_prom_file_path, general_error_prom_string) prom_writer.write_prom_file(missing_iris_tags_prom_file_path, missing_iris_tags_prom_string) logger.info( 'Sleeping the Config_Service for {}\n'.format(run_frequency)) time.sleep(run_frequency)
def run_scheduler(global_config_path: str, local_config_path: str, prom_dir_path: str, run_frequency: float, internal_metrics_whitelist: Tuple[str], log_path: str, log_debug_path: str) -> None: """ Run the Scheduler :param global_config_path: the path to the GlobalConfig object pulled down by the Config Service :param local_config_path: the path to the local config object created by the Config Service :param prom_dir_path: the path to the prom files directory that we write metric results to :param run_frequency: the frequency to which we run the scheduler :param internal_metrics_whitelist: a list of internal metrics that we don't want to count in the iris_custom_metrics_count metric :param log_path: the path to the scheduler log file :param log_debug_path: the path to the iris.debug file that triggers when we want to enable verbose logging :return: None """ logger = get_logger('iris.scheduler', log_path, log_debug_path) error_flag = 0 while True: try: sleep_total = 0 # the accumulated sleep time for checking the global_config and local_config sleep_increment = 10 # check for global_config and local_config every 5 seconds if they don't exist max_wait_time = 120 # max wait/sleep time that the scheduler will wait for these configs while not os.path.isfile(global_config_path) or not os.path.isfile( local_config_path): if sleep_total == max_wait_time: err_msg = 'No global_config: {} or local_config: {}. The scheduler has waited for 2 mins'.format( global_config_path, local_config_path) logger.error('OSError: {}'.format(err_msg)) raise OSError(err_msg) else: msg = 'The scheduler is still waiting on the config_service for the global_config/local_config file' logger.warning(msg) sleep_total += sleep_increment time.sleep(sleep_increment) # run linter to transform the local_config file created by the config_service into objects for the scheduler logger.info( 'Starting linter to transform the config files created by the config_service into python objs' ) linter = Linter(logger) global_config_obj = linter.lint_global_config(global_config_path) local_config_obj = linter.lint_metrics_config( global_config_obj, local_config_path) metrics_list = list(local_config_obj.values()) logger.info('Read local_config file metrics {}'.format(', '.join( [metric.name for metric in metrics_list]))) # run scheduler to asynchronously execute each metric and asynchronously write to the metric's prom file scheduler = Scheduler(metrics_list, prom_dir_path, logger=logger) scheduler.run() error_flag = 0 # will log twice for defined err logs in iris, but will catch & log unlogged errs in code (3rd party err) except Exception as e: logger.error('Scheduler has an err: {}'.format(e)) error_flag = 1 finally: prom_writer = PromFileWriter(logger=logger) metric_name = 'iris_scheduler_error' prom_builder = PromStrBuilder( metric_name=metric_name, metric_result=error_flag, help_str= 'Indicate if an exception/error has occured in the Scheduler', type_str='gauge') prom_string = prom_builder.create_prom_string() prom_file_path = os.path.join(prom_dir_path, '{}.prom'.format(metric_name)) prom_writer.write_prom_file(prom_file_path, prom_string) # count how many custom metrics prom files are currently being exposed and create the prom file custom_metrics_count_result = 0 for prom_file in os.listdir(prom_dir_path): metric_name = prom_file.replace('.prom', '') if metric_name not in internal_metrics_whitelist: custom_metrics_count_result += 1 metric_name = 'iris_custom_metrics_count' prom_builder = PromStrBuilder( metric_name=metric_name, metric_result=custom_metrics_count_result, help_str= 'Indicate how many custom metrics the Scheduler is exposing', type_str='gauge') prom_string = prom_builder.create_prom_string() prom_file_path = os.path.join(prom_dir_path, '{}.prom'.format(metric_name)) prom_writer.write_prom_file(prom_file_path, prom_string) logger.info('Sleeping the Scheduler for {} seconds\n'.format( run_frequency)) time.sleep(run_frequency)