def stop_metrics_process(): if telhandler.is_running(is_lad=False): #Stop the telegraf and ME services tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=False) if tel_out: HUtilObject.log(tel_msg) else: HUtilObject.error(tel_msg) #Delete the telegraf and ME services tel_rm_out, tel_rm_msg = telhandler.remove_telegraf_service() if tel_rm_out: HUtilObject.log(tel_rm_msg) else: HUtilObject.error(tel_rm_msg) if me_handler.is_running(is_lad=False): me_out, me_msg = me_handler.stop_metrics_service(is_lad=False) if me_out: HUtilObject.log(me_msg) else: HUtilObject.error(me_msg) me_rm_out, me_rm_msg = me_handler.remove_metrics_service(is_lad=False) if me_rm_out: HUtilObject.log(me_rm_msg) else: HUtilObject.error(me_rm_msg) pids_filepath = os.path.join(os.getcwd(), 'amametrics.pid') # kill existing telemetry watcher if os.path.exists(pids_filepath): with open(pids_filepath, "r") as f: for pids in f.readlines(): kill_cmd = "kill " + pids run_command_and_log(kill_cmd) run_command_and_log("rm " + pids_filepath)
def main(command): init_globals() global g_ext_op_type global me_msi_token_expiry_epoch g_ext_op_type = get_extension_operation_type(command) waagent_ext_event_type = wala_event_type_for_telemetry(g_ext_op_type) if not check_for_supported_waagent_and_distro_version(): return try: hutil.log("Dispatching command:" + command) if g_ext_op_type is waagent.WALAEventOperation.Disable: if g_dist_config.use_systemd(): RunGetOutput( 'systemctl stop mdsd-lde && systemctl disable mdsd-lde') else: stop_mdsd() oms.tear_down_omsagent_for_lad(RunGetOutput, False) #Stop the telegraf and ME services tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=True) if tel_out: hutil.log(tel_msg) else: hutil.error(tel_msg) me_out, me_msg = me_handler.stop_metrics_service(is_lad=True) if me_out: hutil.log(me_msg) else: hutil.error(me_msg) hutil.do_status_report(g_ext_op_type, "success", '0', "Disable succeeded") elif g_ext_op_type is waagent.WALAEventOperation.Uninstall: if g_dist_config.use_systemd(): RunGetOutput( 'systemctl stop mdsd-lde && systemctl disable mdsd-lde ' + '&& rm /lib/systemd/system/mdsd-lde.service') else: stop_mdsd() # Must remove lad-mdsd package first because of the dependencies cmd_exit_code, cmd_output = g_dist_config.remove_lad_mdsd() if cmd_exit_code != 0: hutil.error( 'lad-mdsd remove failed. Still proceeding to uninstall. ' 'Exit code={0}, Output={1}'.format(cmd_exit_code, cmd_output)) oms.tear_down_omsagent_for_lad(RunGetOutput, True) #Stop the telegraf and ME services tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=True) if tel_out: hutil.log(tel_msg) else: hutil.error(tel_msg) me_out, me_msg = me_handler.stop_metrics_service(is_lad=True) if me_out: hutil.log(me_msg) else: hutil.error(me_msg) #Delete the telegraf and ME services tel_rm_out, tel_rm_msg = telhandler.remove_telegraf_service() if tel_rm_out: hutil.log(tel_rm_msg) else: hutil.error(tel_rm_msg) me_rm_out, me_rm_msg = me_handler.remove_metrics_service( is_lad=True) if me_rm_out: hutil.log(me_rm_msg) else: hutil.error(me_rm_msg) hutil.do_status_report(g_ext_op_type, "success", '0', "Uninstall succeeded") elif g_ext_op_type is waagent.WALAEventOperation.Install: # Install dependencies (omsagent, which includes omi, scx). configurator = create_core_components_configs() dependencies_err, dependencies_msg = setup_dependencies_and_mdsd( configurator) if dependencies_err != 0: g_lad_log_helper.report_mdsd_dependency_setup_failure( waagent_ext_event_type, dependencies_msg) hutil.do_status_report(g_ext_op_type, "error", '-1', "Install failed") return #Start the Telegraf and ME services on Enable after installation is complete start_telegraf_out, log_messages = telhandler.start_telegraf( is_lad=True) if start_telegraf_out: hutil.log("Successfully started metrics-sourcer.") else: hutil.error(log_messages) if enable_metrics_ext: # Generate/regenerate MSI Token required by ME msi_token_generated, me_msi_token_expiry_epoch, log_messages = me_handler.generate_MSI_token( ) if msi_token_generated: hutil.log( "Successfully generated metrics-extension MSI Auth token." ) else: hutil.error(log_messages) start_metrics_out, log_messages = me_handler.start_metrics( is_lad=True) if start_metrics_out: hutil.log("Successfully started metrics-extension.") else: hutil.error(log_messages) if g_dist_config.use_systemd(): install_lad_as_systemd_service() hutil.do_status_report(g_ext_op_type, "success", '0', "Install succeeded") elif g_ext_op_type is waagent.WALAEventOperation.Enable: if hutil.is_current_config_seq_greater_inused(): configurator = create_core_components_configs() dependencies_err, dependencies_msg = setup_dependencies_and_mdsd( configurator) if dependencies_err != 0: g_lad_log_helper.report_mdsd_dependency_setup_failure( waagent_ext_event_type, dependencies_msg) hutil.do_status_report(g_ext_op_type, "error", '-1', "Enabled failed") return #Start the Telegraf and ME services on Enable after installation is complete start_telegraf_out, log_messages = telhandler.start_telegraf( is_lad=True) if start_telegraf_out: hutil.log("Successfully started metrics-sourcer.") else: hutil.error(log_messages) if enable_metrics_ext: # Generate/regenerate MSI Token required by ME generate_token = False me_token_path = g_ext_dir + "/metrics_configs/AuthToken-MSI.json" if me_msi_token_expiry_epoch is None or me_msi_token_expiry_epoch == "": if os.path.isfile(me_token_path): with open(me_token_path, "r") as f: authtoken_content = f.read() if authtoken_content and "expires_on" in authtoken_content: me_msi_token_expiry_epoch = authtoken_content[ "expires_on"] else: generate_token = True else: generate_token = True if me_msi_token_expiry_epoch: currentTime = datetime.datetime.now() token_expiry_time = datetime.datetime.fromtimestamp( me_msi_token_expiry_epoch) if token_expiry_time - currentTime < datetime.timedelta( minutes=30): # The MSI Token will expire within 30 minutes. We need to refresh the token generate_token = True if generate_token: generate_token = False msi_token_generated, me_msi_token_expiry_epoch, log_messages = me_handler.generate_MSI_token( ) if msi_token_generated: hutil.log( "Successfully refreshed metrics-extension MSI Auth token." ) else: hutil.error(log_messages) start_metrics_out, log_messages = me_handler.start_metrics( is_lad=True) if start_metrics_out: hutil.log("Successfully started metrics-extension.") else: hutil.error(log_messages) if g_dist_config.use_systemd(): install_lad_as_systemd_service() RunGetOutput('systemctl enable mdsd-lde') mdsd_lde_active = RunGetOutput( 'systemctl status mdsd-lde')[0] is 0 if not mdsd_lde_active or hutil.is_current_config_seq_greater_inused( ): RunGetOutput('systemctl restart mdsd-lde') else: # if daemon process not runs lad_pids = get_lad_pids() hutil.log("get pids:" + str(lad_pids)) if len(lad_pids ) != 2 or hutil.is_current_config_seq_greater_inused(): stop_mdsd() start_daemon() hutil.set_inused_config_seq(hutil.get_seq_no()) hutil.do_status_report( g_ext_op_type, "success", '0', "Enable succeeded, extension daemon started") # If the -daemon detects a problem, e.g. bad configuration, it will overwrite this status with a more # informative one. If it succeeds, all is well. elif g_ext_op_type is "Daemon": configurator = create_core_components_configs() if configurator: start_mdsd(configurator) elif g_ext_op_type is waagent.WALAEventOperation.Update: hutil.do_status_report(g_ext_op_type, "success", '0', "Update succeeded") except Exception as e: hutil.error( "Failed to perform extension operation {0} with error:{1}, {2}". format(g_ext_op_type, e, traceback.format_exc())) hutil.do_status_report( g_ext_op_type, 'error', '0', 'Extension operation {0} failed:{1}'.format(g_ext_op_type, e))
def metrics_watcher(hutil_error, hutil_log): """ Watcher thread to monitor metric configuration changes and to take action on them """ # check every 30 seconds sleepTime = 30 # sleep before starting the monitoring. time.sleep(sleepTime) last_crc = None me_msi_token_expiry_epoch = None while True: try: if os.path.isfile(MdsdCounterJsonPath): f = open(MdsdCounterJsonPath, "r") data = f.read() if (data != ''): json_data = json.loads(data) if len(json_data) == 0: last_crc = hashlib.sha256(data).hexdigest() if telhandler.is_running(is_lad=False): #Stop the telegraf and ME services tel_out, tel_msg = telhandler.stop_telegraf_service( is_lad=False) if tel_out: HUtilObject.log(tel_msg) else: HUtilObject.error(tel_msg) #Delete the telegraf and ME services tel_rm_out, tel_rm_msg = telhandler.remove_telegraf_service( ) if tel_rm_out: HUtilObject.log(tel_rm_msg) else: HUtilObject.error(tel_rm_msg) if me_handler.is_running(is_lad=False): me_out, me_msg = me_handler.stop_metrics_service( is_lad=False) if me_out: HUtilObject.log(me_msg) else: HUtilObject.error(me_msg) me_rm_out, me_rm_msg = me_handler.remove_metrics_service( is_lad=False) if me_rm_out: HUtilObject.log(me_rm_msg) else: HUtilObject.error(me_rm_msg) else: crc = hashlib.sha256(data).hexdigest() generate_token = False me_token_path = os.path.join( os.getcwd(), "/config/metrics_configs/AuthToken-MSI.json") if me_msi_token_expiry_epoch is None or me_msi_token_expiry_epoch == "": if os.path.isfile(me_token_path): with open(me_token_path, "r") as f: authtoken_content = f.read() if authtoken_content and "expires_on" in authtoken_content: me_msi_token_expiry_epoch = authtoken_content[ "expires_on"] else: generate_token = True else: generate_token = True if me_msi_token_expiry_epoch: currentTime = datetime.datetime.now() token_expiry_time = datetime.datetime.fromtimestamp( int(me_msi_token_expiry_epoch)) if token_expiry_time - currentTime < datetime.timedelta( minutes=30): # The MSI Token will expire within 30 minutes. We need to refresh the token generate_token = True if generate_token: generate_token = False msi_token_generated, me_msi_token_expiry_epoch, log_messages = me_handler.generate_MSI_token( ) if msi_token_generated: hutil_log( "Successfully refreshed metrics-extension MSI Auth token." ) else: hutil_error(log_messages) if (crc != last_crc): hutil_log("Start processing metric configuration") hutil_log(data) telegraf_config, telegraf_namespaces = telhandler.handle_config( json_data, "udp://127.0.0.1:" + metrics_constants. ama_metrics_extension_udp_port, "unix:///var/run/mdsd/default_influx.socket", is_lad=False) me_handler.setup_me(is_lad=False) start_telegraf_out, log_messages = telhandler.start_telegraf( is_lad=False) if start_telegraf_out: hutil_log( "Successfully started metrics-sourcer.") else: hutil_error(log_messages) start_metrics_out, log_messages = me_handler.start_metrics( is_lad=False) if start_metrics_out: hutil_log( "Successfully started metrics-extension.") else: hutil_error(log_messages) last_crc = crc telegraf_restart_retries = 0 me_restart_retries = 0 max_restart_retries = 10 # Check if telegraf is running, if not, then restart if not telhandler.is_running(is_lad=False): if telegraf_restart_retries < max_restart_retries: telegraf_restart_retries += 1 hutil_log( "Telegraf binary process is not running. Restarting telegraf now. Retry count - {0}" .format(telegraf_restart_retries)) tel_out, tel_msg = telhandler.stop_telegraf_service( is_lad=False) if tel_out: hutil_log(tel_msg) else: hutil_error(tel_msg) start_telegraf_out, log_messages = telhandler.start_telegraf( is_lad=False) if start_telegraf_out: hutil_log( "Successfully started metrics-sourcer." ) else: hutil_error(log_messages) else: hutil_error( "Telegraf binary process is not running. Failed to restart after {0} retries. Please check telegraf.log" .format(max_restart_retries)) else: telegraf_restart_retries = 0 # Check if ME is running, if not, then restart if not me_handler.is_running(is_lad=False): if me_restart_retries < max_restart_retries: me_restart_retries += 1 hutil_log( "MetricsExtension binary process is not running. Restarting MetricsExtension now. Retry count - {0}" .format(me_restart_retries)) me_out, me_msg = me_handler.stop_metrics_service( is_lad=False) if me_out: hutil_log(me_msg) else: hutil_error(me_msg) start_metrics_out, log_messages = me_handler.start_metrics( is_lad=False) if start_metrics_out: hutil_log( "Successfully started metrics-extension." ) else: hutil_error(log_messages) else: hutil_error( "MetricsExtension binary process is not running. Failed to restart after {0} retries. Please check /var/log/syslog for ME logs" .format(max_restart_retries)) else: me_restart_retries = 0 except IOError as e: hutil_error( 'I/O error in monitoring metrics. Exception={0}'.format(e)) except Exception as e: hutil_error('Error in monitoring metrics. Exception={0}'.format(e)) finally: time.sleep(sleepTime)