Esempio n. 1
0
def stop_metrics_process():

    if telhandler.is_running(is_lad=False):
        #Stop the telegraf and ME services
        tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=False)
        if tel_out:
            HUtilObject.log(tel_msg)
        else:
            HUtilObject.error(tel_msg)

        #Delete the telegraf and ME services
        tel_rm_out, tel_rm_msg = telhandler.remove_telegraf_service()
        if tel_rm_out:
            HUtilObject.log(tel_rm_msg)
        else:
            HUtilObject.error(tel_rm_msg)

    if me_handler.is_running(is_lad=False):
        me_out, me_msg = me_handler.stop_metrics_service(is_lad=False)
        if me_out:
            HUtilObject.log(me_msg)
        else:
            HUtilObject.error(me_msg)

        me_rm_out, me_rm_msg = me_handler.remove_metrics_service(is_lad=False)
        if me_rm_out:
            HUtilObject.log(me_rm_msg)
        else:
            HUtilObject.error(me_rm_msg)

    pids_filepath = os.path.join(os.getcwd(), 'amametrics.pid')

    # kill existing telemetry watcher
    if os.path.exists(pids_filepath):
        with open(pids_filepath, "r") as f:
            for pids in f.readlines():
                kill_cmd = "kill " + pids
                run_command_and_log(kill_cmd)
                run_command_and_log("rm " + pids_filepath)
Esempio n. 2
0
def main(command):
    init_globals()

    global g_ext_op_type
    global me_msi_token_expiry_epoch

    g_ext_op_type = get_extension_operation_type(command)
    waagent_ext_event_type = wala_event_type_for_telemetry(g_ext_op_type)

    if not check_for_supported_waagent_and_distro_version():
        return

    try:
        hutil.log("Dispatching command:" + command)

        if g_ext_op_type is waagent.WALAEventOperation.Disable:
            if g_dist_config.use_systemd():
                RunGetOutput(
                    'systemctl stop mdsd-lde && systemctl disable mdsd-lde')
            else:
                stop_mdsd()
            oms.tear_down_omsagent_for_lad(RunGetOutput, False)

            #Stop the telegraf and ME services
            tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=True)
            if tel_out:
                hutil.log(tel_msg)
            else:
                hutil.error(tel_msg)

            me_out, me_msg = me_handler.stop_metrics_service(is_lad=True)
            if me_out:
                hutil.log(me_msg)
            else:
                hutil.error(me_msg)

            hutil.do_status_report(g_ext_op_type, "success", '0',
                                   "Disable succeeded")

        elif g_ext_op_type is waagent.WALAEventOperation.Uninstall:
            if g_dist_config.use_systemd():
                RunGetOutput(
                    'systemctl stop mdsd-lde && systemctl disable mdsd-lde ' +
                    '&& rm /lib/systemd/system/mdsd-lde.service')
            else:
                stop_mdsd()
            # Must remove lad-mdsd package first because of the dependencies
            cmd_exit_code, cmd_output = g_dist_config.remove_lad_mdsd()
            if cmd_exit_code != 0:
                hutil.error(
                    'lad-mdsd remove failed. Still proceeding to uninstall. '
                    'Exit code={0}, Output={1}'.format(cmd_exit_code,
                                                       cmd_output))
            oms.tear_down_omsagent_for_lad(RunGetOutput, True)

            #Stop the telegraf and ME services
            tel_out, tel_msg = telhandler.stop_telegraf_service(is_lad=True)
            if tel_out:
                hutil.log(tel_msg)
            else:
                hutil.error(tel_msg)

            me_out, me_msg = me_handler.stop_metrics_service(is_lad=True)
            if me_out:
                hutil.log(me_msg)
            else:
                hutil.error(me_msg)

            #Delete the telegraf and ME services
            tel_rm_out, tel_rm_msg = telhandler.remove_telegraf_service()
            if tel_rm_out:
                hutil.log(tel_rm_msg)
            else:
                hutil.error(tel_rm_msg)

            me_rm_out, me_rm_msg = me_handler.remove_metrics_service(
                is_lad=True)
            if me_rm_out:
                hutil.log(me_rm_msg)
            else:
                hutil.error(me_rm_msg)

            hutil.do_status_report(g_ext_op_type, "success", '0',
                                   "Uninstall succeeded")

        elif g_ext_op_type is waagent.WALAEventOperation.Install:
            # Install dependencies (omsagent, which includes omi, scx).
            configurator = create_core_components_configs()
            dependencies_err, dependencies_msg = setup_dependencies_and_mdsd(
                configurator)
            if dependencies_err != 0:
                g_lad_log_helper.report_mdsd_dependency_setup_failure(
                    waagent_ext_event_type, dependencies_msg)
                hutil.do_status_report(g_ext_op_type, "error", '-1',
                                       "Install failed")
                return

            #Start the Telegraf and ME services on Enable after installation is complete
            start_telegraf_out, log_messages = telhandler.start_telegraf(
                is_lad=True)
            if start_telegraf_out:
                hutil.log("Successfully started metrics-sourcer.")
            else:
                hutil.error(log_messages)

            if enable_metrics_ext:
                # Generate/regenerate MSI Token required by ME
                msi_token_generated, me_msi_token_expiry_epoch, log_messages = me_handler.generate_MSI_token(
                )
                if msi_token_generated:
                    hutil.log(
                        "Successfully generated metrics-extension MSI Auth token."
                    )
                else:
                    hutil.error(log_messages)

                start_metrics_out, log_messages = me_handler.start_metrics(
                    is_lad=True)
                if start_metrics_out:
                    hutil.log("Successfully started metrics-extension.")
                else:
                    hutil.error(log_messages)

            if g_dist_config.use_systemd():
                install_lad_as_systemd_service()
            hutil.do_status_report(g_ext_op_type, "success", '0',
                                   "Install succeeded")

        elif g_ext_op_type is waagent.WALAEventOperation.Enable:
            if hutil.is_current_config_seq_greater_inused():
                configurator = create_core_components_configs()
                dependencies_err, dependencies_msg = setup_dependencies_and_mdsd(
                    configurator)
                if dependencies_err != 0:
                    g_lad_log_helper.report_mdsd_dependency_setup_failure(
                        waagent_ext_event_type, dependencies_msg)
                    hutil.do_status_report(g_ext_op_type, "error", '-1',
                                           "Enabled failed")
                    return

                #Start the Telegraf and ME services on Enable after installation is complete
                start_telegraf_out, log_messages = telhandler.start_telegraf(
                    is_lad=True)
                if start_telegraf_out:
                    hutil.log("Successfully started metrics-sourcer.")
                else:
                    hutil.error(log_messages)

                if enable_metrics_ext:
                    # Generate/regenerate MSI Token required by ME
                    generate_token = False
                    me_token_path = g_ext_dir + "/metrics_configs/AuthToken-MSI.json"

                    if me_msi_token_expiry_epoch is None or me_msi_token_expiry_epoch == "":
                        if os.path.isfile(me_token_path):
                            with open(me_token_path, "r") as f:
                                authtoken_content = f.read()
                                if authtoken_content and "expires_on" in authtoken_content:
                                    me_msi_token_expiry_epoch = authtoken_content[
                                        "expires_on"]
                                else:
                                    generate_token = True
                        else:
                            generate_token = True

                    if me_msi_token_expiry_epoch:
                        currentTime = datetime.datetime.now()
                        token_expiry_time = datetime.datetime.fromtimestamp(
                            me_msi_token_expiry_epoch)
                        if token_expiry_time - currentTime < datetime.timedelta(
                                minutes=30):
                            # The MSI Token will expire within 30 minutes. We need to refresh the token
                            generate_token = True

                    if generate_token:
                        generate_token = False
                        msi_token_generated, me_msi_token_expiry_epoch, log_messages = me_handler.generate_MSI_token(
                        )
                        if msi_token_generated:
                            hutil.log(
                                "Successfully refreshed metrics-extension MSI Auth token."
                            )
                        else:
                            hutil.error(log_messages)

                    start_metrics_out, log_messages = me_handler.start_metrics(
                        is_lad=True)
                    if start_metrics_out:
                        hutil.log("Successfully started metrics-extension.")
                    else:
                        hutil.error(log_messages)

            if g_dist_config.use_systemd():
                install_lad_as_systemd_service()
                RunGetOutput('systemctl enable mdsd-lde')
                mdsd_lde_active = RunGetOutput(
                    'systemctl status mdsd-lde')[0] is 0
                if not mdsd_lde_active or hutil.is_current_config_seq_greater_inused(
                ):
                    RunGetOutput('systemctl restart mdsd-lde')
            else:
                # if daemon process not runs
                lad_pids = get_lad_pids()
                hutil.log("get pids:" + str(lad_pids))
                if len(lad_pids
                       ) != 2 or hutil.is_current_config_seq_greater_inused():
                    stop_mdsd()
                    start_daemon()
            hutil.set_inused_config_seq(hutil.get_seq_no())
            hutil.do_status_report(
                g_ext_op_type, "success", '0',
                "Enable succeeded, extension daemon started")
            # If the -daemon detects a problem, e.g. bad configuration, it will overwrite this status with a more
            # informative one. If it succeeds, all is well.

        elif g_ext_op_type is "Daemon":
            configurator = create_core_components_configs()
            if configurator:
                start_mdsd(configurator)

        elif g_ext_op_type is waagent.WALAEventOperation.Update:
            hutil.do_status_report(g_ext_op_type, "success", '0',
                                   "Update succeeded")

    except Exception as e:
        hutil.error(
            "Failed to perform extension operation {0} with error:{1}, {2}".
            format(g_ext_op_type, e, traceback.format_exc()))
        hutil.do_status_report(
            g_ext_op_type, 'error', '0',
            'Extension operation {0} failed:{1}'.format(g_ext_op_type, e))
def metrics_watcher(hutil_error, hutil_log):
    """
    Watcher thread to monitor metric configuration changes and to take action on them
    """

    # check every 30 seconds
    sleepTime = 30

    # sleep before starting the monitoring.
    time.sleep(sleepTime)
    last_crc = None
    me_msi_token_expiry_epoch = None

    while True:
        try:
            if os.path.isfile(MdsdCounterJsonPath):
                f = open(MdsdCounterJsonPath, "r")
                data = f.read()

                if (data != ''):
                    json_data = json.loads(data)

                    if len(json_data) == 0:
                        last_crc = hashlib.sha256(data).hexdigest()
                        if telhandler.is_running(is_lad=False):
                            #Stop the telegraf and ME services
                            tel_out, tel_msg = telhandler.stop_telegraf_service(
                                is_lad=False)
                            if tel_out:
                                HUtilObject.log(tel_msg)
                            else:
                                HUtilObject.error(tel_msg)

                            #Delete the telegraf and ME services
                            tel_rm_out, tel_rm_msg = telhandler.remove_telegraf_service(
                            )
                            if tel_rm_out:
                                HUtilObject.log(tel_rm_msg)
                            else:
                                HUtilObject.error(tel_rm_msg)

                        if me_handler.is_running(is_lad=False):
                            me_out, me_msg = me_handler.stop_metrics_service(
                                is_lad=False)
                            if me_out:
                                HUtilObject.log(me_msg)
                            else:
                                HUtilObject.error(me_msg)

                            me_rm_out, me_rm_msg = me_handler.remove_metrics_service(
                                is_lad=False)
                            if me_rm_out:
                                HUtilObject.log(me_rm_msg)
                            else:
                                HUtilObject.error(me_rm_msg)
                    else:
                        crc = hashlib.sha256(data).hexdigest()
                        generate_token = False
                        me_token_path = os.path.join(
                            os.getcwd(),
                            "/config/metrics_configs/AuthToken-MSI.json")

                        if me_msi_token_expiry_epoch is None or me_msi_token_expiry_epoch == "":
                            if os.path.isfile(me_token_path):
                                with open(me_token_path, "r") as f:
                                    authtoken_content = f.read()
                                    if authtoken_content and "expires_on" in authtoken_content:
                                        me_msi_token_expiry_epoch = authtoken_content[
                                            "expires_on"]
                                    else:
                                        generate_token = True
                            else:
                                generate_token = True

                        if me_msi_token_expiry_epoch:
                            currentTime = datetime.datetime.now()
                            token_expiry_time = datetime.datetime.fromtimestamp(
                                int(me_msi_token_expiry_epoch))
                            if token_expiry_time - currentTime < datetime.timedelta(
                                    minutes=30):
                                # The MSI Token will expire within 30 minutes. We need to refresh the token
                                generate_token = True

                        if generate_token:
                            generate_token = False
                            msi_token_generated, me_msi_token_expiry_epoch, log_messages = me_handler.generate_MSI_token(
                            )
                            if msi_token_generated:
                                hutil_log(
                                    "Successfully refreshed metrics-extension MSI Auth token."
                                )
                            else:
                                hutil_error(log_messages)

                        if (crc != last_crc):
                            hutil_log("Start processing metric configuration")
                            hutil_log(data)

                            telegraf_config, telegraf_namespaces = telhandler.handle_config(
                                json_data,
                                "udp://127.0.0.1:" + metrics_constants.
                                ama_metrics_extension_udp_port,
                                "unix:///var/run/mdsd/default_influx.socket",
                                is_lad=False)

                            me_handler.setup_me(is_lad=False)

                            start_telegraf_out, log_messages = telhandler.start_telegraf(
                                is_lad=False)
                            if start_telegraf_out:
                                hutil_log(
                                    "Successfully started metrics-sourcer.")
                            else:
                                hutil_error(log_messages)

                            start_metrics_out, log_messages = me_handler.start_metrics(
                                is_lad=False)
                            if start_metrics_out:
                                hutil_log(
                                    "Successfully started metrics-extension.")
                            else:
                                hutil_error(log_messages)

                            last_crc = crc

                        telegraf_restart_retries = 0
                        me_restart_retries = 0
                        max_restart_retries = 10

                        # Check if telegraf is running, if not, then restart
                        if not telhandler.is_running(is_lad=False):
                            if telegraf_restart_retries < max_restart_retries:
                                telegraf_restart_retries += 1
                                hutil_log(
                                    "Telegraf binary process is not running. Restarting telegraf now. Retry count - {0}"
                                    .format(telegraf_restart_retries))
                                tel_out, tel_msg = telhandler.stop_telegraf_service(
                                    is_lad=False)
                                if tel_out:
                                    hutil_log(tel_msg)
                                else:
                                    hutil_error(tel_msg)
                                start_telegraf_out, log_messages = telhandler.start_telegraf(
                                    is_lad=False)
                                if start_telegraf_out:
                                    hutil_log(
                                        "Successfully started metrics-sourcer."
                                    )
                                else:
                                    hutil_error(log_messages)
                            else:
                                hutil_error(
                                    "Telegraf binary process is not running. Failed to restart after {0} retries. Please check telegraf.log"
                                    .format(max_restart_retries))
                        else:
                            telegraf_restart_retries = 0

                        # Check if ME is running, if not, then restart
                        if not me_handler.is_running(is_lad=False):
                            if me_restart_retries < max_restart_retries:
                                me_restart_retries += 1
                                hutil_log(
                                    "MetricsExtension binary process is not running. Restarting MetricsExtension now. Retry count - {0}"
                                    .format(me_restart_retries))
                                me_out, me_msg = me_handler.stop_metrics_service(
                                    is_lad=False)
                                if me_out:
                                    hutil_log(me_msg)
                                else:
                                    hutil_error(me_msg)
                                start_metrics_out, log_messages = me_handler.start_metrics(
                                    is_lad=False)

                                if start_metrics_out:
                                    hutil_log(
                                        "Successfully started metrics-extension."
                                    )
                                else:
                                    hutil_error(log_messages)
                            else:
                                hutil_error(
                                    "MetricsExtension binary process is not running. Failed to restart after {0} retries. Please check /var/log/syslog for ME logs"
                                    .format(max_restart_retries))
                        else:
                            me_restart_retries = 0

        except IOError as e:
            hutil_error(
                'I/O error in monitoring metrics. Exception={0}'.format(e))

        except Exception as e:
            hutil_error('Error in monitoring metrics. Exception={0}'.format(e))

        finally:
            time.sleep(sleepTime)