def stop_telegraf_service(is_lad):
    """
    Stop the telegraf service if VM is using is systemd, otherwise check if the pid_file exists,
    and if the pid belongs to the Telegraf process, if yes, then kill the process
    This method is called before remove_telegraf_service by the main extension code
    :param is_lad: boolean whether the extension is LAD or not (AMA)
    """

    if is_lad:
        telegraf_bin = metrics_constants.lad_telegraf_bin
    else:
        telegraf_bin = metrics_constants.ama_telegraf_bin

    # If the VM has systemd, then we will use that to stop
    if metrics_utils.is_systemd():
        code = 1
        telegraf_service_path = get_telegraf_service_path()

        if os.path.isfile(telegraf_service_path):
            code = os.system("sudo systemctl stop metrics-sourcer")
        else:
            return False, "Telegraf service file does not exist. Failed to stop telegraf service: metrics-sourcer.service."

        if code != 0:
            return False, "Unable to stop telegraf service: metrics-sourcer.service. Run systemctl status metrics-sourcer.service for more info."

    # Whether or not VM has systemd, let's check if we have any telegraf pids saved and if so, terminate the associated process
    _, configFolder = get_handler_vars()
    telegraf_conf_dir = configFolder + "/telegraf_configs/"
    telegraf_pid_path = telegraf_conf_dir + "telegraf_pid.txt"
    if os.path.isfile(telegraf_pid_path):
        with open(telegraf_pid_path, "r") as f:
            for pid in f.readlines():
                # Verify the pid actually belongs to telegraf
                cmd_path = os.path.join("/proc", str(pid.strip("\n")),
                                        "cmdline")
                if os.path.exists(cmd_path):
                    with open(cmd_path, "r") as cmd_f:
                        cmdline = cmd_f.readlines()
                        if cmdline[0].find(telegraf_bin) >= 0:
                            os.kill(int(pid), signal.SIGKILL)
        os.remove(telegraf_pid_path)
    elif not metrics_utils.is_systemd():
        return False, "Could not find telegraf service nor process to stop."

    return True, "Successfully stopped metrics-sourcer service"
Beispiel #2
0
def start_telegraf(is_lad):
    """
    Start the telegraf service if VM is using is systemd, otherwise start the binary as a process and store the pid,
    to a file in the telegraf config directory,
    This method is called after config setup is completed by the main extension code
    :param is_lad: boolean whether the extension is LAD or not (AMA)
    """

    # Re using the code to grab the config directories and imds values because start will be called from Enable process outside this script
    log_messages = ""

    if is_lad:
        telegraf_bin = metrics_constants.lad_telegraf_bin
    else:
        telegraf_bin = metrics_constants.ama_telegraf_bin

    if not os.path.isfile(telegraf_bin):
        log_messages += "Telegraf binary does not exist. Failed to start telegraf service."
        return False, log_messages

    # If the VM has systemd, then we will copy over the systemd unit file and use that to start/stop
    if metrics_utils.is_systemd():
        service_restart_status = os.system(
            "sudo systemctl restart metrics-sourcer")
        if service_restart_status != 0:
            log_messages += "Unable to start Telegraf service. Failed to start telegraf service."
            return False, log_messages

    #Else start telegraf as a process and save the pid to a file so that we can terminate it while disabling/uninstalling
    else:
        _, configFolder = get_handler_vars()
        telegraf_conf_dir = configFolder + "/telegraf_configs/"
        telegraf_agent_conf = telegraf_conf_dir + "telegraf.conf"
        telegraf_d_conf_dir = telegraf_conf_dir + "telegraf.d/"
        telegraf_pid_path = telegraf_conf_dir + "telegraf_pid.txt"

        binary_exec_command = "{0} --config {1} --config-directory {2}".format(
            telegraf_bin, telegraf_agent_conf, telegraf_d_conf_dir)
        proc = subprocess.Popen(binary_exec_command.split(" "),
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        # Sleeping for 3 seconds before checking if the process is still running, to give it ample time to relay crash info
        time.sleep(3)
        p = proc.poll()

        # Process is running successfully
        if p is None:
            telegraf_pid = proc.pid

            # Write this pid to a file for future use
            with open(telegraf_pid_path, "w+") as f:
                f.write(str(telegraf_pid))
        else:
            out, err = proc.communicate()
            log_messages += "Unable to run telegraf binary as a process due to error - {0}. Failed to start telegraf.".format(
                err)
            return False, log_messages
    return True, log_messages
Beispiel #3
0
def stop_metrics_service(is_lad):
    """
    Stop the metrics service if VM is using is systemd, otherwise check if the pid_file exists,
    and if the pid belongs to the MetricsExtension process, if yes, then kill the process
    This method is called before remove_metrics_service by the main extension code
    :param is_lad: boolean whether the extension is LAD or not (AMA)
    """

    if is_lad:
        metrics_ext_bin = metrics_constants.lad_metrics_extension_bin
    else:
        metrics_ext_bin = metrics_constants.ama_metrics_extension_bin

    # If the VM has systemd, then we will use that to stop
    if metrics_utils.is_systemd():
        code = 1
        metrics_service_path = get_metrics_extension_service_path()

        if os.path.isfile(metrics_service_path):
            code = os.system("sudo systemctl stop metrics-extension")
        else:
            return False, "Metrics Extension service file does not exist. Failed to stop ME service: metrics-extension.service ."

        if code != 0:
            return False, "Unable to stop Metrics Extension service: metrics-extension.service. Failed with code {0}".format(
                code)
    else:
        #This VM does not have systemd, So we will use the pid from the last ran metrics process and terminate it
        _, configFolder = get_handler_vars()
        metrics_conf_dir = configFolder + "/metrics_configs/"
        metrics_pid_path = metrics_conf_dir + "metrics_pid.txt"

        if os.path.isfile(metrics_pid_path):
            pid = ""
            with open(metrics_pid_path, "r") as f:
                pid = f.read()
            if pid != "":
                # Check if the process running is indeed MetricsExtension, ignore if the process output doesn't contain MetricsExtension
                proc = subprocess.Popen(["ps -o cmd= {0}".format(pid)],
                                        stdout=subprocess.PIPE,
                                        shell=True)
                output = proc.communicate()[0]
                if metrics_ext_bin in output.decode('utf-8', 'ignore'):
                    os.kill(int(pid), signal.SIGKILL)
                else:
                    return False, "Found a different process running with PID {0}. Failed to stop MetricsExtension.".format(
                        pid)
            else:
                return False, "No pid found for a currently running Metrics Extension process in {0}. Failed to stop Metrics Extension.".format(
                    metrics_pid_path)
        else:
            return False, "File containing the pid for the running Metrics Extension process at {0} does not exit. Failed to stop Metrics Extension".format(
                metrics_pid_path)

    return True, "Successfully stopped metrics-extension service"
def handle_config(config_data, me_url, mdsd_url, is_lad):
    """
    The main method to perfom the task of parsing the config , writing them to disk, setting up, stopping, removing and starting telegraf
    :param config_data: Parsed Metrics Configuration from which telegraf config is created
    :param me_url: The url to which telegraf will send metrics to for MetricsExtension
    :param mdsd_url: The url to which telegraf will send metrics to for MDSD
    :param is_lad: Boolean value for whether the extension is Lad or not (AMA)
    """

    # Making the imds call to get resource id, sub id, resource group and region for the dimensions for telegraf metrics
    retries = 1
    max_retries = 3
    sleep_time = 5
    imdsurl = ""
    is_arc = False

    if is_lad:
        imdsurl = "http://169.254.169.254/metadata/instance?api-version=2019-03-11"
    else:
        if metrics_utils.is_arc_installed():
            imdsurl = metrics_utils.get_arc_endpoint()
            imdsurl += "/metadata/instance?api-version=2019-11-01"
            is_arc = True
        else:
            imdsurl = "http://169.254.169.254/metadata/instance?api-version=2019-03-11"

    data = None
    while retries <= max_retries:

        req = urllib.request.Request(imdsurl, headers={'Metadata': 'true'})
        res = urllib.request.urlopen(req)
        data = json.loads(res.read().decode('utf-8', 'ignore'))

        if "compute" not in data:
            retries += 1
        else:
            break

        time.sleep(sleep_time)

    if retries > max_retries:
        raise Exception(
            "Unable to find 'compute' key in imds query response. Reached max retry limit of - {0} times. Failed to setup Telegraf."
            .format(max_retries))

    if "resourceId" not in data["compute"]:
        raise Exception(
            "Unable to find 'resourceId' key in imds query response. Failed to setup Telegraf."
        )

    az_resource_id = data["compute"]["resourceId"]

    # If the instance is VMSS then trim the last two values from the resource id ie - "/virtualMachines/0"
    # Since ME expects the resource id in a particular format. For egs -
    # IMDS returned ID - /subscriptions/<sub-id>/resourceGroups/<rg_name>/providers/Microsoft.Compute/virtualMachineScaleSets/<VMSSName>/virtualMachines/0
    # ME expected ID- /subscriptions/<sub-id>/resourceGroups/<rg_name>/providers/Microsoft.Compute/virtualMachineScaleSets/<VMSSName>
    if "virtualMachineScaleSets" in az_resource_id:
        az_resource_id = "/".join(az_resource_id.split("/")[:-2])

    if "subscriptionId" not in data["compute"]:
        raise Exception(
            "Unable to find 'subscriptionId' key in imds query response. Failed to setup Telegraf."
        )

    subscription_id = data["compute"]["subscriptionId"]

    if "resourceGroupName" not in data["compute"]:
        raise Exception(
            "Unable to find 'resourceGroupName' key in imds query response. Failed to setup Telegraf."
        )

    resource_group = data["compute"]["resourceGroupName"]

    if "location" not in data["compute"]:
        raise Exception(
            "Unable to find 'location' key in imds query response. Failed to setup Telegraf."
        )

    region = data["compute"]["location"]

    virtual_machine_name = ""
    if "vmScaleSetName" in data[
            "compute"] and data["compute"]["vmScaleSetName"] != "":
        virtual_machine_name = data["compute"]["name"]

    #call the method to first parse the configs
    output, namespaces = parse_config(config_data, me_url, mdsd_url, is_lad,
                                      az_resource_id, subscription_id,
                                      resource_group, region,
                                      virtual_machine_name)

    _, configFolder = get_handler_vars()
    if is_lad:
        telegraf_bin = metrics_constants.lad_telegraf_bin
    else:
        telegraf_bin = metrics_constants.ama_telegraf_bin

    telegraf_conf_dir = configFolder + "/telegraf_configs/"
    telegraf_agent_conf = telegraf_conf_dir + "telegraf.conf"
    telegraf_d_conf_dir = telegraf_conf_dir + "telegraf.d/"

    #call the method to write the configs
    write_configs(output, telegraf_conf_dir, telegraf_d_conf_dir)

    # Setup Telegraf service.
    # If the VM has systemd, then we will copy over the systemd unit file and use that to start/stop
    if metrics_utils.is_systemd():
        telegraf_service_setup = setup_telegraf_service(
            telegraf_bin, telegraf_d_conf_dir, telegraf_agent_conf)
        if not telegraf_service_setup:
            return False, []

    return True, namespaces
Beispiel #5
0
def setup_me(is_lad):
    """
    The main method for creating and writing MetricsExtension configuration as well as service setup
    :param is_lad: Boolean value for whether the extension is Lad or not (AMA)
    """

    # query imds to get the required information
    az_resource_id, subscription_id, location, data = get_imds_values(is_lad)

    # get tenantID
    # The url request will fail due to missing authentication header, but we get the auth url from the header of the request fail exception
    # The armurl is only for Public Cloud. Needs verification in Sovereign clouds
    aad_auth_url = ""
    amrurl = "https://management.azure.com/subscriptions/" + subscription_id + "?api-version=2014-04-01"
    try:
        req = urllib.request.Request(
            amrurl, headers={'Content-Type': 'application/json'})

        # urlopen alias in future backport is broken on py2.6, fails on urls with HTTPS - https://github.com/PythonCharmers/python-future/issues/167
        # Using this hack of switching between py2 and 3 to avoid this
        if sys.version_info < (2, 7):
            from urllib2 import HTTPError, Request, urlopen
            urlopen(req)
        else:
            res = urllib.request.urlopen(req)

    except Exception as e:
        err_res = e.headers["WWW-Authenticate"]
        for line in err_res.split(","):
            if "Bearer authorization_uri" in line:
                data = line.split("=")
                aad_auth_url = data[1][
                    1:-1]  #Removing the quotes from the front and back
                break

    if aad_auth_url == "":
        raise Exception(
            "Unable to find AAD Authentication URL in the request error response. Failed to setup ME."
        )
        return False

    #create metrics conf
    me_conf = create_metrics_extension_conf(az_resource_id, aad_auth_url)

    #create custom metrics conf
    custom_conf = create_custom_metrics_conf(location)

    #write configs to disk
    logFolder, configFolder = get_handler_vars()
    me_config_dir = configFolder + "/metrics_configs/"

    # Clear older config directory if exists.
    if os.path.exists(me_config_dir):
        rmtree(me_config_dir)
    os.mkdir(me_config_dir)

    me_conf_path = me_config_dir + "MetricsExtensionV1_Configuration.json"
    with open(me_conf_path, "w") as f:
        f.write(me_conf)

    if is_lad:
        me_monitoring_account = "CUSTOMMETRIC_" + subscription_id
    else:
        me_monitoring_account = "CUSTOMMETRIC_" + subscription_id + "_" + location

    custom_conf_path = me_config_dir + me_monitoring_account + "_MonitoringAccount_Configuration.json"
    with open(custom_conf_path, "w") as f:
        f.write(custom_conf)

    # Copy MetricsExtension Binary to the bin location
    me_bin_local_path = os.getcwd() + "/MetricsExtensionBin/MetricsExtension"
    if is_lad:
        metrics_ext_bin = metrics_constants.lad_metrics_extension_bin
    else:
        metrics_ext_bin = metrics_constants.ama_metrics_extension_bin

    if is_lad:
        lad_bin_path = "/usr/local/lad/bin/"
        # Checking if directory exists before copying ME bin over to /usr/local/lad/bin/
        if not os.path.exists(lad_bin_path):
            os.makedirs(lad_bin_path)

    # Check if previous file exist at the location, compare the two binaries,
    # If the files are not same, remove the older file, and copy the new one
    # If they are the same, then we ignore it and don't copy
    if os.path.isfile(me_bin_local_path):
        if os.path.isfile(metrics_ext_bin):
            if not filecmp.cmp(me_bin_local_path, metrics_ext_bin):
                # Removing the file in case it is already being run in a process,
                # in which case we can get an error "text file busy" while copying
                os.remove(metrics_ext_bin)
                copyfile(me_bin_local_path, metrics_ext_bin)
                os.chmod(
                    metrics_ext_bin,
                    stat.S_IXGRP | stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR
                    | stat.S_IXUSR | stat.S_IXOTH | stat.S_IROTH)

        else:
            # No previous binary exist, simply copy it and make it executable
            copyfile(me_bin_local_path, metrics_ext_bin)
            os.chmod(
                metrics_ext_bin, stat.S_IXGRP | stat.S_IRGRP | stat.S_IRUSR
                | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXOTH | stat.S_IROTH)
    else:
        raise Exception(
            "Unable to copy MetricsExtension Binary, could not find file at the location {0} . Failed to setup ME."
            .format(me_bin_local_path))
        return False

    if is_lad:
        me_influx_port = metrics_constants.lad_metrics_extension_udp_port
    else:
        me_influx_port = metrics_constants.ama_metrics_extension_udp_port

    # setup metrics extension service
    # If the VM has systemd, then we use that to start/stop
    if metrics_utils.is_systemd():
        setup_me_service(me_config_dir, me_monitoring_account, metrics_ext_bin,
                         me_influx_port)

    return True
Beispiel #6
0
def start_metrics(is_lad):
    """
    Start the metrics service if VM is using is systemd, otherwise start the binary as a process and store the pid,
    to a file in the MetricsExtension config directory,
    This method is called after config setup is completed by the main extension code
    :param is_lad: boolean whether the extension is LAD or not (AMA)
    """

    # Re using the code to grab the config directories and imds values because start will be called from Enable process outside this script
    log_messages = ""

    if is_lad:
        metrics_ext_bin = metrics_constants.lad_metrics_extension_bin
    else:
        metrics_ext_bin = metrics_constants.ama_metrics_extension_bin
    if not os.path.isfile(metrics_ext_bin):
        log_messages += "Metrics Extension binary does not exist. Failed to start ME service."
        return False, log_messages

    if is_lad:
        me_influx_port = metrics_constants.lad_metrics_extension_udp_port
    else:
        me_influx_port = metrics_constants.ama_metrics_extension_udp_port

    # If the VM has systemd, then we use that to start/stop
    if metrics_utils.is_systemd():
        service_restart_status = os.system(
            "sudo systemctl restart metrics-extension")
        if service_restart_status != 0:
            log_messages += "Unable to start metrics-extension.service. Failed to start ME service."
            return False, log_messages

    #Else start ME as a process and save the pid to a file so that we can terminate it while disabling/uninstalling
    else:
        _, configFolder = get_handler_vars()
        me_config_dir = configFolder + "/metrics_configs/"
        #query imds to get the subscription id
        az_resource_id, subscription_id, location, data = get_imds_values(
            is_lad)

        if is_lad:
            monitoringAccount = "CUSTOMMETRIC_" + subscription_id
        else:
            monitoringAccount = "CUSTOMMETRIC_" + subscription_id + "_" + location

        metrics_pid_path = me_config_dir + "metrics_pid.txt"

        binary_exec_command = "{0} -TokenSource MSI -Input influxdb_udp -InfluxDbUdpPort {1} -DataDirectory {2} -LocalControlChannel -MonitoringAccount {3} -LogLevel Error".format(
            metrics_ext_bin, me_influx_port, me_config_dir, monitoringAccount)
        proc = subprocess.Popen(binary_exec_command.split(" "),
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        time.sleep(
            3
        )  #sleeping for 3 seconds before checking if the process is still running, to give it ample time to relay crash info
        p = proc.poll()

        if p is None:  #Process is running successfully
            metrics_pid = proc.pid

            #write this pid to a file for future use
            with open(metrics_pid_path, "w+") as f:
                f.write(str(metrics_pid))
        else:
            out, err = proc.communicate()
            log_messages += "Unable to run MetricsExtension binary as a process due to error - {0}. Failed to start MetricsExtension.".format(
                err)
            return False, log_messages
    return True, log_messages