def stop_telegraf_service(is_lad): """ Stop the telegraf service if VM is using is systemd, otherwise check if the pid_file exists, and if the pid belongs to the Telegraf process, if yes, then kill the process This method is called before remove_telegraf_service by the main extension code :param is_lad: boolean whether the extension is LAD or not (AMA) """ if is_lad: telegraf_bin = metrics_constants.lad_telegraf_bin else: telegraf_bin = metrics_constants.ama_telegraf_bin # If the VM has systemd, then we will use that to stop if metrics_utils.is_systemd(): code = 1 telegraf_service_path = get_telegraf_service_path() if os.path.isfile(telegraf_service_path): code = os.system("sudo systemctl stop metrics-sourcer") else: return False, "Telegraf service file does not exist. Failed to stop telegraf service: metrics-sourcer.service." if code != 0: return False, "Unable to stop telegraf service: metrics-sourcer.service. Run systemctl status metrics-sourcer.service for more info." # Whether or not VM has systemd, let's check if we have any telegraf pids saved and if so, terminate the associated process _, configFolder = get_handler_vars() telegraf_conf_dir = configFolder + "/telegraf_configs/" telegraf_pid_path = telegraf_conf_dir + "telegraf_pid.txt" if os.path.isfile(telegraf_pid_path): with open(telegraf_pid_path, "r") as f: for pid in f.readlines(): # Verify the pid actually belongs to telegraf cmd_path = os.path.join("/proc", str(pid.strip("\n")), "cmdline") if os.path.exists(cmd_path): with open(cmd_path, "r") as cmd_f: cmdline = cmd_f.readlines() if cmdline[0].find(telegraf_bin) >= 0: os.kill(int(pid), signal.SIGKILL) os.remove(telegraf_pid_path) elif not metrics_utils.is_systemd(): return False, "Could not find telegraf service nor process to stop." return True, "Successfully stopped metrics-sourcer service"
def start_telegraf(is_lad): """ Start the telegraf service if VM is using is systemd, otherwise start the binary as a process and store the pid, to a file in the telegraf config directory, This method is called after config setup is completed by the main extension code :param is_lad: boolean whether the extension is LAD or not (AMA) """ # Re using the code to grab the config directories and imds values because start will be called from Enable process outside this script log_messages = "" if is_lad: telegraf_bin = metrics_constants.lad_telegraf_bin else: telegraf_bin = metrics_constants.ama_telegraf_bin if not os.path.isfile(telegraf_bin): log_messages += "Telegraf binary does not exist. Failed to start telegraf service." return False, log_messages # If the VM has systemd, then we will copy over the systemd unit file and use that to start/stop if metrics_utils.is_systemd(): service_restart_status = os.system( "sudo systemctl restart metrics-sourcer") if service_restart_status != 0: log_messages += "Unable to start Telegraf service. Failed to start telegraf service." return False, log_messages #Else start telegraf as a process and save the pid to a file so that we can terminate it while disabling/uninstalling else: _, configFolder = get_handler_vars() telegraf_conf_dir = configFolder + "/telegraf_configs/" telegraf_agent_conf = telegraf_conf_dir + "telegraf.conf" telegraf_d_conf_dir = telegraf_conf_dir + "telegraf.d/" telegraf_pid_path = telegraf_conf_dir + "telegraf_pid.txt" binary_exec_command = "{0} --config {1} --config-directory {2}".format( telegraf_bin, telegraf_agent_conf, telegraf_d_conf_dir) proc = subprocess.Popen(binary_exec_command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) # Sleeping for 3 seconds before checking if the process is still running, to give it ample time to relay crash info time.sleep(3) p = proc.poll() # Process is running successfully if p is None: telegraf_pid = proc.pid # Write this pid to a file for future use with open(telegraf_pid_path, "w+") as f: f.write(str(telegraf_pid)) else: out, err = proc.communicate() log_messages += "Unable to run telegraf binary as a process due to error - {0}. Failed to start telegraf.".format( err) return False, log_messages return True, log_messages
def stop_metrics_service(is_lad): """ Stop the metrics service if VM is using is systemd, otherwise check if the pid_file exists, and if the pid belongs to the MetricsExtension process, if yes, then kill the process This method is called before remove_metrics_service by the main extension code :param is_lad: boolean whether the extension is LAD or not (AMA) """ if is_lad: metrics_ext_bin = metrics_constants.lad_metrics_extension_bin else: metrics_ext_bin = metrics_constants.ama_metrics_extension_bin # If the VM has systemd, then we will use that to stop if metrics_utils.is_systemd(): code = 1 metrics_service_path = get_metrics_extension_service_path() if os.path.isfile(metrics_service_path): code = os.system("sudo systemctl stop metrics-extension") else: return False, "Metrics Extension service file does not exist. Failed to stop ME service: metrics-extension.service ." if code != 0: return False, "Unable to stop Metrics Extension service: metrics-extension.service. Failed with code {0}".format( code) else: #This VM does not have systemd, So we will use the pid from the last ran metrics process and terminate it _, configFolder = get_handler_vars() metrics_conf_dir = configFolder + "/metrics_configs/" metrics_pid_path = metrics_conf_dir + "metrics_pid.txt" if os.path.isfile(metrics_pid_path): pid = "" with open(metrics_pid_path, "r") as f: pid = f.read() if pid != "": # Check if the process running is indeed MetricsExtension, ignore if the process output doesn't contain MetricsExtension proc = subprocess.Popen(["ps -o cmd= {0}".format(pid)], stdout=subprocess.PIPE, shell=True) output = proc.communicate()[0] if metrics_ext_bin in output.decode('utf-8', 'ignore'): os.kill(int(pid), signal.SIGKILL) else: return False, "Found a different process running with PID {0}. Failed to stop MetricsExtension.".format( pid) else: return False, "No pid found for a currently running Metrics Extension process in {0}. Failed to stop Metrics Extension.".format( metrics_pid_path) else: return False, "File containing the pid for the running Metrics Extension process at {0} does not exit. Failed to stop Metrics Extension".format( metrics_pid_path) return True, "Successfully stopped metrics-extension service"
def handle_config(config_data, me_url, mdsd_url, is_lad): """ The main method to perfom the task of parsing the config , writing them to disk, setting up, stopping, removing and starting telegraf :param config_data: Parsed Metrics Configuration from which telegraf config is created :param me_url: The url to which telegraf will send metrics to for MetricsExtension :param mdsd_url: The url to which telegraf will send metrics to for MDSD :param is_lad: Boolean value for whether the extension is Lad or not (AMA) """ # Making the imds call to get resource id, sub id, resource group and region for the dimensions for telegraf metrics retries = 1 max_retries = 3 sleep_time = 5 imdsurl = "" is_arc = False if is_lad: imdsurl = "http://169.254.169.254/metadata/instance?api-version=2019-03-11" else: if metrics_utils.is_arc_installed(): imdsurl = metrics_utils.get_arc_endpoint() imdsurl += "/metadata/instance?api-version=2019-11-01" is_arc = True else: imdsurl = "http://169.254.169.254/metadata/instance?api-version=2019-03-11" data = None while retries <= max_retries: req = urllib.request.Request(imdsurl, headers={'Metadata': 'true'}) res = urllib.request.urlopen(req) data = json.loads(res.read().decode('utf-8', 'ignore')) if "compute" not in data: retries += 1 else: break time.sleep(sleep_time) if retries > max_retries: raise Exception( "Unable to find 'compute' key in imds query response. Reached max retry limit of - {0} times. Failed to setup Telegraf." .format(max_retries)) if "resourceId" not in data["compute"]: raise Exception( "Unable to find 'resourceId' key in imds query response. Failed to setup Telegraf." ) az_resource_id = data["compute"]["resourceId"] # If the instance is VMSS then trim the last two values from the resource id ie - "/virtualMachines/0" # Since ME expects the resource id in a particular format. For egs - # IMDS returned ID - /subscriptions/<sub-id>/resourceGroups/<rg_name>/providers/Microsoft.Compute/virtualMachineScaleSets/<VMSSName>/virtualMachines/0 # ME expected ID- /subscriptions/<sub-id>/resourceGroups/<rg_name>/providers/Microsoft.Compute/virtualMachineScaleSets/<VMSSName> if "virtualMachineScaleSets" in az_resource_id: az_resource_id = "/".join(az_resource_id.split("/")[:-2]) if "subscriptionId" not in data["compute"]: raise Exception( "Unable to find 'subscriptionId' key in imds query response. Failed to setup Telegraf." ) subscription_id = data["compute"]["subscriptionId"] if "resourceGroupName" not in data["compute"]: raise Exception( "Unable to find 'resourceGroupName' key in imds query response. Failed to setup Telegraf." ) resource_group = data["compute"]["resourceGroupName"] if "location" not in data["compute"]: raise Exception( "Unable to find 'location' key in imds query response. Failed to setup Telegraf." ) region = data["compute"]["location"] virtual_machine_name = "" if "vmScaleSetName" in data[ "compute"] and data["compute"]["vmScaleSetName"] != "": virtual_machine_name = data["compute"]["name"] #call the method to first parse the configs output, namespaces = parse_config(config_data, me_url, mdsd_url, is_lad, az_resource_id, subscription_id, resource_group, region, virtual_machine_name) _, configFolder = get_handler_vars() if is_lad: telegraf_bin = metrics_constants.lad_telegraf_bin else: telegraf_bin = metrics_constants.ama_telegraf_bin telegraf_conf_dir = configFolder + "/telegraf_configs/" telegraf_agent_conf = telegraf_conf_dir + "telegraf.conf" telegraf_d_conf_dir = telegraf_conf_dir + "telegraf.d/" #call the method to write the configs write_configs(output, telegraf_conf_dir, telegraf_d_conf_dir) # Setup Telegraf service. # If the VM has systemd, then we will copy over the systemd unit file and use that to start/stop if metrics_utils.is_systemd(): telegraf_service_setup = setup_telegraf_service( telegraf_bin, telegraf_d_conf_dir, telegraf_agent_conf) if not telegraf_service_setup: return False, [] return True, namespaces
def setup_me(is_lad): """ The main method for creating and writing MetricsExtension configuration as well as service setup :param is_lad: Boolean value for whether the extension is Lad or not (AMA) """ # query imds to get the required information az_resource_id, subscription_id, location, data = get_imds_values(is_lad) # get tenantID # The url request will fail due to missing authentication header, but we get the auth url from the header of the request fail exception # The armurl is only for Public Cloud. Needs verification in Sovereign clouds aad_auth_url = "" amrurl = "https://management.azure.com/subscriptions/" + subscription_id + "?api-version=2014-04-01" try: req = urllib.request.Request( amrurl, headers={'Content-Type': 'application/json'}) # urlopen alias in future backport is broken on py2.6, fails on urls with HTTPS - https://github.com/PythonCharmers/python-future/issues/167 # Using this hack of switching between py2 and 3 to avoid this if sys.version_info < (2, 7): from urllib2 import HTTPError, Request, urlopen urlopen(req) else: res = urllib.request.urlopen(req) except Exception as e: err_res = e.headers["WWW-Authenticate"] for line in err_res.split(","): if "Bearer authorization_uri" in line: data = line.split("=") aad_auth_url = data[1][ 1:-1] #Removing the quotes from the front and back break if aad_auth_url == "": raise Exception( "Unable to find AAD Authentication URL in the request error response. Failed to setup ME." ) return False #create metrics conf me_conf = create_metrics_extension_conf(az_resource_id, aad_auth_url) #create custom metrics conf custom_conf = create_custom_metrics_conf(location) #write configs to disk logFolder, configFolder = get_handler_vars() me_config_dir = configFolder + "/metrics_configs/" # Clear older config directory if exists. if os.path.exists(me_config_dir): rmtree(me_config_dir) os.mkdir(me_config_dir) me_conf_path = me_config_dir + "MetricsExtensionV1_Configuration.json" with open(me_conf_path, "w") as f: f.write(me_conf) if is_lad: me_monitoring_account = "CUSTOMMETRIC_" + subscription_id else: me_monitoring_account = "CUSTOMMETRIC_" + subscription_id + "_" + location custom_conf_path = me_config_dir + me_monitoring_account + "_MonitoringAccount_Configuration.json" with open(custom_conf_path, "w") as f: f.write(custom_conf) # Copy MetricsExtension Binary to the bin location me_bin_local_path = os.getcwd() + "/MetricsExtensionBin/MetricsExtension" if is_lad: metrics_ext_bin = metrics_constants.lad_metrics_extension_bin else: metrics_ext_bin = metrics_constants.ama_metrics_extension_bin if is_lad: lad_bin_path = "/usr/local/lad/bin/" # Checking if directory exists before copying ME bin over to /usr/local/lad/bin/ if not os.path.exists(lad_bin_path): os.makedirs(lad_bin_path) # Check if previous file exist at the location, compare the two binaries, # If the files are not same, remove the older file, and copy the new one # If they are the same, then we ignore it and don't copy if os.path.isfile(me_bin_local_path): if os.path.isfile(metrics_ext_bin): if not filecmp.cmp(me_bin_local_path, metrics_ext_bin): # Removing the file in case it is already being run in a process, # in which case we can get an error "text file busy" while copying os.remove(metrics_ext_bin) copyfile(me_bin_local_path, metrics_ext_bin) os.chmod( metrics_ext_bin, stat.S_IXGRP | stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXOTH | stat.S_IROTH) else: # No previous binary exist, simply copy it and make it executable copyfile(me_bin_local_path, metrics_ext_bin) os.chmod( metrics_ext_bin, stat.S_IXGRP | stat.S_IRGRP | stat.S_IRUSR | stat.S_IWUSR | stat.S_IXUSR | stat.S_IXOTH | stat.S_IROTH) else: raise Exception( "Unable to copy MetricsExtension Binary, could not find file at the location {0} . Failed to setup ME." .format(me_bin_local_path)) return False if is_lad: me_influx_port = metrics_constants.lad_metrics_extension_udp_port else: me_influx_port = metrics_constants.ama_metrics_extension_udp_port # setup metrics extension service # If the VM has systemd, then we use that to start/stop if metrics_utils.is_systemd(): setup_me_service(me_config_dir, me_monitoring_account, metrics_ext_bin, me_influx_port) return True
def start_metrics(is_lad): """ Start the metrics service if VM is using is systemd, otherwise start the binary as a process and store the pid, to a file in the MetricsExtension config directory, This method is called after config setup is completed by the main extension code :param is_lad: boolean whether the extension is LAD or not (AMA) """ # Re using the code to grab the config directories and imds values because start will be called from Enable process outside this script log_messages = "" if is_lad: metrics_ext_bin = metrics_constants.lad_metrics_extension_bin else: metrics_ext_bin = metrics_constants.ama_metrics_extension_bin if not os.path.isfile(metrics_ext_bin): log_messages += "Metrics Extension binary does not exist. Failed to start ME service." return False, log_messages if is_lad: me_influx_port = metrics_constants.lad_metrics_extension_udp_port else: me_influx_port = metrics_constants.ama_metrics_extension_udp_port # If the VM has systemd, then we use that to start/stop if metrics_utils.is_systemd(): service_restart_status = os.system( "sudo systemctl restart metrics-extension") if service_restart_status != 0: log_messages += "Unable to start metrics-extension.service. Failed to start ME service." return False, log_messages #Else start ME as a process and save the pid to a file so that we can terminate it while disabling/uninstalling else: _, configFolder = get_handler_vars() me_config_dir = configFolder + "/metrics_configs/" #query imds to get the subscription id az_resource_id, subscription_id, location, data = get_imds_values( is_lad) if is_lad: monitoringAccount = "CUSTOMMETRIC_" + subscription_id else: monitoringAccount = "CUSTOMMETRIC_" + subscription_id + "_" + location metrics_pid_path = me_config_dir + "metrics_pid.txt" binary_exec_command = "{0} -TokenSource MSI -Input influxdb_udp -InfluxDbUdpPort {1} -DataDirectory {2} -LocalControlChannel -MonitoringAccount {3} -LogLevel Error".format( metrics_ext_bin, me_influx_port, me_config_dir, monitoringAccount) proc = subprocess.Popen(binary_exec_command.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE) time.sleep( 3 ) #sleeping for 3 seconds before checking if the process is still running, to give it ample time to relay crash info p = proc.poll() if p is None: #Process is running successfully metrics_pid = proc.pid #write this pid to a file for future use with open(metrics_pid_path, "w+") as f: f.write(str(metrics_pid)) else: out, err = proc.communicate() log_messages += "Unable to run MetricsExtension binary as a process due to error - {0}. Failed to start MetricsExtension.".format( err) return False, log_messages return True, log_messages