def run_umc(self,umcdef,GlobalContext): # check minimum umc runs delay slrun=time.time()-self.last_run_time if slrun<UmcRunTask.MIN_RUN_DELAY: Msg.info2_msg("Sleeping %.2f seoncds before running the next umc instance..."%(UmcRunTask.MIN_RUN_DELAY-slrun)) time.sleep(UmcRunTask.MIN_RUN_DELAY-slrun) # create log directory for this tool if it does not exist log_dir=get_umc_instance_log_dir(umcdef.umc_instanceid, GlobalContext) if not os.path.exists(log_dir): os.makedirs(log_dir) # tell what we are doing Msg.info1_msg("Starting umc instance id '{umc_instanceid}': umc='{umc_toolid}', delay={delay}, count={count}, params='{params}', rotation_timelimit={rotation_timelimit}, log_dir='{log_dir}, log_file_groups={log_file_groups}'". format(umc_instanceid=umcdef.umc_instanceid,umc_toolid=umcdef.umc_toolid,delay=umcdef.delay,count=umcdef.count,params=umcdef.params, rotation_timelimit=umcdef.rotation_timelimit,log_dir=umcdef.log_dir,log_file_groups=umcdef.log_file_groups)) # it is important to set setsid as there might be child processes that use tty, this should provide a dedicated tty for them # example of such process is sqlcl preexec=None if "setsid" in umcdef.options: preexec=ctypes.CDLL('libc.so.6').setsid p = psutil.Popen(UmcRunTask.UMC_LAUNCH_CMD.format(umc_instanceid=umcdef.umc_instanceid,umc_toolid=umcdef.umc_toolid, delay=umcdef.delay,count=umcdef.count,params=umcdef.params,rotation_timelimit=umcdef.rotation_timelimit, umc_home=GlobalContext.homeDir,log_dir=log_dir,log_file_groups=umcdef.log_file_groups), shell=True, executable=UmcRunTask.DEFAULT_SHELL, preexec_fn=preexec, stdin=None, stdout=None, stderr=None) self.last_run_time=time.time() return p
def run_task(self, GlobalContext, tdef): orphans = [] pids = self.get_all_pgids()[str(os.getpgrp())] procs = psutil.Process().children(recursive=True) for pid in pids: try: os.kill(int(pid), 0) except OSError: # we are not so fast, the process ended in the meantime pass else: # the process is live; check it exist in process tree found = False for p in procs: if p.pid == pid: found = True break if not found: orphans.append(pid) # else # for pid # pause if there are orhpans if len(orphans)>0: Msg.warn_msg("There are %d orphan processes, will pause umcrunner until orhpans exist!"%(len(orphans))) Msg.info2_msg("The orhpans are: %s"%orphans) return False else: return True
def run_task(self, GlobalContext, tdef): for umcdef in GlobalContext.umcdefs: self.refresh_single_instance(umcdef, GlobalContext) # report number of open handles per type if Msg.verbose: fd_result = utils.fd_table_status() Msg.info2_msg('Open file handles: %s'%utils.fd_table_status_str()) return True
def run_task(self, GlobalContext, tdef): kids=psutil.Process().children(True) Msg.info2_msg("There are %d children processes."%(len(kids))) if len(kids) > GlobalContext.params.max_processes: Msg.warn_msg("The current number of child processes %d exceeds the maximum of %d; umcrunner will be paused." %(len(kids),GlobalContext.params.max_processes)) return False else: return True
def eval_transform(transform_exprs, timestamp, tags, fields): try: # declare variables and assign values to them for k, v in tags.items(): if v is not None: exec(k + "=\"" + v + "\"") for k, v in fields.items(): if v is not None: exec(k + "=" + str(v)) # transform for expr in transform_exprs: try: exec(expr) except Exception as ex: pass Msg.info2_msg("Error when evaluating transformation '%s': %s" % (expr, str(ex))) # get only variables that come from tags and fiedls, remove all local ones # the list in the below expression must contain all local variables in this function prior to this call! nf = { k: v for k, v in locals().items() if k not in [ "k", "v", "umc_id", "transform_exprs", "timestamp", "tags", "fields", "expr", "ex" ] } __t2 = {} __f2 = {} for k, v in nf.items(): if k in tags.keys(): exec("__t2['%s']=%s" % (k, k)) elif k in fields.keys(): exec("__f2['%s']=%s" % (k, k)) else: exec("value=%s" % (k)) if isinstance(value, int) or isinstance(value, float): exec("__f2['%s']=%s" % (k, k)) else: exec("__t2['%s']=%s" % (k, k)) # new tag or field that resulted from transformation # // for return __t2, __f2 except Exception as e: Msg.err_msg("Error when evaluating transformations for %s: %s" % (umc_id, str(e))) return tags, fields
def purge_cache(self): topurge = [] for url in self.data: d = self.data[url] if (not (d.lock._RLock__owner)) and ( d.created_time is None or d.age is None or time.time() - d.created_time > d.age): topurge.append(url) # // if purge # // for # purge for url in topurge: del self.data[url] Msg.info2_msg("The cache item %s has been purged from the cache." % url)
def run_all(self): paused = self.GlobalContext.paused for tdef in self.tasks: if time.time()-tdef.last_run_time > tdef.time_interval and (tdef.run_on_global_pause or not(paused)): if tdef.run_after==0 or time.time()>tdef.run_after: if not(tdef.disabled): # inform that the task is resumed if it was puased if tdef.run_after>0: tdef.run_after=0 Msg.info1_msg("The task %s is resumed."%(tdef.name)) # run the task start_t=time.time() tdef.result = tdef.target.run_task(self.GlobalContext, tdef) end_t=time.time() if not(tdef.result): paused = True tdef.last_run_time = end_t tdef.last_run_duration=end_t-start_t # check to be disabled due to hard limit if tdef.time_limit_disable>0 and tdef.last_run_duration > tdef.time_limit_disable: tdef.disabled=True Msg.warn_msg("The task %s was running for %.2f seconds which is more than the hard maximum of %.2f seconds. The task will be disabled." %(tdef.name, tdef.last_run_duration, tdef.time_limit_disable)) # check to be paused due to soft limit elif tdef.time_limit_pause>0 and tdef.last_run_duration > tdef.time_limit_pause: tdef.run_after=end_t+tdef.pause_for Msg.warn_msg("The task %s was running for %.2f seconds which is more than the soft maximum of %.2f seconds. The task will be paused for %.2f seconds." %(tdef.name, tdef.last_run_duration, tdef.time_limit_pause, tdef.pause_for)) else: # report on task duration Msg.info2_msg("The task %s was running for %.2f seconds."%(tdef.name,tdef.last_run_duration)) # // not disabled # // locally paused else: pass old_paused = self.GlobalContext.paused self.GlobalContext.paused = not(all([ tdef.result for tdef in self.tasks if tdef.result is not None ])) if self.GlobalContext.paused != old_paused: Msg.warn_msg("umcrunner state has been %s."%("PAUSED" if self.GlobalContext.paused else "RESUMED"))
def read_umcdefs(self, reader, writer): allinstances = self.get_umc_instances() umcdefs = {} for instance in allinstances: umc_id = instance["umc-id"] umcdef = Map(umcid=umc_id, enabled=False, writer=None, reader=None, instance=instance) umcdef.enabled = self.value_element(instance, "enabled", False) umcdef.writer = writer.read_umcdef(umc_id, instance) umcdef.reader = reader.read_umcdef(umc_id, instance) Msg.info1_msg("Definition retrieved for umc %s" % (umc_id)) if not (umcdef.enabled): Msg.info1_msg( "umc id %s is disabled by configuration, no datapoints will be read." % (umc_id)) elif umcdef.writer is None or umcdef.reader is None: Msg.info2_msg( "umc id %s does not have reader or writer definitions and it will be disabled." % (umc_id)) umcdef.enabled = False # disable if the writer is not enabled if not (umcdef.writer.enabled): Msg.info2_msg( "umc id %s is disabled as its writer is disabled. No data will be read for this umc id." % (umc_id)) umcdef.enabled = False if umcdefs.get(umc_id) is not None: Msg.err_msg( "There is a duplicate umc instance with id '%s' in the configuration file!" % (umc_id)) else: umcdefs[umc_id] = umcdef # // for return umcdefs
def run_task(self, GlobalContext, tdef): kids=psutil.Process().children(True) nz = 0 for p in kids: try: if p.status() == psutil.STATUS_ZOMBIE: nz = nz + 1 except Exception as e: pass Msg.info2_msg("There are %d zombie processes"%(nz)) if nz > len(GlobalContext.umcdefs): Msg.warn_msg("There are %d zombie processes which exceeds the number of umc instances %d. Will pause umc runner until the zombie processes will disappear!"% (nz,len(GlobalContext.umcdefs))) return False else: return True
def __init__(self, config, writerDef): super(OMCWriter, self).__init__(config, writerDef) # read params self.omc_params=Map( base_url=self.param("connect.base-url"), data_url=self.param("connect.data-url"), proxies=self.param("connect.proxies"), user=self.param("connect.user"), upass=self.param("connect.pass", ""), connect_timeout=self.param("connect.connect-timeout", 5), read_timeout=self.param("connect.read-timeout", 10), omc_inprogress_timeout=self.param("connect.omc-inprogress-timeout", 120)) # print params Msg.info2_msg("OMC Writer parameters: %s"%self.omc_params) # check the db was defined if self.omc_params.data_url == None: raise Exception("Invalid connection details (data_url is missing).")
def __send_request(self): try: Msg.info2_msg("Sending proxy request %s %s" % (self.method.upper(), self.url)) headers = {"Via": "1.1 %s" % socket.gethostname()} if self.method == "get": self.response = requests.get( self.url, timeout=(GlobalContext.params.proxy_timeout_connect, GlobalContext.params.proxy_timeout_read), headers=headers) elif self.method == "post": self.response = requests.post( self.url, timeout=(GlobalContext.params.proxy_timeout_connect, GlobalContext.params.proxy_timeout_read), headers=headers) else: raise Exception("Method %s is not supported!" % self.method) except Exception as e: Msg.warn_msg("Proxy request to %s failed: %s" % (self.url, str(e))) pass
def write(self,datapoints,exit_event=None): Msg.info2_msg("Uploading %d records to OMC..."%len(datapoints)) Msg.info2_msg("The batch contains entity types %s"%self.get_all_entity_types(datapoints)) #if datapoints is not None: #print "========= BATCH OUTPUT START" #print json.dumps(datapoints) #print "========= BATCH OUTPUT END" response = self.run_request('POST',self.omc_params.data_url, datapoints, 'application/octet-stream') if response.status_code<300: resp=json.loads(response.text) status_uri=resp["statusUri"] Msg.info2_msg("Upload request sent, waiting for the result at %s up to %s seconds..."%(status_uri,self.omc_params.omc_inprogress_timeout)) start_t=time.time() while resp["status"]=="IN_PROGRESS" and (exit_event is not None and not(exit_event.is_set())): response=self.run_request('GET',status_uri) if response.status_code>=300: raise Exception("OMC status request failed with status code %d"%response.status_code) resp=json.loads(response.text) if resp["status"]=="IN_PROGRESS": # wait only certain number of seconds if time.time()-start_t>self.omc_params.omc_inprogress_timeout: Msg.err_msg("Upload failed, the datapoints in the batch will be discarded, they contain the following entity types: %s"%self.get_all_entity_types(datapoints)) raise Exception("OMC upload failed due to a timeout of %d seconds while waiting for OMC to confirm the data was uploaded successfully! The status response payload is %s"%(self.omc_params.omc_inprogress_timeout,resp)) # wait exit_event.wait(1) if exit_event is not None else sleep(1) # // while if resp["status"]=="FAILED": raise Exception("OMC upload reuqest failed. %s. Response payload: %s" %(resp["errorMessage"],resp)) elif exit_event is None or not(exit_event.is_set()): Msg.info2_msg("OMC upload reuqest processed in %d seconds. %s: %s. Response payload: %s" %(time.time()-start_t,resp["status"],resp["errorMessage"],resp)) else: raise Exception("OMC data upload request failed with status code %d. Response payload: %s"%(response.status_code,response.text))
def run_task(self, GlobalContext, tdef): running=[]; started=[]; waiting=[] for umcdef in GlobalContext.umcdefs: if umcdef.enabled: umcdef.lock.acquire() try: if umcdef.proc is None and time.time()>umcdef.start_after: if umcdef.last_started_time is not None and time.time()-umcdef.last_started_time < GlobalContext.params.min_starting_time: Msg.warn_msg("umc instance id '%s' starting frequency is too high (<%d seconds), will not start it now!" %(umcdef.umc_instanceid,GlobalContext.params.min_starting_time)) waiting.append("%s, WT=%.2fs"%(umcdef.umc_instanceid,GlobalContext.params.min_starting_time)) else: try: # run umcinstance as a child process umcdef.proc = self.run_umc(umcdef, GlobalContext) # start time start_t=time.time() umcdef.start_after=0 umcdef.last_started_time=start_t umcdef.num_runs = umcdef.num_runs + 1 if umcdef.first_started_time == 0: umcdef.first_started_time = time.time() started.append("%s, PID=%d"%(umcdef.umc_instanceid,umcdef.proc.pid)) except Exception as e: Msg.warn_msg("Error occurred while starting umc instance %s. The exception was: %s"%(umcdef.umc_instanceid, str(e))) pass else: if umcdef.proc is not None: running.append("%s, PID=%d"%(umcdef.umc_instanceid,umcdef.proc.pid)) else: waiting.append("%s, WT=%.2fs"%(umcdef.umc_instanceid,umcdef.start_after-time.time())) finally: umcdef.lock.release() # for time_run = time.time() Msg.info2_msg("Running: %s"%(running)) Msg.info2_msg("Started: %s"%(started)) Msg.info2_msg("Waiting: %s"%(waiting))
def log_request(self, size): Msg.info2_msg('HTTP request from (%s) %s %s' % (self.address_string(), self.requestline, str(size)))
def process_cluster_request(self, method, path_def, allow_all, cache_maxage, is_stream, get_content): params = PathDef(path_def).params( self.path) #get_path_params(path_def, self.path) # path must be a valid path and hostname param must exist in it if params is None or params.params.hostname is None: return None # get a list of servers this should be proxied to # if there is more than one, then proxy them, otherwise run the locally or redirect via client server_list = self.get_server_list(params) # hostname is "all", will forward to individual umcrunner servers if len(server_list) > 1 and allow_all: # check if this has been proxied already if self.headers.get("Via") is None: # acquire lock on this path to prevent other threads from doing the same cache.acquire_lock(self.path) try: # check if in cache content = cache.get(self.path) if content is None: # not in cache # proxy to all umcrunner hosts including "me" (this one) Msg.info2_msg("Sending %d proxy requests." % (len(server_list))) start_t = time.time() prqs = [] for server_def in server_list: prqs.append( ProxyRequest( method, 'http://{address}:{tcp_port}{fw_path}'. format( address=server_def.address, tcp_port=server_def.tcp_port, fw_path=params.replace( params, Map(hostname=server_def["hostname"] ))), GlobalContext.params.proxy_run_threads)) prqs[-1].send_request() # wait for all responses for x in prqs: x.wait_for_response() # get all "valid" responses resp = [r for r in prqs if r.response is not None] Msg.info2_msg( "Data from %d proxy requests retrieved in %.2f seconds." % (len(resp), time.time() - start_t)) # add result to cache; the result from individual servers should always be json array content = Map(content="[%s]" % ",".join([ r.response.text.strip()[1:-1] for r in resp if r.response.text.strip() != "[]" ])) if cache_maxage > 0: cache.create_data(self.path, content.content, time.time(), cache_maxage) # if not in cache else: Msg.info2_msg("Serving request for %s from cache." % self.path) # send back response self.send(200, {"Content-Type": "application/json"}, content.content) finally: cache.release_lock(self.path) return True # if not via else: Msg.warn_msg( "A request to %s can only come from a client, not a proxy! (%s)" % (self.path, self.headers.get("Via"))) self.send( 400, None, "Request to the resource that comes via a proxy is not allowed!" ) return False # // if multiple hostnames elif len(server_list) == 1: # params.params.hostname should be a valid hostname server_def = server_list[0] if not (server_def.me): # host should be a known host, redirect the request onto it rather than being a proxy location_url = "http://{address}:{tcp_port}{fw_path}".format( address=server_def.address, tcp_port=server_def.tcp_port, fw_path=params.replace( params, Map(hostname=server_def["hostname"]))) Msg.info2_msg("Redirecting the request to '%s'" % location_url) self.send(308, {"Location": location_url}, "") return else: if not (is_stream): content = get_content(params) if content is not None: self.send(content.code, {"Content-Type": "application/json"}, "[%s]" % ",".join(content.json)) else: # should not happen really self.send(500, None, "") return True else: get_content(params) return True # // if one hostname only else: self.send( 404, None, "The host '%s' cannot be found or is not allowed!" % params.params.hostname) return False
def on_terminate(proc): Msg.info2_msg("...process {} terminated with exit code {}".format(proc.pid, proc.returncode))