Esempio n. 1
0
 def refresh_single_instance(self, umcdef, GlobalContext):
     umcdef.lock.acquire()
     try:
         if umcdef.proc is not None:
             try:
                 # update the process return code if any
                 umcdef.proc.poll()
                 if not(umcdef.proc.is_running()) and umcdef.proc.returncode is not None:
                     rc=umcdef.proc.returncode
                     if rc != 0:
                         Msg.warn_msg("umc instance %s failed/terminated with exit code %d. Will attempt to restart it after %d seconds."
                             %(umcdef.umc_instanceid,rc,GlobalContext.params.run_after_failure))
                         umcdef.start_after=time.time()+GlobalContext.params.run_after_failure
                         umcdef.num_errors = umcdef.num_errors + 1
                         umcdef.lasterror_time = time.time()
                     umcdef.returncodes.insert(0,(time.time(), rc))
                     if len(umcdef.returncodes)>GlobalContext.params.retc_history:
                         del umcdef.returncodes[-(len(umcdef.returncodes)-GlobalContext.params.retc_history):]
     
                 # clear the process is not runnig or check the process is zombie; this happens when the process ends normally but we still hold a refernece to it
                 if not(umcdef.proc.is_running()) or (umcdef.proc.is_running() and umcdef.proc.status() == psutil.STATUS_ZOMBIE):   
                     del umcdef.proc
                     umcdef.proc=None
                     umcdef.last_started_time=0
                     sleep(0.1)
                                     
             except Exception as e:
                 Msg.warn_msg("There was a problem when quering the process with pid %d: %s"%(umcdef.proc.pid,str(e)))
                 if e.__class__ == psutil.NoSuchProcess:
                     umcdef.proc=None
                     umcdef.last_started_time=0
                 pass
     finally:
         umcdef.lock.release()
Esempio n. 2
0
    def __run_httpd(self):
        # start the server
        Msg.info1_msg("Starting http server on at %s:%s." %
                      (self.address, self.tcp_port))
        try:
            self.exit = Event()
            self.httpd = ThreadedHTTPServer((self.address, int(self.tcp_port)),
                                            Handler,
                                            bind_and_activate=False)
            self.httpd.allow_reuse_address = True
            self.httpd.timeout = 1
            self.httpd.server_bind()
            self.httpd.server_activate()
        except Exception as e:
            Msg.warn_msg("Cannot start HTTP server due to: %s." % (str(e)))
            return

        # serve the requests
        try:
            while not (self.exit.is_set()):
                self.httpd.handle_request()
        finally:
            Msg.info1_msg("Closing HTTP server.")
            try:
                self.httpd.server_close()
            except Exception as e:
                Msg.warn_msg(
                    "Error occurred while closing the HTTP server: %s" %
                    (str(e)))
Esempio n. 3
0
    def run_task(self, GlobalContext, tdef):
        orphans = []
        pids = self.get_all_pgids()[str(os.getpgrp())]
        procs = psutil.Process().children(recursive=True)
        
        for pid in pids:
            try:
                os.kill(int(pid), 0)
            except OSError:
                # we are not so fast, the process ended in the meantime 
                pass
            else:
                # the process is live; check it exist in process tree
                found = False
                for p in procs:
                    if p.pid == pid:
                        found = True
                        break
                
                if not found:
                    orphans.append(pid)
            # else
        # for pid

        # pause if there are orhpans
        if len(orphans)>0:
            Msg.warn_msg("There are %d orphan processes, will pause umcrunner until orhpans exist!"%(len(orphans)))
            Msg.info2_msg("The orhpans are: %s"%orphans)
            return False
        else:
            return True
Esempio n. 4
0
def terminate_process_children(proc, timeout=10):
    # get all children processes
    procs = proc.children(recursive=True)
    
    Msg.info1_msg("Terminating process tree of pid %d with %d children..."%(proc.pid,len(procs)))

    if len(procs) > 0:
        # send SIGTERM
        for p in procs:
            try:
                p.terminate()
            except:
                pass
            
        # wait for processes to die
        gone, alive = psutil.wait_procs(procs, timeout=timeout, callback=on_terminate)
        
        # send force kill if there are still live processs
        if alive:
            Msg.warn_msg("There were %d child processes that did not terminare within the timeout of %d seconds. Killing them..."
                %(len(alive),timeout))        
            for p in alive:
                try:
                    p.kill()
                except:
                    pass
Esempio n. 5
0
    def run_task(self, GlobalContext, tdef):
        kids=psutil.Process().children(True)

        Msg.info2_msg("There are %d children processes."%(len(kids)))
            
        if len(kids) > GlobalContext.params.max_processes:
            Msg.warn_msg("The current number of child processes %d exceeds the maximum of %d; umcrunner will be paused."
                %(len(kids),GlobalContext.params.max_processes))
            return False
        else:
            return True    
Esempio n. 6
0
 def run_task(self, GlobalContext, tdef):    
     if GlobalContext.umcdefs is not None:
         for ud in GlobalContext.umcdefs:
             if ud.enabled:
                 ud.lock.acquire()
                 try:
                     log_stats=Map(backlog_total=0, errorlog_mtime=0, errorlog_size=0, errorlog_tail=[])                    
                     log_dir=get_umc_instance_log_dir(ud.umc_instanceid, GlobalContext)                
                     
                     if os.path.isdir(log_dir):
                         for file in [os.path.basename(f.path) for f in scandir(log_dir)]:
                             # match the log file waiting to be consumed
                             # there is a maximum of 9 groups (1-9)
                             m1 = re.match(r"^{umc_instanceid}_[0-9\-]+.log.([1-9])$".format(umc_instanceid=ud.umc_instanceid), file) 
                             if m1:
                                 fg_key="backlog_group_%s"%m1.group(1)
                                 if log_stats.get(fg_key) is None:
                                     log_stats[fg_key]=1
                                 else:
                                     log_stats[fg_key]+=1
                                 log_stats.backlog_total += 1
                             # // if match log file
                             
                             # match the error log
                             m2 = re.match(r"^{umc_instanceid}(_[0-9\-]+)?.error.out$".format(umc_instanceid=ud.umc_instanceid), file) 
                             if m2:
                                 stat=os.stat(log_dir + "/" + file)
                                 log_stats.errorlog_size=stat.st_size
                                 if log_stats.errorlog_size>0:
                                     log_stats.errorlog_mtime=stat.st_mtime
                                 else:
                                     log_stats.errorlog_mtime=0
                                 #the below takes too much time to finish, better not run this
                                 #log_stats.errorlog_tail=utils.tail(log_dir + "/" + file, 10)
                             # // if match error log
                         # // for 
                     else:
                         Msg.warn_msg("Directory %s does not exist!"%log_dir)
                     
                     # update log stats
                     ud.log_stats = log_stats                    
                 finally:
                     ud.lock.release()
             # // if enabled
         # // for
     # // if 
     
     return True           
Esempio n. 7
0
    def __init__(self, globalCtx):
        global GlobalContext
        GlobalContext = globalCtx

        self.enabled = False
        self.thread = None

        if GlobalContext.params.http_enabled:
            sl_def = GlobalContext.server_list.get(socket.gethostname())
            if sl_def is not None and sl_def.address is not None and sl_def.tcp_port is not None and sl_def.me:
                self.enabled = True
                self.address = sl_def.address
                self.tcp_port = sl_def.tcp_port
            else:
                Msg.warn_msg(
                    "Cannot determine umcrunner's address and/or tcp_port for http server to bind to. The http server will not be started!"
                )
        else:
            Msg.info1_msg("HTTP server is disabled.")
Esempio n. 8
0
    def run_task(self, GlobalContext, tdef):
        kids=psutil.Process().children(True)
        
        nz = 0
        for p in kids:
            try:
                if p.status() == psutil.STATUS_ZOMBIE:
                    nz = nz + 1
            except Exception as e:
                pass

        Msg.info2_msg("There are %d zombie processes"%(nz))

        if nz > len(GlobalContext.umcdefs):
            Msg.warn_msg("There are %d zombie processes which exceeds the number of umc instances %d. Will pause umc runner until the zombie processes will disappear!"%
                (nz,len(GlobalContext.umcdefs)))
            return False
        else:
            return True        
Esempio n. 9
0
    def run_task(self, GlobalContext, tdef):
        running=[]; started=[]; waiting=[]
        for umcdef in GlobalContext.umcdefs:
            if umcdef.enabled:
                umcdef.lock.acquire()
                try:
                    if umcdef.proc is None and time.time()>umcdef.start_after:
                        if umcdef.last_started_time is not None and time.time()-umcdef.last_started_time < GlobalContext.params.min_starting_time:
                            Msg.warn_msg("umc instance id '%s' starting frequency is too high (<%d seconds), will not start it now!"
                                %(umcdef.umc_instanceid,GlobalContext.params.min_starting_time))
                            waiting.append("%s, WT=%.2fs"%(umcdef.umc_instanceid,GlobalContext.params.min_starting_time))                        
                        else:
                            try:
                                # run umcinstance as a child process
                                umcdef.proc = self.run_umc(umcdef, GlobalContext)
                                
                                # start time
                                start_t=time.time()
                                umcdef.start_after=0
                                umcdef.last_started_time=start_t
                                umcdef.num_runs = umcdef.num_runs + 1
                                if umcdef.first_started_time == 0:
                                    umcdef.first_started_time = time.time()
                                
                                started.append("%s, PID=%d"%(umcdef.umc_instanceid,umcdef.proc.pid))
                            except Exception as e:
                                Msg.warn_msg("Error occurred while starting umc instance %s. The exception was: %s"%(umcdef.umc_instanceid, str(e)))
                                pass
                    else:
                        if umcdef.proc is not None: 
                            running.append("%s, PID=%d"%(umcdef.umc_instanceid,umcdef.proc.pid))
                        else: 
                            waiting.append("%s, WT=%.2fs"%(umcdef.umc_instanceid,umcdef.start_after-time.time()))
                finally:
                    umcdef.lock.release()
        # for

        time_run = time.time()
        Msg.info2_msg("Running: %s"%(running))                
        Msg.info2_msg("Started: %s"%(started))                
        Msg.info2_msg("Waiting: %s"%(waiting))                
Esempio n. 10
0
 def __send_request(self):
     try:
         Msg.info2_msg("Sending proxy request %s %s" %
                       (self.method.upper(), self.url))
         headers = {"Via": "1.1 %s" % socket.gethostname()}
         if self.method == "get":
             self.response = requests.get(
                 self.url,
                 timeout=(GlobalContext.params.proxy_timeout_connect,
                          GlobalContext.params.proxy_timeout_read),
                 headers=headers)
         elif self.method == "post":
             self.response = requests.post(
                 self.url,
                 timeout=(GlobalContext.params.proxy_timeout_connect,
                          GlobalContext.params.proxy_timeout_read),
                 headers=headers)
         else:
             raise Exception("Method %s is not supported!" % self.method)
     except Exception as e:
         Msg.warn_msg("Proxy request to %s failed: %s" % (self.url, str(e)))
         pass
Esempio n. 11
0
    def __init__(self, config, writer_id):
        self.config = config

        # read common reader's params
        base_key = "common.umcpush.reader-params"
        self.params = Map(
            max_batchsize_rows=self.config.value(
                base_key + ".max-batchsize-rows", 50),
            max_batchsize_files=self.config.value(
                base_key + ".max-batchsize-files", 300),
            log_file_group=self.config.value(base_key + ".log-file-group", 1),
            common_tags=self.config.value(base_key +
                                          ".common-tags").split(','),
            common_fields=self.config.value(base_key +
                                            ".common-fields").split(','),
            default_timefield=self.config.value(
                base_key + ".default-timefield", "datetime"),
            default_timeformat=self.config.value(
                base_key + ".default-timeformat", "%Y-%m-%d %H:%M:%S"),
            tzoffset=utils.float_ex(
                self.config.value(base_key + ".tzoffset", 0), 0))

        # update any value that may be overriden in writer's specific parameters
        writers = config.value("common.umcpush.writers")
        for writer in writers:
            if writer["writer-id"] == writer_id:
                rparams = writer["reader-params"]
                if rparams is not None:
                    for k, v in rparams.items():
                        k = k.replace("-", "_")
                        if self.params.get(k):
                            self.params[k] = v
                        else:
                            Msg.warn_msg(
                                "The reader param %s is invalid in %s" %
                                (k, key))
Esempio n. 12
0
    def run_all(self):
        paused = self.GlobalContext.paused
        for tdef in self.tasks:
            if time.time()-tdef.last_run_time > tdef.time_interval and (tdef.run_on_global_pause or not(paused)):
                if tdef.run_after==0 or time.time()>tdef.run_after:
                    if not(tdef.disabled):                         
                        # inform that the task is resumed if it was puased
                        if tdef.run_after>0:
                            tdef.run_after=0
                            Msg.info1_msg("The task %s is resumed."%(tdef.name))
                        
                        # run the task    
                        start_t=time.time()
                        tdef.result = tdef.target.run_task(self.GlobalContext, tdef)
                        end_t=time.time()
                        if not(tdef.result):
                            paused = True
                        tdef.last_run_time = end_t
                        tdef.last_run_duration=end_t-start_t
                        
                        # check to be disabled due to hard limit
                        if tdef.time_limit_disable>0 and tdef.last_run_duration > tdef.time_limit_disable:
                            tdef.disabled=True
                            Msg.warn_msg("The task %s was running for %.2f seconds which is more than the hard maximum of %.2f seconds. The task will be disabled."
                                %(tdef.name, tdef.last_run_duration, tdef.time_limit_disable))
                                
                        # check to be paused due to soft limit
                        elif tdef.time_limit_pause>0 and tdef.last_run_duration > tdef.time_limit_pause:
                            tdef.run_after=end_t+tdef.pause_for
                            Msg.warn_msg("The task %s was running for %.2f seconds which is more than the soft maximum of %.2f seconds. The task will be paused for %.2f seconds."
                                %(tdef.name, tdef.last_run_duration, tdef.time_limit_pause, tdef.pause_for))
                        else:
                            # report on task duration
                            Msg.info2_msg("The task %s was running for %.2f seconds."%(tdef.name,tdef.last_run_duration))

                    # // not disabled
                # // locally paused
            else:
                pass

        old_paused = self.GlobalContext.paused
        self.GlobalContext.paused = not(all([ tdef.result for tdef in self.tasks if tdef.result is not None ]))

        if self.GlobalContext.paused != old_paused:
            Msg.warn_msg("umcrunner state has been %s."%("PAUSED" if self.GlobalContext.paused else "RESUMED"))
Esempio n. 13
0
    def process_cluster_request(self, method, path_def, allow_all,
                                cache_maxage, is_stream, get_content):
        params = PathDef(path_def).params(
            self.path)  #get_path_params(path_def, self.path)

        # path must be a valid path and hostname param must exist in it
        if params is None or params.params.hostname is None:
            return None

        # get a list of servers this should be proxied to
        # if there is more than one, then proxy them, otherwise run the locally or redirect via client
        server_list = self.get_server_list(params)

        # hostname is "all", will forward to individual umcrunner servers
        if len(server_list) > 1 and allow_all:
            # check if this has been proxied already
            if self.headers.get("Via") is None:
                # acquire lock on this path to prevent other threads from doing the same
                cache.acquire_lock(self.path)
                try:
                    # check if in cache
                    content = cache.get(self.path)
                    if content is None:
                        # not in cache
                        # proxy to all umcrunner hosts including "me" (this one)
                        Msg.info2_msg("Sending %d proxy requests." %
                                      (len(server_list)))

                        start_t = time.time()
                        prqs = []
                        for server_def in server_list:
                            prqs.append(
                                ProxyRequest(
                                    method,
                                    'http://{address}:{tcp_port}{fw_path}'.
                                    format(
                                        address=server_def.address,
                                        tcp_port=server_def.tcp_port,
                                        fw_path=params.replace(
                                            params,
                                            Map(hostname=server_def["hostname"]
                                                ))),
                                    GlobalContext.params.proxy_run_threads))
                            prqs[-1].send_request()

                        # wait for all responses
                        for x in prqs:
                            x.wait_for_response()

                        # get all "valid" responses
                        resp = [r for r in prqs if r.response is not None]
                        Msg.info2_msg(
                            "Data from %d proxy requests retrieved in %.2f seconds."
                            % (len(resp), time.time() - start_t))

                        # add result to cache; the result from individual servers should always be json array
                        content = Map(content="[%s]" % ",".join([
                            r.response.text.strip()[1:-1]
                            for r in resp if r.response.text.strip() != "[]"
                        ]))
                        if cache_maxage > 0:
                            cache.create_data(self.path, content.content,
                                              time.time(), cache_maxage)
                    # if not in cache
                    else:
                        Msg.info2_msg("Serving request for %s from cache." %
                                      self.path)

                    # send back response
                    self.send(200, {"Content-Type": "application/json"},
                              content.content)
                finally:
                    cache.release_lock(self.path)
                return True
            # if not via
            else:
                Msg.warn_msg(
                    "A request to %s can only come from a client, not a proxy! (%s)"
                    % (self.path, self.headers.get("Via")))
                self.send(
                    400, None,
                    "Request to the resource that comes via a proxy is not allowed!"
                )
                return False
        # // if multiple hostnames
        elif len(server_list) == 1:
            # params.params.hostname should be a valid hostname
            server_def = server_list[0]
            if not (server_def.me):
                # host should be a known host, redirect the request onto it rather than being a proxy
                location_url = "http://{address}:{tcp_port}{fw_path}".format(
                    address=server_def.address,
                    tcp_port=server_def.tcp_port,
                    fw_path=params.replace(
                        params, Map(hostname=server_def["hostname"])))
                Msg.info2_msg("Redirecting the request to '%s'" % location_url)
                self.send(308, {"Location": location_url}, "")
                return
            else:
                if not (is_stream):
                    content = get_content(params)
                    if content is not None:
                        self.send(content.code,
                                  {"Content-Type": "application/json"},
                                  "[%s]" % ",".join(content.json))
                    else:
                        # should not happen really
                        self.send(500, None, "")
                    return True
                else:
                    get_content(params)
                    return True
        # // if one hostname only
        else:
            self.send(
                404, None, "The host '%s' cannot be found or is not allowed!" %
                params.params.hostname)
            return False
Esempio n. 14
0
    def run_task(self, GlobalContext, tdef):
        umc_counts=Map(count=0, enabled=0, disabled=0, running=0, waiting=0, num_children=0,
            rss=0, cpu=0, cpu_s=0, runs=0, errors=0, last_errortime=0, backlog_total=0)
        if GlobalContext.umcdefs is not None:
            for ud in GlobalContext.umcdefs:
                ud.lock.acquire()
                try:
                    umc_counts.count += 1
                    if ud.enabled: umc_counts.enabled += 1
                    else:
                        umc_counts.disabled += 1
                    umc_counts.errors += ud.num_errors
                    umc_counts.runs += ud.num_runs
                    
                    # update last error time from the error log if it was sooner
                    if ud.log_stats is not None and ud.log_stats.errorlog_mtime > ud.lasterror_time:
                        ud.lasterror_time = ud.log_stats.errorlog_mtime

                    if ud.lasterror_time > umc_counts.last_errortime:
                        umc_counts.last_errortime = ud.lasterror_time
                    
                    if time.time()<ud.start_after:
                        umc_counts.waiting += 1
                    umc_counts.backlog_total += ud.log_stats.backlog_total if ud.get("log_stats") and ud.get("log_stats").get("backlog_total") else 0 

                    # umc instance statistics
                    stats = {}; 
                    
                    # process info
                    p = {}
                    try:
                        if ud.proc is not None:
                            umc_counts.running += 1
                            
                            p["top_pid"] = ud.proc.pid
                            #p["uptime"] = time.time() - ud.proc.create_time()
                            p["uptime"] = time.time()-ud.last_started_time
                            p["cmdline"] = ud.proc.cmdline()
                            
                            kids = ud.proc.children(True)
                            rss = 0.0; cpu = 0
                            for k in kids:
                                d = k.as_dict(attrs=['cpu_times', 'memory_info'])
                                cpu = cpu + d["cpu_times"].user
                                rss = rss + d["memory_info"].rss

                            p["rss"] = float(rss/1024/1024) # in MB
                            p["cpu"] = cpu   
                            p["cpu_s"] = cpu/p["uptime"]   
                            p["num_chproc"] = len(kids)
                            
                            umc_counts.rss += p["rss"]
                            umc_counts.cpu += p["cpu"]
                            umc_counts.cpu_s += p["cpu_s"]                            
                            umc_counts.num_children += p["num_chproc"]    
                        # // end if
                    except Exception as e:
                        Msg.warn_msg("Error occurred when retrieving process info: %s"%str(e))
                        pass
                    
                    stats["p"] = p
                    ud.stats = stats        
                finally:
                    ud.lock.release()
            # // for            
            
            # umcrunner stats
            proc=psutil.Process()
            d = proc.as_dict(attrs=['cpu_times', 'memory_info'])
            uptime=time.time()-proc.create_time()
            hostname=socket.gethostname()
            GlobalContext.umcrunner_stats = Map(
                pid=proc.pid,
                hostname=hostname,
                uptime=uptime,
                cpu=d["cpu_times"].user,
                cpu_s=d["cpu_times"].user/uptime,
                rss=float(d["memory_info"].rss/1024/1024),
                threads=proc.num_threads(),
                umc_counts=umc_counts,
                link_umcinstances="/stats/hosts/{hostname}/umc/all".format(hostname=hostname)
            )
            
            return True
Esempio n. 15
0
    def read_datapoints(self, logfilename, umcdef, create_writeitem_func):
        datapoints = []
        notags = False
        nofields = False
        tzoffset = self.params.tzoffset

        if umcdef.enabled:
            # read datapoints
            with open(logfilename, 'r') as csvfile:
                reader = csv.DictReader(csvfile, delimiter=',')
                for row in reader:
                    # remove None keys
                    row = {k: v for k, v in row.items() if k is not None}

                    # timestamp
                    try:
                        if not (umcdef.reader.timefield in row):
                            raise ValueError("Cannot find time field '" +
                                             umcdef.reader.timefield +
                                             "' in data row!")
                        if umcdef.reader.timeformat == "_unix_" or umcdef.reader.timeformat == "_time_s_":
                            timestamp = long(
                                row[umcdef.reader.timefield]) * 1000000000
                        elif umcdef.reader.timeformat == "_time_ms_":
                            timestamp = long(
                                row[umcdef.reader.timefield]) * 1000000
                        else:
                            if umcdef.reader.tzfield is not None and umcdef.reader.tzfield in row:
                                tzoffset = utils.float_ex(
                                    row[umcdef.reader.tzfield],
                                    self.params.tzoffset)
                            timestamp = (self.unix_time_millis(
                                datetime.datetime.strptime(
                                    row[umcdef.reader.timefield],
                                    umcdef.reader.timeformat)) - int(
                                        tzoffset * 60 * 60 * 1000)) * 1000000
                    except Exception as e:
                        # output error and skip this row
                        Msg.err_msg(
                            "Cannot read or convert time to timestamp for %s: %s"
                            % (umcdef.umcid, str(e)))
                        continue

                    # create tags and fields
                    tags = {
                        k: str(v)
                        for k, v in row.items() if k in umcdef.reader.tcols
                    }
                    fields = {
                        k: utils.float_ex(v)
                        for k, v in row.items() if k in umcdef.reader.fcols
                    }
                    notags = (len(tags) == 0)

                    # only add this row if there is at least one field with some value
                    if len([v
                            for k, v in fields.items() if v is not None]) > 0:
                        # evaluate transformations
                        if umcdef.reader.transform is not None:
                            tags, fields = eval_transform(
                                umcdef.reader.transform, timestamp, tags,
                                fields)

                        # only add this row if filter holds on this row or there is no filter
                        if umcdef.reader.filter is None or eval_filter(
                                umcdef.reader.filter, timestamp, tags, fields):
                            try:
                                records = create_writeitem_func(
                                    umcdef, timestamp, fields, tags)
                                if records is not None and isinstance(
                                        records, list):
                                    datapoints += records
                            except Exception as e:
                                Msg.err_msg(
                                    "Error occured while creating data points item: %s"
                                    % str(e))
                        # // if write data

                # // end reading rows
            # // end open file

        # check for no tags
        if notags and len(datapoints) > 0:
            Msg.warn_msg(
                "The definition of %s contains no tags presented in the log file %s!"
                % (umcdef.umcid, os.path.basename(logfilename)))

        return datapoints