def safe_run(function, name=None, backoff=0.25, # Backoff time increment backoff_max=60, # Longest allowable backoff restart=True # Call again if the function returns ): """ Safely call a long-running lambda (usually a main program), catching and logging exceptions. The lambda will be re-called immediately if it simply returns or after a linearly-increasing backoff if it raises an exception. The backoff always applies if the function has not yet run successfully and time will reset once the lambda runs any longer than the last backoff delay. """ if not isinstance(function, type(lambda:0)): raise ValueError("Function provided is not a lambda.") log = pscheduler.Log(name=name, prefix='safe_run', signals=False, quiet=True ) initial_backoff = backoff current_backoff = backoff runs = 0 while True: try: started = pscheduler.time_now() function() runs += 1 if not restart: break except KeyboardInterrupt: break except Exception as ex: ran = pscheduler.time_now() - started ran_seconds = pscheduler.timedelta_as_seconds(ran) log.error("Program threw an exception after %s", ran) log.exception() # Running longer than the backoff is a good excuse to try # starting over. if ran_seconds > current_backoff and runs != 0: currrent_backoff = initial_backoff log.error("Restarting immediately.") else: log.error("Waiting %s seconds before restarting", current_backoff) time.sleep(current_backoff) if current_backoff < backoff_max: current_backoff += initial_backoff log.error("Restarting")
def safe_run( function, name=None, backoff=0.25, # Backoff time increment backoff_max=60, # Longest allowable backoff restart=True # Call again if the function returns ): """ Safely call a long-running lambda (usually a main program), catching and logging exceptions. The lambda will be re-called immediately if it simply returns or after a linearly-increasing backoff if it raises an exception. The backoff always applies if the function has not yet run successfully and time will reset once the lambda runs any longer than the last backoff delay. """ if not isinstance(function, type(lambda: 0)): raise ValueError("Function provided is not a lambda.") log = pscheduler.Log(name=name, prefix='safe_run', signals=False, quiet=True) initial_backoff = backoff current_backoff = backoff runs = 0 while True: try: started = pscheduler.time_now() function() runs += 1 if not restart: break except KeyboardInterrupt: break except Exception as ex: ran = pscheduler.time_now() - started ran_seconds = pscheduler.timedelta_as_seconds(ran) log.error("Program threw an exception after %s", ran) log.exception() # Running longer than the backoff is a good excuse to try # starting over. if ran_seconds > current_backoff and runs != 0: currrent_backoff = initial_backoff log.error("Restarting immediately.") else: log.error("Waiting %s seconds before restarting", current_backoff) time.sleep(current_backoff) if current_backoff < backoff_max: current_backoff += initial_backoff log.error("Restarting")
def get_status(): response = {} response["time"] = pscheduler.datetime_as_iso8601(pscheduler.time_now()) # Get the heartbeat status try: services = dbcursor_query("SELECT * FROM heartbeat_json", onerow=True).fetchone()[0] except Exception: services = {} # Add the database status try: # query database, calculate server run time cursor = dbcursor_query( "SELECT extract(epoch from current_timestamp - pg_postmaster_start_time())", onerow=True) time_val = pscheduler.seconds_as_timedelta(cursor.fetchone()[0]) response["services"]["database"] = { "uptime": str(pscheduler.timedelta_as_iso8601(time_val)) } except Exception: pass response["services"] = services runs = {} # query database for last run information try: cursor = dbcursor_query( "SELECT times_actual FROM run WHERE state=run_state_finished()") times = cursor.fetchall() formatted = [] for val in times: formatted.append(val[0].upper) runs["last-finished"] = str( pscheduler.datetime_as_iso8601(max(formatted))) except Exception: # handles empty result and faulty query runs["last-finished"] = None # query database for last scheduled information try: cursor = dbcursor_query("SELECT added FROM run") times = cursor.fetchall() formatted = [] for val in times: formatted.append(val[0]) runs["last-scheduled"] = str( pscheduler.datetime_as_iso8601(max(formatted))) except Exception: # handles empty result and faulty query runs["last-scheduled"] = None response["runs"] = runs return ok_json(response)
def safe_run( function, name=None, backoff=0.25, # Backoff time increment backoff_max=60.0, # Longest allowable backoff restart=True # Call again if the function returns ): """ Safely call a long-running lambda (usually a main program), catching and logging exceptions. If an exception is thrown, the calling program will be re-exec'd using the same arguments immediately if it simply returns or after a linearly-increasing backoff if it raises an exception. The backoff always applies if the function has not yet run successfully and time will reset once the lambda runs any longer than the last backoff delay. """ if not isinstance(function, type(lambda: 0)): raise ValueError("Function provided is not a lambda.") log = pscheduler.Log(name=name, prefix='safe_run', signals=False, quiet=True) # Inherit state from the environment if STATE_VARIABLE in os.environ: try: depickled = pickle.loads(os.environ[STATE_VARIABLE]) initial_backoff = depickled['initial_backoff'] assert type(initial_backoff) in [int, float] current_backoff = depickled['current_backoff'] assert type(current_backoff) in [int, float] runs = depickled['runs'] assert type(runs) == int except Exception as ex: log.error("Failed to decode %s '%s': %s" % (STATE_VARIABLE, os.environ[STATE_VARIABLE], ex)) exit(1) else: initial_backoff = backoff current_backoff = backoff runs = 0 # Run the function do_restart = False try: started = pscheduler.time_now() function() runs += 1 do_restart = restart except KeyboardInterrupt: pass except Exception as ex: ran = pscheduler.time_now() - started ran_seconds = pscheduler.timedelta_as_seconds(ran) log.error("Program threw an exception after %s", ran) log.exception() # Running longer than the backoff is a good excuse to try # starting over. if ran_seconds > current_backoff and runs != 0: currrent_backoff = initial_backoff else: log.error("Waiting %s seconds before restarting", current_backoff) time.sleep(current_backoff) if current_backoff < backoff_max: current_backoff += initial_backoff do_restart = True if not do_restart: log.error("Exiting") exit(0) log.error("Restarting: %s", sys.argv) # # Pickle the current state to pass along # to_pickle = { 'initial_backoff': initial_backoff, 'current_backoff': current_backoff, 'runs': runs } os.environ[STATE_VARIABLE] = pickle.dumps(to_pickle) os.execvp(sys.argv[0], sys.argv)
def run_program( argv, # Program name and args stdin=None, # What to send to stdin line_call=None, # Lambda to call when a line arrives timeout=None, # Seconds timeout_ok=False, # Treat timeouts as not being an error fail_message=None, # Exit with this failure message env=None, # Environment for new process, None=existing env_add=None, # Add hash to existing environment attempts=10): # Max attempts to start the process """ Run a program and return the results. Arguments: argv - Array containing arguments, including name of program stdin=s - String containing what should be sent to standard input line_call=l - Call lambda l with one argument containing a line which arrived on stdout each time that happens. If provided, the 'stdout' return value will be None. timeout=n - Wait n seconds for the program to finish, otherwise kill it. timeout_ok - True to prevent timeouts from being treated as errors. fail_message=s - Exit program and include string s if program fails. env=h - Pass environment hash 'h' to the child process, using the existing environment if the value is None. env_add=h - Add contents of hash 'h' to environment. Return Values: status - Status code returned by the program stdout - Contents of standard output as a single string stderr - Contents of standard erroras a single string """ process = None if [arg for arg in argv if arg is None]: raise Exception("Can't run with null arguments.") # Build up a new, incorruptable copy of the environment for the # child process to use. if env_add is None: env_add = {} if env is None and len(env_add) == 0: new_env = None else: new_env = (os.environ if env is None else env).copy() new_env.update(env_add) def __get_process(argv, new_env, attempts): """Try to start a process, handling EAGAINs.""" while attempts > 0: attempts -= 1 try: return _Popen(argv, stdin=subprocess32.PIPE, stdout=subprocess32.PIPE, stderr=subprocess32.PIPE, env=new_env) except OSError as ex: # Non-EAGAIN or last attempt gets re-raised. if ex.errno != errno.EAGAIN or attempts == 0: raise ex # TODO: Should we sleep a bit here? assert False, "This code should not be reached." try: process = __get_process(argv, new_env, attempts) __running_add(process) if line_call is None: # Single-shot I/O with optional timeout try: stdout, stderr = process.communicate(stdin, timeout=timeout) status = process.returncode except subprocess32.TimeoutExpired: _end_process(process) status = 0 if timeout_ok else 2 stdout = '' stderr = "Process took too long to run." else: # Read one line at a time, passing each to the line_call lambda if not isinstance(line_call, type(lambda: 0)): raise ValueError("Function provided is not a lambda.") if stdin is not None: process.stdin.write(stdin) process.stdin.close() stderr = '' stdout_fileno = process.stdout.fileno() stderr_fileno = process.stderr.fileno() fds = [stdout_fileno, stderr_fileno] if timeout is not None: end_time = pscheduler.time_now() \ + pscheduler.seconds_as_timedelta(timeout) else: time_left = None while True: if timeout is not None: time_left = pscheduler.timedelta_as_seconds( end_time - pscheduler.time_now()) reads, _, _ = polled_select(fds, [], [], time_left) if len(reads) == 0: __running_drop(process) _end_process(process) return 2, None, "Process took too long to run." for readfd in reads: if readfd == stdout_fileno: got_line = process.stdout.readline() if got_line != '': line_call(got_line[:-1]) elif readfd == stderr_fileno: got_line = process.stderr.readline() if got_line != '': stderr += got_line if process.poll() != None: break # Siphon off anything left on stdout while True: got_line = process.stdout.readline() if got_line == '': break line_call(got_line[:-1]) process.wait() status = process.returncode stdout = None except Exception as ex: extype, _, trace = sys.exc_info() status = 2 stdout = '' stderr = ''.join(traceback.format_exception_only(extype, ex)) \ + ''.join(traceback.format_exception(extype, ex, trace)).strip() if process is not None: __running_drop(process) _end_process(process) if fail_message is not None and status != 0: pscheduler.fail("%s: %s" % (fail_message, stderr)) return status, stdout, stderr
def run_program(argv, # Program name and args stdin=None, # What to send to stdin line_call=None, # Lambda to call when a line arrives timeout=None, # Seconds timeout_ok=False, # Treat timeouts as not being an error short=False, # True to force timeout to 2 seconds fail_message=None # Exit with this failure message ): """ Run a program and return the results. Arguments: argv - Array containing arguments, including name of program stdin=s - String containing what should be sent to standard input line_call=l - Call lambda l with one argument containing a line which arrived on stdout each time that happens. If provided, the 'stdout' return value will be None. timeout=n - Wait n seconds for the program to finish, otherwise kill it. timeout_ok - True to prevent timeouts from being treated as errors. short - True to force timeout to two seconds fail_message=s - Exit program and include string s if program fails. Return Values: status - Status code returned by the program stdout - Contents of standard output as a single string stderr - Contents of standard erroras a single string """ process = None if filter(lambda v: v is None, argv): raise Exception("Can't run with null arguments.") try: process = subprocess32.Popen(argv, stdin=subprocess32.PIPE, stdout=subprocess32.PIPE, stderr=subprocess32.PIPE, ) __running_add(process) if line_call is None: # Single-shot I/O with optional timeout try: stdout, stderr = process.communicate(stdin, timeout=timeout) status = process.returncode except subprocess32.TimeoutExpired: # Clean up after a timeout try: process.kill() except OSError: pass # Can't kill things that change UID process.communicate() status = 0 if timeout_ok else 2 # TODO: See if the exception has the contents of stdout and # stderr available. stdout = '' stderr = "Process took too long to run." else: # Read one line at a time, passing each to the line_call lambda if not isinstance(line_call, type(lambda:0)): raise ValueError("Function provided is not a lambda.") if stdin is not None: process.stdin.write(stdin) process.stdin.close() stderr = '' stdout_fileno = process.stdout.fileno() stderr_fileno = process.stderr.fileno() fds = [ stdout_fileno, stderr_fileno ] end_time = pscheduler.time_now() \ + pscheduler.seconds_as_timedelta(timeout) while True: time_left = pscheduler.timedelta_as_seconds( end_time - pscheduler.time_now() ) reads, writes, specials = select.select(fds, [], [], time_left) if len(reads) == 0: __running_drop(process) return 2, None, "Process took too long to run." for fd in reads: if fd == stdout_fileno: line = process.stdout.readline() if line != '': line_call(line[:-1]) elif fd == stderr_fileno: line = process.stderr.readline() if line != '': stderr += line if process.poll() != None: break process.wait() status = process.returncode stdout = None except Exception as ex: extype, ex_dummy, tb = sys.exc_info() status = 2 stdout = '' stderr = ''.join(traceback.format_exception_only(extype, ex)) \ + ''.join(traceback.format_exception(extype, ex, tb)).strip() if process is not None: __running_drop(process) if fail_message is not None and status != 0: pscheduler.fail("%s: %s" % (fail_message, stderr)) return status, stdout, stderr