def kill_stalled_processes(dryrun=True): """ Due to a bug in the Django|Postgres backend, occassionally the `manage.py cron` process will hang even through all processes have been marked completed. We compare all recorded PIDs against those still running, and kill any associated with complete jobs. """ pids = set(map(int, Job.objects\ .filter(is_running=False, current_pid__isnull=False)\ .exclude(current_pid='')\ .values_list('current_pid', flat=True))) for pid in pids: try: if utils.pid_exists(pid): # and not utils.get_cpu_usage(pid): p = psutil.Process(pid) cmd = ' '.join(p.cmdline()) if 'manage.py cron' in cmd: jobs = Job.objects.filter(current_pid=pid) job = None if jobs: job = jobs[0] utils.smart_print('Killing process %s associated with %s.' % (pid, job)) if not dryrun: utils.kill_process(pid) else: print('PID not cron.') else: print('PID dead.') except psutil.NoSuchProcess: print('PID does not exist.')
def kill_stalled_processes(dryrun=True): """ Due to a bug in the Django|Postgres backend, occassionally the `manage.py cron` process will hang even through all processes have been marked completed. We compare all recorded PIDs against those still running, and kill any associated with complete jobs. """ pids = set(map(int, Job.objects\ .filter(is_running=False, current_pid__isnull=False)\ .exclude(current_pid='')\ .values_list('current_pid', flat=True))) for pid in pids: try: if utils.pid_exists(pid):# and not utils.get_cpu_usage(pid): p = psutil.Process(pid) cmd = ' '.join(p.cmdline()) if 'manage.py cron' in cmd: jobs = Job.objects.filter(current_pid=pid) job = None if jobs: job = jobs[0] utils.smart_print('Killing process %s associated with %s.' % (pid, job)) if not dryrun: utils.kill_process(pid) else: print('PID not cron.') else: print('PID dead.') except psutil.NoSuchProcess: print('PID does not exist.')
def run_cron(jobs=None, **kwargs): update_heartbeat = kwargs.pop('update_heartbeat', True) force_run = kwargs.pop('force_run', False) dryrun = kwargs.pop('dryrun', False) clear_pid = kwargs.pop('clear_pid', False) sync = kwargs.pop('sync', False) try: # TODO: auto-kill inactive long-running cron processes whose # threads have stalled and not exited properly? # Check for 0 cpu usage. #ps -p <pid> -o %cpu stdout_map = defaultdict(list) # {prod_id:[]} stderr_map = defaultdict(list) # {prod_id:[]} stdout_queue = Queue() stderr_queue = Queue() if _settings.CHRONIKER_AUTO_END_STALE_JOBS and not dryrun: Job.objects.end_all_stale() # Check PID file to prevent conflicts with prior executions. # TODO: is this still necessary? deprecate? As long as jobs run by # JobProcess don't wait for other jobs, multiple instances of cron # should be able to run simeltaneously without issue. if _settings.CHRONIKER_USE_PID: pid_fn = _settings.CHRONIKER_PID_FN pid = str(os.getpid()) any_running = Job.objects.all_running().count() if not any_running: # If no jobs are running, then even if the PID file exists, # it must be stale, so ignore it. pass elif os.path.isfile(pid_fn): try: old_pid = int(open(pid_fn, 'r').read()) if utils.pid_exists(old_pid): print('%s already exists, exiting' % pid_fn) sys.exit() else: print(('%s already exists, but contains stale ' 'PID, continuing') % pid_fn) except ValueError: pass except TypeError: pass open(pid_fn, 'w').write(pid) clear_pid = True procs = [] if force_run: q = Job.objects.all() if jobs: q = q.filter(id__in=jobs) else: q = Job.objects.due_with_met_dependencies_ordered(jobs=jobs) running_ids = set() for job in q: # This is necessary, otherwise we get the exception # DatabaseError: SSL error: sslv3 alert bad record mac # even through we're not using SSL... # We work around this by forcing Django to use separate # connections for each process by explicitly closing the # current connection. connection.close() # Re-check dependencies to incorporate any previous iterations # that marked jobs as running, potentially causing dependencies # to become unmet. Job.objects.update() job = Job.objects.get(id=job.id) if not force_run and not job.is_due_with_dependencies_met(running_ids=running_ids): utils.smart_print(u'Job {} {} is due but has unmet dependencies.'\ .format(job.id, job)) continue # Immediately mark the job as running so the next jobs can # update their dependency check. utils.smart_print(u'Running job {} {}.'.format(job.id, job)) running_ids.add(job.id) if dryrun: continue job.is_running = True Job.objects.filter(id=job.id).update(is_running=job.is_running) # Launch job. if sync: # Run job synchronously. run_job( job, update_heartbeat=update_heartbeat, stdout_queue=stdout_queue, stderr_queue=stderr_queue, force_run=force_run or job.force_run, ) else: # Run job asynchronously. job_func = partial( run_job, job=job, force_run=force_run or job.force_run, update_heartbeat=update_heartbeat, name=str(job), ) proc = JobProcess( job=job, max_seconds=job.timeout_seconds, target=job_func, name=str(job), kwargs=dict( stdout_queue=stdout_queue, stderr_queue=stderr_queue, ) ) proc.start() procs.append(proc) if not dryrun: print("%d Jobs are due." % len(procs)) # Wait for all job processes to complete. while procs: while not stdout_queue.empty(): proc_id, proc_stdout = stdout_queue.get() stdout_map[proc_id].append(proc_stdout) while not stderr_queue.empty(): proc_id, proc_stderr = stderr_queue.get() stderr_map[proc_id].append(proc_stderr) for proc in list(procs): if not proc.is_alive(): print('Process %s ended.' % (proc,)) procs.remove(proc) elif proc.is_expired: print('Process %s expired.' % (proc,)) proc_id = proc.pid proc.terminate() run_end_datetime = timezone.now() procs.remove(proc) connection.close() Job.objects.update() j = Job.objects.get(id=proc.job.id) run_start_datetime = j.last_run_start_timestamp proc.job.is_running = False proc.job.force_run = False proc.job.force_stop = False proc.job.save() # Create log record since the job was killed before it had # a chance to do so. Log.objects.create( job=proc.job, run_start_datetime=run_start_datetime, run_end_datetime=run_end_datetime, success=False, on_time=False, hostname=socket.gethostname(), stdout=''.join(stdout_map[proc_id]), stderr=''.join(stderr_map[proc_id] + ['Job exceeded timeout\n']), ) time.sleep(1) print('!' * 80) print('All jobs complete!') finally: if _settings.CHRONIKER_USE_PID and os.path.isfile(pid_fn) and clear_pid: os.unlink(pid_fn)
def run_cron(jobs=None, **kwargs): update_heartbeat = kwargs.pop('update_heartbeat', True) force_run = kwargs.pop('force_run', False) dryrun = kwargs.pop('dryrun', False) clear_pid = kwargs.pop('clear_pid', False) sync = kwargs.pop('sync', False) try: # TODO: auto-kill inactive long-running cron processes whose # threads have stalled and not exited properly? # Check for 0 cpu usage. #ps -p <pid> -o %cpu stdout_map = defaultdict(list) # {prod_id:[]} stderr_map = defaultdict(list) # {prod_id:[]} stdout_queue = Queue() stderr_queue = Queue() if _settings.CHRONIKER_AUTO_END_STALE_JOBS and not dryrun: Job.objects.end_all_stale() # Check PID file to prevent conflicts with prior executions. # TODO: is this still necessary? deprecate? As long as jobs run by # JobProcess don't wait for other jobs, multiple instances of cron # should be able to run simeltaneously without issue. if _settings.CHRONIKER_USE_PID: pid_fn = _settings.CHRONIKER_PID_FN pid = str(os.getpid()) any_running = Job.objects.all_running().count() if not any_running: # If no jobs are running, then even if the PID file exists, # it must be stale, so ignore it. pass elif os.path.isfile(pid_fn): try: old_pid = int(open(pid_fn, 'r').read()) if utils.pid_exists(old_pid): print('%s already exists, exiting' % pid_fn) sys.exit() else: print(('%s already exists, but contains stale ' 'PID, continuing') % pid_fn) except ValueError: pass except TypeError: pass open(pid_fn, 'w').write(pid) clear_pid = True procs = [] if force_run: q = Job.objects.all() if jobs: q = q.filter(id__in=jobs) else: q = Job.objects.due_with_met_dependencies_ordered(jobs=jobs) running_ids = set() for job in q: # This is necessary, otherwise we get the exception # DatabaseError: SSL error: sslv3 alert bad record mac # even through we're not using SSL... # We work around this by forcing Django to use separate # connections for each process by explicitly closing the # current connection. connection.close() # Re-check dependencies to incorporate any previous iterations # that marked jobs as running, potentially causing dependencies # to become unmet. Job.objects.update() job = Job.objects.get(id=job.id) if not force_run and not job.is_due_with_dependencies_met(running_ids=running_ids): utils.smart_print(u'Job {} {} is due but has unmet dependencies.'\ .format(job.id, job)) continue # Immediately mark the job as running so the next jobs can # update their dependency check. utils.smart_print(u'Running job {} {}.'.format(job.id, job)) running_ids.add(job.id) if dryrun: continue job.is_running = True Job.objects.filter(id=job.id).update(is_running=job.is_running) # Launch job. if sync: # Run job synchronously. run_job( job, update_heartbeat=update_heartbeat, stdout_queue=stdout_queue, stderr_queue=stderr_queue, force_run=force_run or job.force_run, ) else: # Run job asynchronously. job_func = partial( run_job, job=job, force_run=force_run or job.force_run, update_heartbeat=update_heartbeat, name=str(job), ) proc = JobProcess( job=job, max_seconds=job.timeout_seconds, target=job_func, name=str(job), kwargs=dict( stdout_queue=stdout_queue, stderr_queue=stderr_queue, )) proc.start() procs.append(proc) if not dryrun: print("%d Jobs are due." % len(procs)) # Wait for all job processes to complete. while procs: while not stdout_queue.empty(): proc_id, proc_stdout = stdout_queue.get() stdout_map[proc_id].append(proc_stdout) while not stderr_queue.empty(): proc_id, proc_stderr = stderr_queue.get() stderr_map[proc_id].append(proc_stderr) for proc in list(procs): if not proc.is_alive(): print('Process %s ended.' % (proc,)) procs.remove(proc) elif proc.is_expired: print('Process %s expired.' % (proc,)) proc_id = proc.pid proc.terminate() run_end_datetime = timezone.now() procs.remove(proc) connection.close() Job.objects.update() j = Job.objects.get(id=proc.job.id) run_start_datetime = j.last_run_start_timestamp proc.job.is_running = False proc.job.force_run = False proc.job.force_stop = False proc.job.save() # Create log record since the job was killed before it had # a chance to do so. Log.objects.create( job=proc.job, run_start_datetime=run_start_datetime, run_end_datetime=run_end_datetime, success=False, on_time=False, hostname=socket.gethostname(), stdout=''.join(stdout_map[proc_id]), stderr=''.join(stderr_map[proc_id]+['Job exceeded timeout\n']), ) time.sleep(1) print('!'*80) print('All jobs complete!') finally: if _settings.CHRONIKER_USE_PID and os.path.isfile(pid_fn) and clear_pid: os.unlink(pid_fn)