def worker_loop(): ''' Pulls a job from the queue and runs it invoking run_job.py ''' uri = '%s/jobs_queued.json?orderBy="$key"&limitToLast=10' % DB jobs = req('GET', uri) if not jobs: return # Transactionally acquire a job. Deal with races (two workers trying to # acquire the same job). job = None job_id = None for job_id in sorted(jobs.keys(), reverse=True): job = try_acquire_job(job_id) if job is not None: break logging.info('Raced while trying to acquire job %s, retrying', job_id) time.sleep(int(random.random() * 3)) if job is None: logging.error('Failed to acquire a job') return logging.info('Starting job %s', job_id) # Update the db, move the job to the running queue. patch_obj = { 'jobs_queued/' + job_id: {}, # = DELETE 'jobs_running/' + job_id: { 'worker': WORKER_NAME }, 'workers/' + WORKER_NAME: make_worker_obj('RUNNING', job_id=job_id) } req('PATCH', '%s.json' % DB, body=patch_obj) cmd = [os.path.join(CUR_DIR, 'run_job.py'), job_id] # Propagate the worker's PERFETTO_ vars and merge with the job-specific vars. env = dict(os.environ, **{k: str(v) for (k, v) in job['env'].items()}) job_runner = subprocess.Popen(cmd, env=env) # Run the job in a python subprocess, to isolate the main loop from logs # uploader failures. res = None cancelled = False timed_out = False time_started = time.time() time_last_db_poll = time_started polled_status = 'STARTED' while res is None: time.sleep(0.25) res = job_runner.poll() now = time.time() if now - time_last_db_poll > 10: # Throttle DB polling. polled_status = req('GET', '%s/jobs/%s/status.json' % (DB, job_id)) time_last_db_poll = now if now - time_started > JOB_TIMEOUT_SEC: logging.info('Job %s timed out, terminating', job_id) timed_out = True job_runner.terminate() if (sigterm.is_set() or polled_status != 'STARTED') and not cancelled: logging.info('Job %s cancelled, terminating', job_id) cancelled = True job_runner.terminate() status = ( 'INTERRUPTED' if sigterm.is_set() else 'CANCELLED' if cancelled else 'TIMED_OUT' if timed_out else 'COMPLETED' if res == 0 else 'FAILED') logging.info('Job %s %s with code %s', job_id, status, res) # Update the DB, unless the job has been cancelled. The "is not None" # condition deals with a very niche case, that is, avoid creating a partial # job entry after doing a full clear of the DB (which is super rare, happens # only when re-deploying the CI). if polled_status is not None: patch = { 'jobs/%s/status' % job_id: status, 'jobs/%s/exit_code' % job_id: {} if res is None else res, 'jobs/%s/time_ended' % job_id: utc_now_iso(), 'jobs_running/%s' % job_id: {}, # = DELETE } req('PATCH', '%s.json' % (DB), body=patch)
def create_stackdriver_metric_definitions(): logging.info('Creating Stackdriver metric definitions') for name, metric in STACKDRIVER_METRICS.iteritems(): logging.info('Creating metric %s', name) req('POST', STACKDRIVER_API + '/metricDescriptors', body=metric)
def update_queue_metrics(handler): # Update the stackdriver metric that will drive the autoscaler. queued = req('GET', DB + '/jobs_queued.json?shallow=true') or {} running = req('GET', DB + '/jobs_running.json?shallow=true') or {} write_metrics({'ci_job_queue_len': {'v': len(queued) + len(running)}})
def delete_job_logs(handler): req('DELETE', '%s/logs/%s.json' % (DB, handler.request.get('job_id')))
def delete_expired_logs(handler): logs = req('GET', '%s/logs.json?shallow=true' % (DB)) or {} for job_id in logs.iterkeys(): age_days = (datetime.now() - datetime.strptime(job_id[:8], '%Y%m%d')).days if age_days > LOGS_TTL_DAYS: defer('delete_job_logs', job_id=job_id)
def check_pending_cls(handler): # Check if any pending CL has completed (all jobs are done). If so publish # the comment and vote on the CL. pending_cls = req('GET', '%s/cls_pending.json' % DB) or {} for cl_and_ps, _ in pending_cls.iteritems(): defer('check_pending_cl', cl_and_ps=cl_and_ps)