Beispiel #1
0
def worker_loop():
    ''' Pulls a job from the queue and runs it invoking run_job.py  '''
    uri = '%s/jobs_queued.json?orderBy="$key"&limitToLast=10' % DB
    jobs = req('GET', uri)
    if not jobs:
        return

    # Transactionally acquire a job. Deal with races (two workers trying to
    # acquire the same job).
    job = None
    job_id = None
    for job_id in sorted(jobs.keys(), reverse=True):
        job = try_acquire_job(job_id)
        if job is not None:
            break
        logging.info('Raced while trying to acquire job %s, retrying', job_id)
        time.sleep(int(random.random() * 3))
    if job is None:
        logging.error('Failed to acquire a job')
        return

    logging.info('Starting job %s', job_id)

    # Update the db, move the job to the running queue.
    patch_obj = {
        'jobs_queued/' + job_id: {},  # = DELETE
        'jobs_running/' + job_id: {
            'worker': WORKER_NAME
        },
        'workers/' + WORKER_NAME: make_worker_obj('RUNNING', job_id=job_id)
    }
    req('PATCH', '%s.json' % DB, body=patch_obj)

    cmd = [os.path.join(CUR_DIR, 'run_job.py'), job_id]

    # Propagate the worker's PERFETTO_  vars and merge with the job-specific vars.
    env = dict(os.environ, **{k: str(v) for (k, v) in job['env'].items()})
    job_runner = subprocess.Popen(cmd, env=env)

    # Run the job in a python subprocess, to isolate the main loop from logs
    # uploader failures.
    res = None
    cancelled = False
    timed_out = False
    time_started = time.time()
    time_last_db_poll = time_started
    polled_status = 'STARTED'
    while res is None:
        time.sleep(0.25)
        res = job_runner.poll()
        now = time.time()
        if now - time_last_db_poll > 10:  # Throttle DB polling.
            polled_status = req('GET', '%s/jobs/%s/status.json' % (DB, job_id))
            time_last_db_poll = now
        if now - time_started > JOB_TIMEOUT_SEC:
            logging.info('Job %s timed out, terminating', job_id)
            timed_out = True
            job_runner.terminate()
        if (sigterm.is_set() or polled_status != 'STARTED') and not cancelled:
            logging.info('Job %s cancelled, terminating', job_id)
            cancelled = True
            job_runner.terminate()

    status = (
        'INTERRUPTED' if sigterm.is_set() else 'CANCELLED' if cancelled else
        'TIMED_OUT' if timed_out else 'COMPLETED' if res == 0 else 'FAILED')
    logging.info('Job %s %s with code %s', job_id, status, res)

    # Update the DB, unless the job has been cancelled. The "is not None"
    # condition deals with a very niche case, that is, avoid creating a partial
    # job entry after doing a full clear of the DB (which is super rare, happens
    # only when re-deploying the CI).
    if polled_status is not None:
        patch = {
            'jobs/%s/status' % job_id: status,
            'jobs/%s/exit_code' % job_id: {} if res is None else res,
            'jobs/%s/time_ended' % job_id: utc_now_iso(),
            'jobs_running/%s' % job_id: {},  # = DELETE
        }
        req('PATCH', '%s.json' % (DB), body=patch)
Beispiel #2
0
def create_stackdriver_metric_definitions():
  logging.info('Creating Stackdriver metric definitions')
  for name, metric in STACKDRIVER_METRICS.iteritems():
    logging.info('Creating metric %s', name)
    req('POST', STACKDRIVER_API + '/metricDescriptors', body=metric)
Beispiel #3
0
def update_queue_metrics(handler):
  # Update the stackdriver metric that will drive the autoscaler.
  queued = req('GET', DB + '/jobs_queued.json?shallow=true') or {}
  running = req('GET', DB + '/jobs_running.json?shallow=true') or {}
  write_metrics({'ci_job_queue_len': {'v': len(queued) + len(running)}})
Beispiel #4
0
def delete_job_logs(handler):
  req('DELETE', '%s/logs/%s.json' % (DB, handler.request.get('job_id')))
Beispiel #5
0
def delete_expired_logs(handler):
  logs = req('GET', '%s/logs.json?shallow=true' % (DB)) or {}
  for job_id in logs.iterkeys():
    age_days = (datetime.now() - datetime.strptime(job_id[:8], '%Y%m%d')).days
    if age_days > LOGS_TTL_DAYS:
      defer('delete_job_logs', job_id=job_id)
Beispiel #6
0
def check_pending_cls(handler):
  # Check if any pending CL has completed (all jobs are done). If so publish
  # the comment and vote on the CL.
  pending_cls = req('GET', '%s/cls_pending.json' % DB) or {}
  for cl_and_ps, _ in pending_cls.iteritems():
    defer('check_pending_cl', cl_and_ps=cl_and_ps)