예제 #1
0
def write_metrics(metric_dict):
    now = utc_now_iso()
    desc = {'timeSeries': []}
    for key, spec in metric_dict.iteritems():
        desc['timeSeries'] += [{
            'metric': {
                'type': STACKDRIVER_METRICS[key]['type'],
                'labels': spec.get('l', {})
            },
            'resource': {
                'type': 'global'
            },
            'points': [{
                'interval': {
                    'endTime': now
                },
                'value': {
                    'int64Value': str(spec['v'])
                }
            }]
        }]
    try:
        req('POST', STACKDRIVER_API + '/timeSeries', body=desc)
    except Exception as e:
        # Metric updates can easily fail due to Stackdriver API limitations.
        msg = str(e)
        if 'written more frequently than the maximum sampling' not in msg:
            logging.error('Metrics update failed: %s', msg)
예제 #2
0
def make_worker_obj(status, job_id=None):
    return {
        'job_id': job_id,
        'status': status,
        'last_update': utc_now_iso(),
        'host': os.getenv('WORKER_HOST', '')
    }
예제 #3
0
def check_pending_cl(handler):
  # This function can be called twice on the same CL, e.g., in the case when the
  # Presubmit-Ready label is applied after we have finished running all the
  # jobs (we run presubmit regardless, only the voting is conditioned by PR).
  cl_and_ps = handler.request.get('cl_and_ps')
  cl_obj = req('GET', '%s/cls/%s.json' % (DB, cl_and_ps))
  all_jobs = cl_obj.get('jobs', {}).keys()
  pending_jobs = []
  for job_id in all_jobs:
    job_status = req('GET', '%s/jobs/%s/status.json' % (DB, job_id))
    pending_jobs += [job_id] if job_status in ('QUEUED', 'STARTED') else []

  if pending_jobs:
    # If the CL has been pending for too long cancel all its jobs. Upon the next
    # scan it will be deleted and optionally voted on.
    t_queued = parse_iso_time(cl_obj['time_queued'])
    age_sec = (datetime.utcnow() - t_queued).total_seconds()
    if age_sec > CL_TIMEOUT_SEC:
      logging.warning('Canceling %s, it has been pending for too long (%s sec)',
                      cl_and_ps, int(age_sec))
      map(lambda x: defer('cancel_job', job_id=x), pending_jobs)
    return

  logging.info('All jobs completed for CL %s', cl_and_ps)

  # Remove the CL from the pending queue and update end time.
  patch_obj = {
      'cls_pending/%s' % cl_and_ps: {},  # = DELETE
      'cls/%s/time_ended' % cl_and_ps: cl_obj.get('time_ended', utc_now_iso()),
  }
  req('PATCH', '%s.json' % DB, body=patch_obj)
  defer('update_cl_metrics', src='cls/' + cl_and_ps)
  map(lambda x: defer('update_job_metrics', job_id=x), all_jobs)
  if cl_obj.get('wants_vote'):
    defer('comment_and_vote_cl', cl_and_ps=cl_and_ps)
예제 #4
0
def queue_postsubmit_jobs(handler):
    '''Creates the jobs entries in the DB for the given branch or revision

  Can be called in two modes:
    1. ?branch=master: Will retrieve the SHA1 of master and call the one below.
    2. ?branch=master&rev=deadbeef1234: queues jobs for the given revision.
  '''
    prj = urllib.quote(GERRIT_PROJECT, '')
    branch = handler.request.get('branch')
    revision = handler.request.get('revision')
    assert branch

    if not revision:
        # Get the commit SHA1 of the head of the branch.
        url = 'https://%s/a/projects/%s/branches/%s' % (GERRIT_HOST, prj,
                                                        branch)
        revision = req('GET', url, gerrit=True)['revision']
        assert revision
        defer('queue_postsubmit_jobs', branch=branch, revision=revision)
        return

    # Get the committer datetime for the given revision.
    url = 'https://%s/a/projects/%s/commits/%s' % (GERRIT_HOST, prj, revision)
    commit_info = req('GET', url, gerrit=True)
    time_committed = commit_info['committer']['date'].split('.')[0]
    time_committed = datetime.strptime(time_committed, '%Y-%m-%d %H:%M:%S')

    # Enqueue jobs.
    src = 'branches/%s-%s' % (branch, time_committed.strftime('%Y%m%d%H%M%S'))
    now = datetime.utcnow()
    patch_obj = {
        src: {
            'rev': revision,
            'subject': commit_info['subject'][:100],
            'author': commit_info['author'].get('email', 'N/A'),
            'time_committed': utc_now_iso(time_committed),
            'time_queued': utc_now_iso(),
            'jobs': {},
        }
    }
    ref = 'refs/heads/' + branch
    append_jobs(patch_obj, src, ref, now)
    req('PATCH', DB + '.json', body=patch_obj)
예제 #5
0
def delete_stale_jobs(handler):
    '''Deletes jobs that are left in the running queue for too long

  This is usually due to a crash in the VM that handles them.
  '''
    running_jobs = req('GET', '%s/jobs_running.json?shallow=true' % (DB)) or {}
    for job_id in running_jobs.iterkeys():
        job = req('GET', '%s/jobs/%s.json' % (DB, job_id))
        time_started = parse_iso_time(job.get('time_started', utc_now_iso()))
        age = (datetime.now() - time_started).total_seconds()
        if age > JOB_TIMEOUT_SEC * 2:
            defer('cancel_job', job_id=job_id)
예제 #6
0
def check_new_cl(handler):
    '''Creates the CL + jobs entries in the DB for the given CL if doesn't exist

  If exists check if a Presubmit-Ready label has been added and if so updates it
  with the message + vote.
  '''
    change_id = handler.request.get('change_id')
    rev_hash = handler.request.get('rev_hash')
    cl = handler.request.get('cl')
    patchset = handler.request.get('patchset')
    ref = handler.request.get('ref')
    wants_vote = handler.request.get('wants_vote') == '1'

    # We want to do two things here:
    # 1) If the CL doesn't exist (hence vote_prop is None) carry on below and
    #    enqueue jobs for it.
    # 2) If the CL exists, we don't need to kick new jobs. However, the user
    #    might have addeed a Presubmit-Ready label after we created the CL. In
    #    this case update the |wants_vote| flag and return.
    vote_prop = req('GET', '%s/cls/%s-%s/wants_vote.json' % (DB, cl, patchset))
    if vote_prop is not None:
        if vote_prop != wants_vote and wants_vote:
            logging.info('Updating wants_vote flag on %s-%s', cl, patchset)
            req('PUT',
                '%s/cls/%s-%s/wants_vote.json' % (DB, cl, patchset),
                body=True)
            # If the label is applied after we have finished running all the jobs just
            # jump straight to the voting.
            defer('check_pending_cl', cl_and_ps='%s-%s' % (cl, patchset))
        return

    # This is the first time we see this patchset, enqueue jobs for it.

    # Dequeue jobs forb older patchsets, if any.
    defer('cancel_older_jobs', cl=cl, patchset=patchset)

    src = 'cls/%s-%s' % (cl, patchset)
    # Enqueue jobs for the latest patchset.
    patch_obj = {}
    patch_obj['cls_pending/%s-%s' % (cl, patchset)] = 0
    patch_obj[src] = {
        'change_id': change_id,
        'revision_id': rev_hash,
        'time_queued': utc_now_iso(),
        'jobs': {},
        'wants_vote': wants_vote,
    }
    append_jobs(patch_obj, src, ref)
    req('PATCH', DB + '.json', body=patch_obj)
예제 #7
0
def cancel_job(handler):
    '''Cancels a job if not completed or failed.

  This function is racy: workers can complete the queued jobs while we mark them
  as cancelled. The result of such race is still acceptable.'''
    job_id = handler.request.get('job_id')
    status = req('GET', '%s/jobs/%s/status.json' % (DB, job_id))
    patch_obj = {
        'jobs_running/%s' % job_id: {},  # = DELETE,
        'jobs_queued/%s' % job_id: {},  # = DELETE,
    }
    if status in ('QUEUED', 'STARTED'):
        patch_obj['jobs/%s/status' % job_id] = 'CANCELLED'
        patch_obj['jobs/%s/time_ended' % job_id] = utc_now_iso()
    req('PATCH', DB + '.json', body=patch_obj)
예제 #8
0
def try_acquire_job(job_id):
    ''' Transactionally acquire the given job.

  Returns the job JSON object if it managed to acquire and put it into the
  STARTED state, None if another worker got there first.
  '''
    logging.debug('Trying to acquire job %s', job_id)

    uri = '%s/jobs/%s.json' % (DB, job_id)
    job, etag = req('GET', uri, req_etag=True)
    if job['status'] != 'QUEUED':
        return None  # Somebody else took it
    try:
        job['status'] = 'STARTED'
        job['time_started'] = utc_now_iso()
        job['worker'] = WORKER_NAME
        req('PUT', uri, body=job, etag=etag)
        return job
    except ConcurrentModificationError:
        return None
예제 #9
0
def append_jobs(patch_obj, src, git_ref, now=None):
    '''Creates the worker jobs (defined in config.py) for the given CL.

  Jobs are keyed by timestamp-cl-patchset-config to get a fair schedule (workers
  pull jobs ordered by the key above).
  It dosn't directly write into the DB, it just appends keys to the passed
  |patch_obj|, so the whole set of CL descriptor + jobs can be added atomically
  to the datastore.
  src: is cls/1234/1 (cl and patchset number).
  '''
    logging.info('Enqueueing jobs fos cl %s', src)
    timestamp = (now or datetime.utcnow()).strftime('%Y%m%d%H%M%S')
    for cfg_name, env in JOB_CONFIGS.iteritems():
        job_id = '%s--%s--%s' % (timestamp, src.replace('/', '-'), cfg_name)
        logging.info('Enqueueing job %s', job_id)
        patch_obj['jobs/' + job_id] = {
            'src': src,
            'type': cfg_name,
            'env': dict(env, PERFETTO_TEST_GIT_REF=git_ref),
            'status': 'QUEUED',
            'time_queued': utc_now_iso(),
        }
        patch_obj['jobs_queued/' + job_id] = 0
        patch_obj[src]['jobs'][job_id] = 0
예제 #10
0
def worker_loop():
    ''' Pulls a job from the queue and runs it invoking run_job.py  '''
    uri = '%s/jobs_queued.json?orderBy="$key"&limitToLast=10' % DB
    jobs = req('GET', uri)
    if not jobs:
        return

    # Work out the worker number from the hostname. We try to distribute the load
    # (via the time.sleep below) so that we fill first all the worker-1 of each
    # vm, then worker-2 and so on. This is designed so that if there is only one
    # CL (hence N jobs) in the queue, each VM gets only one job, maximizing the
    # cpu efficiency of each VM.
    try:
        worker_num = int(socket.gethostname().split('-')[-1])
    except ValueError:
        worker_num = 1

    # Transactionally acquire a job. Deal with races (two workers trying to
    # acquire the same job).
    job = None
    job_id = None
    for job_id in sorted(jobs.keys(), reverse=True):
        job = try_acquire_job(job_id)
        if job is not None:
            break
        logging.info('Raced while trying to acquire job %s, retrying', job_id)
        time.sleep(worker_num * 2 + random.random())
    if job is None:
        logging.error('Failed to acquire a job')
        return

    logging.info('Starting job %s', job_id)

    # Update the db, move the job to the running queue.
    patch_obj = {
        'jobs_queued/' + job_id: {},  # = DELETE
        'jobs_running/' + job_id: {
            'worker': WORKER_NAME
        },
        'workers/' + WORKER_NAME: make_worker_obj('RUNNING', job_id=job_id)
    }
    req('PATCH', '%s.json' % DB, body=patch_obj)

    cmd = [os.path.join(CUR_DIR, 'run_job.py'), job_id]

    # Propagate the worker's PERFETTO_  vars and merge with the job-specific vars.
    env = dict(os.environ, **{k: str(v) for (k, v) in job['env'].items()})
    job_runner = subprocess.Popen(cmd, env=env)

    # Run the job in a python subprocess, to isolate the main loop from logs
    # uploader failures.
    res = None
    cancelled = False
    timed_out = False
    time_started = time.time()
    time_last_db_poll = time_started
    polled_status = 'STARTED'
    while res is None:
        time.sleep(0.25)
        res = job_runner.poll()
        now = time.time()
        if now - time_last_db_poll > 10:  # Throttle DB polling.
            polled_status = req('GET', '%s/jobs/%s/status.json' % (DB, job_id))
            time_last_db_poll = now
        if now - time_started > JOB_TIMEOUT_SEC:
            logging.info('Job %s timed out, terminating', job_id)
            timed_out = True
            job_runner.terminate()
        if (sigterm.is_set() or polled_status != 'STARTED') and not cancelled:
            logging.info('Job %s cancelled, terminating', job_id)
            cancelled = True
            job_runner.terminate()

    status = (
        'INTERRUPTED' if sigterm.is_set() else 'CANCELLED' if cancelled else
        'TIMED_OUT' if timed_out else 'COMPLETED' if res == 0 else 'FAILED')
    logging.info('Job %s %s with code %s', job_id, status, res)

    # Update the DB, unless the job has been cancelled. The "is not None"
    # condition deals with a very niche case, that is, avoid creating a partial
    # job entry after doing a full clear of the DB (which is super rare, happens
    # only when re-deploying the CI).
    if polled_status is not None:
        patch = {
            'jobs/%s/status' % job_id: status,
            'jobs/%s/exit_code' % job_id: {} if res is None else res,
            'jobs/%s/time_ended' % job_id: utc_now_iso(),
            'jobs_running/%s' % job_id: {},  # = DELETE
        }
        req('PATCH', '%s.json' % (DB), body=patch)