def test_cq_status_fetch_stats_new_fetch_from_last_build_run(self):
    urlfetch_mock = mock.Mock()
    urlfetch_mock.return_value.content = json.dumps({
        'more': False, 'cursor': None, 'results': []})

    FetchStatus(cursor='xxx', begin='1', end='2', done=True).put()
    BuildRun(time_started=datetime.datetime(2015, 10, 30, 12, 17, 42),
             time_finished=datetime.datetime(2015, 10, 30, 13, 17, 42),
             buildnumber=0, result=0).put()
    BuildRun(time_started=datetime.datetime(2015, 10, 30, 11, 17, 42),
             time_finished=datetime.datetime(2015, 10, 30, 12, 17, 42),
             buildnumber=0, result=0).put()
    with mock.patch('google.appengine.api.urlfetch.fetch', urlfetch_mock):
      cq_status.fetch_cq_status()

    urlfetch_mock.assert_called_once_with(
      'https://chromium-cq-status.appspot.com/query?'
      'tags=action=verifier_jobs_update&begin=1446211062.0&end=1446214662.0&'
      'count=10')
 def _create_flake():
     tf = datetime.datetime.utcnow()
     ts = tf - datetime.timedelta(hours=1)
     p = PatchsetBuilderRuns(issue=1,
                             patchset=1,
                             master='tryserver.bar',
                             builder='baz').put()
     br_f0 = BuildRun(parent=p,
                      buildnumber=10,
                      result=2,
                      time_started=ts,
                      time_finished=tf).put()
     br_f1 = BuildRun(parent=p,
                      buildnumber=20,
                      result=2,
                      time_started=ts,
                      time_finished=tf).put()
     br_s0 = BuildRun(parent=p,
                      buildnumber=30,
                      result=0,
                      time_started=ts,
                      time_finished=tf).put()
     occ1 = FlakyRun(failure_run=br_f0,
                     success_run=br_s0,
                     failure_run_time_started=ts,
                     failure_run_time_finished=tf,
                     flakes=[
                         FlakeOccurrence(name='step1', failure='testX'),
                     ])
     occ2 = FlakyRun(failure_run=br_f1,
                     success_run=br_s0,
                     failure_run_time_started=ts,
                     failure_run_time_finished=tf,
                     flakes=[
                         FlakeOccurrence(name='step2', failure='testX'),
                         FlakeOccurrence(name='step3', failure='step3'),
                     ])
     f = Flake(name='testX',
               count_day=10,
               occurrences=[occ1.put(), occ2.put()],
               is_step=True,
               issue_id=123456)
     return f, [occ1, occ2]
    def test_ignores_null_flaky_runs(self):
        last_updated = datetime.datetime.now()

        fake_build_key = BuildRun(buildnumber=1,
                                  result=1,
                                  time_finished=last_updated).put()

        flake_run_key = FlakyRun(failure_run=fake_build_key,
                                 success_run=fake_build_key,
                                 failure_run_time_finished=last_updated,
                                 flakes=[
                                     FlakeOccurrence(name='fake_step',
                                                     failure='fake_test_name'),
                                     FlakeOccurrence(name='fake_step2',
                                                     failure='fake_test_name')
                                 ]).put()

        null_flake_run_key = ndb.Key('FlakyRun', 'fake-key')

        Flake(issue_id=1,
              is_step=False,
              name='fake_test_name',
              issue_last_updated=last_updated,
              occurrences=[
                  flake_run_key,
                  null_flake_run_key,
              ]).put()

        self.test_app.get('/migrate')

        flake_types = FlakeType.query().fetch()
        self.assertEqual(len(flake_types), 2)

        flake_type_1 = flake_types[0]
        self.assertEqual(flake_type_1.project, 'chromium')
        self.assertEqual(flake_type_1.step_name, 'fake_step')
        self.assertEqual(flake_type_1.test_name, 'fake_test_name')
        self.assertIsNone(flake_type_1.config)
        self.assertEqual(flake_type_1.last_updated, last_updated)

        flake_type_2 = flake_types[1]
        self.assertEqual(flake_type_2.project, 'chromium')
        self.assertEqual(flake_type_2.step_name, 'fake_step2')
        self.assertEqual(flake_type_2.test_name, 'fake_test_name')
        self.assertIsNone(flake_type_2.config)
        self.assertEqual(flake_type_2.last_updated, last_updated)

        issues = Issue.query().fetch()
        self.assertEqual(len(issues), 1)

        issue = issues[0]
        self.assertEqual(issue.issue_id, 1)
        self.assertEqual(issue.project, 'chromium')
        self.assertEqual(sorted(issue.flake_type_keys),
                         sorted(flake_type.key for flake_type in flake_types))
 def _create_flake(self):
   tf = datetime.datetime(2016, 8, 6, 10, 20, 30)
   ts = tf - datetime.timedelta(hours=1)
   tf2 = tf - datetime.timedelta(days=5)
   ts2 = tf2 - datetime.timedelta(hours=1)
   p = PatchsetBuilderRuns(issue=123456, patchset=1, master='tryserver.test',
                           builder='test-builder').put()
   br_f0 = BuildRun(parent=p, buildnumber=0, result=2, time_started=ts2,
                    time_finished=tf2).put()
   br_f1 = BuildRun(parent=p, buildnumber=1, result=2, time_started=ts,
                    time_finished=tf).put()
   br_s1 = BuildRun(parent=p, buildnumber=2, result=0, time_started=ts,
                    time_finished=tf).put()
   br_f2 = BuildRun(parent=p, buildnumber=3, result=4, time_started=ts,
                    time_finished=tf).put()
   br_s2 = BuildRun(parent=p, buildnumber=4, result=0, time_started=ts,
                    time_finished=tf).put()
   occ_key1 = FlakyRun(failure_run=br_f0, success_run=br_s2,
                       flakes=[
                         FlakeOccurrence(name='foo (x)', failure='foo.bar'),
                         FlakeOccurrence(name='foo (x)', failure='other')],
                       failure_run_time_started=ts2,
                       failure_run_time_finished=tf2).put()
   occ_key2 = FlakyRun(failure_run=br_f1, success_run=br_s1,
                       flakes=[
                         FlakeOccurrence(name='bar (y)', failure='foo.bar')],
                       failure_run_time_started=ts,
                       failure_run_time_finished=tf).put()
   occ_key3 = FlakyRun(failure_run=br_f2, success_run=br_s2,
                       flakes=[
                         FlakeOccurrence(
                           name='foo (x)', failure='foo.bar', issue_id=100),
                         FlakeOccurrence(
                           name='bar (y)', failure='foo.bar', issue_id=200)],
                       failure_run_time_started=ts,
                       failure_run_time_finished=tf).put()
   return Flake(name='foo.bar', count_day=10, is_step=False,
                occurrences=[occ_key1, occ_key2, occ_key3])
Exemple #5
0
  def post(self):
    if (not self.request.get('failure_run_key') or
        not self.request.get('success_run_key')):
      self.response.set_status(400, 'Invalid request parameters')
      return

    failure_run = ndb.Key(urlsafe=self.request.get('failure_run_key')).get()
    success_run = ndb.Key(urlsafe=self.request.get('success_run_key')).get()

    flaky_run = FlakyRun(
        failure_run=failure_run.key,
        failure_run_time_started=failure_run.time_started,
        failure_run_time_finished=failure_run.time_finished,
        success_run=success_run.key)

    failure_time = failure_run.time_finished
    patchset_builder_runs = failure_run.key.parent().get()

    master = BuildRun.removeMasterPrefix(patchset_builder_runs.master)
    url = ('https://chrome-build-extract.appspot.com/p/' + master +
           '/builders/' + patchset_builder_runs.builder +'/builds/' +
           str(failure_run.buildnumber) + '?json=1')
    urlfetch.set_default_fetch_deadline(60)
    logging.info('get_flaky_run_reason ' + url)
    response = urlfetch.fetch(url)
    if response.status_code >= 400 and response.status_code <= 599:
      logging.error('The request to %s has returned %d: %s', url,
                    response.status_code, response.content)
      self.response.set_status(500, 'Failed to fetch build.')
      return
    json_result = json.loads(response.content)
    steps = json_result['steps']

    failed_steps = []
    passed_steps = []
    for step in steps:
      result = step['results'][0]
      if build_result.isResultSuccess(result):
        passed_steps.append(step)
        continue
      if not build_result.isResultFailure(result):
        continue
      step_name = step['name']
      step_text = ' '.join(step['text'])
      if step_name in IGNORED_STEPS:
        continue

      # Custom (non-trivial) rules for ignoring flakes in certain steps:
      #  - [swarming] ...: summary step would also be red (do not double count)
      #  - Patch failure: ingore non-infra failures as they are typically due to
      #    changes in the code on HEAD
      #  - bot_update PATCH FAILED: Duplicates failure in 'Patch failure' step.
      #  - ... (retry summary): this is an artificial step to fail the build due
      #    to another step that has failed earlier (do not double count).
      if (step_name.startswith('[swarming]') or
          (step_name == 'Patch failure' and result != build_result.EXCEPTION) or
          (step_name == 'bot_update' and 'PATCH FAILED' in step_text)):
        continue

      failed_steps.append(step)

    steps_to_ignore = []
    for step in failed_steps:
      step_name = step['name']
      if '(with patch)' in step_name:
        # Ignore any steps from the same test suite, which is determined by the
        # normalized step name. Additionally, if the step fails without patch,
        # ignore the original step as well because tree is busted.
        normalized_step_name = normalize_test_type(step_name, True)
        for other_step in failed_steps:
          if other_step == step:
            continue
          normalized_other_step_name = normalize_test_type(
              other_step['name'], True)
          if normalized_other_step_name == normalized_step_name:
            steps_to_ignore.append(other_step['name'])
            if '(without patch)' in other_step['name']:
              steps_to_ignore.append(step['name'])

    flakes_to_update = []
    for step in failed_steps:
      step_name = step['name']
      if step_name in steps_to_ignore:
        continue
      flakes, is_step = self.get_flakes(
          master, patchset_builder_runs.builder, failure_run.buildnumber, step)
      for flake in flakes:
        flake_occurrence = FlakeOccurrence(name=step_name, failure=flake)
        flaky_run.flakes.append(flake_occurrence)
        flakes_to_update.append((flake, is_step))

    # Do not create FlakyRuns if all failed steps have been ignored.
    if not flaky_run.flakes:
      return

    flaky_run_key = flaky_run.put()
    for flake, is_step in flakes_to_update:
      self.add_failure_to_flake(flake, flaky_run_key, failure_time, is_step)
    self.flaky_runs.increment_by(1)
Exemple #6
0
def fetch_cq_status():
  """Fetches data from chromium-cq-status app and saves new data.

  Remembers old cursor and fetches new data.
  """

  fetch_status = FetchStatus.query().get()
  cursor = ''
  begin = ''
  end = ''
  retry_count = 0

  while True:
    if fetch_status:
      if fetch_status.done:
        logging.info('historical fetching done so fetch latest...')
        end = str(time_functions.timestamp.utcnow_ts())

        last_build_run_seen = BuildRun.query().order(
            -BuildRun.time_finished).fetch(1)
        begin = str(time_functions.timestamp.utctimestamp(
            last_build_run_seen[0].time_finished))
        cursor = ''
      else:
        begin = fetch_status.begin
        end = fetch_status.end
        cursor = fetch_status.cursor
    else:
      logging.info('didnt find any historical information. fetching last week')
      begin = str(time_functions.timestamp.utctimestamp(
          datetime.datetime.utcnow() - datetime.timedelta(weeks=1)))
      end = str(time_functions.timestamp.utcnow_ts())

    if begin and end:
      logging.info('fetching from ' +
                   str(datetime.datetime.fromtimestamp(float(begin))) + ' to ' +
                   str(datetime.datetime.fromtimestamp(float(end))) +
                   ' cursor: ' + cursor)
    else:
      logging.info('fetching with no begin/end and cursor: ' + cursor)

    url = "https://chromium-cq-status.appspot.com/query"
    params = []
    params.append('tags=action=verifier_jobs_update')
    if cursor:
      params.append('cursor=' + cursor)
    if begin:
      params.append('begin=' + begin)
    if end:
      params.append('end=' + end)
    # Tried count of 200 or more but would get OOM or deadline errors. Even 50
    # sometimes gives:
    # "Values may not be more than 1000000 bytes in length; received 2118015
    # bytes"
    params.append('count=10')

    url += '?' + '&'.join(params)
    logging.info('fetching url: ' + url)

    try:
      urlfetch.set_default_fetch_deadline(60)
      result = urlfetch.fetch(url).content

      timestamp_str = '"timestamp":"'
      logging_idx = result.find(timestamp_str)
      if logging_idx != -1:
        logging_idx += len(timestamp_str)
        logging_idx2 = result.find('"', logging_idx)
        logging.info(' current fetch has time of ' +
                     result[logging_idx:logging_idx2])

      try:
        json_result = json.loads(result)

        more = json_result['more']
        cursor = json_result['cursor']

        try:
          logging_output = parse_cq_data(json_result)
          if logging_output:
            logging.info('found flakes: ' + ' '.join(logging_output))
        except DeadlineExceededError:
          logging.info('got DeadlineExceededError during parse_cq_data, '
                       'catching to not show up as error')
          return
      except ValueError:
        requests_metric.increment_by(1, fields={'status': 'parse_error'})
        logging.exception('failed to parse CQ data from %s', url)
        if 'DeadlineExceededError' in result:
          logging.error('got deadline exceeded, trying again after 1s')
          time.sleep(1)
          continue
        elif retry_count < 3:
          retry_count += 1
          logging.error('will retry after sleeping ' + str(retry_count))
          time.sleep(retry_count)
          continue
        else:
          logging.error('giving up and will count current fetch as done')
          # Don't want to continue this as could be a bad cursor.
          more = False
      else:
        requests_metric.increment_by(1, fields={'status': 'success'})

      if not fetch_status:
        fetch_status = FetchStatus()
      fetch_status.done = not more
      if fetch_status.done:
        fetch_status.cursor = ''
        fetch_status.begin = ''
        fetch_status.end = ''
        retry_count = 0
        logging.info('finished fetching for current cursor')
      else:
        fetch_status.begin = begin
        fetch_status.end = end
        fetch_status.cursor = cursor
      fetch_status.put()

      if not more:
        return  # finish the cron job and wait for next iteration
    except urllib2.URLError, e:
      requests_metric.increment_by(1, fields={'status': 'fetch_error'})
      logging.warning('Failed to fetch CQ status: %s', e.reason)
Exemple #7
0
def parse_cq_data(json_data):
  logging_output = []
  for result in json_data.get('results', {}):
    fields = result.get('fields', [])
    if not 'action' in fields:
      continue

    action = fields.get('action')
    if action != 'verifier_jobs_update':
      continue

    if fields.get('verifier') != 'try job':
      continue

    # At the moment, much of the parsing logic assumes this is a Chromium
    # tryjob.
    if fields.get('project') != 'chromium':
      continue

    job_states = fields.get('jobs', [])
    for state in job_states:
      # Just go by |result|.
      #if state not in ['JOB_SUCCEEDED', 'JOB_FAILED', 'JOB_TIMED_OUT']:
      #  continue

      for job in job_states[state]:
        build_properties = job.get('build_properties')
        if not build_properties:
          continue

        try:
          master = job['master']
          builder = job['builder']
          result = job['result']
          timestamp_tz = dateutil.parser.parse(job['timestamp'])
          # We assume timestamps from chromium-cq-status are already in UTC.
          timestamp = timestamp_tz.replace(tzinfo=None)
        except KeyError:
          continue

        try:
          buildnumber = get_int_value(build_properties, 'buildnumber')
          issue = get_int_value(build_properties, 'issue')
          patchset = get_int_value(build_properties, 'patchset')
          attempt_start_ts = get_int_value(build_properties, 'attempt_start_ts')
          time_started = datetime.datetime.utcfromtimestamp(
              attempt_start_ts / 1000000)
        except ValueError:
          continue

        if build_result.isResultPending(result):
          continue

        # At this point, only success or failure.
        success = build_result.isResultSuccess(result)

        patchset_builder_runs = get_patchset_builder_runs(issue=issue,
                                                          patchset=patchset,
                                                          master=master,
                                                          builder=builder)

        build_run = BuildRun(parent=patchset_builder_runs.key,
                             buildnumber=buildnumber,
                             result=result,
                             time_started=time_started,
                             time_finished=timestamp)

        previous_runs = BuildRun.query(
            ancestor=patchset_builder_runs.key).fetch()

        duplicate = False
        for previous_run in previous_runs:
          # We saw this build run already or there are multiple green runs,
          # in which case we ignore subsequent ones to avoid showing failures
          # multiple times.
          if (previous_run.buildnumber == buildnumber) or \
             (build_run.is_success and previous_run.is_success) :
            duplicate = True
            break

        if duplicate:
          continue

        build_run.put()

        for previous_run in previous_runs:
          if previous_run.is_success == build_run.is_success:
            continue
          if success:
            # We saw the flake and then the pass.
            failure_run = previous_run
            success_run = build_run
          else:
            # We saw the pass and then the failure. Could happen when fetching
            # historical data, or for the bot_update step (patch can't be
            # applied cleanly anymore).
            failure_run = build_run
            success_run = previous_run

          logging_output.append(failure_run.key.parent().get().builder +
                                str(failure_run.buildnumber))

          # Queue a task to fetch the error of this failure and create FlakyRun.
          flakes_metric.increment_by(1)
          taskqueue.add(
              queue_name='issue-updates',
              url='/issues/create_flaky_run',
              params={'failure_run_key': failure_run.key.urlsafe(),
                      'success_run_key': success_run.key.urlsafe()})

  return logging_output
  def post(self):
    if (not self.request.get('failure_run_key') or
        not self.request.get('success_run_key')):
      self.response.set_status(400, 'Invalid request parameters')
      return

    failure_run = ndb.Key(urlsafe=self.request.get('failure_run_key')).get()
    success_run = ndb.Key(urlsafe=self.request.get('success_run_key')).get()

    flaky_run = FlakyRun(
        failure_run=failure_run.key,
        failure_run_time_started=failure_run.time_started,
        failure_run_time_finished=failure_run.time_finished,
        success_run=success_run.key)

    failure_time = failure_run.time_finished
    patchset_builder_runs = failure_run.key.parent().get()

    master = BuildRun.removeMasterPrefix(patchset_builder_runs.master)
    url = ('https://luci-milo.appspot.com/'
           'prpc/milo.Buildbot/GetBuildbotBuildJSON')
    request = json.dumps({
        'master': master,
        'builder': patchset_builder_runs.builder,
        'buildNum': failure_run.buildnumber,
    })
    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json',
    }
    urlfetch.set_default_fetch_deadline(60)
    logging.info('get_flaky_run_reason: %s, %s', url, request)
    response = urlfetch.fetch(
        url, payload=request, method=urlfetch.POST, headers=headers,
        validate_certificate=True)
    if response.status_code != 200:
      logging.error('The request to %s has returned %d: %s', url,
                    response.status_code, response.content)
      self.response.set_status(500, 'Failed to fetch build.')
      return
    content = response.content
    if content.startswith(_MILO_RESPONSE_PREFIX):
      content = content[len(_MILO_RESPONSE_PREFIX):]
    data = json.loads(content)['data']
    json_result = json.loads(base64.b64decode(data))
    steps = json_result['steps']

    failed_steps = []
    passed_steps = []
    for step in steps:
      result = step['results'][0]
      if build_result.isResultSuccess(result):
        passed_steps.append(step)
        continue
      if not build_result.isResultFailure(result):
        continue
      # For Luci builds, some steps don't have step text anymore. Such steps
      # include 'Failure reason', 'analyze', etc.
      step_text = ' '.join(step['text'] or [])
      step_name = step['name']
      if step_name in IGNORED_STEPS:
        continue

      # Custom (non-trivial) rules for ignoring flakes in certain steps:
      #  - [swarming] ...: summary step would also be red (do not double count)
      #  - Patch failure: ingore non-infra failures as they are typically due to
      #    changes in the code on HEAD
      #  - bot_update PATCH FAILED: Duplicates failure in 'Patch failure' step.
      #  - ... (retry summary): this is an artificial step to fail the build due
      #    to another step that has failed earlier (do not double count).
      if (step_name.startswith('[swarming]') or
          (step_name == 'Patch failure' and result != build_result.EXCEPTION) or
          (step_name == 'bot_update' and 'PATCH FAILED' in step_text)):
        continue

      failed_steps.append(step)

    steps_to_ignore = []
    for step in failed_steps:
      step_name = step['name']
      if '(with patch)' in step_name:
        # Ignore any steps from the same test suite, which is determined by the
        # normalized step name. Additionally, if the step fails without patch,
        # ignore the original step as well because tree is busted.
        normalized_step_name = normalize_test_type(step_name, True)
        for other_step in failed_steps:
          if other_step == step:
            continue
          normalized_other_step_name = normalize_test_type(
              other_step['name'], True)
          if normalized_other_step_name == normalized_step_name:
            steps_to_ignore.append(other_step['name'])
            if '(without patch)' in other_step['name']:
              steps_to_ignore.append(step['name'])

    flakes_to_update = []
    for step in failed_steps:
      step_name = step['name']
      if step_name in steps_to_ignore:
        continue
      flakes, is_step = self.get_flakes(
          master, patchset_builder_runs.builder, failure_run.buildnumber, step)
      if is_step and not is_infra_step_flake(step_name):
        continue  # Ignore flakes of non-infra steps.
      for flake in flakes:
        flake_occurrence = FlakeOccurrence(name=step_name, failure=flake)
        flaky_run.flakes.append(flake_occurrence)
        flakes_to_update.append((flake, is_step))

    # Do not create FlakyRuns if all failed steps have been ignored.
    if not flaky_run.flakes:
      return

    flaky_run_key = flaky_run.put()
    for flake, is_step in flakes_to_update:
      if self.is_duplicate_occurrence(flake, flaky_run):
        logging.info('Not adding duplicate occurrence for the same CL')
        continue
      self.add_failure_to_flake(flake, flaky_run_key, failure_time, is_step)
    self.flaky_runs.increment_by(1)
Exemple #9
0
def fetch_cq_status():
    """Fetches data from chromium-cq-status app and saves new data.

  Remembers old cursor and fetches new data.
  """

    fetch_status = FetchStatus.query().get()
    cursor = ''
    begin = ''
    end = ''
    retry_count = 0

    while True:
        if fetch_status:
            if fetch_status.done:
                logging.info('historical fetching done so fetch latest...')
                end = str(time_functions.timestamp.utcnow_ts())

                last_build_run_seen = BuildRun.query().order(
                    -BuildRun.time_finished).fetch(1)
                begin = str(
                    time_functions.timestamp.utctimestamp(
                        last_build_run_seen[0].time_finished))
                cursor = ''
            else:
                begin = fetch_status.begin
                end = fetch_status.end
                cursor = fetch_status.cursor
        else:
            logging.info(
                'didnt find any historical information. fetching last week')
            begin = str(
                time_functions.timestamp.utctimestamp(
                    datetime.datetime.utcnow() - datetime.timedelta(weeks=1)))
            end = str(time_functions.timestamp.utcnow_ts())

        if begin and end:
            logging.info('fetching from %s to %s cursor: %s',
                         str(datetime.datetime.utcfromtimestamp(float(begin))),
                         str(datetime.datetime.utcfromtimestamp(float(end))),
                         cursor)
        else:
            logging.info('fetching with no begin/end and cursor: ' + cursor)

        url = "https://chromium-cq-status.appspot.com/query"
        params = []
        params.append('tags=action=verifier_jobs_update')
        if cursor:
            params.append('cursor=' + cursor)
        if begin:
            params.append('begin=' + begin)
        if end:
            params.append('end=' + end)
        # Tried count of 200 or more but would get OOM or deadline errors. Even 50
        # sometimes gives:
        # "Values may not be more than 1000000 bytes in length; received 2118015
        # bytes"
        params.append('count=10')

        url += '?' + '&'.join(params)
        logging.info('fetching url: ' + url)

        try:
            urlfetch.set_default_fetch_deadline(60)
            result = urlfetch.fetch(url).content

            timestamp_str = '"timestamp":"'
            logging_idx = result.find(timestamp_str)
            if logging_idx != -1:
                logging_idx += len(timestamp_str)
                logging_idx2 = result.find('"', logging_idx)
                logging.info(' current fetch has time of ' +
                             result[logging_idx:logging_idx2])

            try:
                json_result = json.loads(result)

                more = json_result['more']
                cursor = json_result['cursor']

                try:
                    logging_output = parse_cq_data(json_result)
                    if logging_output:
                        logging.info('found flakes: ' +
                                     ' '.join(logging_output))
                except DeadlineExceededError:
                    logging.info(
                        'got DeadlineExceededError during parse_cq_data, '
                        'catching to not show up as error')
                    return
            except ValueError:
                requests_metric.increment_by(1,
                                             fields={'status': 'parse_error'})
                logging.exception('failed to parse CQ data from %s', url)
                if 'DeadlineExceededError' in result:
                    logging.error(
                        'got deadline exceeded, trying again after 1s')
                    time.sleep(1)
                    continue
                elif retry_count < 3:
                    retry_count += 1
                    logging.error('will retry after sleeping ' +
                                  str(retry_count))
                    time.sleep(retry_count)
                    continue
                else:
                    logging.error(
                        'giving up and will count current fetch as done')
                    # Don't want to continue this as could be a bad cursor.
                    more = False
            else:
                requests_metric.increment_by(1, fields={'status': 'success'})

            if not fetch_status:
                fetch_status = FetchStatus()
            fetch_status.done = not more
            if fetch_status.done:
                fetch_status.cursor = ''
                fetch_status.begin = ''
                fetch_status.end = ''
                retry_count = 0
                logging.info('finished fetching for current cursor')
            else:
                fetch_status.begin = begin
                fetch_status.end = end
                fetch_status.cursor = cursor
            fetch_status.put()

            if not more:
                return  # finish the cron job and wait for next iteration
        except urllib2.URLError, e:
            requests_metric.increment_by(1, fields={'status': 'fetch_error'})
            logging.warning('Failed to fetch CQ status: %s', e.reason)
Exemple #10
0
def parse_cq_data(json_data):
    logging_output = []
    for result in json_data.get('results', {}):
        fields = result.get('fields', [])
        if not 'action' in fields:
            logging.warning('Missing field action in status record')
            parsing_errors.increment_by(1)
            continue

        action = fields.get('action')
        if action != 'verifier_jobs_update':
            continue

        if fields.get('verifier') != 'try job':
            continue

        # At the moment, much of the parsing logic assumes this is a Chromium
        # tryjob.
        project = fields.get('project')
        if project != 'chromium/chromium/src':
            logging.info('project not chromium: %s', project)
            continue

        job_states = fields.get('jobs', {})

        for job in itertools.chain.from_iterable(job_states.values()):
            try:
                builder = job['builder']
                result = job['result']
                timestamp_tz = dateutil.parser.parse(
                    job.get('created_ts') or job['timestamp'])
                # We assume timestamps from chromium-cq-status are already in UTC.
                timestamp = timestamp_tz.replace(tzinfo=None)
            except KeyError:
                logging.warning('Failed to parse job details', exc_info=True)
                parsing_errors.increment_by(1)
                continue

            if build_result.isResultPending(result):
                continue

            build_properties = job.get('build_properties')
            if not build_properties:
                logging.warning(
                    'Missing field build_properties in job details')
                parsing_errors.increment_by(1)
                continue

            issue = -1
            patchset = -1
            time_started = 0

            try:
                buildnumber = get_int_value(build_properties, 'buildnumber')
                if 'patch_issue' in build_properties:
                    issue = get_int_value(build_properties, 'patch_issue')
                else:  # pragma: no cover
                    logging.warning('no issue')

                if 'patch_set' in build_properties:
                    patchset = get_int_value(build_properties, 'patch_set')
                else:  # pragma: no cover
                    logging.warning('no patchset')

                if 'attempt_start_ts' in build_properties:
                    attempt_start_ts = get_int_value(build_properties,
                                                     'attempt_start_ts')
                    time_started = datetime.datetime.utcfromtimestamp(
                        attempt_start_ts / 1000000)
                else:  # pragma: no cover
                    logging.warning('no attempt_start_ts')
                    continue

                # For builds through Buildbucket, job['master'] is actually the bucket
                # name. For buildbot-based builds, it just happens to be the same as the
                # master name. For Luci-based builds, it is different from the master
                # name, and the master name is set as a build property instead.
                # https://chromium.googlesource.com/chromium/src/+/infra/config/cr-buildbucket.cfg#115
                # So in either case, the "real" master name is in the build properties.
                master = build_properties['mastername']

            except (ValueError, KeyError):
                logging.warning('Failed to parse build properties',
                                exc_info=True)
                parsing_errors.increment_by(1)
                continue

            # At this point, only success or failure.
            success = build_result.isResultSuccess(result)

            patchset_builder_runs = get_patchset_builder_runs(
                issue=issue, patchset=patchset, master=master, builder=builder)

            build_run = BuildRun(parent=patchset_builder_runs.key,
                                 buildnumber=buildnumber,
                                 result=result,
                                 time_started=time_started,
                                 time_finished=timestamp)

            previous_runs = BuildRun.query(
                ancestor=patchset_builder_runs.key).fetch()

            duplicate = False
            for previous_run in previous_runs:
                # We saw this build run already or there are multiple green runs,
                # in which case we ignore subsequent ones to avoid showing failures
                # multiple times.
                if (previous_run.buildnumber == buildnumber) or \
                   (build_run.is_success and previous_run.is_success) :
                    duplicate = True
                    break

            if duplicate:
                continue

            build_run.put()

            for previous_run in previous_runs:
                if previous_run.is_success == build_run.is_success:
                    continue
                if success:
                    # We saw the flake and then the pass.
                    failure_run = previous_run
                    success_run = build_run
                else:
                    # We saw the pass and then the failure. Could happen when fetching
                    # historical data, or for the bot_update step (patch can't be
                    # applied cleanly anymore).
                    failure_run = build_run
                    success_run = previous_run

                logging_output.append(failure_run.key.parent().get().builder +
                                      str(failure_run.buildnumber))

                # Queue a task to fetch the error of this failure and create FlakyRun.
                flakes_metric.increment_by(1)
                taskqueue.add(queue_name='issue-updates',
                              url='/issues/create_flaky_run',
                              params={
                                  'failure_run_key': failure_run.key.urlsafe(),
                                  'success_run_key': success_run.key.urlsafe()
                              })

    return logging_output
Exemple #11
0
    def _create_flakes(ts, tf, ts2, tf2):
        p = PatchsetBuilderRuns(issue=123456,
                                patchset=1,
                                master='tryserver.test',
                                builder='test-builder').put()
        br_f0 = BuildRun(parent=p,
                         buildnumber=0,
                         result=2,
                         time_started=ts2,
                         time_finished=tf2).put()
        br_f1 = BuildRun(parent=p,
                         buildnumber=1,
                         result=2,
                         time_started=ts,
                         time_finished=tf).put()
        br_s1 = BuildRun(parent=p,
                         buildnumber=2,
                         result=0,
                         time_started=ts,
                         time_finished=tf).put()
        br_f2 = BuildRun(parent=p,
                         buildnumber=3,
                         result=4,
                         time_started=ts,
                         time_finished=tf2).put()
        br_s2 = BuildRun(parent=p,
                         buildnumber=4,
                         result=0,
                         time_started=ts,
                         time_finished=tf2).put()
        occ_key1 = FlakyRun(failure_run=br_f0,
                            success_run=br_s2,
                            failure_run_time_started=ts2,
                            failure_run_time_finished=tf2).put()
        occ_key2 = FlakyRun(failure_run=br_f1,
                            success_run=br_s1,
                            failure_run_time_started=ts,
                            failure_run_time_finished=tf).put()
        occ_key3 = FlakyRun(failure_run=br_f2,
                            success_run=br_s2,
                            failure_run_time_started=ts,
                            failure_run_time_finished=tf).put()

        Flake(name='foo',
              last_hour=True,
              last_day=True,
              last_week=True,
              last_month=True).put()
        Flake(name='bar',
              last_hour=True,
              last_day=True,
              last_week=True,
              last_month=True,
              occurrences=[occ_key1, occ_key2]).put()
        Flake(name='baz',
              last_hour=True,
              last_day=True,
              last_week=True,
              last_month=True,
              occurrences=[occ_key3]).put()
        Flake(name='zee',
              last_hour=False,
              last_day=False,
              last_week=True,
              last_month=False).put()
Exemple #12
0
def parse_cq_data(json_data):
  logging_output = []
  for result in json_data['results']:
    fields = result['fields']
    if not 'action' in fields:
      continue

    action = fields['action']
    if action != 'verifier_jobs_update':
      continue

    if fields['verifier'] != 'try job':
      continue

    job_states = fields['jobs']
    for state in job_states:
      # Just go by |result|.
      #if state not in ['JOB_SUCCEEDED', 'JOB_FAILED', 'JOB_TIMED_OUT']:
      #  continue

      for job in job_states[state]:
        build_properties = job['build_properties']
        if not build_properties:
          continue

        master = job['master']
        builder = job['builder']
        result = job['result']
        timestamp = datetime.datetime.strptime(job['timestamp'],
                                               '%Y-%m-%d %H:%M:%S.%f')
        try:
          buildnumber = get_int_value(build_properties, 'buildnumber')
          issue = get_int_value(build_properties, 'issue')
          patchset = get_int_value(build_properties, 'patchset')
        except ValueError as e:
          continue

        if build_result.isResultPending(result):
          continue

        # At this point, only success or failure.
        success = build_result.isResultSuccess(result)

        patchset_builder_runs = get_patchset_builder_runs(issue=issue,
                                                          patchset=patchset,
                                                          master=master,
                                                          builder=builder)

        build_run = BuildRun(parent=patchset_builder_runs.key,
                             buildnumber=buildnumber,
                             result=result,
                             time_finished=timestamp)

        previous_runs = BuildRun.query(
            ancestor=patchset_builder_runs.key).fetch()

        duplicate = False
        for previous_run in previous_runs:
          # We saw this build run already or there are multiple green runs,
          # in which case we ignore subsequent ones to avoid showing failures
          # multiple times.
          if (previous_run.buildnumber == buildnumber) or \
             (build_run.is_success and previous_run.is_success) :
            duplicate = True
            break

        if duplicate:
          continue

        build_run.put()

        for previous_run in previous_runs:
          if previous_run.is_success == build_run.is_success:
            continue
          if success:
            # We saw the flake and then the pass.
            flaky_run = FlakyRun(
                failure_run=previous_run.key,
                failure_run_time_finished=previous_run.time_finished,
                success_run=build_run.key)
            flaky_run.put()
            logging_output.append(previous_run.key.parent().get().builder +
                                  str(previous_run.buildnumber))
          else:
            # We saw the pass and then the failure. Could happen when fetching
            # historical data.
            flaky_run = FlakyRun(
                failure_run=build_run.key,
                failure_run_time_finished=build_run.time_finished,
                success_run=previous_run.key)
            flaky_run.put()
            logging_output.append(build_run.key.parent().get().builder +
                                  str(build_run.buildnumber))

          # Queue a task to fetch the error of this failure.
          deferred.defer(get_flaky_run_reason, flaky_run.key)

  return logging_output