def send_email(bug, bug_template):
    """Send email to the owner and cc's to notify the TestBug.

    @param bug: TestBug instance.
    @param bug_template: A template dictionary specifying the default bug
                         filing options for failures in this suite.
    """
    autotest_stats.Counter(EMAIL_COUNT_KEY % 'total').increment()
    to_set = set(bug.cc) if bug.cc else set()
    if bug.owner:
        to_set.add(bug.owner)
    if bug_template.get('cc'):
        to_set = to_set.union(bug_template.get('cc'))
    if bug_template.get('owner'):
        to_set.add(bug_template.get('owner'))
    recipients = ', '.join(to_set)
    try:
        gmail_lib.send_email(
            recipients,
            bug.title(),
            bug.summary(),
            retry=False,
            creds_path=site_utils.get_creds_abspath(EMAIL_CREDS_FILE))
    except Exception:
        autotest_stats.Counter(EMAIL_COUNT_KEY % 'fail').increment()
        raise
Example #2
0
    def log_devserver_match_stats(dut_hostname, devserver_url):
        """Log stats whether host and devserver are in the same subnet.

        @param dut_hostname: Hostname of the dut.
        @param devserver_url: Url to the devserver.
        """
        try:
            devserver_name = dev_server.ImageServer.get_server_name(
                devserver_url)
            devserver_ip = socket.gethostbyname(devserver_name)
            dut_ip = socket.gethostbyname(dut_hostname)
        except socket.gaierror as e:
            logging.error('Failed to get IP address, error: %s', e)
            return

        # Take the first 2 octets as the indicator of subnet.
        devserver_subnet = '_'.join(devserver_ip.split('.')[0:2])
        dut_subnet = '_'.join(dut_ip.split('.')[0:2])
        if not utils.is_in_same_subnet(devserver_ip, dut_ip, 19):
            counter = ('devserver_mismatch.%s_to_%s' %
                       (devserver_subnet, dut_subnet))
            autotest_stats.Counter(counter).increment()
            counter = 'devserver_mismatch.%s' % devserver_subnet
        else:
            counter = 'devserver_match.%s' % devserver_subnet

        autotest_stats.Counter(counter).increment()
def symbolicate_minidump_with_devserver(minidump_path, resultdir):
    """
    Generates a stack trace for the specified minidump by consulting devserver.

    This function assumes the debug symbols have been staged on the devserver.

    @param minidump_path: absolute path to minidump to by symbolicated.
    @param resultdir: server job's result directory.
    @raise DevServerException upon failure, HTTP or otherwise.
    """
    # First, look up what build we tested.  If we can't find this, we can't
    # get the right debug symbols, so we might as well give up right now.
    keyvals = client_utils.read_keyval(resultdir)
    if JOB_BUILD_KEY not in keyvals:
        raise dev_server.DevServerException(
            'Cannot determine build being tested.')

    crashserver_name = dev_server.get_least_loaded_devserver(
        devserver_type=dev_server.CrashServer)
    if not crashserver_name:
        autotest_stats.Counter(CRASH_SERVER_OVERLOAD).increment()
        raise dev_server.DevServerException(
            'No crash server has the capacity to symbolicate the dump.')
    else:
        autotest_stats.Counter(CRASH_SERVER_FOUND).increment()
    devserver = dev_server.CrashServer(crashserver_name)
    trace_text = devserver.symbolicate_dump(minidump_path,
                                            keyvals[JOB_BUILD_KEY])
    if not trace_text:
        raise dev_server.DevServerException('Unknown error!!')
    with open(minidump_path + '.txt', 'w') as trace_file:
        trace_file.write(trace_text)
def main():
    try:
        autotest_stats.Counter(STATS_KEY).increment('starts')
        main_without_exception_handling()
    except Exception as e:
        message = 'Uncaught exception. Terminating shard_client.'
        email_manager.manager.log_stacktrace(message)
        logging.exception(message)
        autotest_stats.Counter(STATS_KEY).increment('uncaught_exceptions')
        raise
    finally:
        email_manager.manager.send_queued_emails()
Example #5
0
    def dispatchRequest(self, request):
        """
        Invoke a json RPC call from a decoded json request.
        @param request: a decoded json_request
        @returns a dictionary with keys id, result, err and err_traceback
        """
        results = self.blank_result_dict()

        try:
            results['id'] = self._getRequestId(request)
            methName = request['method']
            args = request['params']
        except KeyError:
            raise BadServiceRequest(request)

        autotest_stats.Counter('rpc').increment(methName)

        metadata = request.copy()
        metadata['_type'] = 'rpc'
        timer = autotest_stats.Timer('rpc', metadata=metadata)

        try:
            timer.start()
            meth = self.findServiceEndpoint(methName)
            results['result'] = self.invokeServiceEndpoint(meth, args)
        except Exception, err:
            results['err_traceback'] = traceback.format_exc()
            results['err'] = err
Example #6
0
    def run_with_retry(self, function, *args, **dargs):
        """Call function(*args, **dargs) until either it passes
        without an operational error, or a timeout is reached.
        This will re-connect to the database, so it is NOT safe
        to use this inside of a database transaction.

        It can be safely used with transactions, but the
        transaction start & end must be completely contained
        within the call to 'function'."""
        OperationalError = _get_error_class("OperationalError")

        success = False
        start_time = time.time()
        while not success:
            try:
                result = function(*args, **dargs)
            except OperationalError, e:
                self._log_operational_error(e)
                stop_time = time.time()
                elapsed_time = stop_time - start_time
                if elapsed_time > self.query_timeout:
                    raise
                else:
                    try:
                        self._random_delay()
                        self._init_db()
                        autotest_stats.Counter('tko_db_error').increment()
                    except OperationalError, e:
                        self._log_operational_error(e)
 def _handle_pidfile_error(self, error, message=''):
     metadata = {'_type': 'scheduler_error',
                 'error': 'autoserv died without writing exit code',
                 'process': str(self._state.process),
                 'pidfile_id': str(self.pidfile_id)}
     autotest_stats.Counter('autoserv_died_without_writing_exit_code',
                            metadata=metadata).increment()
     self.on_lost_process(self._state.process)
def find_and_generate_minidump_stacktraces(host_resultdir):
    """
    Finds all minidump files and generates a stack trace for each.

    Enumerates all files under the test results directory (recursively)
    and generates a stack trace file for the minidumps.  Minidump files are
    identified as files with .dmp extension.  The stack trace filename is
    composed by appending the .txt extension to the minidump filename.

    @param host_resultdir: Directory to walk looking for dmp files.

    @returns The list of generated minidumps.
    """
    minidumps = []
    for dir, subdirs, files in os.walk(host_resultdir):
        for file in files:
            if not file.endswith('.dmp'):
                continue
            minidump = os.path.join(dir, file)

            # First, try to symbolicate locally.
            try:
                generate_minidump_stacktrace(minidump)
                logging.info('Generated stack trace for dump %s', minidump)
                minidumps.append(minidump)
                continue
            except client_utils.error.CmdError as err:
                logging.warning(
                    'Failed to generate stack trace locally for '
                    'dump %s (rc=%d):\n%r', minidump,
                    err.result_obj.exit_status, err)

            # If that did not succeed, try to symbolicate using the dev server.
            try:
                logging.info('Generating stack trace for %s', minidump)
                minidumps.append(minidump)
                is_timeout, _ = retry.timeout(
                    symbolicate_minidump_with_devserver,
                    args=(minidump, host_resultdir),
                    timeout_sec=600)
                if is_timeout:
                    logging.warn(
                        'Generating stack trace is timed out for dump '
                        '%s', minidump)
                    autotest_stats.Counter(SYMBOLICATE_TIMEDOUT).increment()
                else:
                    logging.info('Generated stack trace for dump %s', minidump)
                continue
            except dev_server.DevServerException as e:
                logging.warning(
                    'Failed to generate stack trace on devserver for '
                    'dump %s:\n%r', minidump, e)
    return minidumps
 def _check_paired_results_exist(self):
     if not self._paired_with_monitor().has_process():
         metadata = {
             '_type': 'scheduler_error',
             'error': 'No paired results in task',
             'task': str(self),
             'pidfile_id': str(self._paired_with_monitor().pidfile_id)
         }
         autotest_stats.Counter('no_paired_results_in_task',
                                metadata=metadata).increment()
         self.finished(False)
         return False
     return True
Example #10
0
    def _init_db(self):
        # make sure we clean up any existing connection
        if self.con:
            self.con.close()
            self.con = None

        try:
            # create the db connection and cursor
            self.con = self.connect(self.host, self.database, self.user,
                                    self.password, self.port)
        except:
            autotest_stats.Counter('tko_db_con_error').increment()
            raise
        self.cur = self.con.cursor()
Example #11
0
 def _find_installable_dir(cls, host):
     client_autodir_paths = cls.get_client_autodir_paths(host)
     for path in client_autodir_paths:
         try:
             host.run('mkdir -p %s' % utils.sh_escape(path))
             host.run('test -w %s' % utils.sh_escape(path))
             return path
         except error.AutoservRunError:
             logging.debug('Failed to create %s', path)
     metadata = {'_type': 'AutoservInstallError', 'hostname': host.hostname}
     autotest_stats.Counter('AutoservInstallError',
                            metadata=metadata).increment()
     raise error.AutoservInstallError(
         'Unable to find a place to install Autotest; tried %s' %
         ', '.join(client_autodir_paths))
    def _deserialize_many(self, serialized_list, djmodel, message):
        """Deserialize data in JSON format to database.

        Deserialize a list of JSON-formatted data to database using Django.

        @param serialized_list: A list of JSON-formatted data.
        @param djmodel: Django model type.
        @param message: A string to be used in a logging message.
        """
        for serialized in serialized_list:
            with transaction.commit_on_success():
                try:
                    djmodel.deserialize(serialized)
                except Exception as e:
                    logging.error('Deserializing a %s fails: %s, Error: %s',
                                  message, serialized, e)
                    autotest_stats.Counter(STATS_KEY).increment(
                        'deserialization_failures')
Example #13
0
def wait_for_machine_to_recover(host, hours_to_wait=HOURS_TO_WAIT):
    """Wait for a machine (possibly down) to become accessible again.

    @param host: A RemoteHost instance to wait on
    @param hours_to_wait: Number of hours to wait before giving up

    @returns: True if the machine comes back up, False otherwise
    """
    current_time = time.strftime("%b %d %H:%M:%S", time.localtime())
    if host.is_up():
        logging.info("%s already up, collecting crash info", host.hostname)
        return True

    logging.info("Waiting %s hours for %s to come up (%s)", hours_to_wait,
                 host.hostname, current_time)
    if not host.wait_up(timeout=hours_to_wait * 3600):
        autotest_stats.Counter('collect_crashinfo_timeout').increment()
        logging.warning("%s down, unable to collect crash info", host.hostname)
        return False
    else:
        logging.info("%s is back up, collecting crash info", host.hostname)
        return True
Example #14
0
    def _run(self, command, timeout, ignore_status, stdout, stderr,
             connect_timeout, env, options, stdin, args, ignore_timeout):
        """Helper function for run()."""
        ssh_cmd = self.ssh_command(connect_timeout, options)
        if not env.strip():
            env = ""
        else:
            env = "export %s;" % env
        for arg in args:
            command += ' "%s"' % utils.sh_escape(arg)
        full_cmd = '%s "%s %s"' % (ssh_cmd, env, utils.sh_escape(command))

        # TODO(jrbarnette):  crbug.com/484726 - When we're in an SSP
        # container, sometimes shortly after reboot we will see DNS
        # resolution errors on ssh commands; the problem never
        # occurs more than once in a row.  This especially affects
        # the autoupdate_Rollback test, but other cases have been
        # affected, too.
        #
        # We work around it by detecting the first DNS resolution error
        # and retrying exactly one time.
        dns_retry_count = 2
        while True:
            result = utils.run(full_cmd,
                               timeout,
                               True,
                               stdout,
                               stderr,
                               verbose=False,
                               stdin=stdin,
                               stderr_is_expected=ignore_status,
                               ignore_timeout=ignore_timeout)
            dns_retry_count -= 1
            if (result and result.exit_status == 255 and re.search(
                    r'^ssh: .*: Name or service not known', result.stderr)):
                if dns_retry_count:
                    logging.debug('Retrying because of DNS failure')
                    continue
                logging.debug('Retry failed.')
                autotest_stats.Counter('dns_retry_hack.fail').increment()
            elif not dns_retry_count:
                logging.debug('Retry succeeded.')
                autotest_stats.Counter('dns_retry_hack.pass').increment()
            break

        if ignore_timeout and not result:
            return None

        # The error messages will show up in band (indistinguishable
        # from stuff sent through the SSH connection), so we have the
        # remote computer echo the message "Connected." before running
        # any command.  Since the following 2 errors have to do with
        # connecting, it's safe to do these checks.
        if result.exit_status == 255:
            if re.search(
                    r'^ssh: connect to host .* port .*: '
                    r'Connection timed out\r$', result.stderr):
                raise error.AutoservSSHTimeout("ssh timed out", result)
            if "Permission denied." in result.stderr:
                msg = "ssh permission denied"
                raise error.AutoservSshPermissionDeniedError(msg, result)

        if not ignore_status and result.exit_status > 0:
            raise error.AutoservRunError("command execution error", result)

        return result
Example #15
0
    def offload_dir(dir_entry, dest_path):
        """Offload the specified directory entry to Google storage.

        @param dir_entry: Directory entry to offload.
        @param dest_path: Location in google storage where we will
                          offload the directory.

        """
        try:
            counter = autotest_stats.Counter(STATS_KEY)
            counter.increment('jobs_offload_started')

            sanitize_dir(dir_entry)

            if LIMIT_FILE_COUNT:
                limit_file_count(dir_entry)

            error = False
            stdout_file = tempfile.TemporaryFile('w+')
            stderr_file = tempfile.TemporaryFile('w+')
            process = None
            signal.alarm(OFFLOAD_TIMEOUT_SECS)
            gs_path = '%s%s' % (gs_uri, dest_path)
            process = subprocess.Popen(get_cmd_list(multiprocessing, dir_entry,
                                                    gs_path),
                                       stdout=stdout_file,
                                       stderr=stderr_file)
            process.wait()
            signal.alarm(0)

            if process.returncode == 0:
                dir_size = get_directory_size_kibibytes(dir_entry)

                counter.increment('kibibytes_transferred_total', dir_size)
                metadata = {
                    '_type': METADATA_TYPE,
                    'size_KB': dir_size,
                    'result_dir': dir_entry,
                    'drone': socket.gethostname().replace('.', '_')
                }
                autotest_stats.Gauge(STATS_KEY, metadata=metadata).send(
                    'kibibytes_transferred', dir_size)
                counter.increment('jobs_offloaded')
                shutil.rmtree(dir_entry)
            else:
                error = True
        except TimeoutException:
            # If we finished the call to Popen(), we may need to
            # terminate the child process.  We don't bother calling
            # process.poll(); that inherently races because the child
            # can die any time it wants.
            if process:
                try:
                    process.terminate()
                except OSError:
                    # We don't expect any error other than "No such
                    # process".
                    pass
            logging.error(
                'Offloading %s timed out after waiting %d '
                'seconds.', dir_entry, OFFLOAD_TIMEOUT_SECS)
            error = True
        except OSError as e:
            # The wrong file permission can lead call
            # `shutil.rmtree(dir_entry)` to raise OSError with message
            # 'Permission denied'. Details can be found in
            # crbug.com/536151
            if e.errno == errno.EACCES:
                logging.warn('Try to correct file permission of %s.',
                             dir_entry)
                correct_results_folder_permission(dir_entry)
        finally:
            signal.alarm(0)
            if error:
                # Rewind the log files for stdout and stderr and log
                # their contents.
                stdout_file.seek(0)
                stderr_file.seek(0)
                stderr_content = stderr_file.read()
                logging.error('Error occurred when offloading %s:', dir_entry)
                logging.error('Stdout:\n%s \nStderr:\n%s', stdout_file.read(),
                              stderr_content)
                # Some result files may have wrong file permission. Try
                # to correct such error so later try can success.
                # TODO(dshi): The code is added to correct result files
                # with wrong file permission caused by bug 511778. After
                # this code is pushed to lab and run for a while to
                # clean up these files, following code and function
                # correct_results_folder_permission can be deleted.
                if 'CommandException: Error opening file' in stderr_content:
                    logging.warn('Try to correct file permission of %s.',
                                 dir_entry)
                    correct_results_folder_permission(dir_entry)
            stdout_file.close()
            stderr_file.close()
 def _heartbeat_failure(self, log_message):
     logging.error("Heartbeat failed. %s", log_message)
     autotest_stats.Counter(STATS_KEY).increment('heartbeat_failures')