Example #1
0
def main(argv):
    """Standard main routine.

    @param argv  Command line arguments, including `sys.argv[0]`.
    """
    arguments = _parse_command(argv)
    if not arguments:
        sys.exit(1)
    _configure_logging(arguments)
    try:
        if arguments.debug_metrics or not arguments.debug:
            metrics_file = None if not arguments.debug_metrics else '/dev/null'
            with site_utils.SetupTsMonGlobalState('repair_loops',
                                                  debug_file=metrics_file,
                                                  auto_flush=False):
                _perform_inventory_reports(arguments)
            metrics.Flush()
        else:
            _perform_inventory_reports(arguments)
    except KeyboardInterrupt:
        pass
    except EnvironmentError as e:
        logging.exception('Unexpected OS error: %s', e)
    except Exception as e:
        logging.exception('Unexpected exception: %s', e)
Example #2
0
def _WaitToFlush(last_flush, reset_after=()):
  """Sleeps until the next time we can call metrics.Flush(), then flushes.

  Args:
    last_flush: timestamp of the last flush
    reset_after: A list of metrics to reset after the flush.
  """
  time_delta = time.time() - last_flush
  time.sleep(max(0, FLUSH_INTERVAL - time_delta))
  metrics.Flush(reset_after=reset_after)
Example #3
0
def collect_metrics(cycles):
  system_metrics.get_uptime()
  system_metrics.get_cpu_info()
  system_metrics.get_disk_info()
  system_metrics.get_mem_info()
  system_metrics.get_net_info()
  system_metrics.get_proc_info()
  system_metrics.get_load_avg()
  puppet_metrics.get_puppet_summary()
  if cycles == 0:
    system_metrics.get_os_info()
  system_metrics.get_unix_time()  # must be just before flush
  metrics.Flush()
Example #4
0
    def _FlushIfReady(self):
        """Call metrics.Flush() if we are ready and have pending metrics.

    This allows us to only call flush every FLUSH_INTERVAL seconds.
    """
        now = time.time()
        time_delta = now - self.last_flush
        if time_delta > FLUSH_INTERVAL:
            self.last_flush = now
            time_delta = 0
            metrics.Flush(reset_after=self.reset_after_flush)
            self.pending = False
        return time_delta
 def __call__(self):
     """Collect metrics."""
     system_metrics.collect_uptime()
     system_metrics.collect_cpu_info()
     system_metrics.collect_disk_info()
     system_metrics.collect_mem_info()
     net_metrics.collect_net_info()
     proc_metrics.collect_proc_info()
     system_metrics.collect_load_avg()
     puppet_metrics.collect_puppet_summary()
     git_metrics.collect_git_metrics()
     self._collect_osinfo()
     system_metrics.collect_unix_time()  # must be just before flush
     metrics.Flush()
def Main():
    """Sets up logging and runs matchers against stdin"""
    args = ParseArgs()
    log_daemon_common.SetupLogging(args)

    # Set up metrics sending and go.
    ts_mon_args = {}
    if args.debug_metrics_file:
        ts_mon_args['debug_file'] = args.debug_metrics_file

    with ts_mon_config.SetupTsMonGlobalState('apache_error_log_metrics',
                                             **ts_mon_args):
      log_daemon_common.RunMatchers(sys.stdin, MATCHERS)
      metrics.Flush()
def main():
    """tko_parse entry point."""
    options, args = parse_args()

    # We are obliged to use indirect=False, not use the SetupTsMonGlobalState
    # context manager, and add a manual flush, because tko/parse is expected to
    # be a very short lived (<1 min) script when working effectively, and we
    # can't afford to either a) wait for up to 1min for metrics to flush at the
    # end or b) drop metrics that were sent within the last minute of execution.
    site_utils.SetupTsMonGlobalState('tko_parse', indirect=False,
                                     short_lived=True)
    try:
        with metrics.SuccessCounter('chromeos/autotest/tko_parse/runs'):
            _main_with_options(options, args)
    finally:
        metrics.Flush()
def main(argv):
    """Standard main routine.

    @param argv  Command line arguments including `sys.argv[0]`.

    """
    arguments = _parse_command(argv)
    if arguments.production:
        metrics_manager = site_utils.SetupTsMonGlobalState(
            'balance_pools',
            indirect=False,
            auto_flush=False,
        )
    else:
        metrics_manager = site_utils.TrivialContextManager()

    with metrics_manager:
        end_time = time.time()
        start_time = end_time - 24 * 60 * 60
        afe = frontend.AFE(server=arguments.web)

        def balancer(pool, labels):
            """Balance the specified model.

            @param pool: The pool to rebalance for the model.
            @param labels: labels to restrict to balancing operations
                    within.
            """
            _balance_model(arguments, afe, pool, labels, start_time, end_time)
            _log_message('')

        pools = (lab_inventory.CRITICAL_POOLS if arguments.pool
                 == _ALL_CRITICAL_POOLS else [arguments.pool])
        balancer_targets = infer_balancer_targets(afe, arguments, pools)
        try:
            parallel.RunTasksInProcessPool(
                balancer,
                balancer_targets,
                processes=8,
            )
        except KeyboardInterrupt:
            pass
        finally:
            metrics.Flush()
Example #9
0
def _FlushIfReady(pending, last_flush, reset_after=()):
  """Call metrics.Flush() if we are ready and have pending metrics.

  This allows us to only call flush every FLUSH_INTERVAL seconds.

  Args:
    pending: bool indicating whether there are pending metrics to flush.
    last_flush: time stamp of the last time flush() was called.
    reset_after: A list of metrics to reset after the flush.
  """
  now = time.time()
  time_delta = now - last_flush
  if time_delta > FLUSH_INTERVAL and pending:
    last_flush = now
    time_delta = 0
    metrics.Flush(reset_after=reset_after)
    pending = False
  else:
    pending = True

  return pending, last_flush, time_delta
Example #10
0
def main():
    """Standard main routine."""
    parser = argparse.ArgumentParser(
        description='Update the stable repair version for all '
        'boards')
    parser.add_argument('-n',
                        '--dry-run',
                        action='store_true',
                        help='print changes without executing them')
    loglib.add_logging_options(parser)
    # TODO(crbug/888046) Make these arguments required once puppet is updated to
    # pass them in.
    parser.add_argument('--web',
                        default='cautotest',
                        help='URL to the AFE to update.')

    arguments = parser.parse_args()
    loglib.configure_logging_with_args(parser, arguments)

    tsmon_args = {
        'service_name': parser.prog,
        'indirect': False,
        'auto_flush': False,
    }
    if arguments.dry_run:
        logging.info('DRYRUN: No changes will be made.')
        # metrics will be logged to logging stream anyway.
        tsmon_args['debug_file'] = '/dev/null'

    try:
        with ts_mon_config.SetupTsMonGlobalState(**tsmon_args):
            with metrics.SuccessCounter(_METRICS_PREFIX + '/tick',
                                        fields={'afe': arguments.web}):
                _assign_stable_images(arguments)
    finally:
        metrics.Flush()
Example #11
0
def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp):
    """Run server job with given options.

    @param pid_file_manager: PidFileManager used to monitor the autoserv process
    @param results: Folder to store results.
    @param parser: Parser for the command line arguments.
    @param ssp_url: Url to server-side package.
    @param use_ssp: Set to True to run with server-side packaging.
    """
    # send stdin to /dev/null
    dev_null = os.open(os.devnull, os.O_RDONLY)
    os.dup2(dev_null, sys.stdin.fileno())
    os.close(dev_null)

    # Create separate process group if the process is not a process group
    # leader. This allows autoserv process to keep running after the caller
    # process (drone manager call) exits.
    if os.getpid() != os.getpgid(0):
        os.setsid()

    # Container name is predefined so the container can be destroyed in
    # handle_sigterm.
    job_or_task_id = job_directories.get_job_id_or_task_id(
        parser.options.results)
    container_id = lxc.ContainerId(job_or_task_id, time.time(), os.getpid())

    # Implement SIGTERM handler
    def handle_sigterm(signum, frame):
        logging.debug('Received SIGTERM')
        if pid_file_manager:
            pid_file_manager.close_file(1, signal.SIGTERM)
        logging.debug('Finished writing to pid_file. Killing process.')

        # Update results folder's file permission. This needs to be done ASAP
        # before the parsing process tries to access the log.
        if use_ssp and results:
            correct_results_folder_permission(results)

        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
        # This sleep allows the pending output to be logged before the kill
        # signal is sent.
        time.sleep(.1)
        if use_ssp:
            logging.debug(
                'Destroy container %s before aborting the autoserv '
                'process.', container_id)
            try:
                bucket = lxc.ContainerBucket()
                container = bucket.get_container(container_id)
                if container:
                    container.destroy()
                else:
                    logging.debug('Container %s is not found.', container_id)
            except:
                # Handle any exception so the autoserv process can be aborted.
                logging.exception('Failed to destroy container %s.',
                                  container_id)
            # Try to correct the result file permission again after the
            # container is destroyed, as the container might have created some
            # new files in the result folder.
            if results:
                correct_results_folder_permission(results)

        os.killpg(os.getpgrp(), signal.SIGKILL)

    # Set signal handler
    signal.signal(signal.SIGTERM, handle_sigterm)

    # faulthandler is only needed to debug in the Lab and is not avaliable to
    # be imported in the chroot as part of VMTest, so Try-Except it.
    try:
        import faulthandler
        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
        logging.debug('faulthandler registered on SIGTERM.')
    except ImportError:
        sys.exc_clear()

    # Ignore SIGTTOU's generated by output from forked children.
    signal.signal(signal.SIGTTOU, signal.SIG_IGN)

    # If we received a SIGALARM, let's be loud about it.
    signal.signal(signal.SIGALRM, log_alarm)

    # Server side tests that call shell scripts often depend on $USER being set
    # but depending on how you launch your autotest scheduler it may not be set.
    os.environ['USER'] = getpass.getuser()

    label = parser.options.label
    group_name = parser.options.group_name
    user = parser.options.user
    client = parser.options.client
    server = parser.options.server
    verify = parser.options.verify
    repair = parser.options.repair
    cleanup = parser.options.cleanup
    provision = parser.options.provision
    reset = parser.options.reset
    job_labels = parser.options.job_labels
    no_tee = parser.options.no_tee
    execution_tag = parser.options.execution_tag
    ssh_user = parser.options.ssh_user
    ssh_port = parser.options.ssh_port
    ssh_pass = parser.options.ssh_pass
    collect_crashinfo = parser.options.collect_crashinfo
    control_filename = parser.options.control_filename
    verify_job_repo_url = parser.options.verify_job_repo_url
    skip_crash_collection = parser.options.skip_crash_collection
    ssh_verbosity = int(parser.options.ssh_verbosity)
    ssh_options = parser.options.ssh_options
    no_use_packaging = parser.options.no_use_packaging
    in_lab = bool(parser.options.lab)

    # can't be both a client and a server side test
    if client and server:
        parser.parser.error(
            "Can not specify a test as both server and client!")

    if provision and client:
        parser.parser.error("Cannot specify provisioning and client!")

    is_special_task = (verify or repair or cleanup or collect_crashinfo
                       or provision or reset)
    use_client_trampoline = False
    if parser.options.control_name:
        if use_ssp:
            # When use_ssp is True, autoserv will be re-executed inside a
            # container preserving the --control-name argument. Control file
            # will be staged inside the rexecuted autoserv.
            control = None
        else:
            try:
                control = _stage_control_file(parser.options.control_name,
                                              results)
            except error.AutoservError as e:
                logging.info("Using client trampoline because of: %s", e)
                control = parser.options.control_name
                use_client_trampoline = True

    elif parser.args:
        control = parser.args[0]
    else:
        if not is_special_task:
            parser.parser.error("Missing argument: control file")
        control = None

    if ssh_verbosity > 0:
        # ssh_verbosity is an integer between 0 and 3, inclusive
        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
    else:
        ssh_verbosity_flag = ''

    machines = _get_machines(parser)
    if group_name and len(machines) < 2:
        parser.parser.error('-G %r may only be supplied with more than one '
                            'machine.' % group_name)

    job_kwargs = {
        'control':
        control,
        'args':
        parser.args[1:],
        'resultdir':
        results,
        'label':
        label,
        'user':
        user,
        'machines':
        machines,
        'machine_dict_list':
        server_job.get_machine_dicts(
            machine_names=machines,
            store_dir=os.path.join(results, parser.options.host_info_subdir),
            in_lab=in_lab,
            use_shadow_store=not parser.options.local_only_host_info,
            host_attributes=parser.options.host_attributes,
        ),
        'client':
        client,
        'ssh_user':
        ssh_user,
        'ssh_port':
        ssh_port,
        'ssh_pass':
        ssh_pass,
        'ssh_verbosity_flag':
        ssh_verbosity_flag,
        'ssh_options':
        ssh_options,
        'group_name':
        group_name,
        'tag':
        execution_tag,
        'disable_sysinfo':
        parser.options.disable_sysinfo,
        'in_lab':
        in_lab,
        'use_client_trampoline':
        use_client_trampoline,
    }
    if parser.options.parent_job_id:
        job_kwargs['parent_job_id'] = int(parser.options.parent_job_id)
    if control_filename:
        job_kwargs['control_filename'] = control_filename
    job = server_job.server_job(**job_kwargs)

    job.logging.start_logging()

    # perform checks
    job.precheck()

    # run the job
    exit_code = 0
    auto_start_servod = _CONFIG.get_config_value('AUTOSERV',
                                                 'auto_start_servod',
                                                 type=bool,
                                                 default=False)

    site_utils.SetupTsMonGlobalState('autoserv',
                                     indirect=False,
                                     short_lived=True)
    try:
        try:
            if repair:
                if auto_start_servod and len(machines) == 1:
                    _start_servod(machines[0])
                job.repair(job_labels)
            elif verify:
                job.verify(job_labels)
            elif provision:
                job.provision(job_labels)
            elif reset:
                job.reset(job_labels)
            elif cleanup:
                job.cleanup(job_labels)
            else:
                if auto_start_servod and len(machines) == 1:
                    _start_servod(machines[0])
                if use_ssp:
                    try:
                        _run_with_ssp(job, container_id, job_or_task_id,
                                      results, parser, ssp_url, machines)
                    finally:
                        # Update the ownership of files in result folder.
                        correct_results_folder_permission(results)
                else:
                    if collect_crashinfo:
                        # Update the ownership of files in result folder. If the
                        # job to collect crashinfo was running inside container
                        # (SSP) and crashed before correcting folder permission,
                        # the result folder might have wrong permission setting.
                        try:
                            correct_results_folder_permission(results)
                        except:
                            # Ignore any error as the user may not have root
                            # permission to run sudo command.
                            pass
                    metric_name = ('chromeos/autotest/experimental/'
                                   'autoserv_job_run_duration')
                    f = {
                        'in_container': utils.is_in_container(),
                        'success': False
                    }
                    with metrics.SecondsTimer(metric_name, fields=f) as c:
                        job.run(verify_job_repo_url=verify_job_repo_url,
                                only_collect_crashinfo=collect_crashinfo,
                                skip_crash_collection=skip_crash_collection,
                                job_labels=job_labels,
                                use_packaging=(not no_use_packaging))
                        c['success'] = True

        finally:
            job.close()
            # Special task doesn't run parse, so result summary needs to be
            # built here.
            if results and (repair or verify or reset or cleanup or provision):
                # Throttle the result on the server side.
                try:
                    result_utils.execute(
                        results, control_data.DEFAULT_MAX_RESULT_SIZE_KB)
                except:
                    logging.exception(
                        'Non-critical failure: Failed to throttle results '
                        'in directory %s.', results)
                # Build result view and report metrics for result sizes.
                site_utils.collect_result_sizes(results)
    except:
        exit_code = 1
        traceback.print_exc()
    finally:
        metrics.Flush()

    sys.exit(exit_code)
def run_autoserv(pid_file_manager, results, parser, ssp_url, use_ssp):
    """Run server job with given options.

    @param pid_file_manager: PidFileManager used to monitor the autoserv process
    @param results: Folder to store results.
    @param parser: Parser for the command line arguments.
    @param ssp_url: Url to server-side package.
    @param use_ssp: Set to True to run with server-side packaging.
    """
    if parser.options.warn_no_ssp:
        # Post a warning in the log.
        logging.warn('Autoserv is required to run with server-side packaging. '
                     'However, no drone is found to support server-side '
                     'packaging. The test will be executed in a drone without '
                     'server-side packaging supported.')

    # send stdin to /dev/null
    dev_null = os.open(os.devnull, os.O_RDONLY)
    os.dup2(dev_null, sys.stdin.fileno())
    os.close(dev_null)

    # Create separate process group if the process is not a process group
    # leader. This allows autoserv process to keep running after the caller
    # process (drone manager call) exits.
    if os.getpid() != os.getpgid(0):
        os.setsid()

    # Container name is predefined so the container can be destroyed in
    # handle_sigterm.
    job_or_task_id = job_directories.get_job_id_or_task_id(
        parser.options.results)
    container_name = (lxc.TEST_CONTAINER_NAME_FMT %
                      (job_or_task_id, time.time(), os.getpid()))
    job_folder = job_directories.get_job_folder_name(parser.options.results)

    # Implement SIGTERM handler
    def handle_sigterm(signum, frame):
        logging.debug('Received SIGTERM')
        if pid_file_manager:
            pid_file_manager.close_file(1, signal.SIGTERM)
        logging.debug('Finished writing to pid_file. Killing process.')

        # Update results folder's file permission. This needs to be done ASAP
        # before the parsing process tries to access the log.
        if use_ssp and results:
            correct_results_folder_permission(results)

        # TODO (sbasi) - remove the time.sleep when crbug.com/302815 is solved.
        # This sleep allows the pending output to be logged before the kill
        # signal is sent.
        time.sleep(.1)
        if use_ssp:
            logging.debug(
                'Destroy container %s before aborting the autoserv '
                'process.', container_name)
            metadata = {
                'drone': socket.gethostname(),
                'job_id': job_or_task_id,
                'container_name': container_name,
                'action': 'abort',
                'success': True
            }
            try:
                bucket = lxc.ContainerBucket()
                container = bucket.get(container_name)
                if container:
                    container.destroy()
                else:
                    metadata['success'] = False
                    metadata['error'] = 'container not found'
                    logging.debug('Container %s is not found.', container_name)
            except:
                metadata['success'] = False
                metadata['error'] = 'Exception: %s' % str(sys.exc_info())
                # Handle any exception so the autoserv process can be aborted.
                logging.exception('Failed to destroy container %s.',
                                  container_name)
            autotest_es.post(use_http=True,
                             type_str=lxc.CONTAINER_RUN_TEST_METADB_TYPE,
                             metadata=metadata)
            # Try to correct the result file permission again after the
            # container is destroyed, as the container might have created some
            # new files in the result folder.
            if results:
                correct_results_folder_permission(results)

        os.killpg(os.getpgrp(), signal.SIGKILL)

    # Set signal handler
    signal.signal(signal.SIGTERM, handle_sigterm)

    # faulthandler is only needed to debug in the Lab and is not avaliable to
    # be imported in the chroot as part of VMTest, so Try-Except it.
    try:
        import faulthandler
        faulthandler.register(signal.SIGTERM, all_threads=True, chain=True)
        logging.debug('faulthandler registered on SIGTERM.')
    except ImportError:
        sys.exc_clear()

    # Ignore SIGTTOU's generated by output from forked children.
    signal.signal(signal.SIGTTOU, signal.SIG_IGN)

    # If we received a SIGALARM, let's be loud about it.
    signal.signal(signal.SIGALRM, log_alarm)

    # Server side tests that call shell scripts often depend on $USER being set
    # but depending on how you launch your autotest scheduler it may not be set.
    os.environ['USER'] = getpass.getuser()

    label = parser.options.label
    group_name = parser.options.group_name
    user = parser.options.user
    client = parser.options.client
    server = parser.options.server
    install_before = parser.options.install_before
    install_after = parser.options.install_after
    verify = parser.options.verify
    repair = parser.options.repair
    cleanup = parser.options.cleanup
    provision = parser.options.provision
    reset = parser.options.reset
    job_labels = parser.options.job_labels
    no_tee = parser.options.no_tee
    parse_job = parser.options.parse_job
    execution_tag = parser.options.execution_tag
    if not execution_tag:
        execution_tag = parse_job
    ssh_user = parser.options.ssh_user
    ssh_port = parser.options.ssh_port
    ssh_pass = parser.options.ssh_pass
    collect_crashinfo = parser.options.collect_crashinfo
    control_filename = parser.options.control_filename
    test_retry = parser.options.test_retry
    verify_job_repo_url = parser.options.verify_job_repo_url
    skip_crash_collection = parser.options.skip_crash_collection
    ssh_verbosity = int(parser.options.ssh_verbosity)
    ssh_options = parser.options.ssh_options
    no_use_packaging = parser.options.no_use_packaging
    host_attributes = parser.options.host_attributes
    in_lab = bool(parser.options.lab)

    # can't be both a client and a server side test
    if client and server:
        parser.parser.error(
            "Can not specify a test as both server and client!")

    if provision and client:
        parser.parser.error("Cannot specify provisioning and client!")

    is_special_task = (verify or repair or cleanup or collect_crashinfo
                       or provision or reset)
    if len(parser.args) < 1 and not is_special_task:
        parser.parser.error("Missing argument: control file")

    if ssh_verbosity > 0:
        # ssh_verbosity is an integer between 0 and 3, inclusive
        ssh_verbosity_flag = '-' + 'v' * ssh_verbosity
    else:
        ssh_verbosity_flag = ''

    # We have a control file unless it's just a verify/repair/cleanup job
    if len(parser.args) > 0:
        control = parser.args[0]
    else:
        control = None

    machines = _get_machines(parser)
    if group_name and len(machines) < 2:
        parser.parser.error('-G %r may only be supplied with more than one '
                            'machine.' % group_name)

    kwargs = {
        'group_name': group_name,
        'tag': execution_tag,
        'disable_sysinfo': parser.options.disable_sysinfo
    }
    if parser.options.parent_job_id:
        kwargs['parent_job_id'] = int(parser.options.parent_job_id)
    if control_filename:
        kwargs['control_filename'] = control_filename
    if host_attributes:
        kwargs['host_attributes'] = host_attributes
    kwargs['in_lab'] = in_lab
    job = server_job.server_job(control, parser.args[1:], results, label, user,
                                machines, client, parse_job, ssh_user,
                                ssh_port, ssh_pass, ssh_verbosity_flag,
                                ssh_options, test_retry, **kwargs)

    job.logging.start_logging()
    job.init_parser()

    # perform checks
    job.precheck()

    # run the job
    exit_code = 0
    auto_start_servod = _CONFIG.get_config_value('AUTOSERV',
                                                 'auto_start_servod',
                                                 type=bool,
                                                 default=False)

    site_utils.SetupTsMonGlobalState('autoserv',
                                     indirect=False,
                                     short_lived=True)
    try:
        try:
            if repair:
                if auto_start_servod and len(machines) == 1:
                    _start_servod(machines[0])
                job.repair(job_labels)
            elif verify:
                job.verify(job_labels)
            elif provision:
                job.provision(job_labels)
            elif reset:
                job.reset(job_labels)
            elif cleanup:
                job.cleanup(job_labels)
            else:
                if auto_start_servod and len(machines) == 1:
                    _start_servod(machines[0])
                if use_ssp:
                    try:
                        _run_with_ssp(job, container_name, job_or_task_id,
                                      results, parser, ssp_url, job_folder,
                                      machines)
                    finally:
                        # Update the ownership of files in result folder.
                        correct_results_folder_permission(results)
                else:
                    if collect_crashinfo:
                        # Update the ownership of files in result folder. If the
                        # job to collect crashinfo was running inside container
                        # (SSP) and crashed before correcting folder permission,
                        # the result folder might have wrong permission setting.
                        try:
                            correct_results_folder_permission(results)
                        except:
                            # Ignore any error as the user may not have root
                            # permission to run sudo command.
                            pass
                    metric_name = ('chromeos/autotest/experimental/'
                                   'autoserv_job_run_duration')
                    f = {
                        'in_container': utils.is_in_container(),
                        'success': False
                    }
                    with metrics.SecondsTimer(metric_name, fields=f) as c:
                        job.run(install_before,
                                install_after,
                                verify_job_repo_url=verify_job_repo_url,
                                only_collect_crashinfo=collect_crashinfo,
                                skip_crash_collection=skip_crash_collection,
                                job_labels=job_labels,
                                use_packaging=(not no_use_packaging))
                        c['success'] = True

        finally:
            while job.hosts:
                host = job.hosts.pop()
                host.close()
    except:
        exit_code = 1
        traceback.print_exc()
    finally:
        metrics.Flush()

    if pid_file_manager:
        pid_file_manager.num_tests_failed = job.num_tests_failed
        pid_file_manager.close_file(exit_code)
    job.cleanup_parser()

    sys.exit(exit_code)
Example #13
0
 def _WaitToFlush(self):
     """Sleeps until the next time we can call metrics.Flush(), then flushes."""
     time_delta = time.time() - self.last_flush
     time.sleep(max(0, FLUSH_INTERVAL - time_delta))
     metrics.Flush(reset_after=self.reset_after_flush)
def main():
    """Cleans unused labels from AFE database"""
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument(
        '--db',
        dest='db_server',
        help='Database server',
        default=DB_SERVER,
    )
    parser.add_argument(
        '--db-user',
        dest='db_user',
        help='Database user',
        default=USER,
    )
    parser.add_argument(
        '--db-password',
        dest='db_password',
        help='Database password',
        default=PASSWD,
    )
    parser.add_argument(
        '-p',
        dest='prefix',
        action='store_true',
        help=('Use argument <label> as a prefix for matching. '
              'For example, when the argument <label> is "cros-version" '
              'and this option is enabled, then labels whose name '
              'beginning with "cros-version" are matched. When this '
              'option is disabled, we match labels whose name is '
              'exactly same as the argument <label>.'),
    )
    parser.add_argument(
        '-n',
        dest='max_delete',
        type=int,
        help='Max number of records to delete in each query.',
        default=100,
    )
    parser.add_argument(
        '-s',
        dest='check_status',
        action='store_true',
        help='Enforce to run only in a server that has primary status',
    )
    parser.add_argument(
        '--dry-run',
        dest='dry_run',
        action='store_true',
        help='Dry run mode. Do not actually delete any labels.',
    )
    parser.add_argument('label', help='Label name to delete')
    options = parser.parse_args()

    logging_config.LoggingConfig().configure_logging(
        datefmt='%Y-%m-%d %H:%M:%S', verbose=True)

    if options.dry_run:
        tfd, metrics_file = tempfile.mkstemp()
        os.close(tfd)
        ts_mon_context = ts_mon_config.SetupTsMonGlobalState(
            'afe_label_cleaner',
            auto_flush=False,
            debug_file=metrics_file,
        )
    else:
        ts_mon_context = ts_mon_config.SetupTsMonGlobalState(
            'afe_label_cleaner',
            auto_flush=False,
        )
    with ts_mon_context:
        try:
            clean_labels(options)
        except:
            metrics.Counter(_METRICS_PREFIX +
                            '/tick').increment(fields={
                                'target_db': options.db_server,
                                'success': False
                            })
            raise
        else:
            metrics.Counter(_METRICS_PREFIX +
                            '/tick').increment(fields={
                                'target_db': options.db_server,
                                'success': True
                            })
        finally:
            metrics.Flush()
            if options.dry_run:
                logging.info('Dumped ts_mon metrics to %s', metrics_file)
Example #15
0
def main():
    """Main entrance."""
    start_time = datetime.datetime.now()
    # Record the processed jobs so that
    # we can send the duration of parsing to metadata db.
    processed_jobs = set()

    options, args = parse_args()

    if options.detach:
        _detach_from_parent_process()

    parse_options = _ParseOptions(options.reparse, options.mailit,
                                  options.dry_run, options.suite_report,
                                  options.datastore_creds,
                                  options.export_to_gcloud_path)
    results_dir = os.path.abspath(args[0])
    assert os.path.exists(results_dir)

    site_utils.SetupTsMonGlobalState('tko_parse',
                                     indirect=False,
                                     short_lived=True)

    pid_file_manager = pidfile.PidFileManager("parser", results_dir)

    if options.write_pidfile:
        pid_file_manager.open_file()

    try:
        # build up the list of job dirs to parse
        if options.singledir:
            jobs_list = [results_dir]
        else:
            jobs_list = [
                os.path.join(results_dir, subdir)
                for subdir in os.listdir(results_dir)
            ]

        # build up the database
        db = tko_db.db(autocommit=False,
                       host=options.db_host,
                       user=options.db_user,
                       password=options.db_pass,
                       database=options.db_name)

        # parse all the jobs
        for path in jobs_list:
            lockfile = open(os.path.join(path, ".parse.lock"), "w")
            flags = fcntl.LOCK_EX
            if options.noblock:
                flags |= fcntl.LOCK_NB
            try:
                fcntl.flock(lockfile, flags)
            except IOError, e:
                # lock is not available and nonblock has been requested
                if e.errno == errno.EWOULDBLOCK:
                    lockfile.close()
                    continue
                else:
                    raise  # something unexpected happened
            try:
                new_jobs = parse_path(db, path, options.level, parse_options)
                processed_jobs.update(new_jobs)

            finally:
                fcntl.flock(lockfile, fcntl.LOCK_UN)
                lockfile.close()

    except Exception as e:
        pid_file_manager.close_file(1)
        raise
    else:
        pid_file_manager.close_file(0)
    finally:
        metrics.Flush()
    duration_secs = (datetime.datetime.now() - start_time).total_seconds()
    if options.record_duration:
        record_parsing(processed_jobs, duration_secs)