Example #1
0
def run(treadmill_root, upload_url=None):
    """Run postmortem"""
    filetime = utils.datetime_utcnow().strftime('%Y%m%d_%H%M%SUTC')
    hostname = socket.gethostname()
    postmortem_file_base = os.path.join(
        tempfile.gettempdir(), '{0}-{1}.tar'.format(hostname, filetime))

    postmortem_file = collect(treadmill_root, postmortem_file_base)
    if os.name == 'posix':
        os.chmod(postmortem_file, 0o644)
    _LOGGER.info('generated postmortem file: %r', postmortem_file)

    if _UPLOADER is not None:
        _UPLOADER.upload(postmortem_file, upload_url)
Example #2
0
    def collect(install_dir, upload_script, upload_args):
        """Collect Treadmill node data"""

        filetime = utils.datetime_utcnow().strftime('%Y%m%d_%H%M%SUTC')
        hostname = socket.gethostname()

        postmortem_file_base = os.path.join(
            '/tmp', '{0}-{1}.tar'.format(hostname, filetime))

        postmortem_file = postmortem.collect(install_dir, postmortem_file_base)
        _LOGGER.info('generated postmortem file: %r', postmortem_file)
        # need to change owner of the postmortem file to treadmill proid
        # change permission to 644
        os.chmod(postmortem_file, 0o644)

        # if upload script is provided, we upload the postmortem_file
        if upload_script is not None:
            upload_arg_list = ([] if upload_args is None else
                               shlex.split(upload_args))
            utils.check_call([upload_script, postmortem_file] +
                             upload_arg_list)
Example #3
0
def run(treadmill_root):
    """Run postmortem"""
    filetime = utils.datetime_utcnow().strftime('%Y%m%d_%H%M%SUTC')
    hostname = socket.gethostname()
    postmortem_dir = os.path.join(treadmill_root, 'postmortem')
    fs.mkdir_safe(postmortem_dir)

    postmortem_archive = os.path.join(
        postmortem_dir, '{0}-{1}.tar.gz'.format(hostname, filetime))

    _LOGGER.info('Collection postmortem: %s', postmortem_archive)

    with tarfile.open(postmortem_archive, 'w:gz') as f:
        collect(treadmill_root, f)

    if os.name == 'posix':
        os.chmod(postmortem_archive, 0o644)

    existing = glob.glob(os.path.join(postmortem_dir, '*'))
    # Remove all files except for last two.
    for filename in sorted(existing)[0:-_MAX_ARCHIVES]:
        _LOGGER.info('Removing old archive: %s', filename)
        fs.rm_safe(filename)
Example #4
0
def _cleanup(tm_env, zkclient, container_dir, app):
    """Cleanup a container that actually ran.
    """
    # Too many branches.
    #
    # pylint: disable=R0912

    rootdir = os.path.join(container_dir, 'root')
    # Generate a unique name for the app
    unique_name = appmgr.app_unique_name(app)
    # Create service clients
    cgroup_client = tm_env.svc_cgroup.make_client(
        os.path.join(container_dir, 'cgroups'))
    localdisk_client = tm_env.svc_localdisk.make_client(
        os.path.join(container_dir, 'localdisk'))
    network_client = tm_env.svc_network.make_client(
        os.path.join(container_dir, 'network'))

    # Make sure all processes are killed
    # FIXME(boysson): Should we use `kill_apps_in_cgroup` instead?
    _kill_apps_by_root(rootdir)

    # Setup the archive filename that will hold this container's data
    filetime = utils.datetime_utcnow().strftime('%Y%m%d_%H%M%S%f')
    archive_filename = os.path.join(
        container_dir, '{instance_name}_{hostname}_{timestamp}.tar'.format(
            instance_name=appmgr.appname_task_id(app.name),
            hostname=sysinfo.hostname(),
            timestamp=filetime))

    # Tar up container root filesystem if archive list is in manifest
    try:
        localdisk = localdisk_client.get(unique_name)
        fs.archive_filesystem(localdisk['block_dev'], rootdir,
                              archive_filename, app.archive)
    except services.ResourceServiceError:
        _LOGGER.warning('localdisk never allocated')
    except subprocess.CalledProcessError:
        _LOGGER.exception('Unable to archive root device of %r', unique_name)
    except:  # pylint: disable=W0702
        _LOGGER.exception('Unknown exception while archiving %r', unique_name)

    # Destroy the volume
    try:
        localdisk = localdisk_client.delete(unique_name)
    except (IOError, OSError) as err:
        if err.errno == errno.ENOENT:
            pass
        else:
            raise

    if not app.shared_network:
        _cleanup_network(tm_env, app, network_client)

    # Add metrics to archive
    rrd_file = os.path.join(
        tm_env.metrics_dir, 'apps',
        '{name}-{instanceid}-{uniqueid}.rrd'.format(
            name=app.app,
            instanceid=app.task,
            uniqueid=app.uniqueid,
        ))
    rrdutils.flush_noexc(rrd_file)
    _copy_metrics(rrd_file, container_dir)

    # Cleanup our cgroup resources
    try:
        cgroup_client.delete(unique_name)
    except (IOError, OSError) as err:
        if err.errno == errno.ENOENT:
            pass
        else:
            raise

    try:
        _archive_logs(tm_env, container_dir)
    except Exception:  # pylint: disable=W0703
        _LOGGER.exception('Unexpected exception storing local logs.')

    # Append or create the tarball with folders outside of container
    # Compress and send the tarball to HCP
    try:
        archive_filename = fs.tar(sources=container_dir,
                                  target=archive_filename,
                                  compression='gzip').name
        _send_container_archive(zkclient, app, archive_filename)
    except:  # pylint: disable=W0702
        _LOGGER.exception("Failed to update archive")