def run(treadmill_root, upload_url=None): """Run postmortem""" filetime = utils.datetime_utcnow().strftime('%Y%m%d_%H%M%SUTC') hostname = socket.gethostname() postmortem_file_base = os.path.join( tempfile.gettempdir(), '{0}-{1}.tar'.format(hostname, filetime)) postmortem_file = collect(treadmill_root, postmortem_file_base) if os.name == 'posix': os.chmod(postmortem_file, 0o644) _LOGGER.info('generated postmortem file: %r', postmortem_file) if _UPLOADER is not None: _UPLOADER.upload(postmortem_file, upload_url)
def collect(install_dir, upload_script, upload_args): """Collect Treadmill node data""" filetime = utils.datetime_utcnow().strftime('%Y%m%d_%H%M%SUTC') hostname = socket.gethostname() postmortem_file_base = os.path.join( '/tmp', '{0}-{1}.tar'.format(hostname, filetime)) postmortem_file = postmortem.collect(install_dir, postmortem_file_base) _LOGGER.info('generated postmortem file: %r', postmortem_file) # need to change owner of the postmortem file to treadmill proid # change permission to 644 os.chmod(postmortem_file, 0o644) # if upload script is provided, we upload the postmortem_file if upload_script is not None: upload_arg_list = ([] if upload_args is None else shlex.split(upload_args)) utils.check_call([upload_script, postmortem_file] + upload_arg_list)
def run(treadmill_root): """Run postmortem""" filetime = utils.datetime_utcnow().strftime('%Y%m%d_%H%M%SUTC') hostname = socket.gethostname() postmortem_dir = os.path.join(treadmill_root, 'postmortem') fs.mkdir_safe(postmortem_dir) postmortem_archive = os.path.join( postmortem_dir, '{0}-{1}.tar.gz'.format(hostname, filetime)) _LOGGER.info('Collection postmortem: %s', postmortem_archive) with tarfile.open(postmortem_archive, 'w:gz') as f: collect(treadmill_root, f) if os.name == 'posix': os.chmod(postmortem_archive, 0o644) existing = glob.glob(os.path.join(postmortem_dir, '*')) # Remove all files except for last two. for filename in sorted(existing)[0:-_MAX_ARCHIVES]: _LOGGER.info('Removing old archive: %s', filename) fs.rm_safe(filename)
def _cleanup(tm_env, zkclient, container_dir, app): """Cleanup a container that actually ran. """ # Too many branches. # # pylint: disable=R0912 rootdir = os.path.join(container_dir, 'root') # Generate a unique name for the app unique_name = appmgr.app_unique_name(app) # Create service clients cgroup_client = tm_env.svc_cgroup.make_client( os.path.join(container_dir, 'cgroups')) localdisk_client = tm_env.svc_localdisk.make_client( os.path.join(container_dir, 'localdisk')) network_client = tm_env.svc_network.make_client( os.path.join(container_dir, 'network')) # Make sure all processes are killed # FIXME(boysson): Should we use `kill_apps_in_cgroup` instead? _kill_apps_by_root(rootdir) # Setup the archive filename that will hold this container's data filetime = utils.datetime_utcnow().strftime('%Y%m%d_%H%M%S%f') archive_filename = os.path.join( container_dir, '{instance_name}_{hostname}_{timestamp}.tar'.format( instance_name=appmgr.appname_task_id(app.name), hostname=sysinfo.hostname(), timestamp=filetime)) # Tar up container root filesystem if archive list is in manifest try: localdisk = localdisk_client.get(unique_name) fs.archive_filesystem(localdisk['block_dev'], rootdir, archive_filename, app.archive) except services.ResourceServiceError: _LOGGER.warning('localdisk never allocated') except subprocess.CalledProcessError: _LOGGER.exception('Unable to archive root device of %r', unique_name) except: # pylint: disable=W0702 _LOGGER.exception('Unknown exception while archiving %r', unique_name) # Destroy the volume try: localdisk = localdisk_client.delete(unique_name) except (IOError, OSError) as err: if err.errno == errno.ENOENT: pass else: raise if not app.shared_network: _cleanup_network(tm_env, app, network_client) # Add metrics to archive rrd_file = os.path.join( tm_env.metrics_dir, 'apps', '{name}-{instanceid}-{uniqueid}.rrd'.format( name=app.app, instanceid=app.task, uniqueid=app.uniqueid, )) rrdutils.flush_noexc(rrd_file) _copy_metrics(rrd_file, container_dir) # Cleanup our cgroup resources try: cgroup_client.delete(unique_name) except (IOError, OSError) as err: if err.errno == errno.ENOENT: pass else: raise try: _archive_logs(tm_env, container_dir) except Exception: # pylint: disable=W0703 _LOGGER.exception('Unexpected exception storing local logs.') # Append or create the tarball with folders outside of container # Compress and send the tarball to HCP try: archive_filename = fs.tar(sources=container_dir, target=archive_filename, compression='gzip').name _send_container_archive(zkclient, app, archive_filename) except: # pylint: disable=W0702 _LOGGER.exception("Failed to update archive")