def resource_usage(bcbio_log, cluster, rawdir, verbose): """Generate system statistics from bcbio runs. Parse the obtained files and put the information in a :class pandas.DataFrame:. :param bcbio_log: local path to bcbio log file written by the run :param cluster: :param rawdir: directory to put raw data files :param verbose: increase verbosity :return: a tuple with three dictionaries, the first one contains an instance of :pandas.DataFrame: for each host, the second one contains information regarding the hardware configuration and the last one contains information regarding timing. :type return: tuple """ data_frames = {} hardware_info = {} time_frame = log_time_frame(bcbio_log) for collectl_file in sorted(os.listdir(rawdir)): if not collectl_file.endswith('.raw.gz'): continue # Only load filenames within sampling timerange (gathered from bcbio_log time_frame) if rawfile_within_timeframe(collectl_file, time_frame): collectl_path = os.path.join(rawdir, collectl_file) data, hardware = load_collectl( collectl_path, time_frame.start, time_frame.end) if len(data) == 0: #raise ValueError("No data present in collectl file %s, mismatch in timestamps between raw collectl and log file?", collectl_path) continue host = re.sub(r'-\d{8}-\d{6}\.raw\.gz$', '', collectl_file) hardware_info[host] = hardware if host not in data_frames: data_frames[host] = data else: data_frames[host] = pd.concat([data_frames[host], data]) return (data_frames, hardware_info, time_frame.steps)
def generate_graphs(collectl_datadir, bcbio_log_path, outdir, verbose=False): """Generate all graphs for a bcbio run.""" if verbose: print('Reading timings from bcbio log...') steps = get_bcbio_timings(bcbio_log_path) start_time = min(steps.keys()) end_time = max(steps.keys()) if verbose: print('Parsing performance data...') dfs = {} hardware_info = {} for item in sorted(os.listdir(collectl_datadir)): if not item.endswith('.raw.gz'): continue df, hardware = load_collectl( os.path.join(collectl_datadir, item), start_time, end_time) if len(df) == 0: continue host = re.sub(r'-\d{8}-\d{6}\.raw\.gz$', '', item) hardware_info[host] = hardware if host not in dfs: dfs[host] = df else: old_df = dfs[host] dfs[host] = pd.concat([old_df, df]) for host, df in dfs.iteritems(): if verbose: print('Generating CPU graph for {}...'.format(host)) graph = graph_cpu(df, steps, hardware_info[host]['num_cpus']) graph.get_figure().savefig( os.path.join(outdir, '{}_cpu.png'.format(host)), bbox_inches='tight', pad_inches=0.25) pylab.close() ifaces = set([ series.split('_')[0] for series in df.keys() if series.startswith(('eth', 'ib')) ]) if verbose: print('Generating network graphs for {}...'.format(host)) graph = graph_net_bytes(df, steps, ifaces) graph.get_figure().savefig( os.path.join(outdir, '{}_net_bytes.png'.format(host)), bbox_inches='tight', pad_inches=0.25) pylab.close() graph = graph_net_pkts(df, steps, ifaces) graph.get_figure().savefig( os.path.join(outdir, '{}_net_pkts.png'.format(host)), bbox_inches='tight', pad_inches=0.25) pylab.close() if verbose: print('Generating memory graph for {}...'.format(host)) graph = graph_memory(df, steps, hardware_info[host]["memory"]) graph.get_figure().savefig( os.path.join(outdir, '{}_memory.png'.format(host)), bbox_inches='tight', pad_inches=0.25) pylab.close() if verbose: print('Generating storage I/O graph for {}...'.format(host)) drives = set([ series.split('_')[0] for series in df.keys() if series.startswith(('sd', 'vd', 'hd', 'xvd')) ]) graph = graph_disk_io(df, steps, drives) graph.get_figure().savefig( os.path.join(outdir, '{}_disk_io.png'.format(host)), bbox_inches='tight', pad_inches=0.25) pylab.close()