def __init__(self, query_id): """ Set configuration options for the queries to ElasticSearch Establish connection to the server - query_id: (int) arbitrary identification number of the query """ self.log = fancylogger.getLogger(name=self.__class__.__name__) # Set query ID try: self.id = str(query_id) except ValueError as err: error_exit(self.log, err) try: # URL of the ElasticSearch instance self.servers = MainConf.get('elasticsearch', 'server_url').split(',') # Index parameters self.index = { 'name': MainConf.get('elasticsearch', 'index_name'), 'freq': MainConf.get('elasticsearch', 'index_freq'), 'walltime': MainConf.get('elasticsearch', 'max_walltime'), } except KeyError as err: error_exit(logger, err) # Default field to retrieve and format of timestamps self.fields = ['@timestamp'] self.timeformat = '%Y-%m-%dT%H:%M:%S.%fZ' try: self.client = Elasticsearch(hosts=self.servers) self.search = Search(using=self.client) es_cluster = self.client.cluster.health() except (ConnectionError, TransportError) as err: error_exit( self.log, f"ES query [{self.id}] connection to ElasticSearch server failed: {err}" ) except ConnectionTimeout as err: error_exit( self.log, f"ES query [{self.id}] connection to ElasticSearch server timed out" ) else: dbgmsg = "ES query [%s] connection established with ES cluster: %s" self.log.debug(dbgmsg, self.id, es_cluster['cluster_name']) self.log.debug("ES query [%s] status of ES cluster is %s", self.id, es_cluster['status'])
def init_db_cache(self): """ Returns empty cache with db placeholder and default meta data """ try: valid_days = MainConf.get_digit('userdb', 'default_valid_days', fallback=30, mandatory=False) except (KeyError, ValueError) as err: error_exit(self.log, err) else: empty_db = {'valid_days': valid_days, 'db': dict()} self.log.info( f"Initialized empty data base of users with a validity of %s days", valid_days) return empty_db
def load_db_cache(self): """ Read contents of user data base from local cache file If cache does not exist, inititalize it """ # Use local cache file defined in configuration cache_file = MainConf.get('userdb', 'cache_file', fallback='userdb-cache.json', mandatory=False) cache = DataFile(cache_file, mandatory=False) if hasattr(cache, 'contents'): self.log.debug(f"Data base of users populated from local cache") else: self.log.warning( f"Data base of users not found in local cache: {cache.datafile}" ) cache.contents = self.init_db_cache() return cache
def main(): # Core command line arguments cli_core = argparse.ArgumentParser(prog='accounting-report', add_help=False) cli_core.add_argument( '-v', '--version', action='version', version='%(prog)s from vsc-accounting-brussel v{}'.format(VERSION) ) cli_core.add_argument( '-d', dest='debug', help='use debug log level', required=False, action='store_true' ) cli_core.add_argument( '-i', dest='force_install', help='force (re)installation of any data files needed from package resources', required=False, action='store_true', ) cli_core.add_argument( '-c', dest='config_file', help='path to configuration file (default: ~/.config/vsc-accounting/vsc-accouning.ini)', default='vsc-accounting.ini', required=False, ) cli_core_args, cli_extra_args = cli_core.parse_known_args() # Debug level logs if cli_core_args.debug: fancylogger.setLogLevelDebug() logger.debug("Switched logging to debug verbosity") # Load configuration MainConf.load(cli_core_args.config_file) # Enforce (re)installation of data files if cli_core_args.force_install: dataparser.FORCE_INSTALL = True # Read nodegroup specs and default values try: nodegroups_spec = MainConf.get('nodegroups', 'specsheet') nodegroups_default = MainConf.get('nodegroups', 'default').split(',') except KeyError as err: error_exit(logger, err) else: nodegroups = DataFile(nodegroups_spec).contents # Reporting command line arguments cli = argparse.ArgumentParser( description='Generate accurate accounting reports about the computational resources used in an HPC cluster', parents=[cli_core], ) cli.add_argument( '-s', dest='start_date', help='data retrieved from START_DATE [YYYY-MM-DD] at 00:00', required=True, type=valid_isodate, ) cli.add_argument( '-e', dest='end_date', help='data retrieved until END_DATE [YYYY-MM-DD] at 00:00 (default: today)', default=date.today(), required=False, type=valid_isodate, ) cli.add_argument( '-r', dest='resolution', help='time resolution of the accounting (default: day)', choices=['year', 'quarter', 'month', 'week', 'day'], default='day', required=False, ) cli.add_argument( '-f', dest='report_format', help='format of the report document (default: SVG)', choices=['html', 'pdf', 'png', 'svg'], default='svg', required=False, ) cli.add_argument( '-t', dest='csv', help='write report data table in a CSV file', required=False, action='store_true', ) cli.add_argument( '-o', dest='output_dir', help='path to store output files (default: print working directory)', default=None, required=False, type=valid_dirpath, ) cli.add_argument( '-u', dest="compute_units", help='compute time units (default: corehours)', choices=['corehours', 'coredays'], default='corehours', required=False, ) cli.add_argument( '-n', dest='node_groups', help='node groups to include in the accounting report', choices=[*nodegroups], nargs='*', default=nodegroups_default, required=False, ) cli.add_argument( 'reports', help='accounting reports to generate', choices=[ 'compute-time', 'compute-percent', 'running-jobs', 'unique-users', 'peruser-compute', 'peruser-percent', 'peruser-jobs', 'perfield-compute', 'perfield-percent', 'perfield-jobs', 'persite-compute', 'persite-percent', 'persite-jobs', 'top-users', 'top-users-percent', 'top-fields', 'top-fields-percent', 'top-sites', 'top-sites-percent', ], nargs='+', ) # Read command line arguments cli_args = cli.parse_args() # Set absolute path of output directory if cli_args.output_dir: basedir = os.path.abspath(os.path.expanduser(cli_args.output_dir)) else: basedir = os.getcwd() logger.debug("Output directory set to: %s", basedir) # Convert time resolution to pandas DateOffset format pd_date_offsets = {'day': 'D', 'week': 'W-MON', 'month': 'MS', 'quarter': 'QS', 'year': 'AS'} date_offset = pd_date_offsets[cli_args.resolution] # Selection of node groups nodegroup_list = list(set(cli_args.node_groups)) # go through a set to remove duplicates # Account compute time on each node group in the requested period ComputeTime = ComputeTimeCount( cli_args.start_date, cli_args.end_date, date_offset, compute_units=cli_args.compute_units ) for ng in nodegroup_list: logger.info("Processing jobs on %s nodes...", ng) ComputeTime.add_nodegroup(ng, nodegroups[ng]['cores'], nodegroups[ng]['hosts']) # Colors of each nodegroup plot_colors = {ng: nodegroups[ng]['color'] for ng in nodegroup_list} # Generate requested accounting reports report_save = [basedir, cli_args.report_format, cli_args.csv] report_generators = { 'compute-time': (report.compute_time, [ComputeTime, plot_colors] + report_save), 'compute-percent': (report.compute_percent, [ComputeTime, plot_colors] + report_save), 'running-jobs': (report.global_measure, [ComputeTime, 'Running Jobs', plot_colors] + report_save), 'unique-users': (report.global_measure, [ComputeTime, 'Unique Users', plot_colors] + report_save), 'peruser-compute': (report.aggregates, [ComputeTime, 'User', 'Compute', False, plot_colors] + report_save), 'peruser-percent': (report.aggregates, [ComputeTime, 'User', 'Compute', True, plot_colors] + report_save), 'peruser-jobs': (report.aggregates, [ComputeTime, 'User', 'Jobs', False, plot_colors] + report_save), 'perfield-compute': (report.aggregates, [ComputeTime, 'Field', 'Compute', False, plot_colors] + report_save), 'perfield-percent': (report.aggregates, [ComputeTime, 'Field', 'Compute', True, plot_colors] + report_save), 'perfield-jobs': (report.aggregates, [ComputeTime, 'Field', 'Jobs', False, plot_colors] + report_save), 'persite-compute': (report.aggregates, [ComputeTime, 'Site', 'Compute', False, plot_colors] + report_save), 'persite-percent': (report.aggregates, [ComputeTime, 'Site', 'Compute', True, plot_colors] + report_save), 'persite-jobs': (report.aggregates, [ComputeTime, 'Site', 'Jobs', False, plot_colors] + report_save), 'top-users': (report.top_users, [ComputeTime, False] + report_save), 'top-users-percent': (report.top_users, [ComputeTime, True] + report_save), 'top-fields': (report.top_fields, [ComputeTime, False] + report_save), 'top-fields-percent': (report.top_fields, [ComputeTime, True] + report_save), 'top-sites': (report.top_sites, [ComputeTime, False] + report_save), 'top-sites-percent': (report.top_sites, [ComputeTime, True] + report_save), } for requested_report in cli_args.reports: report_generators[requested_report][0](*report_generators[requested_report][1])
def __init__(self, date_start, date_end, date_freq, compute_units='corehours'): """ Inititalize data frames for the provided period of time - date_start, date_end: (date) limits of the period of time - date_freq: (pd.timedelta) string defining the frequency of time entries - compute_units: (string) units used to account compute time """ self.log = fancylogger.getLogger(name=self.__class__.__name__) # Set global compute units and save them in here ComputeUnits.set_units(compute_units) self.compute_units = ComputeUnits.active_units # Use global date format self.dateformat = DATE_FORMAT # Set range of dates try: self.dates = self.set_dates(date_start, date_end, date_freq) except ValueError as err: error_exit(self.log, err) # Set number of procs for parallel processing from configuration file try: self.max_procs = MainConf.get_digit('nodegroups', 'max_procs', fallback=None, mandatory=False) except (KeyError, ValueError) as err: error_exit(self.log, err) else: self.log.debug("Maximum number of processor set to %s", self.max_procs) # Specifications of each groups of nodes self.NG = dict() # Index both dates and nodegroups (empty unless nodegroups are added) self.index = pd.MultiIndex.from_product([self.dates, []], names=['date', 'nodegroup']) # Compute time indexing both dates and nodegroups self.GlobalStats = pd.DataFrame(columns=[ 'capacity', 'compute_time', 'running_jobs', 'unique_users' ], index=self.index) # Aggregate stats (columns are dynamically added for each section) for section in ['User', 'Field', 'Site']: self.setattr(section + 'List', set()) self.setattr(section + 'Compute', pd.DataFrame({}, index=self.index)) self.setattr(section + 'Jobs', pd.DataFrame({}, index=self.index)) # User account data self.UserAccounts = pd.DataFrame( columns=['user', 'field', 'site', 'updated']) self.UserAccounts = self.UserAccounts.set_index('user') self.log.debug("Global and aggregate data structures initialized")
def __init__(self, title, table, ymax=None, colors=None, legend=None): """ Initialize plot including axes, labels and legend > Plot object (matplotlib) is accessible in self.fig and self.ax > HTML page (beautifulsoup) is accessible in self.html_page - title: (string) main title of the plot - table: (DataFrame) data source for the plot - ymax: (numeric) maximum value of the Y axis - colors: (list of strings) color codes for each plot element - legend: (list of strings) alternative text elements of the legend Note: No default render() function defined. It is declared on child classes depending on the plot type. """ self.log = fancylogger.getLogger(name=self.__class__.__name__) # Plot title try: cluster_name = MainConf.get('nodegroups', 'cluster_name') except KeyError as err: error_exit(self.log, err) else: self.title = f"{cluster_name}: {title}" # General plot format settings format_configs = dict() for format_config in ['plot_dpi', 'plot_fontsize']: try: format_value = MainConf.get_digit('reports', format_config) except (KeyError, ValueError) as err: error_exit(self.log, err) else: format_configs.update({format_config: format_value}) # Font sizes are relative to 'plot_fontsize' configuration format_fontsize_mod = { 'axes.titlesize': 4, 'axes.labelsize': 0, 'xtick.labelsize': -2, 'ytick.labelsize': -2, 'legend.fontsize': -4, } format_params = {fp: format_configs['plot_fontsize'] + fmod for fp, fmod in format_fontsize_mod.items()} # Add DPI setting format_params.update({'figure.dpi': format_configs['plot_dpi']}) # Apply formats globally plt.rcParams.update(format_params) self.log.debug("Plot formatting set succesfully: %s", format_params) # Make local copy of data for the plot try: self.check_df(table) except TypeError as err: error_exit(self.log, err) else: self.table = table.copy() self.log.debug("Plot data table copied succesfully") # Plot date range if 'date' in self.table.index.names: dateidx = self.table.index.get_level_values('date').unique() self.datelim = (dateidx[0].date(), dateidx[-1].date()) self.log.debug("Plot data range: %s to %s", *self.datelim) else: self.datelim = None # Plot measure is first column in index level 0 if table.columns.nlevels > 1: self.ylab = self.table.columns.get_level_values(0)[0] else: self.ylab = self.table.columns[0] # Y axis scale and labels self.ymax = ymax self.yunits = re.search(r'\((.*?)\)', self.ylab) if self.yunits: self.yunits = self.yunits.group(1) # X axis labels self.xfreq = self.date_freq() self.xlab = f"Date ({self.xfreq})" self.log.debug("Plot labels: [X] %s [Y] %s", self.xlab, self.ylab) # Plot legend self.colors = colors self.legend = legend # Set plot ID from plot title plus index interval self.set_id() # Make the plot self.render() self.set_xaxis() self.set_yaxis() self.add_legend()
def html_dataframe(self, table): """ Format DataFrame into an HTML table, generating a complete HTML document - table: (DataFrame) source data for the HTML table """ # Work on a local copy of data table table = table.copy() # Format any Datetime indexes to ISO format for level in range(table.index.nlevels): idx = table.index.unique(level=level) if isinstance(idx, pd.DatetimeIndex): idx = idx.strftime('%Y-%m-%d') if table.index.nlevels > 1: table.index = table.index.set_levels(idx, level=level) else: table = table.set_index(idx) self.log.debug("HTML page: dates in index formatted in ISO format") # CSS style: take from file defined in configuration table_css_file = MainConf.get( 'reports', 'html_table_cssfile', fallback='html_table_style.json', mandatory=False ) table_css = DataFile(table_css_file, mandatory=True).contents self.log.debug(f"HTML page: added stylist rules to table from file: {table_css_file}") # CSS style: table zebra pattern zebra_bg = ('background', 'whitesmoke') if table.index.nlevels == 1: # Intermitent shading of single rows zebra_css = [{'selector': 'tbody tr:nth-of-type(odd)', 'props': [zebra_bg]}] self.log.debug(f"HTML page: applied zebra shading to every other row") else: # Intermitent shading of all rows beloging to each element in root index level rows = np.prod([len(level) for level in table.index.levels[1:]]) zebra_css = [ {'selector': f"tbody tr:nth-of-type({rows * 2}n-{shift})", 'props': [zebra_bg]} for shift in range(rows) ] self.log.debug("HTML page: applied zebra shading to every %s rows", rows) table_css.extend(zebra_css) # Delete names of each index level as it adds a second TH row table.index.names = [None for name in table.index.names] # Delete names of each column level as thous would be also printed along the column headers table.columns.names = [None for name in table.columns.names] # Format numbers table_format = dict() for column in table.columns: # Use names from all column levels if table.columns.nlevels > 1: column_name = " ".join(column) else: column_name = column if re.search('\(coredays.*\)', column_name): table_format.update({column: '{:.1f}'}) elif re.search('\(.*%\)', column_name): table_format.update({column: '{:.2%}'}) elif re.search('\(.*\)', column_name): # by default display data with units as integers table_format.update({column: '{:.0f}'}) else: # data without units are treated as is table_format.update({column: '{}'}) self.log.debug("HTML page: number formatting set per column of table to %s", table_format) # Get extra padding from configuration setting try: column_xtrlen = MainConf.get_digit('reports', 'html_table_extrapadding', fallback=2, mandatory=False) except (KeyError, ValueError) as err: error_exit(self.log, err) else: self.log.debug("HTML page: table cells extra padding set to %s", column_xtrlen) # Set lengths for each column based on formatted maximum value column_maxlen = [len(table_format[col].format(val)) for col, val in table.max(axis=0).to_dict().items()] column_width = [ {'selector': f".col{col}", 'props': [('width', f"{column_maxlen[col] + column_xtrlen}em")]} for col in range(table.shape[1]) ] table_css.extend(column_width) self.log.debug("HTML page: table column widths adjusted to %s", column_width) # Heatmap for data corresponding with the plot if self.yunits == '%': # color grade all columns with percentual data unitlabel = f"({self.yunits})" if table.columns.nlevels > 1: graded_cols = [col for col in table.columns if unitlabel in ''.join(col)] else: graded_cols = [col for col in table.columns if unitlabel in col] self.log.debug("HTML page: color graded all columns in table") elif self.ylab in table.columns: # color grade columns with data of plot graded_cols = [self.ylab] self.log.debug("HTML page: color graded column '%s'", self.ylab) else: graded_cols = None self.log.debug("HTML page: no color grading applied") # Data table printout table_styled = table.style.format(table_format).set_table_styles(table_css) self.log.debug("HTML page: table CSS style applied") if graded_cols: # Note: background_gradient accepts axis=None in pandas 0.25 and vmax in pandas 1.0 # .background_gradient(cmap='YlGnBu', axis=None, subset=dataframe_slice, vmax=num) table_styled = table_styled.background_gradient(cmap='YlGnBu', axis='index', subset=graded_cols) self.log.debug("HTML page: table color gradient applied") table_html = table_styled.render() # Parse table html table_soup = BeautifulSoup(table_html, 'lxml') # Fusion cells with equal total values for all nodegroups th0 = table_soup.tbody.select('th.row_heading.level0') rowspan = int(th0[0]['rowspan']) if th0[0].has_attr('rowspan') else 1 ngtotals = [f"col{col}" for col, name in enumerate(table.columns) if 'Total' in name] # Only proceed if level 0 index has rowspan and columns named 'Total' exist if rowspan > 1 and len(ngtotals) > 0: for ngtotal in ngtotals: column_total = table_soup.tbody.find_all('td', ngtotal) # Check if values in first group of rows are equal (assumes same topology accross the column) firstrow = [cell.string for cell in column_total[0:rowspan]] if all(cell == firstrow[0] for cell in firstrow): # Add rowspan to each top cell for row in range(0, len(column_total), rowspan): column_total[row]['rowspan'] = rowspan # Delete redundant cells for span in range(1, rowspan): column_total[row + span].decompose() self.log.debug("HTML page: cells in column '%s' fusioned succesfully", ngtotal[3:]) return table_soup
def html_makepage(self, plot_title=None, plot_notes=None): """ Generate HTML document from scratch with plot image and store it in self.html_page - plot_title: (string) alternative title for the plot - plot_notes: (list of string) optional text to add below plot_title """ # Path to image file of the plot # Use SVG file for better scaling quality try: img_source_path = self.output_path['svg'] except AttributeError as err: errmsg = f"Path to plot render for HTML page not found. Method self.set_output_paths() not called yet." error_exit(self.log, errmsg) # Main titles page_title = self.title if self.datelim and plot_title is None: plot_title = "Graph from {} to {}".format(*self.datelim) # Head and title of the HTML page head = "<head><meta /><title>{}</title></head>".format(page_title) page = BeautifulSoup(''.join(head), 'lxml') page.insert(0, Doctype('html')) page.html['lang'] = 'en' page.head.meta['charset'] = 'utf-8' # CSS style: take from file defined in configuration html_css_file = MainConf.get('reports', 'html_main_cssfile', fallback='html_main_style.html', mandatory=False) css_style = DataFile(html_css_file, mandatory=True).contents page.head.append(css_style.find('style')) self.log.debug(f"HTML page: added CSS style from file: {html_css_file}") # Body and main title newobj = page.html for tag in ['body', 'h1']: newtag = page.new_tag(tag) newobj.append(newtag) newobj = newobj.contents[-1] page.h1.string = page_title # Render plot in SVG format img_block = page.new_tag('div') img_block['class'] = 'blockcard' if plot_title is not None: img_block.append(page.new_tag('h2')) img_block.h2.string = plot_title self.log.debug("HTML page: plot sub-title added") if plot_notes is not None: if not isinstance(plot_notes, list): plot_notes = [plot_notes] for note in plot_notes: img_block.append(page.new_tag('p')) p_block = img_block.contents[-1] p_block.string = note self.log.debug("HTML page: %s notes added", len(plot_notes)) img_block.append(page.new_tag('img')) img_block.img['class'] = 'plotrender' img_block.img['src'] = img_source_path img_block.img['alt'] = self.title page.body.append(img_block) self.log.info("HTML page: plot render '%s' added to report page", img_block.img['src']) # Render container for tables tables_block = page.new_tag('div') tables_block['id'] = 'tablescontainer' tables_block['class'] = 'blockcard' page.body.append(tables_block) self.html_page = page
def __init__(self, requested_users): """ Generate data base with user account data requested_users: (iterable) list of usernames to include in the data base """ self.log = fancylogger.getLogger(name=self.__class__.__name__) # Set number of procs for parallel processing from configuration file try: self.max_procs = MainConf.get_digit('nodegroups', 'max_procs', fallback=None, mandatory=False) except (KeyError, ValueError) as err: error_exit(self.log, err) else: self.log.debug("Maximum number of processor set to %s", self.max_procs) # Check requested list of users try: self.users = list(requested_users) except TypeError as err: errmsg = f"Cannot generate user data base from non-iterable user list: {requested_users}" error_exit(self.log, errmsg) # Get token from configuration file to access VSC account page TokenConfig = ConfigFile() try: vsc_token_file = MainConf.get('userdb', 'vsc_token_file', fallback='vsc-access.ini', mandatory=False) self.vsc_token = TokenConfig.load(vsc_token_file).get( 'MAIN', 'access_token') except KeyError as err: error_exit(self.log, err) # Load user data base from local cache self.cache = self.load_db_cache() # Update list of users with their records in the cache for n, user in enumerate(self.users): if user in self.cache.contents['db']: self.users[n] = (user, self.cache.contents['db'][user]) else: self.users[n] = (user, None) # Retrieve account data of requested users self.log.info(f"Retrieving {len(self.users)} user account records...") requested_records = parallel_exec( get_updated_record, # worker function f"User account retrieval", # label prefixing log messages self.users, # stack of items to process self.cache.contents[ 'valid_days'], # record_validity: forwarded to worker function self.vsc_token, # vsc_token: forwarded to worker function procs=self.max_procs, logger=self.log, ) # Generate dict of user accounts and update cache self.records = dict() for user_record in requested_records: self.records.update(user_record) self.cache.contents['db'].update(user_record) # Save local cache self.cache.save_data()