def html_addtable(self, table_data, table_title=None): """ Convert DataFrame into an HTML table and add it to self.html_page - table_data: (DataFrame) source data for the HTML table - table_title: (string) title for the table in the HTML document """ # Check provided data table try: self.check_df(table_data) except TypeError as err: error_exit(self.log, err) # Convert DataFrame to HTML table_page = self.html_dataframe(table_data) # Add CSS to HTML document table_css = table_page.head.style self.html_page.head.append(table_css) # Add table to HTML document table_container = self.html_page.find(id='tablescontainer') table_block = table_page.new_tag('div') table_block['class'] = 'tablecard' if table_title: table_block.append(table_page.new_tag('h2')) table_block.h2.string = table_title table_block.append(table_page.body.table) table_container.append(table_block) self.log.info("HTML page: data table added to report page")
def locate_config(self, configfile): """ Determine location of config file, create it if it does not exist - configfile: (string) name of the config file """ if os.path.isabs(configfile): # Use config file from absolute path self.usercfg = { 'name': os.path.basename(configfile), 'path': configfile, } elif configfile: # Locate config file and install it if necessary self.usercfg = {'name': configfile} # Check existence of config file in default directories tentative_configs = [os.path.join(confdir, self.usercfg['name']) for confdir in self.default_config_dirs] existing_configs = [config_path for config_path in tentative_configs if os.path.isfile(config_path)] if len(existing_configs) > 0: # Use config file from top hit self.usercfg.update({'path': existing_configs[0]}) self.log.debug("Found existing configuration file: %s", self.usercfg['path']) else: # Install default config file in user's dir if config file is not found self.usercfg.update({'path': os.path.join(appdirs.user_config_dir(CONFIG_DIR), self.usercfg['name'])}) self.copy_pkgdefault() else: error_exit(self.log, "Name of configuration file is needed")
def find_available_path(filepath): """ Check if given path does exist. If path exists generate a sensible variant that does not exist. Make parent folders if necessary. - filepath: (string) absolute path to a existing or non-existing file """ try: check_abspath(filepath) except ValueError as err: error_exit(logger, err) # Make parent directories as needed parent_dir = os.path.dirname(filepath) make_dir(parent_dir) # Look for a file name that does not exist # Make variants of file name until we find one available replica = 0 tentative_path = filepath while os.path.lexists(tentative_path): replica += 1 if replica < 10000: # Prepend replica number to extension tentative_pathcut = filepath.split('.') tentative_pathcut.insert(-1, f"{replica:04}") tentative_path = '.'.join(tentative_pathcut) else: errmsg = f"Reached maximum number of replicas. Cannot find an available file name in path: {filepath}" raise FileExistsError(errmsg) logger.debug("Found available file name at path: %s", tentative_path) return tentative_path
def copy_file(source, destination, force=False): """ Copy file from source to destination if file in destination is missing Returns success of the copy operation - source: (string) absolute path to source file - destination: (string) absolute path to destination file - force: (boolean) copy file regardless of existence of destination """ for filepath in [source, destination]: try: check_abspath(filepath) except ValueError as err: error_exit(logger, err) # Copy files if destination does not exist or force is enabled if not os.path.exists(destination) or force: try: shutil.copyfile(source, destination) except FileNotFoundError: logger.warning("Copy failed due to missing file: %s", source) return False except PermissionError: error_exit( logger, f"Permission denied to copy file '{source}' to '{destination}'" ) else: logger.debug("File '%s' succesfully copied to '%s'", source, destination) return True else: logger.debug("Nothing to copy, file already exists: %s", destination) return None
def set_output_paths(self, savedir=None): """ Set output directory and define the path of output files for this object Paths will be based on the object ID, which will be modified as necessary to avoid filename collisions WARNING: do not set these paths too much in advance of any write operation - savedir: (string) path to directory to save output data """ # Set output directory if savedir is not None and check_dir(savedir): self.savedir = savedir else: self.savedir = os.getcwd() # Use ID to set default paths for each output file self.output_path = dict() output_exts = ['html', 'pdf', 'png', 'svg', 'csv'] for ext in output_exts: try: filepath = os.path.join(self.savedir, f"{self.id}.{ext}") filepath = find_available_path(filepath) except FileExistsError as err: error_exit(self.log, err) else: self.output_path.update({ext: filepath}) self.log.debug("Default output path for %s files set to %s", ext.upper(), self.output_path[ext])
def output_csv(self, table=None, filename=None): """ Save data frame in CSV format to 'csvfile' - table: (DataFrame or Series) alternative source data to save in the CSV - filename: (string) alternative name of the CSV file """ if not isinstance(table, pd.DataFrame) and not isinstance(table, pd.Series): table = self.table.copy() if filename is None: csvpath = self.output_path['csv'] else: try: csvpath = os.path.join(self.savedir, f"{filename}.csv") csvpath = find_available_path(csvpath) except FileExistsError as err: error_exit(self.log, err) else: self.log.debug("Using alternative path for CSV file output: %s", csvpath) # Add index names to the header row index_header = [idx.replace('_', ' ').title() for idx in table.index.names if idx] # Output data to CSV try: table.to_csv(csvpath, float_format='%.2f', header=True, index_label=index_header) except PermissionError: error_exit(f"Permission denied to save data in CSV format to {csvpath}") else: self.log.info(f"Data for '{self.title}' saved in CSV format to {csvpath}")
def output_html(self): """ Save HTML document in self.html_page into a file """ try: with open(self.output_path['html'], 'w') as htmlfile: htmlfile.write(self.html_page.prettify()) except PermissionError as err: error_exit(f"Permission denied to write HTML file: {self.output_path['html']}") else: self.log.info(f"Report for '{self.title}' saved in HTML format to {self.output_path['html']}")
def parallel_exec(task, label, stack, *args, procs=None, logger=None, **kwargs): """ Execute task in each item of stack in parallel Returns list with the resulting data - task: (method) function to pass to the parallel executor - label: (string) name of the task, used in log messages - stack: (iterable) list of items to be processed by task - procs: (int) number of processors - logger: (object) fancylogger object of the caller """ if logger is None: logger = fancylogger.getLogger() # In Python 3.6, passing the logger to worker functions will make them non-picklable by ProcessPoolExecutor pyver = sys.version_info worker_logger = logger if pyver[0] >= 3 and pyver[1] >= 7 else None data_collection = list() # Start process pool to execute all items in the stack with futures.ProcessPoolExecutor(max_workers=procs) as executor: task_pool = { executor.submit(task, item, *args, logger=worker_logger, **kwargs): item for item in stack } for pid, completed_task in enumerate(futures.as_completed(task_pool)): try: data_batch = completed_task.result() except futures.process.BrokenProcessPool as err: # In Python 3.8+ there is also the exception futures.BrokenExecutor to consider error_exit(logger, f"{label}: process pool executor failed") except futures.CancelledError as err: # Child processes will be cancelled if any ends in error. Ignore error. logger.debug(f"{label}: process {pid} cancelled successfully") pass except SystemExit as exit: if exit.code == 1: # Child process ended in error. Cancel all remaining processes in the pool. cancel_process_pool(task_pool, pid, logger) # Abort execution errmsg = f"{label}: process {pid} failed. Aborting!" error_exit(logger, errmsg) else: # Add counters to list data_collection.append(data_batch) return data_collection
def get_updated_record(user_record, record_validity, vsc_token, logger=None): """ Return user record with up to date information First check local cache. If missing or outdated check VSC account page - user_record: (tuple) username and its account record - record_validity: (int) number of days that user records are valid - vsc_token: (string) access token to VSC account page - logger: (object) fancylogger object of the caller """ if logger is None: logger = fancylogger.getLogger() # Unpack user record (username, record_data) = user_record # Existing user if record_data: logger.debug(f"[{username}] user account record exists in local cache") try: # Calculate age of existing record # Once we can use Python 3.7+, the following can be replaced with date.fromisoformat() record_date = datetime.strptime(record_data['updated'], '%Y-%m-%d').date() except ValueError as err: errmsg = f"[{username}] user account record in local cache is malformed" error_exit(logger, errmsg) else: record_age = date.today() - record_date if record_age.days > record_validity: fresh_record = get_vsc_record(username, vsc_token) if fresh_record: # Update outdated record with data from VSC account page record_data.update(fresh_record) logger.debug( f"[{username}] user account record updated from VSC account page" ) else: # Account missing in VSC account page, keep existing record in our data base record_data['updated'] = date.today().isoformat() # New user else: # Retrieve full record from VSC account page record_data = get_vsc_record(username, vsc_token) if not record_data: # Generate a default record for users not present in VSC account page record_data = user_basic_record(username) logger.debug( f"[{username}] new user account registered as member of {record_data['site']}" ) return {username: record_data}
def save_json(self): """ Save contents to data file in JSON format """ try: with open(self.datafile, 'w', encoding='utf8') as jsonfile: json.dump(self.contents, jsonfile, indent=4, ensure_ascii=False) except FileNotFoundError as err: error_exit(self.log, f"Data file not found: {self.datafile}") else: self.log.debug("Data saved to file: %s", self.datafile) return True
def set_units(self, units): """ Change active compute units """ try: self.active_units = self.known_units[units] except KeyError as err: errmsg = f"Unknown compute units {units}: {err}" error_exit(self.log, errmsg) else: self.log.debug("Compute units set to '%s'", self.active_units['name'])
def setattr(self, target_name, local_data): """ Wrapper around setattr with error handling - target_name: (string) name of attribute in ComputeTimeCount - local_data: (object) data to be saved in target attribute """ try: setattr(self, target_name, local_data) except AttributeError as err: errmsg = f"Attribute {target_name} could not be set in ComputeTimeCount object" error_exit(self.log, errmsg) else: return True
def getattr(self, target_name): """ Wrapper around getattr with error handling Returns existing attribute in ComputeTimeCount - target_name: (string) name of attribute in ComputeTimeCount """ try: target_attr = getattr(self, target_name) except AttributeError as err: errmsg = f"Attribute {target_name} not found in ComputeTimeCount object" error_exit(self.log, errmsg) else: return target_attr
def __init__(self, datafile, mandatory=True, force_install=None): """ Determine location of data file and read its contents Data files from package resources will be copied to user's data dir if needed - datafile: (string) name of the data file or full path to data file - mandatory: (boolean) mandatory files must already exist in user's data dir (or be installed) - force_install: (boolean) force copy of data file from package resources, superseeds FORCE_INSTALL """ self.log = fancylogger.getLogger(name=self.__class__.__name__) # Fallback to FORCE_INSTALL if force_install is not set if force_install is None: force_install = FORCE_INSTALL if force_install: self.log.debug("Installation of data files is enforced") # Define paths holding package data files by order of preference # The 'data' folder in the package resources is set as a fallback location self.sys_data_dirs = ( f'/etc/{DATA_DIR}', '/etc', ) datafile = os.path.expanduser(datafile) if os.path.isabs(datafile): # Directly read data from absolute path self.datafile = datafile readable_file = True else: # Use datafile in user data directory self.datafile = os.path.join(appdirs.user_data_dir(DATA_DIR), datafile) # Copy data file from package contents (if it is missing in user's data dir or manually forced) # Failed copies are only fatal for mandatory data files try: self.install_pkgdata(datafile, force=force_install) except FileNotFoundError as err: readable_file = False if mandatory: error_exit(self.log, err) else: readable_file = True if readable_file: # Read contents of data file try: self.read_data() except ValueError as err: error_exit(self.log, err)
def read_html(self): """ Return contents of HTML file """ try: with open(self.datafile, 'r') as htmlfile: htmldump = htmlfile.read() htmldata = BeautifulSoup(htmldump, 'lxml') # There are no other exeptions to check, bs4 will make HTML compliant anything that you throw at it except FileNotFoundError as err: error_exit(self.log, f"Data file not found: {self.datafile}") else: self.log.debug("Data read from file: %s", self.datafile) return htmldata
def read_json(self): """ Return contents of JSON file """ try: with open(self.datafile, 'r') as jsonfile: jsondata = json.load(jsonfile) except FileNotFoundError as err: error_exit(self.log, f"Data file not found: {self.datafile}") except json.decoder.JSONDecodeError as err: error_exit(self.log, f"Data file in JSON format is malformed: {self.datafile}") else: self.log.debug("Data read from file: %s", self.datafile) return jsondata
def load(self, configfile): """ Load contents of configuration file - configfile: (string) name or path of the config file """ # Determine location of config file self.locate_config(configfile) # Read contents of config file self.opts = configparser.ConfigParser() try: self.read() except FileNotFoundError as err: error_exit(self.log, err) return self
def init_db_cache(self): """ Returns empty cache with db placeholder and default meta data """ try: valid_days = MainConf.get_digit('userdb', 'default_valid_days', fallback=30, mandatory=False) except (KeyError, ValueError) as err: error_exit(self.log, err) else: empty_db = {'valid_days': valid_days, 'db': dict()} self.log.info( f"Initialized empty data base of users with a validity of %s days", valid_days) return empty_db
def output_img(self, imgfmt='svg'): """ Save plot image in 'imgfmt' format to the default output path Matplotlib object is closed after save as it is no longer needed and there is a limit of plot objects that can be open at the same time - imgfmt: (string) file format of the image """ # Work with lowercase format extensions imgfmt = imgfmt.lower() # Save image file try: self.fig.savefig(self.output_path[imgfmt], format=imgfmt, bbox_inches='tight') except PermissionError: error_exit(f"Permission denied to save plot render: {imgpath}") else: self.log.info(f"Report for '{self.title}' saved in {imgfmt.upper()} format to {self.output_path[imgfmt]}") # Delete plot render plt.close(self.fig)
def job_seconds_to_compute(self, job_time, used_cores, days): """ Returns compute time per day using the active compute units Warning: this function is structured to work with individual variables, pd.Series or pd.DataFrames that contain the following numerical parameters - job_time: (float) real used time in seconds - used_cores: (int) number of cores used during job_time - days: (int) number of days (used in normalized units) """ try: total_compute_units = job_time * used_cores / self.active_units[ 'factor'] if self.active_units['norm']: daily_compute_units = total_compute_units / days except ValueError as err: error_exit( self.log, f"Compute time unit conversion to {self.active_units['name']} failed: {err}" ) else: return daily_compute_units
def __init__(self, query_id): """ Set configuration options for the queries to ElasticSearch Establish connection to the server - query_id: (int) arbitrary identification number of the query """ self.log = fancylogger.getLogger(name=self.__class__.__name__) # Set query ID try: self.id = str(query_id) except ValueError as err: error_exit(self.log, err) try: # URL of the ElasticSearch instance self.servers = MainConf.get('elasticsearch', 'server_url').split(',') # Index parameters self.index = { 'name': MainConf.get('elasticsearch', 'index_name'), 'freq': MainConf.get('elasticsearch', 'index_freq'), 'walltime': MainConf.get('elasticsearch', 'max_walltime'), } except KeyError as err: error_exit(logger, err) # Default field to retrieve and format of timestamps self.fields = ['@timestamp'] self.timeformat = '%Y-%m-%dT%H:%M:%S.%fZ' try: self.client = Elasticsearch(hosts=self.servers) self.search = Search(using=self.client) es_cluster = self.client.cluster.health() except (ConnectionError, TransportError) as err: error_exit( self.log, f"ES query [{self.id}] connection to ElasticSearch server failed: {err}" ) except ConnectionTimeout as err: error_exit( self.log, f"ES query [{self.id}] connection to ElasticSearch server timed out" ) else: dbgmsg = "ES query [%s] connection established with ES cluster: %s" self.log.debug(dbgmsg, self.id, es_cluster['cluster_name']) self.log.debug("ES query [%s] status of ES cluster is %s", self.id, es_cluster['status'])
def aggregate_perdate(self, source, selection, destination=None): """ Aggregate data in selected column per each date in time interval Add/Update the aggregation to destination data frame as a new column prefixed with "total" - source: (string) name of ComputeTimeCounter attribute with the source data - selection: (string) name of column to aggregate - destination: (string) name of ComputeTimeCounter attribute to store aggregation """ if not destination: destination = source source_data = self.getattr(source) dest_data = self.getattr(destination) # Execute aggregation per date try: aggregate = source_data.loc[:, selection].groupby('date').sum() except KeyError: errmsg = f"Aggregation per date failed: {selection} data not found in {source}" error_exit(self.log, errmsg) aggregate_name = 'total_{}'.format(selection) if aggregate_name in dest_data.columns: # Update existing data in destination dest_data.update(aggregate.rename(aggregate_name)) aggregate_action = 'Updated' else: # Add aggregation as new data to destination dest_data = dest_data.join(aggregate.rename(aggregate_name)) aggregate_action = 'Added' self.setattr(destination, dest_data) self.log.debug("%s aggregation of %s per date in %s succesfully", aggregate_action, selection, destination) return True
def make_dir(dirpath): """ Create directory in dir path if it does not exist - dirpath: (string) absolute path to directory """ try: check_abspath(dirpath) except ValueError as err: error_exit(logger, err) try: os.makedirs(dirpath) except FileExistsError: if os.path.isdir(dirpath): logger.debug("Folder already exists: %s", dirpath) return False else: error_exit(logger, f"Path '{dirpath}' exists but is not a folder") except PermissionError: error_exit(logger, f"Permission denied to create folder: {dirpath}") else: logger.debug("Folder successfully created: %s", dirpath) return True
def main(): # Core command line arguments cli_core = argparse.ArgumentParser(prog='accounting-report', add_help=False) cli_core.add_argument( '-v', '--version', action='version', version='%(prog)s from vsc-accounting-brussel v{}'.format(VERSION) ) cli_core.add_argument( '-d', dest='debug', help='use debug log level', required=False, action='store_true' ) cli_core.add_argument( '-i', dest='force_install', help='force (re)installation of any data files needed from package resources', required=False, action='store_true', ) cli_core.add_argument( '-c', dest='config_file', help='path to configuration file (default: ~/.config/vsc-accounting/vsc-accouning.ini)', default='vsc-accounting.ini', required=False, ) cli_core_args, cli_extra_args = cli_core.parse_known_args() # Debug level logs if cli_core_args.debug: fancylogger.setLogLevelDebug() logger.debug("Switched logging to debug verbosity") # Load configuration MainConf.load(cli_core_args.config_file) # Enforce (re)installation of data files if cli_core_args.force_install: dataparser.FORCE_INSTALL = True # Read nodegroup specs and default values try: nodegroups_spec = MainConf.get('nodegroups', 'specsheet') nodegroups_default = MainConf.get('nodegroups', 'default').split(',') except KeyError as err: error_exit(logger, err) else: nodegroups = DataFile(nodegroups_spec).contents # Reporting command line arguments cli = argparse.ArgumentParser( description='Generate accurate accounting reports about the computational resources used in an HPC cluster', parents=[cli_core], ) cli.add_argument( '-s', dest='start_date', help='data retrieved from START_DATE [YYYY-MM-DD] at 00:00', required=True, type=valid_isodate, ) cli.add_argument( '-e', dest='end_date', help='data retrieved until END_DATE [YYYY-MM-DD] at 00:00 (default: today)', default=date.today(), required=False, type=valid_isodate, ) cli.add_argument( '-r', dest='resolution', help='time resolution of the accounting (default: day)', choices=['year', 'quarter', 'month', 'week', 'day'], default='day', required=False, ) cli.add_argument( '-f', dest='report_format', help='format of the report document (default: SVG)', choices=['html', 'pdf', 'png', 'svg'], default='svg', required=False, ) cli.add_argument( '-t', dest='csv', help='write report data table in a CSV file', required=False, action='store_true', ) cli.add_argument( '-o', dest='output_dir', help='path to store output files (default: print working directory)', default=None, required=False, type=valid_dirpath, ) cli.add_argument( '-u', dest="compute_units", help='compute time units (default: corehours)', choices=['corehours', 'coredays'], default='corehours', required=False, ) cli.add_argument( '-n', dest='node_groups', help='node groups to include in the accounting report', choices=[*nodegroups], nargs='*', default=nodegroups_default, required=False, ) cli.add_argument( 'reports', help='accounting reports to generate', choices=[ 'compute-time', 'compute-percent', 'running-jobs', 'unique-users', 'peruser-compute', 'peruser-percent', 'peruser-jobs', 'perfield-compute', 'perfield-percent', 'perfield-jobs', 'persite-compute', 'persite-percent', 'persite-jobs', 'top-users', 'top-users-percent', 'top-fields', 'top-fields-percent', 'top-sites', 'top-sites-percent', ], nargs='+', ) # Read command line arguments cli_args = cli.parse_args() # Set absolute path of output directory if cli_args.output_dir: basedir = os.path.abspath(os.path.expanduser(cli_args.output_dir)) else: basedir = os.getcwd() logger.debug("Output directory set to: %s", basedir) # Convert time resolution to pandas DateOffset format pd_date_offsets = {'day': 'D', 'week': 'W-MON', 'month': 'MS', 'quarter': 'QS', 'year': 'AS'} date_offset = pd_date_offsets[cli_args.resolution] # Selection of node groups nodegroup_list = list(set(cli_args.node_groups)) # go through a set to remove duplicates # Account compute time on each node group in the requested period ComputeTime = ComputeTimeCount( cli_args.start_date, cli_args.end_date, date_offset, compute_units=cli_args.compute_units ) for ng in nodegroup_list: logger.info("Processing jobs on %s nodes...", ng) ComputeTime.add_nodegroup(ng, nodegroups[ng]['cores'], nodegroups[ng]['hosts']) # Colors of each nodegroup plot_colors = {ng: nodegroups[ng]['color'] for ng in nodegroup_list} # Generate requested accounting reports report_save = [basedir, cli_args.report_format, cli_args.csv] report_generators = { 'compute-time': (report.compute_time, [ComputeTime, plot_colors] + report_save), 'compute-percent': (report.compute_percent, [ComputeTime, plot_colors] + report_save), 'running-jobs': (report.global_measure, [ComputeTime, 'Running Jobs', plot_colors] + report_save), 'unique-users': (report.global_measure, [ComputeTime, 'Unique Users', plot_colors] + report_save), 'peruser-compute': (report.aggregates, [ComputeTime, 'User', 'Compute', False, plot_colors] + report_save), 'peruser-percent': (report.aggregates, [ComputeTime, 'User', 'Compute', True, plot_colors] + report_save), 'peruser-jobs': (report.aggregates, [ComputeTime, 'User', 'Jobs', False, plot_colors] + report_save), 'perfield-compute': (report.aggregates, [ComputeTime, 'Field', 'Compute', False, plot_colors] + report_save), 'perfield-percent': (report.aggregates, [ComputeTime, 'Field', 'Compute', True, plot_colors] + report_save), 'perfield-jobs': (report.aggregates, [ComputeTime, 'Field', 'Jobs', False, plot_colors] + report_save), 'persite-compute': (report.aggregates, [ComputeTime, 'Site', 'Compute', False, plot_colors] + report_save), 'persite-percent': (report.aggregates, [ComputeTime, 'Site', 'Compute', True, plot_colors] + report_save), 'persite-jobs': (report.aggregates, [ComputeTime, 'Site', 'Jobs', False, plot_colors] + report_save), 'top-users': (report.top_users, [ComputeTime, False] + report_save), 'top-users-percent': (report.top_users, [ComputeTime, True] + report_save), 'top-fields': (report.top_fields, [ComputeTime, False] + report_save), 'top-fields-percent': (report.top_fields, [ComputeTime, True] + report_save), 'top-sites': (report.top_sites, [ComputeTime, False] + report_save), 'top-sites-percent': (report.top_sites, [ComputeTime, True] + report_save), } for requested_report in cli_args.reports: report_generators[requested_report][0](*report_generators[requested_report][1])
def html_dataframe(self, table): """ Format DataFrame into an HTML table, generating a complete HTML document - table: (DataFrame) source data for the HTML table """ # Work on a local copy of data table table = table.copy() # Format any Datetime indexes to ISO format for level in range(table.index.nlevels): idx = table.index.unique(level=level) if isinstance(idx, pd.DatetimeIndex): idx = idx.strftime('%Y-%m-%d') if table.index.nlevels > 1: table.index = table.index.set_levels(idx, level=level) else: table = table.set_index(idx) self.log.debug("HTML page: dates in index formatted in ISO format") # CSS style: take from file defined in configuration table_css_file = MainConf.get( 'reports', 'html_table_cssfile', fallback='html_table_style.json', mandatory=False ) table_css = DataFile(table_css_file, mandatory=True).contents self.log.debug(f"HTML page: added stylist rules to table from file: {table_css_file}") # CSS style: table zebra pattern zebra_bg = ('background', 'whitesmoke') if table.index.nlevels == 1: # Intermitent shading of single rows zebra_css = [{'selector': 'tbody tr:nth-of-type(odd)', 'props': [zebra_bg]}] self.log.debug(f"HTML page: applied zebra shading to every other row") else: # Intermitent shading of all rows beloging to each element in root index level rows = np.prod([len(level) for level in table.index.levels[1:]]) zebra_css = [ {'selector': f"tbody tr:nth-of-type({rows * 2}n-{shift})", 'props': [zebra_bg]} for shift in range(rows) ] self.log.debug("HTML page: applied zebra shading to every %s rows", rows) table_css.extend(zebra_css) # Delete names of each index level as it adds a second TH row table.index.names = [None for name in table.index.names] # Delete names of each column level as thous would be also printed along the column headers table.columns.names = [None for name in table.columns.names] # Format numbers table_format = dict() for column in table.columns: # Use names from all column levels if table.columns.nlevels > 1: column_name = " ".join(column) else: column_name = column if re.search('\(coredays.*\)', column_name): table_format.update({column: '{:.1f}'}) elif re.search('\(.*%\)', column_name): table_format.update({column: '{:.2%}'}) elif re.search('\(.*\)', column_name): # by default display data with units as integers table_format.update({column: '{:.0f}'}) else: # data without units are treated as is table_format.update({column: '{}'}) self.log.debug("HTML page: number formatting set per column of table to %s", table_format) # Get extra padding from configuration setting try: column_xtrlen = MainConf.get_digit('reports', 'html_table_extrapadding', fallback=2, mandatory=False) except (KeyError, ValueError) as err: error_exit(self.log, err) else: self.log.debug("HTML page: table cells extra padding set to %s", column_xtrlen) # Set lengths for each column based on formatted maximum value column_maxlen = [len(table_format[col].format(val)) for col, val in table.max(axis=0).to_dict().items()] column_width = [ {'selector': f".col{col}", 'props': [('width', f"{column_maxlen[col] + column_xtrlen}em")]} for col in range(table.shape[1]) ] table_css.extend(column_width) self.log.debug("HTML page: table column widths adjusted to %s", column_width) # Heatmap for data corresponding with the plot if self.yunits == '%': # color grade all columns with percentual data unitlabel = f"({self.yunits})" if table.columns.nlevels > 1: graded_cols = [col for col in table.columns if unitlabel in ''.join(col)] else: graded_cols = [col for col in table.columns if unitlabel in col] self.log.debug("HTML page: color graded all columns in table") elif self.ylab in table.columns: # color grade columns with data of plot graded_cols = [self.ylab] self.log.debug("HTML page: color graded column '%s'", self.ylab) else: graded_cols = None self.log.debug("HTML page: no color grading applied") # Data table printout table_styled = table.style.format(table_format).set_table_styles(table_css) self.log.debug("HTML page: table CSS style applied") if graded_cols: # Note: background_gradient accepts axis=None in pandas 0.25 and vmax in pandas 1.0 # .background_gradient(cmap='YlGnBu', axis=None, subset=dataframe_slice, vmax=num) table_styled = table_styled.background_gradient(cmap='YlGnBu', axis='index', subset=graded_cols) self.log.debug("HTML page: table color gradient applied") table_html = table_styled.render() # Parse table html table_soup = BeautifulSoup(table_html, 'lxml') # Fusion cells with equal total values for all nodegroups th0 = table_soup.tbody.select('th.row_heading.level0') rowspan = int(th0[0]['rowspan']) if th0[0].has_attr('rowspan') else 1 ngtotals = [f"col{col}" for col, name in enumerate(table.columns) if 'Total' in name] # Only proceed if level 0 index has rowspan and columns named 'Total' exist if rowspan > 1 and len(ngtotals) > 0: for ngtotal in ngtotals: column_total = table_soup.tbody.find_all('td', ngtotal) # Check if values in first group of rows are equal (assumes same topology accross the column) firstrow = [cell.string for cell in column_total[0:rowspan]] if all(cell == firstrow[0] for cell in firstrow): # Add rowspan to each top cell for row in range(0, len(column_total), rowspan): column_total[row]['rowspan'] = rowspan # Delete redundant cells for span in range(1, rowspan): column_total[row + span].decompose() self.log.debug("HTML page: cells in column '%s' fusioned succesfully", ngtotal[3:]) return table_soup
def __init__(self, title, table, ymax=None, colors=None, legend=None): """ Initialize plot including axes, labels and legend > Plot object (matplotlib) is accessible in self.fig and self.ax > HTML page (beautifulsoup) is accessible in self.html_page - title: (string) main title of the plot - table: (DataFrame) data source for the plot - ymax: (numeric) maximum value of the Y axis - colors: (list of strings) color codes for each plot element - legend: (list of strings) alternative text elements of the legend Note: No default render() function defined. It is declared on child classes depending on the plot type. """ self.log = fancylogger.getLogger(name=self.__class__.__name__) # Plot title try: cluster_name = MainConf.get('nodegroups', 'cluster_name') except KeyError as err: error_exit(self.log, err) else: self.title = f"{cluster_name}: {title}" # General plot format settings format_configs = dict() for format_config in ['plot_dpi', 'plot_fontsize']: try: format_value = MainConf.get_digit('reports', format_config) except (KeyError, ValueError) as err: error_exit(self.log, err) else: format_configs.update({format_config: format_value}) # Font sizes are relative to 'plot_fontsize' configuration format_fontsize_mod = { 'axes.titlesize': 4, 'axes.labelsize': 0, 'xtick.labelsize': -2, 'ytick.labelsize': -2, 'legend.fontsize': -4, } format_params = {fp: format_configs['plot_fontsize'] + fmod for fp, fmod in format_fontsize_mod.items()} # Add DPI setting format_params.update({'figure.dpi': format_configs['plot_dpi']}) # Apply formats globally plt.rcParams.update(format_params) self.log.debug("Plot formatting set succesfully: %s", format_params) # Make local copy of data for the plot try: self.check_df(table) except TypeError as err: error_exit(self.log, err) else: self.table = table.copy() self.log.debug("Plot data table copied succesfully") # Plot date range if 'date' in self.table.index.names: dateidx = self.table.index.get_level_values('date').unique() self.datelim = (dateidx[0].date(), dateidx[-1].date()) self.log.debug("Plot data range: %s to %s", *self.datelim) else: self.datelim = None # Plot measure is first column in index level 0 if table.columns.nlevels > 1: self.ylab = self.table.columns.get_level_values(0)[0] else: self.ylab = self.table.columns[0] # Y axis scale and labels self.ymax = ymax self.yunits = re.search(r'\((.*?)\)', self.ylab) if self.yunits: self.yunits = self.yunits.group(1) # X axis labels self.xfreq = self.date_freq() self.xlab = f"Date ({self.xfreq})" self.log.debug("Plot labels: [X] %s [Y] %s", self.xlab, self.ylab) # Plot legend self.colors = colors self.legend = legend # Set plot ID from plot title plus index interval self.set_id() # Make the plot self.render() self.set_xaxis() self.set_yaxis() self.add_legend()
def global_measure(ComputeTime, selection, colorlist, savedir, plotformat, csv=False): """ Number of total measures in GlobalStats in the given period Plot upper limit is maximum total 'measures' in the period - ComputeTime: (ComputeTimeFrame) source data for the plot - selection: (string) matching name of column to be plotted - colorlist: (dict) colors for each plot stack - savedir: (string) path of directory to store output - plotformat: (string) image format of the plot - csv: (boolean) save data used for the plot in CSV format """ logger.info("Generating accounting report on %s...", selection.replace('_', ' ')) plot = dict() # Names of selection has to be capitalized selection = selection.title() # Sum jobs and users per time period ComputeTime.aggregate_perdate('GlobalStats', 'running_jobs') ComputeTime.aggregate_perdate('GlobalStats', 'unique_users') # Full data table for the plot table_columns = [ 'compute_time', 'running_jobs', 'total_running_jobs', 'unique_users', 'total_unique_users' ] table = ComputeTime.GlobalStats.loc[:, table_columns] # Format columns in the table units = [ ComputeTime.compute_units['name'], 'jobs/day', 'jobs/day', 'users/day', 'users/day' ] table = table.rename(columns=simple_names_units(table_columns, units)) logger.debug("Data included in the report: %s", ", ".join(table.columns)) # Data selection for the plot plot_data = [ column for column in table.columns if re.match(selection, column) ] try: plot['table'] = table.loc[:, plot_data] except KeyError: error_exit(f"Data column for '{selection}' not found in GlobalStats") else: logger.debug("Data used in the plot: %s", ", ".join(plot['table'].columns)) plot['ymax'] = max(table.loc[:, plot_data[0]].groupby('date').sum()) logger.debug("Maximum value of the plot: %s %s", '{:.2f}'.format(plot['ymax']), plot['table'].columns[0]) # Set colors for each nodegroup in the stack plot plot['colors'] = [ colorlist[ng] for ng in ComputeTime.GlobalStats.index.unique(level='nodegroup') ] # Plot title: first column name without units plot['title'] = re.sub('\((.*?)\)', '', plot['table'].columns[0]).rstrip() # Render plot stackplot = PlotterStack(**plot) # Output: file paths stackplot.set_output_paths(savedir) # Output: render HTML document including plot data table if plotformat == 'html': table_title = "{} stats per nodegroup".format( stackplot.xfreq.capitalize()) stackplot.html_makepage() stackplot.html_addtable(table, table_title) # Output: save files stackplot.save_plot(plotformat) if csv: stackplot.output_csv(table)
def add_nodegroup(self, nodegroup, cores, hostlist): """ Add the definition of a new node group to the accounting of stats - nodegroup: (string) name of the new group of nodes - cores: (integer) number of cores per node - hostlist: (list of dicts) each element should include {regex: pattern of hostnames, n: number of nodes, start: date string, end: date string} """ # Check number of cores if str(cores).isdigit(): self.log.debug("'%s' cores per host: %s", nodegroup, cores) else: errmsg = f"Cores per host of nodegroup '{nodegroup}' are not a positive integer" error_exit(self.log, errmsg) # Update nodegroup host list with cores per node and add missing start and end datetimes for n, host in enumerate(hostlist): hostlist[n].update({'cores': cores}) try: hostlist[n]['start'] = pd.Timestamp( host.get('start', date(2018, 1, 1))) hostlist[n]['end'] = pd.Timestamp(host.get( 'end', date.today())) except ValueError as err: errmsg = f"Dates of host {n} in nodegroup '{nodegroup}' are not in ISO format" error_exit(self.log, errmsg) else: dates_str = ( hostlist[n]['start'].strftime(self.dateformat), hostlist[n]['end'].strftime(self.dateformat), ) self.log.debug("'%s' host %s active period: %s to %s", nodegroup, n, *dates_str) # Add group of nodes self.NG.update({nodegroup: hostlist}) self.log.debug("'%s' nodegroup succesfully defined", nodegroup) # Create corresponding indexes for this group of nodes multidx = ['date', 'nodegroup'] ng_index = pd.MultiIndex.from_product([self.dates, [nodegroup]], names=multidx) self.index = self.index.append(ng_index) # Start with capacity stats of this nodegroup ng_capacity = pd.DataFrame( [self.update_capacity(*dt) for dt in ng_index]) ng_capacity = ng_capacity.set_index(multidx) self.log.debug("'%s' updated %s capacity records", nodegroup, ng_capacity.shape[0]) # Retrieve compute stats of this nodegroup ng_compute = parallel_exec( count_computejobsusers, # worker function f"'{nodegroup}' compute/job counter", # label prefixing log messages ng_index.levels[0], # stack of items to process (nodegroup, self.NG[nodegroup] ), # nodegroup_spec: forwarded to worker function procs=self.max_procs, logger=self.log, peruser=True, # forwarded to worker function ) # Serial version # ng_compute = [count_computejobsusers(n, *dt, peruser=True) for (n, dt) in enumerate(ng_index)] self.log.debug("'%s' retrieved %s compute time data records", nodegroup, len(ng_compute)) # Unpack compue stats and create data frame with global compute stats ng_global, ng_peruser = zip(*ng_compute) ng_global = pd.DataFrame(ng_global).set_index(multidx) ng_global = pd.merge(ng_capacity, ng_global, left_index=True, right_index=True, sort=True) self.GlobalStats = self.GlobalStats.combine_first(ng_global) self.log.debug("'%s' Global stats completed with %s data records", nodegroup, self.GlobalStats.shape[0]) # Unpack user stats and create data frame with user compute time and jobs ng_peruser = [(record['compute'], record['jobs']) for record in ng_peruser] ng_peruser_compute, ng_peruser_jobs = zip(*ng_peruser) ng_peruser_compute = pd.DataFrame(ng_peruser_compute).set_index( multidx) ng_peruser_jobs = pd.DataFrame(ng_peruser_jobs).set_index(multidx) ng_peruser_counters = [('Compute', ng_peruser_compute), ('Jobs', ng_peruser_jobs)] # Update list of active users with users from this nodegroup ng_users = set(ng_peruser_compute.columns) self.UserList.update(ng_users) self.log.debug("'%s' %s unique users added to accounting", nodegroup, len(ng_users)) # Retrieve account data for users in this nodegroup ng_user_accounts = pd.DataFrame.from_dict(UserDB(ng_users).records, orient='index') ng_user_accounts.index.name = 'user' self.UserAccounts = self.UserAccounts.combine_first(ng_user_accounts) # Update user data and generate aggregates per field and site for counter_name, counter_data in ng_peruser_counters: # Order data by date counter_data.sort_index(level='date', ascending=True, inplace=True) # Add to respective data frame UserCounts = self.getattr('User' + counter_name) UserCounts = UserCounts.combine_first(counter_data).fillna(0) self.setattr('User' + counter_name, UserCounts) dbgmsg = "'%s' User %s stats completed with %s data records for %s users" self.log.debug(dbgmsg, nodegroup, counter_name.lower(), len(counter_data.index), len(counter_data.columns)) for category in ['Field', 'Site']: # Aggregate user data per category ng_percategory = self.aggregate_account_category( counter_data, ng_user_accounts, category) aggregate_counts = (len(counter_data.columns), len(ng_percategory.columns)) infomsg = "'%s' adding %s aggregates for %s users in %s '%s' categories" self.log.info(infomsg, nodegroup, counter_name.lower(), *aggregate_counts, category) # Add aggregate to global data structure CategoryCounts = self.getattr(category + counter_name) CategoryCounts = CategoryCounts.combine_first( ng_percategory).fillna(0) self.setattr(category + counter_name, CategoryCounts) # Update list of categories CategoryList = self.getattr(category + 'List') CategoryList.update(ng_percategory.columns) self.setattr(category + 'List', CategoryList)
def __init__(self, date_start, date_end, date_freq, compute_units='corehours'): """ Inititalize data frames for the provided period of time - date_start, date_end: (date) limits of the period of time - date_freq: (pd.timedelta) string defining the frequency of time entries - compute_units: (string) units used to account compute time """ self.log = fancylogger.getLogger(name=self.__class__.__name__) # Set global compute units and save them in here ComputeUnits.set_units(compute_units) self.compute_units = ComputeUnits.active_units # Use global date format self.dateformat = DATE_FORMAT # Set range of dates try: self.dates = self.set_dates(date_start, date_end, date_freq) except ValueError as err: error_exit(self.log, err) # Set number of procs for parallel processing from configuration file try: self.max_procs = MainConf.get_digit('nodegroups', 'max_procs', fallback=None, mandatory=False) except (KeyError, ValueError) as err: error_exit(self.log, err) else: self.log.debug("Maximum number of processor set to %s", self.max_procs) # Specifications of each groups of nodes self.NG = dict() # Index both dates and nodegroups (empty unless nodegroups are added) self.index = pd.MultiIndex.from_product([self.dates, []], names=['date', 'nodegroup']) # Compute time indexing both dates and nodegroups self.GlobalStats = pd.DataFrame(columns=[ 'capacity', 'compute_time', 'running_jobs', 'unique_users' ], index=self.index) # Aggregate stats (columns are dynamically added for each section) for section in ['User', 'Field', 'Site']: self.setattr(section + 'List', set()) self.setattr(section + 'Compute', pd.DataFrame({}, index=self.index)) self.setattr(section + 'Jobs', pd.DataFrame({}, index=self.index)) # User account data self.UserAccounts = pd.DataFrame( columns=['user', 'field', 'site', 'updated']) self.UserAccounts = self.UserAccounts.set_index('user') self.log.debug("Global and aggregate data structures initialized")
def aggregates(ComputeTime, aggregate, selection, percent, colorlist, savedir, plotformat, csv=False): """ Compute time used by each entity in the chosen aggregate during the time period Gives insight on resources used by each entity Plot upper limit is maximum compute time of entity over all nodegroups - ComputeTime: (ComputeTimeFrame) source data for the plot - aggregate: (string) name of the aggregate data - selection: (string) name of the accounted data - percent: (boolean) plot percentual compute time - colorlist: (dict) colors for each plot stack - savedir: (string) path of directory to store output - plotformat: (string) image format of the plot - csv: (boolean) save data used for the plot in CSV format """ # Names of aggregate and selection have to be capitalized aggregate = aggregate.title() selection = selection.title() # Source data for selected accounting and aggregate try: sources = source_data(selection, aggregate, ComputeTime.compute_units['name']) except AttributeError as err: error_exit(logger, err) # List of entities in this aggregation aggregate_list = sorted(ComputeTime.getattr(aggregate + 'List')) # Add total compute time per time interval ComputeTime.aggregate_perdate('GlobalStats', sources['reference'], sources['aggregate']) # Calculate percentage compute time per entity for entity in aggregate_list: ComputeTime.add_percentage(sources['aggregate'], entity, sources['total'], f"{entity} - percent") # Grab stats for this aggregate AggregateStats = ComputeTime.getattr(sources['aggregate']) # Render plots for each entity plot = dict() # Set colors for each nodegroup in the stack plot plot['colors'] = [ colorlist[ng] for ng in ComputeTime.GlobalStats.index.unique(level='nodegroup') ] # Iterate over each entity for entity in aggregate_list: logger.info("Generating accounting report on %s by %s: %s...", selection, aggregate, entity) # Full data table for the plot entity_perc = f"{entity} - percent" table = AggregateStats.loc[:, [entity, entity_perc, sources['total']]] # Format columns in the table counter_name = sources['reference'].replace('_', ' ').title() column_names = { entity: f"{counter_name} of {entity} ({sources['units']})", entity_perc: f"{counter_name} of {entity} (%)", sources['total']: f"Total {counter_name} ({sources['units']})", } table = table.rename(columns=column_names) logger.debug("Data included in the report: %s", ", ".join(table.columns)) # Plot title and data selection if percent: plot['title'] = f"Relative {counter_name} of {entity}" plot['table'] = table.loc[:, [column_names[entity_perc]]] else: plot['title'] = f"{counter_name} of {entity}" plot['table'] = table.loc[:, [column_names[entity]]] logger.debug("Data used in the plot: %s", ", ".join(plot['table'].columns)) # Max value is set to max in the plot to avoid empty plots due to exagerated scales plot['ymax'] = plot['table'].iloc[:, 0].groupby('date').sum().max() ymax_fmt = '{:.2%}' if percent else '{:.2f}' logger.debug("Maximum value of the plot: %s %s", ymax_fmt.format(plot['ymax']), plot['table'].columns[0]) # Render plot stackplot = PlotterStack(**plot) # Output: file paths stackplot.set_output_paths(savedir) # Output: render HTML document including plot data table if plotformat == 'html': table_title = "{} stats per nodegroup".format( stackplot.xfreq.capitalize()) stackplot.html_makepage() stackplot.html_addtable(table, table_title) # Output: save files stackplot.save_plot(plotformat) if csv: stackplot.output_csv(table)