def get_next_charter_period(start, end): """Return the next g5k charter time period. :param start: timestamp in a type supported by `execo.time_utils.get_unixts` from which to start searching for the next g5k charter time period. If start is in a g5k charter time period, the returned g5k charter time period starts at start. :param end: timestamp in a type supported by `execo.time_utils.get_unixts` until which to search for the next g5k charter time period. If end is in the g5k charter time period, the returned g5k charter time period ends at end. :returns: a tuple (charter_start, charter_end) of unix timestamps. (None, None) if no g5k charter time period found """ start = unixts_to_oar_datetime(get_unixts(start)) end = unixts_to_oar_datetime(get_unixts(end)) if end <= start: return None, None elif g5k_charter_time(start): charter_end = start.replace(hour = 19, minute = 0, second = 0, microsecond = 0) return oar_datetime_to_unixts(start), datetime_to_unixts(min(end, charter_end)) else: if start.hour < 9 and _work_day(start.date()): charter_start = start.replace(hour = 9, minute = 0, second = 0, microsecond = 0) else: charter_start = datetime.datetime.combine(_next_work_day(start.date()), datetime.time(9, 0, 0)) if charter_start > end: return None, None charter_end = charter_start.replace(hour = 19, minute = 0, second = 0, microsecond = 0) return oar_datetime_to_unixts(charter_start), datetime_to_unixts(min(end, charter_end))
def get_current_oargrid_jobs(start_between=None, end_between=None, frontend_connection_params=None, timeout=False): """Return a list of current active oargrid job ids. :param start_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose start date is in between these endpoints. :param end_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose end date is in between these endpoints. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if start_between: start_between = [get_unixts(t) for t in start_between] if end_between: end_between = [get_unixts(t) for t in end_between] process = get_process("oargridstat", host=get_frontend_host(), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: jobs = re.findall("Reservation # (\d+):", process.stdout, re.MULTILINE) oargrid_job_ids = [int(j) for j in jobs] if start_between or end_between: filtered_job_ids = [] for job in oargrid_job_ids: info = get_oargrid_job_info(job, timeout) if (_date_in_range(info['start_date'], start_between) and _date_in_range( info['start_date'] + info['walltime'], end_between)): filtered_job_ids.append(job) oargrid_job_ids = filtered_job_ids return oargrid_job_ids else: raise ProcessesFailed([process])
def get_hosts_metric(hosts, metric, from_ts=None, to_ts=None, resolution=None): """Get metric values from Grid'5000 metrology API :param hosts: List of hosts :param metric: Grid'5000 metrology metric to fetch (eg: "power", "cpu_user") :param from_ts: Time from which metric is collected, in any type supported by `execo.time_utils.get_unixts`, optional. :param to_ts: Time until which metric is collected, in any type supported by `execo.time_utils.get_unixts`, optional. :param resolution: time resolution, in any type supported by `execo.time_utils.get_seconds`, optional. :return: A dict of host -> dict with entries 'from' (unix timestamp in seconds, as returned from g5k api), 'to' (unix timestamp in seconds, as returned from g5k api), 'resolution' (in seconds, as returned from g5k api), type (the type of metric, as returned by g5k api), 'values': a list of tuples (timestamp, metric value). Some g5k metrics (the kwapi ones) return both the timestamps and values as separate lists, in which case this function only takes care of gathering them in tuples (note also that for these metrics, it seems that 'from', 'to', 'resolution' returned by g5k api are inconsistent with the timestamps list. In this case this function makes no correction and returns everything 'as is'). Some other g5k metrics (the ganglia ones) only return the values, in which case this function generates the timestamps of the tuples from 'from', 'to', 'resolution'. """ if from_ts != None: from_ts = int(get_unixts(from_ts)) if to_ts != None: to_ts = int(get_unixts(to_ts)) if resolution != None: resolution = get_seconds(resolution) grouped_hosts = group_hosts(hosts) res = {} site_threads = [] for site in grouped_hosts: site_th = threading.Thread(target=__get_site_metrics, args=(site, grouped_hosts, metric, from_ts, to_ts, resolution)) site_th.start() site_threads.append(site_th) for site_th in site_threads: site_th.join() res.update(site_th.res) return res
def format_oar_date(ts): """Return a string with the formatted date (year, month, day, hour, min, sec, ms) formatted for oar/oargrid. timezone is forced to Europe/Paris, and timezone info is discarded, for g5k oar/oargrid. :param tz: a date in one of the formats handled. """ # forking code because modifying os.environ["TZ"] and calling # time.tzset() is not thread-safe ts = int(get_unixts(ts)) rend, wend = os.pipe() pid = os.fork() if pid == 0: os.environ["TZ"] = "Europe/Paris" time.tzset() t = time.localtime(ts) formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", t) os.write(wend, codecs.encode(formatted_time)) os._exit(0) else: os.close(wend) f = os.fdopen(rend, 'rb') formatted_time = codecs.decode(f.read()) f.close() os.waitpid(pid, 0) return formatted_time
def format_oar_date(ts): """Return a string with the formatted date (year, month, day, hour, min, sec, ms) formatted for oar/oargrid. timezone is forced to Europe/Paris, and timezone info is discarded, for g5k oar/oargrid. :param tz: a date in one of the formats handled. """ # forking code because modifying os.environ["TZ"] and calling # time.tzset() is not thread-safe ts = int(get_unixts(ts)) rend, wend = os.pipe() pid = os.fork() if pid == 0: os.environ["TZ"] = "Europe/Paris" time.tzset() t = time.localtime(ts) formatted_time = time.strftime("%Y-%m-%d %H:%M:%S", t) os.write(wend, formatted_time) os._exit(0) else: os.close(wend) f = os.fdopen(rend) formatted_time = f.read() f.close() os.waitpid(pid, 0) return formatted_time
def get_current_oargrid_jobs(start_between = None, end_between = None, frontend_connection_params = None, timeout = False): """Return a list of current active oargrid job ids. :param start_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose start date is in between these endpoints. :param end_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose end date is in between these endpoints. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if start_between: start_between = [ get_unixts(t) for t in start_between ] if end_between: end_between = [ get_unixts(t) for t in end_between ] process = get_process("oargridstat", host = get_frontend_host(), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) process.timeout = timeout process.pty = True process.run() if process.ok: jobs = re.findall("Reservation # (\d+):", process.stdout, re.MULTILINE) oargrid_job_ids = [ int(j) for j in jobs ] if start_between or end_between: filtered_job_ids = [] for job in oargrid_job_ids: info = get_oargrid_job_info(job, timeout) if (_date_in_range(info['start_date'], start_between) and _date_in_range(info['start_date'] + info['walltime'], end_between)): filtered_job_ids.append(job) oargrid_job_ids = filtered_job_ids return oargrid_job_ids else: raise ProcessesFailed([process])
def get_charter_el_planning(start_time, end_time): """Returns the list of tuples (start, end) of g5k charter time periods between start_time and end_time. :param start_time: a date in one of the types supported by `execo.time_utils.get_unixts` :param end_time: a date in one of the types supported by `execo.time_utils.get_unixts` """ start_time = unixts_to_datetime(get_unixts(start_time)) end_time = unixts_to_datetime(get_unixts(end_time)) el_planning = [] while True: charter_start, charter_end = get_next_charter_period( start_time, end_time) if charter_start == None: break el_planning.append((int(charter_start), int(charter_end))) start_time = charter_end return el_planning
def g5k_charter_time(t): """Is the given date in a g5k charter time period ? Returns a boolean, True if the given date is in a period where the g5k charter needs to be respected, False if it is in a period where charter is not applicable (night, weekends, non working days) :param t: a date in a type supported by `execo.time_utils.get_unixts` """ dt = unixts_to_oar_datetime(get_unixts(t)) if dt.hour < 9 or dt.hour >= 19: return False return _work_day(dt.date())
def get_hosts_metric(hosts, metric, from_ts=None, to_ts=None, resolution=1): """Get metric values from Grid'5000 metrology API :param hosts: List of hosts :param metric: Grid'5000 metrology metric to fetch (eg: "power", "cpu_user") :param from_ts: Time from which metric is collected, in any type supported by `execo.time_utils.get_unixts`, optional. :param to_ts: Time until which metric is collected, in any type supported by `execo.time_utils.get_unixts`, optional. :param resolution: time resolution, in any type supported by `execo.time_utils.get_seconds`, default 1 second. :return: A dict of host -> List of (timestamp, metric value) retrieved from API """ from_ts = get_unixts(from_ts) to_ts = get_unixts(to_ts) resolution = get_seconds(resolution) grouped_hosts = group_hosts(hosts) res = {} site_threads = [] for site in grouped_hosts: site_th = threading.Thread( target=__get_site_metrics, args=(site, grouped_hosts, metric, from_ts, to_ts, resolution) ) site_th.start() site_threads.append(site_th) for site_th in site_threads: site_th.join() res.update(site_th.res) return res
def get_current_oar_jobs(frontends=None, start_between=None, end_between=None, frontend_connection_params=None, timeout=False, abort_on_error=False): """Return a list of current active oar job ids. The list contains tuples (oarjob id, frontend). :param frontends: an iterable of frontends to connect to. A frontend with value None means default frontend. If frontends == None, means get current oar jobs only for default frontend. :param start_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose start date is in between these endpoints. :param end_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose end date is in between these endpoints. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. :param abort_on_error: default False. If True, raises an exception on any error. If False, will returned the list of job got, even if incomplete (some frontends may have failed to answer). """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if start_between: start_between = [get_unixts(t) for t in start_between] if end_between: end_between = [get_unixts(t) for t in end_between] processes = [] if frontends == None: frontends = [None] for frontend in frontends: p = get_process("oarstat -u", host=get_frontend_host(frontend), connection_params=make_connection_params( frontend_connection_params, default_frontend_connection_params)) p.timeout = timeout p.pty = True p.frontend = frontend processes.append(p) oar_job_ids = [] if len(processes) == 0: return oar_job_ids for process in processes: process.start() for process in processes: process.wait() failed_processes = [] for process in processes: if process.ok: jobs = re.findall("^(\d+)\s", process.stdout, re.MULTILINE) oar_job_ids.extend([(int(jobid), process.frontend) for jobid in jobs]) else: failed_processes.append(process) if len(failed_processes) > 0 and abort_on_error: raise ProcessesFailed(failed_processes) else: if start_between or end_between: filtered_job_ids = [] for jobfrontend in oar_job_ids: info = get_oar_job_info(jobfrontend[0], jobfrontend[1], frontend_connection_params, timeout, nolog_exit_code=True, nolog_timeout=True, nolog_error=True) if (_date_in_range(info['start_date'], start_between) and _date_in_range( info['start_date'] + info['walltime'], end_between)): filtered_job_ids.append(jobfrontend) oar_job_ids = filtered_job_ids return oar_job_ids
def get_planning(elements=['grid5000'], vlan=False, subnet=False, storage=False, out_of_chart=False, starttime=None, endtime=None, ignore_besteffort=True, queues='default'): """Retrieve the planning of the elements (site, cluster) and others resources. Element planning structure is ``{'busy': [(123456,123457), ... ], 'free': [(123457,123460), ... ]}.`` :param elements: a list of Grid'5000 elements ('grid5000', <site>, <cluster>) :param vlan: a boolean to ask for KaVLAN computation :param subnet: a boolean to ask for subnets computation :param storage: a boolean to ask for sorage computation :param out_of_chart: if True, consider that days outside weekends are busy :param starttime: start of time period for which to compute the planning, defaults to now + 1 minute :param endtime: end of time period for which to compute the planning, defaults to 4 weeks from now :param ignore_besteffort: True by default, to consider the resources with besteffort jobs as available :param queues: list of oar queues for which to get the planning Return a dict whose keys are sites, whose values are dict whose keys are cluster, subnets, kavlan or storage, whose values are planning dicts, whose keys are hosts, subnet address range, vlan number or chunk id planning respectively. """ if not starttime: starttime = int(time() + timedelta_to_seconds(timedelta(minutes=1))) starttime = int(get_unixts(starttime)) if not endtime: endtime = int(starttime + timedelta_to_seconds(timedelta(weeks=4, minutes=1))) endtime = int(get_unixts(endtime)) if 'grid5000' in elements: sites = elements = get_g5k_sites() else: sites = list( set([site for site in elements if site in get_g5k_sites()] + [ get_cluster_site(cluster) for cluster in elements if cluster in get_g5k_clusters(queues=queues) ] + [ get_host_site(host) for host in elements if host in get_g5k_hosts() or get_host_shortname(host) in get_g5k_hosts() ])) if len(sites) == 0: logger.error('Wrong elements given: %s' % (elements, )) return None planning = {} for site in sites: planning[site] = {} for cluster in get_site_clusters(site, queues=queues): planning[site][cluster] = {} for site in sites: if vlan: planning[site].update({'vlans': {}}) if subnet: planning[site].update({'subnets': {}}) if storage: planning[site].update({'storage': {}}) if _retrieve_method == 'API': _get_planning_API(planning, ignore_besteffort) elif _retrieve_method == 'PostgreSQL': _get_planning_PGSQL(planning, ignore_besteffort) if out_of_chart: _add_charter_to_planning(planning, starttime, endtime) for site_pl in planning.values(): for res_pl in site_pl.values(): for el_planning in res_pl.values(): el_planning['busy'].sort() _merge_el_planning(el_planning['busy']) _trunc_el_planning(el_planning['busy'], starttime, endtime) _fill_el_planning_free(el_planning, starttime, endtime) # cleaning real_planning = deepcopy(planning) for site, site_pl in planning.items(): for cl, cl_pl in site_pl.items(): if cl in ['vlans']: continue keep_cluster = False for h in cl_pl: if not (get_host_site(h) in elements or get_host_cluster(h) in elements or get_host_shortname(h) in elements or h in elements): del real_planning[site][cl][h] else: keep_cluster = True if not keep_cluster: del real_planning[site][cl] return real_planning
def make_reservation(self): """Perform a reservation of the required number of nodes. Parameters ---------- Returns ------- """ if self.oar_result: message = "Validated OAR_JOB_ID:" for job_id, site in self.oar_result: message += "\n%s: %s" % (site, job_id) logger.info(message) message = "The list of hosts:" for job_id, site in self.oar_result: hosts = get_oar_job_nodes(oar_job_id=job_id, frontend=site) message += "\n--- %s: %s nodes ---" % (site, len(hosts)) for host in hosts: message += "\n%s" % (host.address) logger.info(message) return if self.configs['walltime'] <= 99*3600+99*60+99: walltime = time.strftime('%H:%M:%S', time.gmtime(self.configs['walltime'])) else: walltime = '%s seconds' % self.configs['walltime'] message = 'You are requesting %s nodes for %s:' % (sum(self.clusters.values()), walltime) for cluster, n_nodes in self.clusters.items(): message += "\n%s: %s nodes" % (cluster, n_nodes) logger.info(message) logger.info('Performing reservation .......') if 'starttime' not in self.configs or self.configs['starttime'] is None: self.configs['starttime'] = int( time.time() + timedelta_to_seconds(datetime.timedelta(minutes=1))) starttime = int(get_unixts(self.configs['starttime'])) endtime = int( starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) startdate = self._get_nodes(starttime, endtime) while startdate is None: logger.info('No enough nodes found between %s and %s, ' + '\nIncreasing the window time....', format_date(starttime), format_date(endtime)) starttime = endtime endtime = int( starttime + timedelta_to_seconds(datetime.timedelta(days=3, minutes=1))) startdate = self._get_nodes(starttime, endtime) if starttime > int(self.configs['starttime'] + timedelta_to_seconds(datetime.timedelta(weeks=6))): logger.error( 'What a pity! There is no slot which satisfies your request until %s :(' % format_date(endtime)) exit() jobs_specs = get_jobs_specs(self.clusters, name=self.job_name) for job_spec, site_name in jobs_specs: tmp = str(job_spec.resources).replace('\\', '') job_spec.resources = 'slash_22=4+' + tmp.replace('"', '') job_spec.walltime = self.configs['walltime'] # -t deploy to reserve node without deploying OS job_spec.additional_options = '-t deploy' job_spec.reservation_date = startdate + 10 self.oar_result = oarsub(jobs_specs) for oar_job_id, _ in self.oar_result: if oar_job_id is None: logger.info('Performing reservation FAILED') exit() message = "Reserved nodes successfully!!! \nOAR JOB ID:\n" for each in self.oar_result: message += "%s:%s," % (each[1], each[0]) logger.info(message)
def get_current_oar_jobs(frontends = None, start_between = None, end_between = None, frontend_connection_params = None, timeout = False, abort_on_error = False): """Return a list of current active oar job ids. The list contains tuples (oarjob id, frontend). :param frontends: an iterable of frontends to connect to. A frontend with value None means default frontend. If frontends == None, means get current oar jobs only for default frontend. :param start_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose start date is in between these endpoints. :param end_between: a tuple (low, high) of endpoints. Filters and returns only jobs whose end date is in between these endpoints. :param frontend_connection_params: connection params for connecting to frontends if needed. Values override those in `execo_g5k.config.default_frontend_connection_params`. :param timeout: timeout for retrieving. Default is False, which means use ``execo_g5k.config.g5k_configuration['default_timeout']``. None means no timeout. :param abort_on_error: default False. If True, raises an exception on any error. If False, will returned the list of job got, even if incomplete (some frontends may have failed to answer). """ if isinstance(timeout, bool) and timeout == False: timeout = g5k_configuration.get('default_timeout') if start_between: start_between = [ get_unixts(t) for t in start_between ] if end_between: end_between = [ get_unixts(t) for t in end_between ] processes = [] if frontends == None: frontends = [ None ] for frontend in frontends: p = get_process("oarstat -u", host = get_frontend_host(frontend), connection_params = make_connection_params(frontend_connection_params, default_frontend_connection_params)) p.timeout = timeout p.pty = True p.frontend = frontend processes.append(p) oar_job_ids = [] if len(processes) == 0: return oar_job_ids for process in processes: process.start() for process in processes: process.wait() failed_processes = [] for process in processes: if process.ok: jobs = re.findall("^(\d+)\s", process.stdout, re.MULTILINE) oar_job_ids.extend([ (int(jobid), process.frontend) for jobid in jobs ]) else: failed_processes.append(process) if len(failed_processes) > 0 and abort_on_error: raise ProcessesFailed(failed_processes) else: if start_between or end_between: filtered_job_ids = [] for jobfrontend in oar_job_ids: info = get_oar_job_info(jobfrontend[0], jobfrontend[1], frontend_connection_params, timeout, nolog_exit_code = True, nolog_timeout = True, nolog_error = True) if (_date_in_range(info['start_date'], start_between) and _date_in_range(info['start_date'] + info['walltime'], end_between)): filtered_job_ids.append(jobfrontend) oar_job_ids = filtered_job_ids return oar_job_ids