def _add_xml_elements(self): """Add sites, clusters, hosts to self.state """ _state = self.state logger.debug('Initial state \n %s', prettify(_state)) for site in self.sites: SubElement(_state, 'site', attrib={'id': site}) else: el_site = SubElement(_state, 'site', attrib={'id': 'unknown'}) logger.debug('Sites added \n %s', prettify(_state)) for cluster in self.clusters: el_site = _state.find("./site[@id='" + get_cluster_site(cluster) \ + "']") SubElement(el_site, 'cluster', attrib={'id': cluster}) else: el_cluster = SubElement(el_site, 'cluster', attrib={'id': 'unknown'}) logger.debug('Clusters added \n %s', prettify(_state)) hosts_attr = get_CPU_RAM_FLOPS(self.hosts) for host in self.hosts: if host in get_g5k_hosts(): el_cluster = _state.find(".//cluster/[@id='" + get_host_cluster(host) + "']") SubElement(el_cluster, 'host', attrib={ 'id': host, 'state': 'Undeployed', 'cpu': str(hosts_attr[host]['CPU']), 'mem': str(hosts_attr[host]['RAM']) }) logger.debug('Hosts added \n %s', prettify(_state))
def get_fastest_host(hosts): """ Use the G5K api to have the fastest node""" hosts_attr = {'TOTAL': {'CPU': 0, 'RAM': 0}} cluster_attr = {} for host in hosts: if isinstance(host, Host): host = host.address cluster = get_host_cluster(host) if cluster not in cluster_attr: attr = get_host_attributes(host) cluster_attr[cluster] = { 'CPU': attr['architecture']['nb_cores'], 'RAM': int(attr['main_memory']['ram_size'] / 10 ** 6), 'flops': attr['performance']['node_flops']} hosts_attr[host] = cluster_attr[cluster] hosts_attr['TOTAL']['CPU'] += attr['architecture']['nb_cores'] hosts_attr['TOTAL']['RAM'] += int(attr['main_memory']['ram_size'] \ / 10 ** 6) max_flops = -1 for host in hosts: if isinstance(host, Host): host = host.address flops = hosts_attr[host]['flops'] if flops > max_flops: max_flops = flops fastest_host = host return fastest_host
def _add_xml_elements(self): """Add sites, clusters, hosts to self.state """ _state = self.state logger.debug('Initial state \n %s', prettify(_state)) for site in self.sites: SubElement(_state, 'site', attrib={'id': site}) else: el_site = SubElement(_state, 'site', attrib={'id': 'unknown'}) logger.debug('Sites added \n %s', prettify(_state)) for cluster in self.clusters: el_site = _state.find("./site[@id='" + get_cluster_site(cluster) \ + "']") SubElement(el_site, 'cluster', attrib={'id': cluster}) else: el_cluster = SubElement(el_site, 'cluster', attrib={'id': 'unknown'}) logger.debug('Clusters added \n %s', prettify(_state)) print 'xxxxxxx', self.hosts hosts_attr = get_CPU_RAM_FLOPS(self.hosts) for host in self.hosts: if host in get_g5k_hosts(): el_cluster = _state.find(".//cluster/[@id='" + get_host_cluster(host) + "']") SubElement(el_cluster, 'host', attrib={'id': host, 'state': 'Undeployed', 'cpu': str(hosts_attr[host]['CPU']), 'mem': str(hosts_attr[host]['RAM'])}) logger.debug('Hosts added \n %s', prettify(_state))
def get_fastest_host(hosts): """ Use the G5K api to have the fastest node""" hosts_attr = {'TOTAL': {'CPU': 0, 'RAM': 0}} cluster_attr = {} for host in hosts: if isinstance(host, Host): host = host.address cluster = get_host_cluster(host) if cluster not in cluster_attr: attr = get_host_attributes(host) cluster_attr[cluster] = { 'CPU': attr['architecture']['nb_cores'], 'RAM': int(attr['main_memory']['ram_size'] / 10**6), 'flops': attr['performance']['node_flops'] } hosts_attr[host] = cluster_attr[cluster] hosts_attr['TOTAL']['CPU'] += attr['architecture']['nb_cores'] hosts_attr['TOTAL']['RAM'] += int(attr['main_memory']['ram_size'] \ / 10 ** 6) max_flops = -1 for host in hosts: if isinstance(host, Host): host = host.address flops = hosts_attr[host]['flops'] if flops > max_flops: max_flops = flops fastest_host = host return fastest_host
def __init__(self, hosts, topo_list=None, config_file=None): """Create a new Hadoop cluster with the given hosts and topology. Args: hosts (list of Host): The hosts to be assigned a topology. topo_list (list of str, optional): The racks to be assigned to each host. len(hosts) should be equal to len(topo_list). configFile (str, optional): The path of the config file to be used. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) self.base_dir = config.get("cluster", "hadoop_base_dir") self.conf_dir = config.get("cluster", "hadoop_conf_dir") self.logs_dir = config.get("cluster", "hadoop_logs_dir") self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir") self.hdfs_port = config.getint("cluster", "hdfs_port") self.mapred_port = config.getint("cluster", "mapred_port") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/bin" # Configure master and slaves self.hosts = hosts self.master = hosts[0] # Create topology self.topology = HadoopTopology(hosts, topo_list) # Store cluster information self.host_clusters = {} for h in self.hosts: g5k_cluster = get_host_cluster(h) if g5k_cluster in self.host_clusters: self.host_clusters[g5k_cluster].append(h) else: self.host_clusters[g5k_cluster] = [h] # Create a string to display the topology t = {v: [] for v in self.topology.topology.values()} for key, value in self.topology.topology.iteritems(): t[value].append(key.address) log_topo = ', '.join([style.user2(k) + ': ' + ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) for k, v in t.iteritems()]) logger.info("Hadoop cluster created with master %s, hosts %s and topology %s", style.host(self.master.address), ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]), log_topo)
def _check_xml_elements(self, xml, resources, strict=False): sites, clusters, hosts = self._get_xml_elements(xml) ok = True if not sites == self.sites: logger.error('List of sites from resources differs from infile' + \ '\n resource %s \n infile %s', self.sites, sites) ok = False if not clusters == self.clusters: logger.error('List of clusters from resources differs from infile' + \ '\n resource %s \n infile %s', self.clusters, clusters) ok = False if strict: if not hosts == self.hosts: logger.error('List of hosts from resources differs from infile' + \ '\n resource %s \n infile %s', self.hosts, hosts) ok = False else: res_hosts = {} for host in self.hosts: cluster = get_host_cluster(host) if cluster in res_hosts: res_hosts[cluster] += 1 else: res_hosts[cluster] = 1 xml_hosts = {} for host in hosts: cluster = get_host_cluster(host) if cluster in xml_hosts: xml_hosts[cluster] += 1 else: xml_hosts[cluster] = 1 if not res_hosts == xml_hosts: logger.error('List of hosts from resources differs from infile' + \ '\n resource %s \n infile %s', self.hosts, hosts) ok = False else: for i in range(len(hosts)): el_host = xml.find(".//host/[@id='" + hosts[i] + "']") el_host.attrib['id'] = self.hosts[i] return ok
def _get_resources_elements(self, resources=None): """ """ self.sites = sorted([site for site in resources.keys() if site != 'global']) self.hosts = [] for site in self.sites: if self.kavlan: self.hosts += map(lambda host: get_kavlan_host_name(host, self.kavlan), resources[site]['hosts']) else: self.hosts += resources[site]['hosts'] self.hosts.sort(key=lambda host: (host.split('.', 1)[0].split('-')[0], int(host.split('.', 1)[0].split('-')[1]))) self.clusters = list(set([get_host_cluster(host) for host in self.hosts])) self.clusters.sort()
def _get_resources_elements(self, resources=None): """ """ self.sites = sorted( [site for site in resources.keys() if site != 'global']) self.hosts = [] for site in self.sites: if self.kavlan: self.hosts += map( lambda host: get_kavlan_host_name(host, self.kavlan), resources[site]['hosts']) else: self.hosts += resources[site]['hosts'] self.hosts.sort(key=lambda host: (host.split('.', 1)[0].split('-')[ 0], int(host.split('.', 1)[0].split('-')[1]))) self.clusters = list( set([get_host_cluster(host) for host in self.hosts])) self.clusters.sort()
def __init__(self, hosts, config_file=None): """Create a new Cassandra cluster with the given hosts. Args: hosts (list of Host): The hosts that conform the cluster. config_file (str, optional): The path of the config file to be used. """ # Load cluster properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) self.base_dir = config.get("cluster", "cassandra_base_dir") self.conf_dir = config.get("cluster", "cassandra_conf_dir") self.logs_dir = config.get("cluster", "cassandra_logs_dir") self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.bin_dir = self.base_dir + "/bin" # Configure nodes and seeds self.hosts = hosts self.seeds = self.hosts[0:3] # TODO: Temporary self.master = self.hosts[0] # Store cluster information self.host_clusters = {} for h in self.hosts: g5k_cluster = get_host_cluster(h) if g5k_cluster in self.host_clusters: self.host_clusters[g5k_cluster].append(h) else: self.host_clusters[g5k_cluster] = [h] logger.info("Cassandra cluster created with hosts " + str(self.hosts))
def get_CPU_RAM_FLOPS(hosts): """Return the number of CPU and amount RAM for a host list """ hosts_attr = {'TOTAL': {'CPU': 0, 'RAM': 0}} cluster_attr = {} for host in hosts: if isinstance(host, Host): host = host.address cluster = get_host_cluster(host) if cluster not in cluster_attr: attr = get_host_attributes(host) cluster_attr[cluster] = { 'CPU': attr['architecture']['nb_cores'], 'RAM': int(attr['main_memory']['ram_size'] / 10 ** 6), 'flops': attr['performance']['node_flops']} hosts_attr[host] = cluster_attr[cluster] hosts_attr['TOTAL']['CPU'] += attr['architecture']['nb_cores'] hosts_attr['TOTAL']['RAM'] += int(attr['main_memory']['ram_size'] \ / 10 ** 6) logger.debug(hosts_list(hosts_attr)) return hosts_attr
def get_CPU_RAM_FLOPS(hosts): """Return the number of CPU and amount RAM for a host list """ hosts_attr = {'TOTAL': {'CPU': 0, 'RAM': 0}} cluster_attr = {} for host in hosts: if isinstance(host, Host): host = host.address cluster = get_host_cluster(host) if cluster not in cluster_attr: attr = get_host_attributes(host) cluster_attr[cluster] = { 'CPU': attr['architecture']['smt_size'], 'RAM': int(attr['main_memory']['ram_size'] / 10 ** 6), 'flops': attr['performance']['node_flops']} hosts_attr[host] = cluster_attr[cluster] hosts_attr['TOTAL']['CPU'] += attr['architecture']['smt_size'] hosts_attr['TOTAL']['RAM'] += int(attr['main_memory']['ram_size'] \ / 10 ** 6) logger.debug(hosts_list(hosts_attr)) return hosts_attr
def _get_site_planning_PGSQL(site, site_planning, ignore_besteffort): try: with G5kAutoPortForwarder( site, 'oardb.' + site + '.grid5000.fr', g5k_configuration['oar_pgsql_ro_port']) as (host, port): conn = psycopg2.connect( host=host, port=port, user=g5k_configuration['oar_pgsql_ro_user'], password=g5k_configuration['oar_pgsql_ro_password'], database=g5k_configuration['oar_pgsql_ro_db']) try: cur = conn.cursor() # Retrieving alive resources sql = """SELECT DISTINCT R.type, R.network_address, R.vlan, R.subnet_address FROM resources R WHERE state <> 'Dead' AND R.maintenance <> 'YES';""" cur.execute(sql) for data in cur.fetchall(): if data[0] == "default": cluster = get_host_cluster(data[1]) if cluster in site_planning: site_planning[cluster][data[1]] = { 'busy': [], 'free': [] } if data[0] in ['kavlan', 'kavlan-global'] \ and 'vlans' in site_planning: site_planning['vlans']['kavlan-' + data[2]] = { 'busy': [], 'free': [] } if data[0] == "subnet" and 'subnet' in site_planning: site_planning['subnets'][data[3]] = { 'busy': [], 'free': [] } sql = ( """SELECT J.job_id, J.state, GJP.start_time AS start_time, GJP.start_time+MJD.moldable_walltime, array_agg(DISTINCT R.network_address) AS hosts, array_agg(DISTINCT R.vlan) AS vlan, array_agg(DISTINCT R.subnet_address) AS subnets FROM jobs J LEFT JOIN moldable_job_descriptions MJD ON MJD.moldable_job_id=J.job_id LEFT JOIN gantt_jobs_predictions GJP ON GJP.moldable_job_id=MJD.moldable_id INNER JOIN gantt_jobs_resources AR ON AR.moldable_job_id=MJD.moldable_id LEFT JOIN resources R ON AR.resource_id=R.resource_id WHERE ( J.state='Launching' OR J.state='Running' OR J.state='Waiting') """ + (""" AND queue_name<>'besteffort'""" if ignore_besteffort else """""") + """GROUP BY J.job_id, GJP.start_time, MJD.moldable_walltime ORDER BY J.start_time""") # CONVERT(SUBSTRING_INDEX(SUBSTRING_INDEX(R.network_address,'.',1),'-',-1), SIGNED)""" cur.execute(sql) for job in cur.fetchall(): start_time = job[2] end_time = job[3] start_time, end_time = _fix_job(start_time, end_time) if len(job[4]) > 0: for host in job[4]: if host != '': cluster = get_host_cluster(host) if cluster in site_planning: if host in site_planning[cluster]: site_planning[cluster][host][ 'busy'].append( (start_time, end_time)) if job[5][0] and 'vlans' in site_planning: for vlan in job[5]: if isinstance(vlan, str) and int(vlan) > 3: # only routed vlan site_planning['vlans']['kavlan-' + vlan]['busy'].append( (start_time, end_time)) if len(job[6]) > 0 and 'subnet' in site_planning: for subnet in job[6]: site_planning['subnets'][subnet]['busy'].append( (start_time, end_time)) finally: conn.close() except Exception as e: logger.warn( 'error connecting to oar database / getting planning from ' + site) logger.detail("exception:\n" + format_exc()) currentThread().broken = True
def __init__(self, hosts, topo_list=None, config_file=None): """Create a new Hadoop cluster with the given hosts and topology. Args: hosts (list of Host): The hosts to be assigned a topology. topo_list (list of str, optional): The racks to be assigned to each host. len(hosts) should be equal to len(topo_list). configFile (str, optional): The path of the config file to be used. """ # Load properties config = ConfigParser(self.defaults) config.add_section("cluster") config.add_section("local") if config_file: config.readfp(open(config_file)) # Deployment properties self.local_base_conf_dir = config.get("local", "local_base_conf_dir") self.init_conf_dir = tempfile.mkdtemp("", "hadoop-init-", "/tmp") self.conf_mandatory_files = [CORE_CONF_FILE, HDFS_CONF_FILE, MR_CONF_FILE] # Node properties self.base_dir = config.get("cluster", "hadoop_base_dir") self.conf_dir = config.get("cluster", "hadoop_conf_dir") self.logs_dir = config.get("cluster", "hadoop_logs_dir") self.hadoop_temp_dir = config.get("cluster", "hadoop_temp_dir") self.hdfs_port = config.getint("cluster", "hdfs_port") self.mapred_port = config.getint("cluster", "mapred_port") self.bin_dir = self.base_dir + "/bin" self.sbin_dir = self.base_dir + "/bin" self.java_home = None # Configure master and slaves self.hosts = list(hosts) self.master = self.hosts[0] # Create topology self.topology = HadoopTopology(hosts, topo_list) # Store cluster information self.hw = G5kDeploymentHardware() self.hw.add_hosts(self.hosts) self.master_cluster = self.hw.get_cluster(get_host_cluster(self.master)) # Create a string to display the topology t = {v: [] for v in self.topology.topology.values()} for key, value in self.topology.topology.iteritems(): t[value].append(key.address) log_topo = ', '.join([style.user2(k) + ': ' + ' '.join(map(lambda x: style.host(x.split('.')[0]), v)) for k, v in t.iteritems()]) logger.info("Hadoop cluster created with master %s, hosts %s and topology %s", style.host(self.master.address), ' '.join([style.host(h.address.split('.')[0]) for h in self.hosts]), log_topo)
def get_planning(elements=['grid5000'], vlan=False, subnet=False, storage=False, out_of_chart=False, starttime=None, endtime=None, ignore_besteffort=True, queues='default'): """Retrieve the planning of the elements (site, cluster) and others resources. Element planning structure is ``{'busy': [(123456,123457), ... ], 'free': [(123457,123460), ... ]}.`` :param elements: a list of Grid'5000 elements ('grid5000', <site>, <cluster>) :param vlan: a boolean to ask for KaVLAN computation :param subnet: a boolean to ask for subnets computation :param storage: a boolean to ask for sorage computation :param out_of_chart: if True, consider that days outside weekends are busy :param starttime: start of time period for which to compute the planning, defaults to now + 1 minute :param endtime: end of time period for which to compute the planning, defaults to 4 weeks from now :param ignore_besteffort: True by default, to consider the resources with besteffort jobs as available :param queues: list of oar queues for which to get the planning Return a dict whose keys are sites, whose values are dict whose keys are cluster, subnets, kavlan or storage, whose values are planning dicts, whose keys are hosts, subnet address range, vlan number or chunk id planning respectively. """ if not starttime: starttime = int(time() + timedelta_to_seconds(timedelta(minutes=1))) starttime = int(get_unixts(starttime)) if not endtime: endtime = int(starttime + timedelta_to_seconds(timedelta(weeks=4, minutes=1))) endtime = int(get_unixts(endtime)) if 'grid5000' in elements: sites = elements = get_g5k_sites() else: sites = list( set([site for site in elements if site in get_g5k_sites()] + [ get_cluster_site(cluster) for cluster in elements if cluster in get_g5k_clusters(queues=queues) ] + [ get_host_site(host) for host in elements if host in get_g5k_hosts() or get_host_shortname(host) in get_g5k_hosts() ])) if len(sites) == 0: logger.error('Wrong elements given: %s' % (elements, )) return None planning = {} for site in sites: planning[site] = {} for cluster in get_site_clusters(site, queues=queues): planning[site][cluster] = {} for site in sites: if vlan: planning[site].update({'vlans': {}}) if subnet: planning[site].update({'subnets': {}}) if storage: planning[site].update({'storage': {}}) if _retrieve_method == 'API': _get_planning_API(planning, ignore_besteffort) elif _retrieve_method == 'PostgreSQL': _get_planning_PGSQL(planning, ignore_besteffort) if out_of_chart: _add_charter_to_planning(planning, starttime, endtime) for site_pl in planning.values(): for res_pl in site_pl.values(): for el_planning in res_pl.values(): el_planning['busy'].sort() _merge_el_planning(el_planning['busy']) _trunc_el_planning(el_planning['busy'], starttime, endtime) _fill_el_planning_free(el_planning, starttime, endtime) # cleaning real_planning = deepcopy(planning) for site, site_pl in planning.items(): for cl, cl_pl in site_pl.items(): if cl in ['vlans']: continue keep_cluster = False for h in cl_pl: if not (get_host_site(h) in elements or get_host_cluster(h) in elements or get_host_shortname(h) in elements or h in elements): del real_planning[site][cl][h] else: keep_cluster = True if not keep_cluster: del real_planning[site][cl] return real_planning
def _get_site_planning_API(site, site_planning, ignore_besteffort): try: alive_nodes = set([ str(node['network_address']) for node in get_resource_attributes( '/sites/' + site + '/internal/oarapi/resources/details.json?limit=2^30')['items'] if node['type'] == 'default' and node['state'] != 'Dead' and node['maintenance'] != 'YES' ]) for host in alive_nodes: host_cluster = get_host_cluster(str(host)) if host_cluster in site_planning: site_planning[host_cluster].update( {host: { 'busy': [], 'free': [] }}) if 'vlans' in site_planning: site_planning['vlans'] = {} for vlan in _get_vlans_API(site): site_planning['vlans'][vlan] = {'busy': [], 'free': []} # STORAGE AND SUBNETS MISSING # Retrieving jobs site_jobs = get_resource_attributes( '/sites/' + site + '/jobs?limit=1073741824&state=waiting,launching,running')['items'] jobs_links = [ link['href'] for job in site_jobs for link in job['links'] \ if link['rel'] == 'self' and (ignore_besteffort == False or job['queue'] != 'besteffort') ] threads = [] for link in jobs_links: t = Thread(target=_get_job_link_attr_API, args=('/' + str(link).split('/', 2)[2], )) t.broken = False t.attr = None t.ex = None threads.append(t) t.start() for t in threads: t.join() if t.broken: raise t.ex attr = t.attr try: start_time = attr['started_at'] if attr[ 'started_at'] != 0 else attr['scheduled_at'] end_time = start_time + attr['walltime'] except: continue start_time, end_time = _fix_job(start_time, end_time) nodes = attr['assigned_nodes'] for node in nodes: cluster = node.split('.', 1)[0].split('-')[0] if cluster in site_planning and node in site_planning[cluster]: site_planning[cluster][node]['busy'].append( (start_time, end_time)) if 'vlans' in site_planning and 'vlans' in attr['resources_by_type'] \ and int(attr['resources_by_type']['vlans'][0]) > 3: kavname = 'kavlan-' + str( attr['resources_by_type']['vlans'][0]) site_planning['vlans'][kavname]['busy'].append( (start_time, end_time)) if 'subnets' in site_planning and 'subnets' in attr[ 'resources_by_type']: for subnet in attr['resources_by_type']['subnets']: if subnet not in site_planning['subnets']: site_planning['subnets'][subnet] = { 'busy': [], 'free': [] } site_planning['subnets'][subnet]['busy'].append( (start_time, end_time)) # STORAGE IS MISSING except Exception as e: logger.warn( 'error connecting to oar database / getting planning from ' + site) logger.detail("exception:\n" + format_exc()) currentThread().broken = True