def _actions_hosts(self, action): hosts_ok, hosts_ko = [], [] for p in action.processes: if p.ok: hosts_ok.append(p.host) else: logger.warn('%s is KO', p.host) hosts_ko.append(p.host) hosts_ok, hosts_ko = list(set(hosts_ok)), list(set(hosts_ko)) self._update_hosts_state(hosts_ok, hosts_ko)
def _get_target_lc_and_port(self, equip, linecard_index, port_index, site): # given the port <linecard_index>:<port_index> on equip/site # find (linecard_index, port_index) to which is it connected to on the target equipment # by looking at the (optional) port specification and by searching in the target equipment network description, and comparing / complementing this informations # returns the tuple (linecard_index, port_index) port = self.data['network'][site][equip]['linecards'][linecard_index][ 'ports'][port_index] uid = _parse_port_uid(port['uid']) if 'port' in port: target_lc1, target_port1 = _parse_port_port(port['port']) if target_lc1 is None: logger.warn( 'unable to get from the port spec %s the target linecard of link between %s(%s:%s %s) and %s' % (port['port'], equip, linecard_index, port_index, port, uid)) target_lc2, target_port2 = self._get_target_linecard_and_port_from_api( equip, linecard_index, port_index, site) if target_lc2 is None: logger.warn( 'unable to get from %s the target linecard of link between %s(%s:%s %s) and %s' % (uid, equip, linecard_index, port_index, port, uid)) if (target_lc1 != target_lc2) or (target_port1 != target_port2): if not target_lc1 is None: target_lc, target_port = target_lc1, target_port1 else: target_lc, target_port = target_lc2, target_port2 logger.warn( 'mismatch between the linecards of link between %s(%s:%s %s) and %s: %s:%s vs %s:%s. Using the "less unlikely one" %s:%s' % (equip, linecard_index, port_index, port, uid, target_lc1, target_port1, target_lc2, target_port2, target_lc, target_port)) return (target_lc, target_port) else: return self._get_target_linecard_and_port_from_api( equip, linecard_index, port_index, site)
def _parse_port_uid(uid): prefix, _, uid = uid.rpartition(' ') if prefix: logger.warn('uid %s prefixed with %s' % (uid, prefix)) return uid
def add_equip(self, equip, site): """Add a network equipment """ if equip not in self.data['network'][site]: logger.warn('Equipment %s not described in API' % (equip, )) return data = self.data['network'][site][equip] if self.has_node(equip): recurse = False else: logger.debug('Adding equipment %s', equip) self.add_node(equip, kind=data['kind'], backplane=data['backplane_bps']) recurse = True lc_data = data['linecards'] multiple_linecards = self._equip_uses_multiple_linecards(equip, site) equip_bw = data['backplane_bps'] for i_lc, lc in enumerate(lc_data): lc_node = _get_linecard_name(equip, i_lc) if 'ports' in lc: for i_port, port in enumerate(lc['ports']): if 'uid' in port: uid = _parse_port_uid(port['uid']) if not self._is_in_api(site, uid): do_once( (site, uid), logger.warn, 'unable to get kind of %s in %s, is it in g5k api?' % (uid, site)) continue kind = port.get('kind') kind2 = self._get_node_kind(site, uid) if not kind: kind = kind2 if kind != 'node': do_once( (equip, i_lc, i_port), logger.warn, 'missing kind in port %s:%s %s of %s, using %s from %s' % (i_lc, i_port, port, equip, kind, uid)) elif not kind2: logger.warn('missing kind in %s' % (uid, )) elif kind != kind2: logger.warn( 'mismatching kind %s in port %s:%s %s of %s and kind %s from %s. Using %s' % (kind, i_lc, i_port, port, equip, kind2, uid, kind2)) kind = kind2 if not kind: logger.error('unable to find kind of %s' % (uid, )) port_bw = lc['rate'] if 'rate' not in port else port[ 'rate'] if kind == 'virtual': # in this situation, we don't know what # kind is the target equipment, we need to # discover it if uid in self.data['network'][site]: pass elif uid in self.data['hosts']: kind = 'virtual-node' logger.warn( 'virtual link from %s(%s:%s %s) to node %s' % (equip, i_lc, i_port, port, uid)) else: pass if self.has_node(uid): if kind in ['node', 'virtual-node']: for e in self.get_host_adapters(uid): if e['switch'] == equip: if multiple_linecards: self._checked_add_linecard( lc_node, lc.get('backplane_bps', data['backplane_bps'])) self._checked_add_edge( equip, lc_node, _unique_link_key( equip, lc_node), bandwidth=equip_bw, active=True) self._checked_add_edge( lc_node, uid, _unique_link_key( lc_node, uid + '-' + e['device']), bandwidth=port_bw, active=e['mounted']) else: self._checked_add_edge( equip, uid, _unique_link_key( equip, uid + '-' + e['device']), bandwidth=min( port_bw, equip_bw), active=e['mounted']) elif kind in ['switch', 'router'] and recurse: if multiple_linecards: self._checked_add_linecard( lc_node, lc.get('backplane_bps', data['backplane_bps'])) self._checked_add_edge(equip, lc_node, _unique_link_key( equip, lc_node), bandwidth=equip_bw, active=True) target_lc, target_port = self._get_target_lc_and_port( equip, i_lc, i_port, site) if not target_lc is None: if self._equip_uses_multiple_linecards( uid, site): self._checked_add_edge( lc_node, _get_linecard_name( uid, target_lc), _unique_link_key( lc_node, _get_linecard_name( uid, target_lc)), bandwidth=port_bw, active=True) else: self._checked_add_edge( lc_node, uid, _unique_link_key(lc_node, uid), bandwidth=port_bw, active=True) else: logger.error( 'unable to find the target linecard of link between %s(%s:%s %s) and %s. Skipping this link!' % (equip, i_lc, i_port, port, uid)) else: target_lc, target_port = self._get_target_lc_and_port( equip, i_lc, i_port, site) if not target_lc is None: if self._equip_uses_multiple_linecards( uid, site): self._checked_add_edge( equip, _get_linecard_name( uid, target_lc), _unique_link_key( equip, _get_linecard_name( uid, target_lc)), bandwidth=min( port_bw, equip_bw), active=True) else: self._checked_add_edge( equip, uid, _unique_link_key(equip, uid), bandwidth=min( port_bw, equip_bw), active=True) else: logger.error( 'unable to find the target linecard of link between %s(%s:%s %s) and %s. Skipping this link!' % (equip, i_lc, i_port, port, uid)) if 'renater' in uid: # if uid != 'renater-' + site: # logger.error('renater node in %s has name %s which is not of the form renater-%s. Forcing to renater-%s' % (site, uid, site, site)) # uid = 'renater-' + site self.add_node(uid, kind='renater') if multiple_linecards: self._checked_add_linecard( lc_node, lc.get('backplane_bps', data['backplane_bps'])) self._checked_add_edge(equip, lc_node, _unique_link_key( equip, lc_node), bandwidth=equip_bw, active=True) self._checked_add_edge(lc_node, uid, _unique_link_key( lc_node, uid), bandwidth=port_bw, active=True) else: self._checked_add_edge( equip, uid, _unique_link_key(equip, uid), bandwidth=min(port_bw, equip_bw), active=True) elif kind in ['switch', 'router']: if multiple_linecards: self._checked_add_linecard( lc_node, lc.get('backplane_bps', data['backplane_bps'])) self._checked_add_edge(equip, lc_node, _unique_link_key( equip, lc_node), bandwidth=equip_bw, active=True) if recurse: self.add_equip(uid, site)
def _get_target_linecard_and_port_from_api(self, equip, linecard_index, port_index, site): # given the port <linecard_index>:<port_index> on equip/site # find (linecard_index, port_index) to which it is connected to on the target equipment # by searching in the target equipment network description # returns the tuple (linecard_index, port_index) port = self.data['network'][site][equip]['linecards'][linecard_index][ 'ports'][port_index] uid = _parse_port_uid(port['uid']) if uid not in self.data['network'][site]: raise Exception( 'trying to find a linecard of equipment %s which is not in the network description of %s' % (uid, site)) possible_targets = [] for i_lc, lc in enumerate( self.data['network'][site][uid]['linecards']): if 'ports' in lc: for i_p, p in enumerate(lc['ports']): if 'uid' in p and _parse_port_uid(p['uid']) == equip: possible_targets.append((i_lc, i_p)) targets = [] if len(possible_targets) > 1: # need to disambiguate for target in possible_targets: target_port_data = self.data['network'][site][uid][ 'linecards'][target[0]]['ports'][target[1]] if 'port' in target_port_data: target_lc, target_port = _parse_port_port( target_port_data['port']) if target_lc is None or target_port is None: logger.warn( 'unable to parse port spec %s of port %s:%s on %s of link from between %s(%s:%s %s)' % (target_port_data['port'], target[0], target[1], uid, equip, linecard_index, port_index, port)) if target_lc == linecard_index and target_port == port_index: targets.append(target) else: logger.warn( 'no "port" entry in api network/%s/%s/linecards[%s]/ports[%s]' % (site, uid, target[0], target[1])) else: targets = possible_targets if len(targets) == 0: logger.warn( 'unable to find the target linecard on %s of link from %s(%s:%s %s)' % (uid, equip, linecard_index, port_index, port)) if len(possible_targets) > 0: logger.warn( 'there are %s candidates %s, use the first possible one: %s' % (len(possible_targets), possible_targets, possible_targets[0])) return possible_targets[0] else: return (None, None) if len(targets) > 1: logger.warn( 'unable to disambiguate between multiple links to %s from %s(%s:%s %s). candidate linecards:ports are %s. Using the first possible one %s' % (uid, equip, linecard_index, port_index, port, targets, targets[0])) return targets[0] return targets[0]
def _get_site_planning_PGSQL(site, site_planning, ignore_besteffort): try: with G5kAutoPortForwarder( site, 'oardb.' + site + '.grid5000.fr', g5k_configuration['oar_pgsql_ro_port']) as (host, port): conn = psycopg2.connect( host=host, port=port, user=g5k_configuration['oar_pgsql_ro_user'], password=g5k_configuration['oar_pgsql_ro_password'], database=g5k_configuration['oar_pgsql_ro_db']) try: cur = conn.cursor() # Retrieving alive resources sql = """SELECT DISTINCT R.type, R.network_address, R.vlan, R.subnet_address FROM resources R WHERE state <> 'Dead' AND R.maintenance <> 'YES';""" cur.execute(sql) for data in cur.fetchall(): if data[0] == "default": cluster = get_host_cluster(data[1]) if cluster in site_planning: site_planning[cluster][data[1]] = { 'busy': [], 'free': [] } if data[0] in ['kavlan', 'kavlan-global'] \ and 'vlans' in site_planning: site_planning['vlans']['kavlan-' + data[2]] = { 'busy': [], 'free': [] } if data[0] == "subnet" and 'subnet' in site_planning: site_planning['subnets'][data[3]] = { 'busy': [], 'free': [] } sql = ( """SELECT J.job_id, J.state, GJP.start_time AS start_time, GJP.start_time+MJD.moldable_walltime, array_agg(DISTINCT R.network_address) AS hosts, array_agg(DISTINCT R.vlan) AS vlan, array_agg(DISTINCT R.subnet_address) AS subnets FROM jobs J LEFT JOIN moldable_job_descriptions MJD ON MJD.moldable_job_id=J.job_id LEFT JOIN gantt_jobs_predictions GJP ON GJP.moldable_job_id=MJD.moldable_id INNER JOIN gantt_jobs_resources AR ON AR.moldable_job_id=MJD.moldable_id LEFT JOIN resources R ON AR.resource_id=R.resource_id WHERE ( J.state='Launching' OR J.state='Running' OR J.state='Waiting') """ + (""" AND queue_name<>'besteffort'""" if ignore_besteffort else """""") + """GROUP BY J.job_id, GJP.start_time, MJD.moldable_walltime ORDER BY J.start_time""") # CONVERT(SUBSTRING_INDEX(SUBSTRING_INDEX(R.network_address,'.',1),'-',-1), SIGNED)""" cur.execute(sql) for job in cur.fetchall(): start_time = job[2] end_time = job[3] start_time, end_time = _fix_job(start_time, end_time) if len(job[4]) > 0: for host in job[4]: if host != '': cluster = get_host_cluster(host) if cluster in site_planning: if host in site_planning[cluster]: site_planning[cluster][host][ 'busy'].append( (start_time, end_time)) if job[5][0] and 'vlans' in site_planning: for vlan in job[5]: if isinstance(vlan, str) and int(vlan) > 3: # only routed vlan site_planning['vlans']['kavlan-' + vlan]['busy'].append( (start_time, end_time)) if len(job[6]) > 0 and 'subnet' in site_planning: for subnet in job[6]: site_planning['subnets'][subnet]['busy'].append( (start_time, end_time)) finally: conn.close() except Exception as e: logger.warn( 'error connecting to oar database / getting planning from ' + site) logger.detail("exception:\n" + format_exc()) currentThread().broken = True
def _get_site_planning_API(site, site_planning, ignore_besteffort): try: alive_nodes = set([ str(node['network_address']) for node in get_resource_attributes( '/sites/' + site + '/internal/oarapi/resources/details.json?limit=2^30')['items'] if node['type'] == 'default' and node['state'] != 'Dead' and node['maintenance'] != 'YES' ]) for host in alive_nodes: host_cluster = get_host_cluster(str(host)) if host_cluster in site_planning: site_planning[host_cluster].update( {host: { 'busy': [], 'free': [] }}) if 'vlans' in site_planning: site_planning['vlans'] = {} for vlan in _get_vlans_API(site): site_planning['vlans'][vlan] = {'busy': [], 'free': []} # STORAGE AND SUBNETS MISSING # Retrieving jobs site_jobs = get_resource_attributes( '/sites/' + site + '/jobs?limit=1073741824&state=waiting,launching,running')['items'] jobs_links = [ link['href'] for job in site_jobs for link in job['links'] \ if link['rel'] == 'self' and (ignore_besteffort == False or job['queue'] != 'besteffort') ] threads = [] for link in jobs_links: t = Thread(target=_get_job_link_attr_API, args=('/' + str(link).split('/', 2)[2], )) t.broken = False t.attr = None t.ex = None threads.append(t) t.start() for t in threads: t.join() if t.broken: raise t.ex attr = t.attr try: start_time = attr['started_at'] if attr[ 'started_at'] != 0 else attr['scheduled_at'] end_time = start_time + attr['walltime'] except: continue start_time, end_time = _fix_job(start_time, end_time) nodes = attr['assigned_nodes'] for node in nodes: cluster = node.split('.', 1)[0].split('-')[0] if cluster in site_planning and node in site_planning[cluster]: site_planning[cluster][node]['busy'].append( (start_time, end_time)) if 'vlans' in site_planning and 'vlans' in attr['resources_by_type'] \ and int(attr['resources_by_type']['vlans'][0]) > 3: kavname = 'kavlan-' + str( attr['resources_by_type']['vlans'][0]) site_planning['vlans'][kavname]['busy'].append( (start_time, end_time)) if 'subnets' in site_planning and 'subnets' in attr[ 'resources_by_type']: for subnet in attr['resources_by_type']['subnets']: if subnet not in site_planning['subnets']: site_planning['subnets'][subnet] = { 'busy': [], 'free': [] } site_planning['subnets'][subnet]['busy'].append( (start_time, end_time)) # STORAGE IS MISSING except Exception as e: logger.warn( 'error connecting to oar database / getting planning from ' + site) logger.detail("exception:\n" + format_exc()) currentThread().broken = True