def do_pxe_discovery(cfg, handler, info, manual, nodename, policies): # use uuid based scheme in lieu of tls cert, ideally only # for stateless 'discovery' targets like pxe, where data does not # change uuidinfo = cfg.get_node_attributes(nodename, ['id.uuid', 'id.serial', 'id.model', 'net*.bootable']) if manual or policies & set(('open', 'pxe')): enrich_pxe_info(info) attribs = {} olduuid = uuidinfo.get(nodename, {}).get('id.uuid', None) uuid = info.get('uuid', None) if uuid and uuid != olduuid: attribs['id.uuid'] = info['uuid'] sn = info.get('serialnumber', None) mn = info.get('modelnumber', None) if sn and sn != uuidinfo.get(nodename, {}).get('id.serial', None): attribs['id.serial'] = sn if mn and mn != uuidinfo.get(nodename, {}).get('id.model', None): attribs['id.model'] = mn for attrname in uuidinfo.get(nodename, {}): if attrname.endswith('.bootable') and uuidinfo[nodename][attrname].get('value', None): newattrname = attrname[:-8] + 'hwaddr' attribs[newattrname] = info['hwaddr'] if attribs: cfg.set_node_attributes({nodename: attribs}) if info['uuid'] in known_pxe_uuids: return True if uuid_is_valid(info['uuid']): known_pxe_uuids[info['uuid']] = nodename log.log({'info': 'Detected {0} ({1} with mac {2})'.format( nodename, handler.devname, info['hwaddr'])}) return True
def become_leader(connection): global currentleader global follower global retrythread log.log({ 'info': 'Becoming leader of collective', 'subsystem': 'collective' }) if follower: follower.kill() cfm.stop_following() follower = None if retrythread: retrythread.cancel() retrythread = None currentleader = connection.getsockname()[0] skipaddr = connection.getpeername()[0] myname = get_myname() skipem = set(cfm.cfgstreams) skipem.add(currentleader) skipem.add(skipaddr) for member in cfm.list_collective(): dronecandidate = cfm.get_collective_member(member)['address'] if dronecandidate in skipem or member == myname: continue eventlet.spawn_n(try_assimilate, dronecandidate) schedule_rebalance()
def start_collective(): global follower global retrythread if follower: follower.kill() cfm.stop_following() follower = None try: if cfm.cfgstreams: cfm.check_quorum() # Do not start if we have quorum and are leader return except exc.DegradedCollective: pass if leader_init.active: # do not start trying to connect if we are # xmitting data to a follower return myname = get_myname() for member in sorted(list(cfm.list_collective())): if member == myname: continue if cfm.cfgleader is None: cfm.stop_following(True) ldrcandidate = cfm.get_collective_member(member)['address'] log.log({'info': 'Performing startup attempt to {0}'.format( ldrcandidate), 'subsystem': 'collective'}) if connect_to_leader(name=myname, leader=ldrcandidate): break else: retrythread = eventlet.spawn_after(30 + random.random(), start_collective)
def run(): setlimits() try: signal.signal(signal.SIGUSR1, dumptrace) except AttributeError: pass # silly windows if havefcntl: _checkpidfile() conf.init_config() try: config = conf.get_config() _initsecurity(config) except: sys.stderr.write("Error unlocking credential store\n") doexit() sys.exit(1) try: confluentcore.load_plugins() except: doexit() raise try: log.log({'info': 'Confluent management service starting'}, flush=True) except (OSError, IOError) as e: print(repr(e)) sys.exit(1) _daemonize() if havefcntl: _updatepidfile() signal.signal(signal.SIGINT, terminate) signal.signal(signal.SIGTERM, terminate) collective.startup() if dbgif: oumask = os.umask(0o077) try: os.remove('/var/run/confluent/dbg.sock') except OSError: pass # We are not expecting the file to exist try: dbgsock = eventlet.listen("/var/run/confluent/dbg.sock", family=socket.AF_UNIX) eventlet.spawn_n(backdoor.backdoor_server, dbgsock) except AttributeError: pass # Windows... os.umask(oumask) http_bind_host, http_bind_port = _get_connector_config('http') sock_bind_host, sock_bind_port = _get_connector_config('socket') webservice = httpapi.HttpApi(http_bind_host, http_bind_port) webservice.start() disco.start_detection() try: sockservice = sockapi.SockApi(sock_bind_host, sock_bind_port) sockservice.start() except NameError: pass atexit.register(doexit) eventlet.sleep(1) consoleserver.start_console_sessions() while 1: eventlet.sleep(100)
def _recheck_nodes_backend(nodeattribs, configmanager): global rechecker _map_unique_ids(nodeattribs) # for the nodes whose attributes have changed, consider them as potential # strangers for node in nodeattribs: if node in known_nodes: for somemac in known_nodes[node]: unknown_info[somemac] = known_nodes[node][somemac] unknown_info[somemac]['discostatus'] = 'unidentified' # Now we go through ones we did not find earlier for mac in list(unknown_info): try: _recheck_single_unknown(configmanager, mac) except Exception: traceback.print_exc() continue # now we go through ones that were identified, but could not pass # policy or hadn't been able to verify key for nodename in pending_nodes: info = pending_nodes[nodename] try: if info['handler'] is None: next handler = info['handler'].NodeHandler(info, configmanager) discopool.spawn_n(eval_node, configmanager, handler, info, nodename) except Exception: traceback.print_exc() log.log({ 'error': 'Unexpected error during discovery of {0}, check debug ' 'logs'.format(nodename) })
def _affluent_map_switch(args): switch, password, user, cfm = args kv = util.TLSCertVerifier(cfm, switch, 'pubkeys.tls_hardwaremanager').verify_cert wc = webclient.SecureHTTPConnection(switch, 443, verifycallback=kv, timeout=5) wc.set_basic_credentials(user, password) macs = wc.grab_json_response('/affluent/macs/by-port') _macsbyswitch[switch] = macs for iface in macs: nummacs = len(macs[iface]) for mac in macs[iface]: if mac in _macmap: _macmap[mac].append((switch, iface, nummacs)) else: _macmap[mac] = [(switch, iface, nummacs)] nodename = _nodelookup(switch, iface) if nodename is not None: if mac in _nodesbymac and _nodesbymac[mac][0] != nodename: # For example, listed on both a real edge port # and by accident a trunk port log.log({ 'error': '{0} and {1} described by ambiguous' ' switch topology values'.format( nodename, _nodesbymac[mac][0]) }) _nodesbymac[mac] = (None, None) else: _nodesbymac[mac] = (nodename, nummacs)
def follow_leader(remote, leader): global currentleader cleanexit = False try: cfm.follow_channel(remote) except greenlet.GreenletExit: cleanexit = True finally: if cleanexit: log.log({ 'info': 'Previous following cleanly closed', 'subsystem': 'collective' }) return log.log({ 'info': 'Current leader ({0}) has disappeared, restarting ' 'collective membership'.format(leader), 'subsystem': 'collective' }) # The leader has folded, time to startup again... cfm.stop_following() currentleader = None eventlet.spawn_n(start_collective)
def _full_updatemacmap(configmanager): global vintage global _macmap global _nodesbymac global _switchportmap global _macsbyswitch global switchbackoff start = util.monotonic_time() with mapupdating: vintage = util.monotonic_time() # Clear all existing entries _macmap = {} _nodesbymac = {} _switchportmap = {} _macsbyswitch = {} if configmanager.tenant is not None: raise exc.ForbiddenRequest( 'Network topology not available to tenants') # here's a list of switches... need to add nodes that are switches nodelocations = configmanager.get_node_attributes( configmanager.list_nodes(), ('net*.switch', 'net*.switchport')) switches = set([]) for node in nodelocations: cfg = nodelocations[node] for attr in cfg: if not attr.endswith('.switch') or 'value' not in cfg[attr]: continue curswitch = cfg[attr].get('value', None) if not curswitch: continue switches.add(curswitch) switchportattr = attr + 'port' if switchportattr in cfg: portname = cfg[switchportattr].get('value', '') if not portname: continue if curswitch not in _switchportmap: _switchportmap[curswitch] = {} if portname in _switchportmap[curswitch]: log.log({ 'error': 'Duplicate switch topology config ' 'for {0} and {1}'.format( node, _switchportmap[curswitch][portname]) }) _switchportmap[curswitch][portname] = None else: _switchportmap[curswitch][portname] = node switchauth = get_switchcreds(configmanager, switches) pool = GreenPool(64) for ans in pool.imap(_map_switch, switchauth): vintage = util.monotonic_time() yield ans endtime = util.monotonic_time() duration = endtime - start duration = duration * 15 # wait 15 times as long as it takes to walk # avoid spending a large portion of the time hitting switches with snmp # requests if duration > switchbackoff: switchbackoff = duration
def get_nodename(cfg, handler, info): nodename = None maccount = None info['verified'] = False if not handler: return None, None if handler.https_supported: currcert = handler.https_cert if not currcert: info['discofailure'] = 'nohttps' return None, None currprint = util.get_fingerprint(currcert, 'sha256') nodename = nodes_by_fprint.get(currprint, None) if not nodename: # Try SHA512 as well currprint = util.get_fingerprint(currcert) nodename = nodes_by_fprint.get(currprint, None) if not nodename: curruuid = info.get('uuid', None) if uuid_is_valid(curruuid): nodename = nodes_by_uuid.get(curruuid, None) if nodename is None: _map_unique_ids() nodename = nodes_by_uuid.get(curruuid, None) if not nodename: # Ok, see if it is something with a chassis-uuid and discover by # chassis nodename = get_nodename_from_enclosures(cfg, info) if not nodename and handler.devname == 'SMM': nodename = get_nodename_from_chained_smms(cfg, handler, info) if not nodename: # as a last resort, search switches for info # This is the slowest potential operation, so we hope for the # best to occur prior to this nodename, macinfo = macmap.find_nodeinfo_by_mac(info['hwaddr'], cfg) maccount = macinfo['maccount'] if nodename: if handler.devname == 'SMM': nl = list(cfg.filter_node_attributes( 'enclosure.extends=' + nodename)) if nl: # We found an SMM, and it's in a chain per configuration # we need to ask the switch for the fingerprint to see # if we have a match or not newnodename, v = get_chained_smm_name(nodename, cfg, handler, nl) if newnodename: # while this started by switch, it was disambiguated info['verified'] = v return newnodename, None if (nodename and not handler.discoverable_by_switch(macinfo['maccount'])): if handler.devname == 'SMM': errorstr = 'Attempt to discover SMM by switch, but chained ' \ 'topology or incorrect net attributes detected, ' \ 'which is not compatible with switch discovery ' \ 'of SMM, nodename would have been ' \ '{0}'.format(nodename) log.log({'error': errorstr}) return None, None return nodename, maccount
def update_macmap(configmanager): """Interrogate switches to build/update mac table Begin a rebuild process. This process is a generator that will yield as each switch interrogation completes, allowing a caller to recheck the cache as results become possible, rather than having to wait for the process to complete to interrogate. """ global _macmap global _nodesbymac global _switchportmap # Clear all existing entries _macmap = {} _nodesbymac = {} _switchportmap = {} if configmanager.tenant is not None: raise exc.ForbiddenRequest('Network topology not available to tenants') nodelocations = configmanager.get_node_attributes( configmanager.list_nodes(), ('hardwaremanagement.switch', 'hardwaremanagement.switchport')) switches = set([]) for node in nodelocations: cfg = nodelocations[node] if 'hardwaremanagement.switch' in cfg: curswitch = cfg['hardwaremanagement.switch']['value'] switches.add(curswitch) if 'hardwaremanagement.switchport' in cfg: portname = cfg['hardwaremanagement.switchport']['value'] if curswitch not in _switchportmap: _switchportmap[curswitch] = {} if portname in _switchportmap[curswitch]: log.log({ 'warning': 'Duplicate switch topology config for ' '{0} and {1}'.format( node, _switchportmap[curswitch][portname]) }) _switchportmap[curswitch][portname] = node switchcfg = configmanager.get_node_attributes( switches, ('secret.hardwaremanagementuser', 'secret.hardwaremanagementpassword'), decrypt=True) switchauth = [] for switch in switches: password = '******' user = None if (switch in switchcfg and 'secret.hardwaremanagementpassword' in switchcfg[switch]): password = switchcfg[switch]['secret.hardwaremanagementpassword'][ 'value'] if 'secret.hardwaremanagementuser' in switchcfg[switch]: user = switchcfg[switch]['secret.hardwaremanagementuser'][ 'value'] switchauth.append((switch, password, user)) pool = GreenPool() for res in pool.imap(_map_switch, switchauth): yield res print(repr(_macmap))
def update_macmap(configmanager): """Interrogate switches to build/update mac table Begin a rebuild process. This process is a generator that will yield as each switch interrogation completes, allowing a caller to recheck the cache as results become possible, rather than having to wait for the process to complete to interrogate. """ global _macmap global _nodesbymac global _switchportmap # Clear all existing entries _macmap = {} _nodesbymac = {} _switchportmap = {} if configmanager.tenant is not None: raise exc.ForbiddenRequest('Network topology not available to tenants') nodelocations = configmanager.get_node_attributes( configmanager.list_nodes(), ('hardwaremanagement.switch', 'hardwaremanagement.switchport')) switches = set([]) for node in nodelocations: cfg = nodelocations[node] if 'hardwaremanagement.switch' in cfg: curswitch = cfg['hardwaremanagement.switch']['value'] switches.add(curswitch) if 'hardwaremanagement.switchport' in cfg: portname = cfg['hardwaremanagement.switchport']['value'] if curswitch not in _switchportmap: _switchportmap[curswitch] = {} if portname in _switchportmap[curswitch]: log.log({'warning': 'Duplicate switch topology config for ' '{0} and {1}'.format(node, _switchportmap[ curswitch][ portname])}) _switchportmap[curswitch][portname] = node switchcfg = configmanager.get_node_attributes( switches, ('secret.hardwaremanagementuser', 'secret.hardwaremanagementpassword'), decrypt=True) switchauth = [] for switch in switches: password = '******' user = None if (switch in switchcfg and 'secret.hardwaremanagementpassword' in switchcfg[switch]): password = switchcfg[switch]['secret.hardwaremanagementpassword'][ 'value'] if 'secret.hardwaremanagementuser' in switchcfg[switch]: user = switchcfg[switch]['secret.hardwaremanagementuser'][ 'value'] switchauth.append((switch, password, user)) pool = GreenPool() for res in pool.imap(_map_switch, switchauth): yield res print(repr(_macmap))
def get_leader(connection): if currentleader is None or connection.getpeername()[0] == currentleader: if currentleader is None: msg = 'Becoming leader as no leader known' else: msg = 'Becoming leader because {0} attempted to connect and it ' \ 'is current leader'.format(currentleader) log.log({'info': msg, 'subsystem': 'collective'}) become_leader(connection) return currentleader
def start_collective(): global follower global retrythread global initting initting = True retrythread = None try: cfm.membership_callback = schedule_rebalance if follower is not None: initting = False return try: if cfm.cfgstreams: cfm.check_quorum() # Do not start if we have quorum and are leader return except exc.DegradedCollective: pass if leader_init.active: # do not start trying to connect if we are # xmitting data to a follower return myname = get_myname() connecto = [] for member in sorted(list(cfm.list_collective())): if member == myname: continue if cfm.cfgleader is None: cfm.stop_following(True) ldrcandidate = cfm.get_collective_member(member)['address'] connecto.append(ldrcandidate) conpool = greenpool.GreenPool(64) connections = conpool.imap(create_connection, connecto) for ent in connections: member, remote = ent if isinstance(remote, Exception): continue if follower is None: log.log({ 'info': 'Performing startup attempt to {0}'.format(member), 'subsystem': 'collective' }) if not connect_to_leader( name=myname, leader=member, remote=remote): remote.close() else: remote.close() except Exception as e: pass finally: if retrythread is None and follower is None: retrythread = eventlet.spawn_after(5 + random.random(), start_collective) initting = False
def _recheck_single_unknown(configmanager, mac): global rechecker global rechecktime info = unknown_info.get(mac, None) if not info: return if info['handler'] != pxeh and not info.get('addresses', None): #log.log({'info': 'Missing address information in ' + repr(info)}) return handler = info['handler'].NodeHandler(info, configmanager) if handler.https_supported and not handler.https_cert: if handler.cert_fail_reason == 'unreachable': log.log({ 'info': '{0} with hwaddr {1} is not reachable at {2}' ''.format(handler.devname, info['hwaddr'], handler.ipaddr) }) # addresses data is bad, delete the offending ip info['addresses'] = [ x for x in info.get('addresses', []) if x != handler.ipaddr ] # TODO(jjohnson2): rescan due to bad peer addr data? # not just wait around for the next announce return log.log({ 'info': '{0} with hwaddr {1} at address {2} is not yet running ' 'https, will examine later'.format(handler.devname, info['hwaddr'], handler.ipaddr) }) if rechecker is not None and rechecktime > util.monotonic_time() + 300: rechecker.cancel() # if cancel did not result in dead, then we are in progress if rechecker is None or rechecker.dead: rechecktime = util.monotonic_time() + 300 rechecker = eventlet.spawn_after(300, _periodic_recheck, configmanager) return nodename = get_nodename(configmanager, handler, info) if nodename: if handler.https_supported: dp = configmanager.get_node_attributes( [nodename], ('pubkeys.tls_hardwaremanager', )) lastfp = dp.get(nodename, {}).get('pubkeys.tls_hardwaremanager', {}).get('value', None) if util.cert_matches(lastfp, handler.https_cert): info['nodename'] = nodename known_nodes[nodename][info['hwaddr']] = info info['discostatus'] = 'discovered' return # already known, no need for more discopool.spawn_n(eval_node, configmanager, handler, info, nodename)
def try_assimilate(drone): try: remote = connect_to_collective(None, drone) except socket.error: # Oh well, unable to connect, hopefully the rest will be # in order return tlvdata.send(remote, {'collective': {'operation': 'assimilate', 'name': get_myname(), 'txcount': cfm._txcount}}) tlvdata.recv(remote) # the banner tlvdata.recv(remote) # authpassed... 0.. answer = tlvdata.recv(remote) if not answer: log.log( {'error': 'No answer from {0} while trying to assimilate'.format( drone), 'subsystem': 'collective'}) return if 'txcount' in answer: log.log({'info': 'Deferring to {0} due to transaction count'.format( drone), 'subsystem': 'collective'}) connect_to_leader(None, None, leader=remote.getpeername()[0]) return if 'error' in answer: log.log({ 'error': 'Error encountered while attempting to ' 'assimilate {0}: {1}'.format(drone, answer['error']), 'subsystem': 'collective'}) return log.log({'info': 'Assimilated {0} into collective'.format(drone), 'subsystem': 'collective'})
def try_assimilate(drone, followcount, remote): global retrythread try: remote = connect_to_collective(None, drone, remote) except socket.error: # Oh well, unable to connect, hopefully the rest will be # in order return tlvdata.send( remote, { 'collective': { 'operation': 'assimilate', 'name': get_myname(), 'followcount': followcount, 'txcount': cfm._txcount } }) tlvdata.recv(remote) # the banner tlvdata.recv(remote) # authpassed... 0.. answer = tlvdata.recv(remote) if not answer: log.log({ 'error': 'No answer from {0} while trying to assimilate'.format(drone), 'subsystem': 'collective' }) return True if 'txcount' in answer: log.log({ 'info': 'Deferring to {0} due to target being a better leader'.format( drone), 'subsystem': 'collective' }) retire_as_leader(drone) if not connect_to_leader(None, None, leader=remote.getpeername()[0]): if retrythread is None: retrythread = eventlet.spawn_after(random.random(), start_collective) return False if 'leader' in answer: # Will wait for leader to see about assimilation return True if 'error' in answer: log.log({ 'error': 'Error encountered while attempting to ' 'assimilate {0}: {1}'.format(drone, answer['error']), 'subsystem': 'collective' }) return True log.log({ 'info': 'Assimilated {0} into collective'.format(drone), 'subsystem': 'collective' }) return True
def _periodic_recheck(configmanager): global rechecker global rechecktime rechecker = None try: _recheck_nodes((), configmanager) except Exception: traceback.print_exc() log.log({'error': 'Unexpected error during discovery, check debug ' 'logs'}) # if rechecker is set, it means that an accelerated schedule # for rechecker was requested in the course of recheck_nodes if rechecker is None: rechecktime = util.monotonic_time() + 900 rechecker = eventlet.spawn_after(900, _periodic_recheck, configmanager)
def become_leader(connection): global currentleader global follower global retrythread global reassimilate log.log({ 'info': 'Becoming leader of collective', 'subsystem': 'collective' }) if follower is not None: follower.kill() cfm.stop_following() follower = None if retrythread is not None: retrythread.cancel() retrythread = None currentleader = connection.getsockname()[0] skipaddr = connection.getpeername()[0] if _assimilate_missing(skipaddr): schedule_rebalance() if reassimilate is not None: reassimilate.kill() reassimilate = eventlet.spawn(reassimilate_missing)
def _map_switch(args): try: return _map_switch_backend(args) except (UnicodeError, socket.gaierror): log.log({'error': "Cannot resolve switch '{0}' to an address".format( args[0])}) except exc.TargetEndpointUnreachable: log.log({'error': "Timeout or bad SNMPv1 community string trying to " "reach switch '{0}'".format( args[0])}) except exc.TargetEndpointBadCredentials: log.log({'error': "Bad SNMPv3 credentials for \'{0}\'".format( args[0])}) except Exception as e: log.log({'error': 'Unexpected condition trying to reach switch "{0}"' ' check trace log for more'.format(args[0])}) log.logtrace()
def follow_leader(remote, leader): global currentleader global retrythread global follower cleanexit = False newleader = None try: exitcause = cfm.follow_channel(remote) newleader = exitcause.get('newleader', None) except greenlet.GreenletExit: cleanexit = True finally: if cleanexit: log.log({ 'info': 'Previous following cleanly closed', 'subsystem': 'collective' }) return if newleader: log.log({ 'info': 'Previous leader directed us to join new leader {}'.format( newleader) }) if connect_to_leader(None, get_myname(), newleader): return log.log({ 'info': 'Current leader ({0}) has disappeared, restarting ' 'collective membership'.format(leader), 'subsystem': 'collective' }) # The leader has folded, time to startup again... follower = None cfm.stop_following() currentleader = None if retrythread is None: # start a recovery retrythread = eventlet.spawn_after(random.random(), start_collective)
def _full_updatemacmap(configmanager): global vintage global _macmap global _nodesbymac global _switchportmap global _macsbyswitch with mapupdating: vintage = util.monotonic_time() # Clear all existing entries _macmap = {} _nodesbymac = {} _switchportmap = {} _macsbyswitch = {} if configmanager.tenant is not None: raise exc.ForbiddenRequest( 'Network topology not available to tenants') nodelocations = configmanager.get_node_attributes( configmanager.list_nodes(), ('net*.switch', 'net*.switchport')) switches = set([]) for node in nodelocations: cfg = nodelocations[node] for attr in cfg: if not attr.endswith('.switch') or 'value' not in cfg[attr]: continue curswitch = cfg[attr].get('value', None) if not curswitch: continue switches.add(curswitch) switchportattr = attr + 'port' if switchportattr in cfg: portname = cfg[switchportattr].get('value', '') if not portname: continue if curswitch not in _switchportmap: _switchportmap[curswitch] = {} if portname in _switchportmap[curswitch]: log.log({ 'error': 'Duplicate switch topology config ' 'for {0} and {1}'.format( node, _switchportmap[curswitch][portname]) }) _switchportmap[curswitch][portname] = None else: _switchportmap[curswitch][portname] = node switchcfg = configmanager.get_node_attributes( switches, ('secret.hardwaremanagementuser', 'secret.snmpcommunity', 'secret.hardwaremanagementpassword'), decrypt=True) switchauth = [] for switch in switches: if not switch: continue switchparms = switchcfg.get(switch, {}) user = None password = switchparms.get('secret.snmpcommunity', {}).get('value', None) if not password: password = switchparms.get('secret.hardwaremanagementpassword', {}).get('value', 'public') user = switchparms.get('secret.hardwaremanagementuser', {}).get('value', None) switchauth.append((switch, password, user)) pool = GreenPool() for ans in pool.imap(_map_switch, switchauth): vintage = util.monotonic_time() yield ans
def _map_switch_backend(args): """Manipulate portions of mac address map relevant to a given switch """ # 1.3.6.1.2.1.17.7.1.2.2.1.2 - mactoindex (qbridge - preferred) # if not, check for cisco and if cisco, build list of all relevant vlans: # .1.3.6.1.4.1.9.9.46.1.6.1.1.5 - trunk port vlan map (cisco only) # .1.3.6.1.4.1.9.9.68.1.2.2.1.2 - access port vlan map (cisco only) # if cisco, vlan community string indexed or snmpv3 contest for: # 1.3.6.1.2.1.17.4.3.1.2 - mactoindx (bridge - low-end switches and cisco) # .1.3.6.1.2.1.17.1.4.1.2 - bridge index to if index map # no vlan index or context for: # .1.3.6.1.2.1.31.1.1.1.1 - ifName... but some switches don't do it # .1.3.6.1.2.1.2.2.1.2 - ifDescr, usually useless, but a # fallback if ifName is empty # global _macmap if len(args) == 3: switch, password, user = args if not user: user = None else: switch, password = args user = None haveqbridge = False mactobridge = {} conn = snmp.Session(switch, password, user) for vb in conn.walk('1.3.6.1.2.1.17.7.1.2.2.1.2'): haveqbridge = True oid, bridgeport = vb if not bridgeport: continue oid = str(oid).rsplit('.', 6) # if 7, then oid[1] would be vlan id macaddr = '{0:02x}:{1:02x}:{2:02x}:{3:02x}:{4:02x}:{5:02x}'.format( *([int(x) for x in oid[-6:]])) mactobridge[macaddr] = int(bridgeport) if not haveqbridge: for vb in conn.walk('1.3.6.1.2.1.17.4.3.1.2'): oid, bridgeport = vb if not bridgeport: continue oid = str(oid).rsplit('.', 6) macaddr = '{0:02x}:{1:02x}:{2:02x}:{3:02x}:{4:02x}:{5:02x}'.format( *([int(x) for x in oid[-6:]])) mactobridge[macaddr] = int(bridgeport) bridgetoifmap = {} for vb in conn.walk('1.3.6.1.2.1.17.1.4.1.2'): bridgeport, ifidx = vb bridgeport = int(str(bridgeport).rsplit('.', 1)[1]) try: bridgetoifmap[bridgeport] = int(ifidx) except ValueError: # ifidx might be '', skip in such a case continue ifnamemap = {} havenames = False for vb in conn.walk('1.3.6.1.2.1.31.1.1.1.1'): ifidx, ifname = vb if not ifname: continue havenames = True ifidx = int(str(ifidx).rsplit('.', 1)[1]) ifnamemap[ifidx] = str(ifname) if not havenames: for vb in conn.walk('1.3.6.1.2.1.2.2.1.2'): ifidx, ifname = vb ifidx = int(str(ifidx).rsplit('.', 1)[1]) ifnamemap[ifidx] = str(ifname) maccounts = {} bridgetoifvalid = False for mac in mactobridge: try: ifname = ifnamemap[bridgetoifmap[mactobridge[mac]]] bridgetoifvalid = True except KeyError: continue if ifname not in maccounts: maccounts[ifname] = 1 else: maccounts[ifname] += 1 if not bridgetoifvalid: bridgetoifmap = {} # Not a single mac address resolved to an interface index, chances are # that the switch is broken, and the mactobridge is reporting ifidx # instead of bridge port index # try again, skipping the bridgetoifmap lookup for mac in mactobridge: try: ifname = ifnamemap[mactobridge[mac]] bridgetoifmap[mactobridge[mac]] = mactobridge[mac] except KeyError: continue if ifname not in maccounts: maccounts[ifname] = 1 else: maccounts[ifname] += 1 _macsbyswitch[switch] = {} for mac in mactobridge: # We want to merge it so that when a mac appears in multiple # places, it is captured. try: ifname = ifnamemap[bridgetoifmap[mactobridge[mac]]] except KeyError: continue if mac in _macmap: _macmap[mac].append((switch, ifname, maccounts[ifname])) else: _macmap[mac] = [(switch, ifname, maccounts[ifname])] if ifname in _macsbyswitch[switch]: _macsbyswitch[switch][ifname].append(mac) else: _macsbyswitch[switch][ifname] = [mac] nodename = _nodelookup(switch, ifname) if nodename is not None: if mac in _nodesbymac and _nodesbymac[mac] != nodename: # For example, listed on both a real edge port # and by accident a trunk port log.log({ 'error': '{0} and {1} described by ambiguous' ' switch topology values'.format(nodename, _nodesbymac[mac]) }) _nodesbymac[mac] = None else: _nodesbymac[mac] = nodename
def _map_switch_backend(args): """Manipulate portions of mac address map relevant to a given switch """ # 1.3.6.1.2.1.17.7.1.2.2.1.2 - mactoindex (qbridge - preferred) # if not, check for cisco and if cisco, build list of all relevant vlans: # .1.3.6.1.4.1.9.9.46.1.6.1.1.5 - trunk port vlan map (cisco only) # .1.3.6.1.4.1.9.9.68.1.2.2.1.2 - access port vlan map (cisco only) # if cisco, vlan community string indexed or snmpv3 contest for: # 1.3.6.1.2.1.17.4.3.1.2 - mactoindx (bridge - low-end switches and cisco) # .1.3.6.1.2.1.17.1.4.1.2 - bridge index to if index map # no vlan index or context for: # .1.3.6.1.2.1.31.1.1.1.1 - ifName... but some switches don't do it # .1.3.6.1.2.1.2.2.1.2 - ifDescr, usually useless, but a # fallback if ifName is empty # global _macmap if len(args) == 4: switch, password, user, _ = args # 4th arg is for affluent only if not user: user = None else: switch, password = args user = None if switch not in noaffluent: try: return _affluent_map_switch(args) except Exception: pass mactobridge, ifnamemap, bridgetoifmap = _offload_map_switch( switch, password, user) maccounts = {} bridgetoifvalid = False for mac in mactobridge: try: ifname = ifnamemap[bridgetoifmap[mactobridge[mac]]] bridgetoifvalid = True except KeyError: continue if ifname not in maccounts: maccounts[ifname] = 1 else: maccounts[ifname] += 1 if not bridgetoifvalid: bridgetoifmap = {} # Not a single mac address resolved to an interface index, chances are # that the switch is broken, and the mactobridge is reporting ifidx # instead of bridge port index # try again, skipping the bridgetoifmap lookup for mac in mactobridge: try: ifname = ifnamemap[mactobridge[mac]] bridgetoifmap[mactobridge[mac]] = mactobridge[mac] except KeyError: continue if ifname not in maccounts: maccounts[ifname] = 1 else: maccounts[ifname] += 1 newmacs = {} noaffluent.add(switch) for mac in mactobridge: # We want to merge it so that when a mac appears in multiple # places, it is captured. try: ifname = ifnamemap[bridgetoifmap[mactobridge[mac]]] except KeyError: continue if mac in _macmap: _macmap[mac].append((switch, ifname, maccounts[ifname])) else: _macmap[mac] = [(switch, ifname, maccounts[ifname])] if ifname in newmacs: newmacs[ifname].append(mac) else: newmacs[ifname] = [mac] nodename = _nodelookup(switch, ifname) if nodename is not None: if mac in _nodesbymac and _nodesbymac[mac][0] != nodename: # For example, listed on both a real edge port # and by accident a trunk port log.log({ 'error': '{0} and {1} described by ambiguous' ' switch topology values'.format(nodename, _nodesbymac[mac][0]) }) _nodesbymac[mac] = (None, None) else: _nodesbymac[mac] = (nodename, maccounts[ifname]) _macsbyswitch[switch] = newmacs
def _map_switch_backend(args): """Manipulate portions of mac address map relevant to a given switch """ # 1.3.6.1.2.1.17.7.1.2.2.1.2 - mactoindex (qbridge - preferred) # if not, check for cisco and if cisco, build list of all relevant vlans: # .1.3.6.1.4.1.9.9.46.1.6.1.1.5 - trunk port vlan map (cisco only) # .1.3.6.1.4.1.9.9.68.1.2.2.1.2 - access port vlan map (cisco only) # if cisco, vlan community string indexed or snmpv3 contest for: # 1.3.6.1.2.1.17.4.3.1.2 - mactoindx (bridge - low-end switches and cisco) # .1.3.6.1.2.1.17.1.4.1.2 - bridge index to if index map # no vlan index or context for: # .1.3.6.1.2.1.31.1.1.1.1 - ifName... but some switches don't do it # .1.3.6.1.2.1.2.2.1.2 - ifDescr, usually useless, but a # fallback if ifName is empty # global _macmap switch, password, user = args haveqbridge = False mactobridge = {} conn = snmp.Session(switch, password, user) for vb in conn.walk('1.3.6.1.2.1.17.7.1.2.2.1.2'): haveqbridge = True oid, bridgeport = vb if not bridgeport: continue oid = str(oid).rsplit('.', 6) # if 7, then oid[1] would be vlan id macaddr = '{0:02x}:{1:02x}:{2:02x}:{3:02x}:{4:02x}:{5:02x}'.format( *([int(x) for x in oid[-6:]])) mactobridge[macaddr] = int(bridgeport) if not haveqbridge: raise exc.NotImplementedException('TODO: Bridge-MIB without QBRIDGE') bridgetoifmap = {} for vb in conn.walk('1.3.6.1.2.1.17.1.4.1.2'): bridgeport, ifidx = vb bridgeport = int(str(bridgeport).rsplit('.', 1)[1]) bridgetoifmap[bridgeport] = int(ifidx) ifnamemap = {} havenames = False for vb in conn.walk('1.3.6.1.2.1.31.1.1.1.1'): ifidx, ifname = vb if not ifname: continue havenames = True ifidx = int(str(ifidx).rsplit('.', 1)[1]) ifnamemap[ifidx] = str(ifname) if not havenames: for vb in conn.walk('1.3.6.1.2.1.2.2.1.2'): ifidx, ifname = vb ifidx = int(str(ifidx).rsplit('.', 1)[1]) ifnamemap[ifidx] = str(ifname) maccounts = {} for mac in mactobridge: ifname = ifnamemap[bridgetoifmap[mactobridge[mac]]] if ifname not in maccounts: maccounts[ifname] = 1 else: maccounts[ifname] += 1 _macsbyswitch[switch] = {} for mac in mactobridge: # We want to merge it so that when a mac appears in multiple # places, it is captured. ifname = ifnamemap[bridgetoifmap[mactobridge[mac]]] if mac in _macmap: _macmap[mac].append((switch, ifname, maccounts[ifname])) else: _macmap[mac] = [(switch, ifname, maccounts[ifname])] if ifname in _macsbyswitch[switch]: _macsbyswitch[switch][ifname].append(mac) else: _macsbyswitch[switch][ifname] = [mac] nodename = _nodelookup(switch, ifname) if nodename is not None: if mac in _nodesbymac and _nodesbymac[mac] != nodename: log.log({ 'warning': '{0} and {1} described by ambiguous' ' switch topology values'.format(nodename, _nodesbymac[mac]) }) _nodesbymac[mac] = nodename
def connect_to_leader(cert=None, name=None, leader=None): global currentleader global follower if leader is None: leader = currentleader log.log({ 'info': 'Attempting connection to leader {0}'.format(leader), 'subsystem': 'collective' }) try: remote = connect_to_collective(cert, leader) except socket.error as e: log.log({ 'error': 'Collective connection attempt to {0} failed: {1}' ''.format(leader, str(e)), 'subsystem': 'collective' }) return False with connecting: with cfm._initlock: banner = tlvdata.recv(remote) # the banner vers = banner.split()[2] if vers != b'v2': raise Exception( 'This instance only supports protocol 2, synchronize versions between collective members' ) tlvdata.recv(remote) # authpassed... 0.. if name is None: name = get_myname() tlvdata.send( remote, { 'collective': { 'operation': 'connect', 'name': name, 'txcount': cfm._txcount } }) keydata = tlvdata.recv(remote) if not keydata: return False if 'error' in keydata: if 'backoff' in keydata: log.log({ 'info': 'Collective initialization in progress on ' '{0}'.format(leader), 'subsystem': 'collective' }) return False if 'leader' in keydata: log.log({ 'info': 'Prospective leader {0} has redirected this ' 'member to {1}'.format(leader, keydata['leader']), 'subsystem': 'collective' }) ldrc = cfm.get_collective_member_by_address( keydata['leader']) if ldrc and ldrc['name'] == name: raise Exception("Redirected to self") return connect_to_leader(name=name, leader=keydata['leader']) if 'txcount' in keydata: log.log({ 'info': 'Prospective leader {0} has inferior ' 'transaction count, becoming leader' ''.format(leader), 'subsystem': 'collective', 'subsystem': 'collective' }) return become_leader(remote) return False follower.kill() cfm.stop_following() follower = None if follower: follower.kill() cfm.stop_following() follower = None log.log({ 'info': 'Following leader {0}'.format(leader), 'subsystem': 'collective' }) colldata = tlvdata.recv(remote) # the protocol transmits global data, but for now we ignore it globaldata = tlvdata.recv(remote) dbi = tlvdata.recv(remote) dbsize = dbi['dbsize'] dbjson = b'' while (len(dbjson) < dbsize): ndata = remote.recv(dbsize - len(dbjson)) if not ndata: try: remote.close() except Exception: pass raise Exception("Error doing initial DB transfer") dbjson += ndata cfm.clear_configuration() try: cfm._restore_keys(keydata, None, sync=False) for c in colldata: cfm._true_add_collective_member(c, colldata[c]['address'], colldata[c]['fingerprint'], sync=False) #for globvar in globaldata: # cfm.set_global(globvar, globaldata[globvar], False) cfm._txcount = dbi.get('txcount', 0) cfm.ConfigManager(tenant=None)._load_from_json(dbjson, sync=False) cfm.commit_clear() except Exception: cfm.stop_following() cfm.rollback_clear() raise currentleader = leader #spawn this as a thread... follower = eventlet.spawn(follow_leader, remote, leader) return True
def handle_connection(connection, cert, request, local=False): global currentleader global retrythread operation = request['operation'] if cert: cert = crypto.dump_certificate(crypto.FILETYPE_ASN1, cert) else: if not local: return if operation in ('show', 'delete'): if not list(cfm.list_collective()): tlvdata.send( connection, { 'collective': { 'error': 'Collective mode not ' 'enabled on this ' 'system' } }) return if follower: linfo = cfm.get_collective_member_by_address(currentleader) remote = socket.create_connection((currentleader, 13001)) remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') cert = remote.getpeercert(binary_form=True) if not (linfo and util.cert_matches(linfo['fingerprint'], cert)): remote.close() tlvdata.send(connection, { 'error': 'Invalid certificate, ' 'redo invitation process' }) connection.close() return tlvdata.recv(remote) # ignore banner tlvdata.recv(remote) # ignore authpassed: 0 tlvdata.send(remote, { 'collective': { 'operation': 'getinfo', 'name': get_myname() } }) collinfo = tlvdata.recv(remote) else: collinfo = {} populate_collinfo(collinfo) try: cfm.check_quorum() collinfo['quorum'] = True except exc.DegradedCollective: collinfo['quorum'] = False if operation == 'show': tlvdata.send(connection, {'collective': collinfo}) elif operation == 'delete': todelete = request['member'] if (todelete == collinfo['leader'] or todelete in collinfo['active']): tlvdata.send( connection, { 'collective': { 'error': '{0} is still active, stop the confluent service to remove it' .format(todelete) } }) return if todelete not in collinfo['offline']: tlvdata.send( connection, { 'collective': { 'error': '{0} is not a recognized collective member'. format(todelete) } }) return cfm.del_collective_member(todelete) tlvdata.send( connection, { 'collective': { 'status': 'Successfully deleted {0}'.format(todelete) } }) connection.close() return if 'invite' == operation: try: cfm.check_quorum() except exc.DegradedCollective: tlvdata.send(connection, { 'collective': { 'error': 'Collective does not have quorum' } }) return #TODO(jjohnson2): Cannot do the invitation if not the head node, the certificate hand-carrying #can't work in such a case. name = request['name'] invitation = invites.create_server_invitation(name) tlvdata.send(connection, {'collective': { 'invitation': invitation }}) connection.close() if 'join' == operation: invitation = request['invitation'] try: invitation = base64.b64decode(invitation) name, invitation = invitation.split(b'@', 1) name = util.stringify(name) except Exception: tlvdata.send( connection, {'collective': { 'status': 'Invalid token format' }}) connection.close() return host = request['server'] try: remote = socket.create_connection((host, 13001)) # This isn't what it looks like. We do CERT_NONE to disable # openssl verification, but then use the invitation as a # shared secret to validate the certs as part of the join # operation remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') except Exception: tlvdata.send( connection, { 'collective': { 'status': 'Failed to connect to {0}'.format(host) } }) connection.close() return mycert = util.get_certificate_from_file( '/etc/confluent/srvcert.pem') cert = remote.getpeercert(binary_form=True) proof = base64.b64encode( invites.create_client_proof(invitation, mycert, cert)) tlvdata.recv(remote) # ignore banner tlvdata.recv(remote) # ignore authpassed: 0 tlvdata.send(remote, { 'collective': { 'operation': 'enroll', 'name': name, 'hmac': proof } }) rsp = tlvdata.recv(remote) if 'error' in rsp: tlvdata.send(connection, {'collective': { 'status': rsp['error'] }}) connection.close() return proof = rsp['collective']['approval'] proof = base64.b64decode(proof) j = invites.check_server_proof(invitation, mycert, cert, proof) if not j: remote.close() tlvdata.send(connection, {'collective': { 'status': 'Bad server token' }}) connection.close() return tlvdata.send(connection, {'collective': {'status': 'Success'}}) connection.close() currentleader = rsp['collective']['leader'] f = open('/etc/confluent/cfg/myname', 'w') f.write(name) f.close() log.log({ 'info': 'Connecting to collective due to join', 'subsystem': 'collective' }) eventlet.spawn_n(connect_to_leader, rsp['collective']['fingerprint'], name) if 'enroll' == operation: #TODO(jjohnson2): error appropriately when asked to enroll, but the master is elsewhere mycert = util.get_certificate_from_file('/etc/confluent/srvcert.pem') proof = base64.b64decode(request['hmac']) myrsp = invites.check_client_proof(request['name'], mycert, cert, proof) if not myrsp: tlvdata.send(connection, {'error': 'Invalid token'}) connection.close() return myrsp = base64.b64encode(myrsp) fprint = util.get_fingerprint(cert) myfprint = util.get_fingerprint(mycert) cfm.add_collective_member(get_myname(), connection.getsockname()[0], myfprint) cfm.add_collective_member(request['name'], connection.getpeername()[0], fprint) myleader = get_leader(connection) ldrfprint = cfm.get_collective_member_by_address( myleader)['fingerprint'] tlvdata.send( connection, { 'collective': { 'approval': myrsp, 'fingerprint': ldrfprint, 'leader': get_leader(connection) } }) if 'assimilate' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not droneinfo: tlvdata.send( connection, {'error': 'Unrecognized leader, ' 'redo invitation process'}) return if not util.cert_matches(droneinfo['fingerprint'], cert): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) return if request['txcount'] < cfm._txcount: tlvdata.send( connection, { 'error': 'Refusing to be assimilated by inferior' 'transaction count', 'txcount': cfm._txcount, }) return if connecting.active: # don't try to connect while actively already trying to connect tlvdata.send(connection, {'status': 0}) connection.close() return if (currentleader == connection.getpeername()[0] and follower and not follower.dead): # if we are happily following this leader already, don't stir # the pot tlvdata.send(connection, {'status': 0}) connection.close() return log.log({ 'info': 'Connecting in response to assimilation', 'subsystem': 'collective' }) eventlet.spawn_n(connect_to_leader, None, None, leader=connection.getpeername()[0]) tlvdata.send(connection, {'status': 0}) connection.close() if 'getinfo' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not (droneinfo and util.cert_matches(droneinfo['fingerprint'], cert)): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) connection.close() return collinfo = {} populate_collinfo(collinfo) tlvdata.send(connection, collinfo) if 'connect' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not (droneinfo and util.cert_matches(droneinfo['fingerprint'], cert)): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) connection.close() return myself = connection.getsockname()[0] if connecting.active: tlvdata.send(connection, { 'error': 'Connecting right now', 'backoff': True }) connection.close() return if myself != get_leader(connection): tlvdata.send( connection, { 'error': 'Cannot assimilate, our leader is ' 'in another castle', 'leader': currentleader }) connection.close() return if request['txcount'] > cfm._txcount: retire_as_leader() tlvdata.send( connection, { 'error': 'Client has higher tranasaction count, ' 'should assimilate me, connecting..', 'txcount': cfm._txcount }) log.log({ 'info': 'Connecting to leader due to superior ' 'transaction count', 'subsystem': collective }) eventlet.spawn_n(connect_to_leader, None, None, connection.getpeername()[0]) connection.close() return if retrythread: retrythread.cancel() retrythread = None with leader_init: cfm.update_collective_address(request['name'], connection.getpeername()[0]) tlvdata.send(connection, cfm._dump_keys(None, False)) tlvdata.send(connection, cfm._cfgstore['collective']) tlvdata.send(connection, {}) # cfm.get_globals()) cfgdata = cfm.ConfigManager(None)._dump_to_json() tlvdata.send(connection, { 'txcount': cfm._txcount, 'dbsize': len(cfgdata) }) connection.sendall(cfgdata) #tlvdata.send(connection, {'tenants': 0}) # skip the tenants for now, # so far unused anyway if not cfm.relay_slaved_requests(drone, connection): if not retrythread: # start a recovery if everyone else seems # to have disappeared retrythread = eventlet.spawn_after(30 + random.random(), start_collective)
def _map_switch_backend(args): """Manipulate portions of mac address map relevant to a given switch """ # 1.3.6.1.2.1.17.7.1.2.2.1.2 - mactoindex (qbridge - preferred) # if not, check for cisco and if cisco, build list of all relevant vlans: # .1.3.6.1.4.1.9.9.46.1.6.1.1.5 - trunk port vlan map (cisco only) # .1.3.6.1.4.1.9.9.68.1.2.2.1.2 - access port vlan map (cisco only) # if cisco, vlan community string indexed or snmpv3 contest for: # 1.3.6.1.2.1.17.4.3.1.2 - mactoindx (bridge - low-end switches and cisco) # .1.3.6.1.2.1.17.1.4.1.2 - bridge index to if index map # no vlan index or context for: # .1.3.6.1.2.1.31.1.1.1.1 - ifName... but some switches don't do it # .1.3.6.1.2.1.2.2.1.2 - ifDescr, usually useless, but a # fallback if ifName is empty # global _macmap switch, password, user = args haveqbridge = False mactobridge = {} conn = snmp.Session(switch, password, user) for vb in conn.walk('1.3.6.1.2.1.17.7.1.2.2.1.2'): haveqbridge = True oid, bridgeport = vb if not bridgeport: continue oid = str(oid).rsplit('.', 6) # if 7, then oid[1] would be vlan id macaddr = '{0:02x}:{1:02x}:{2:02x}:{3:02x}:{4:02x}:{5:02x}'.format( *([int(x) for x in oid[-6:]]) ) mactobridge[macaddr] = int(bridgeport) if not haveqbridge: raise exc.NotImplementedException('TODO: Bridge-MIB without QBRIDGE') bridgetoifmap = {} for vb in conn.walk('1.3.6.1.2.1.17.1.4.1.2'): bridgeport, ifidx = vb bridgeport = int(str(bridgeport).rsplit('.', 1)[1]) bridgetoifmap[bridgeport] = int(ifidx) ifnamemap = {} havenames = False for vb in conn.walk('1.3.6.1.2.1.31.1.1.1.1'): ifidx, ifname = vb if not ifname: continue havenames = True ifidx = int(str(ifidx).rsplit('.', 1)[1]) ifnamemap[ifidx] = str(ifname) if not havenames: for vb in conn.walk( '1.3.6.1.2.1.2.2.1.2'): ifidx, ifname = vb ifidx = int(str(ifidx).rsplit('.', 1)[1]) ifnamemap[ifidx] = str(ifname) maccounts = {} for mac in mactobridge: ifname = ifnamemap[bridgetoifmap[mactobridge[mac]]] if ifname not in maccounts: maccounts[ifname] = 1 else: maccounts[ifname] += 1 _macsbyswitch[switch] = {} for mac in mactobridge: # We want to merge it so that when a mac appears in multiple # places, it is captured. ifname = ifnamemap[bridgetoifmap[mactobridge[mac]]] if mac in _macmap: _macmap[mac].append((switch, ifname, maccounts[ifname])) else: _macmap[mac] = [(switch, ifname, maccounts[ifname])] if ifname in _macsbyswitch[switch]: _macsbyswitch[switch][ifname].append(mac) else: _macsbyswitch[switch][ifname] = [mac] nodename = _nodelookup(switch, ifname) if nodename is not None: if mac in _nodesbymac and _nodesbymac[mac] != nodename: log.log({'warning': '{0} and {1} described by ambiguous' ' switch topology values'.format(nodename, _nodesbymac[mac] )}) _nodesbymac[mac] = nodename
def check_reply(node, info, packet, sock, cfg, reqview): httpboot = info['architecture'] == 'uefi-httpboot' replen = 275 # default is going to be 286 cfd = cfg.get_node_attributes(node, ('deployment.*')) profile = cfd.get(node, {}).get('deployment.pendingprofile', {}).get('value', None) myipn = info['netinfo']['recvip'] myipn = socket.inet_aton(myipn) if not profile: return rqtype = packet[53][0] insecuremode = cfd.get(node, {}).get('deployment.useinsecureprotocols', 'never') if not insecuremode: insecuremode = 'never' if insecuremode == 'never' and not httpboot: if rqtype == 1 and info['architecture']: log.log({ 'info': 'Boot attempt by {0} detected in insecure mode, but ' 'insecure mode is disabled. Set the attribute ' '`deployment.useinsecureprotocols` to `firmware` or ' '`always` to enable support, or use UEFI HTTP boot ' 'with HTTPS.'.format(node) }) return reply = bytearray(512) repview = memoryview(reply) repview[:20] = iphdr repview[12:16] = myipn repview[20:28] = udphdr repview = repview[28:] repview[0:1] = b'\x02' repview[1:10] = reqview[1:10] # duplicate txid, hwlen, and others repview[10:11] = b'\x80' # always set broadcast repview[28:44] = reqview[28:44] # copy chaddr field if httpboot: proto = 'https' if insecuremode == 'never' else 'http' bootfile = '{0}://{1}/confluent-public/os/{2}/boot.img'.format( proto, info['netinfo']['recvip'], profile) if not isinstance(bootfile, bytes): bootfile = bootfile.encode('utf8') repview[108:108 + len(bootfile)] = bootfile repview[20:24] = myipn gateway = None netmask = None niccfg = netutil.get_nic_config(cfg, node, ifidx=info['netinfo']['ifidx']) if niccfg.get('ipv4_broken', False): # Received a request over a nic with no ipv4 configured, ignore it return clipn = None if niccfg['ipv4_address']: clipn = socket.inet_aton(niccfg['ipv4_address']) repview[16:20] = clipn gateway = niccfg['ipv4_gateway'] if gateway: gateway = socket.inet_aton(gateway) netmask = niccfg['prefix'] netmask = (2**32 - 1) ^ (2**(32 - netmask) - 1) netmask = struct.pack('!I', netmask) repview[236:240] = b'\x63\x82\x53\x63' repview[240:242] = b'\x35\x01' if rqtype == 1: # if discover, then offer repview[242:243] = b'\x02' elif rqtype == 3: # if request, then ack repview[242:243] = b'\x05' repview[243:245] = b'\x36\x04' # DHCP server identifier repview[245:249] = myipn repview[249:255] = b'\x33\x04\x00\x00\x00\xf0' # fixed short lease time repview[255:257] = b'\x61\x11' repview[257:274] = packet[97] # Note that sending PXEClient kicks off the proxyDHCP procedure, ignoring # boot filename and such in the DHCP packet # we will simply always do it to provide the boot payload in a consistent # matter to both dhcp-elsewhere and fixed ip clients if info['architecture'] == 'uefi-httpboot': repview[replen - 1:replen + 11] = b'\x3c\x0aHTTPClient' replen += 12 else: repview[replen - 1:replen + 10] = b'\x3c\x09PXEClient' replen += 11 hwlen = bytearray(reqview[2:3].tobytes())[0] fulladdr = repview[28:28 + hwlen].tobytes() myipbypeer[fulladdr] = myipn if hwlen == 8: # omnipath may present a mangled proxydhcp request later shortaddr = bytearray(6) shortaddr[0] = 2 shortaddr[1:] = fulladdr[3:] myipbypeer[bytes(shortaddr)] = myipn if netmask: repview[replen - 1:replen + 1] = b'\x01\x04' repview[replen + 1:replen + 5] = netmask replen += 6 if gateway: repview[replen - 1:replen + 1] = b'\x03\x04' repview[replen + 1:replen + 5] = gateway replen += 6 repview[replen - 1:replen] = b'\xff' # end of options, should always be last byte repview = memoryview(reply) pktlen = struct.pack('!H', replen + 28) # ip+udp = 28 repview[2:4] = pktlen curripsum = ~(_ipsum(constiphdrsum + pktlen + myipn)) & 0xffff repview[10:12] = struct.pack('!H', curripsum) repview[24:26] = struct.pack('!H', replen + 8) datasum = _ipsum(b'\x00\x11' + repview[24:26].tobytes() + repview[12:replen + 28].tobytes()) datasum = ~datasum & 0xffff repview[26:28] = struct.pack('!H', datasum) if clipn: staticassigns[fulladdr] = (clipn, repview[:replen + 28].tobytes()) elif fulladdr in staticassigns: del staticassigns[fulladdr] send_raw_packet(repview, replen + 28, reqview, info)
def discover_node(cfg, handler, info, nodename, manual): known_nodes[nodename][info['hwaddr']] = info if info['hwaddr'] in unknown_info: del unknown_info[info['hwaddr']] info['discostatus'] = 'identified' dp = cfg.get_node_attributes( [nodename], ('discovery.policy', 'pubkeys.tls_hardwaremanager')) policy = dp.get(nodename, {}).get('discovery.policy', {}).get('value', None) if policy is None: policy = '' policies = set(policy.split(',')) lastfp = dp.get(nodename, {}).get('pubkeys.tls_hardwaremanager', {}).get('value', None) # TODO(jjohnson2): permissive requires we guarantee storage of # the pubkeys, which is deferred for a little bit # Also, 'secure', when we have the needed infrastructure done # in some product or another. curruuid = info.get('uuid', False) if 'pxe' in policies and info['handler'] == pxeh: return do_pxe_discovery(cfg, handler, info, manual, nodename, policies) elif ('permissive' in policies and handler.https_supported and lastfp and not util.cert_matches(lastfp, handler.https_cert) and not manual): info['discofailure'] = 'fingerprint' log.log({ 'info': 'Detected replacement of {0} with existing ' 'fingerprint and permissive discovery policy, not ' 'doing discovery unless discovery.policy=open or ' 'pubkeys.tls_hardwaremanager attribute is cleared ' 'first'.format(nodename) }) return False # With a permissive policy, do not discover new elif policies & set(('open', 'permissive')) or manual: info['nodename'] = nodename if info['handler'] == pxeh: return do_pxe_discovery(cfg, handler, info, manual, nodename, policies) elif manual or not util.cert_matches(lastfp, handler.https_cert): # only 'discover' if it is not the same as last time try: handler.config(nodename) except Exception as e: info['discofailure'] = 'bug' if manual: raise log.log({ 'error': 'Error encountered trying to set up {0}, {1}'.format( nodename, str(e)) }) traceback.print_exc() return False newnodeattribs = {} if 'uuid' in info: newnodeattribs['id.uuid'] = info['uuid'] if 'serialnumber' in info: newnodeattribs['id.serial'] = info['serialnumber'] if 'modelnumber' in info: newnodeattribs['id.model'] = info['modelnumber'] if handler.https_cert: newnodeattribs['pubkeys.tls_hardwaremanager'] = \ util.get_fingerprint(handler.https_cert, 'sha256') if newnodeattribs: cfg.set_node_attributes({nodename: newnodeattribs}) log.log({ 'info': 'Discovered {0} ({1})'.format(nodename, handler.devname) }) info['discostatus'] = 'discovered' for i in pending_by_uuid.get(curruuid, []): eventlet.spawn_n(_recheck_single_unknown_info, cfg, i) return True log.log({ 'info': 'Detected {0}, but discovery.policy is not set to a ' 'value allowing discovery (open or permissive)'.format(nodename) }) info['discofailure'] = 'policy' return False
def eval_node(cfg, handler, info, nodename, manual=False): try: handler.probe() # unicast interrogation as possible to get more data # switch concurrently # do some preconfig, for example, to bring a SMM online if applicable handler.preconfig() except Exception as e: unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentified' errorstr = 'An error occured during discovery, check the ' \ 'trace and stderr logs, mac was {0} and ip was {1}' \ ', the node or the containing enclosure was {2}' \ ''.format(info['hwaddr'], handler.ipaddr, nodename) traceback.print_exc() if manual: raise exc.InvalidArgumentException(errorstr) log.log({'error': errorstr}) return # first, if had a bay, it was in an enclosure. If it was discovered by # switch, it is probably the enclosure manager and not # the node directly. switch is ambiguous and we should leave it alone if 'enclosure.bay' in info and handler.is_enclosure: unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentified' log.log({ 'error': 'Something that is an enclosure reported a bay, ' 'not possible' }) if manual: raise exc.InvalidArgumentException() return nl = list(cfg.filter_node_attributes('enclosure.manager=' + nodename)) if not handler.is_enclosure and nl: # The specified node is an enclosure (has nodes mapped to it), but # what we are talking to is *not* an enclosure # might be ambiguous, need to match chassis-uuid as well.. if 'enclosure.bay' not in info: unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentified' errorstr = '{2} with mac {0} is in {1}, but unable to ' \ 'determine bay number'.format(info['hwaddr'], nodename, handler.ipaddr) if manual: raise exc.InvalidArgumentException(errorstr) log.log({'error': errorstr}) return enl = list(cfg.filter_node_attributes('enclosure.extends=' + nodename)) if enl: # ambiguous SMM situation according to the configuration, we need # to match uuid encuuid = info['attributes'].get('chasis-uuid', None) if encuuid: enl = list(cfg.filter_node_attributes('id.uuid=' + encuuid)) if len(enl) != 1: # errorstr = 'No SMM by given UUID known, *yet*' # if manual: # raise exc.InvalidArgumentException(errorstr) # log.log({'error': errorstr}) if encuuid in pending_by_uuid: pending_by_uuid[encuuid].add(info) else: pending_by_uuid[encuuid] = set([info]) return # We found the real smm, replace the list with the actual smm # to continue nl = list( cfg.filter_node_attributes('enclosure.manager=' + enl[0])) else: errorstr = 'Chained SMM configuration with older XCC, ' \ 'unable to perform zero power discovery' if manual: raise exc.InvalidArgumentException(errorstr) log.log({'error': errorstr}) return # search for nodes fitting our description using filters # lead with the most specific to have a small second pass nl = list( cfg.filter_node_attributes( 'enclosure.bay={0}'.format(info['enclosure.bay']), nl)) if len(nl) != 1: info['discofailure'] = 'ambigconfig' if len(nl): errorstr = 'The following nodes have duplicate ' \ 'enclosure attributes: ' + ','.join(nl) else: errorstr = 'The {0} in enclosure {1} bay {2} does not ' \ 'seem to be a defined node ({3})'.format( handler.devname, nodename, info['enclosure.bay'], handler.ipaddr, ) if manual: raise exc.InvalidArgumentException(errorstr) log.log({'error': errorstr}) unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentified' return nodename = nl[0] if not discover_node(cfg, handler, info, nodename, manual): # store it as pending, assuming blocked on enclosure # assurance... pending_nodes[nodename] = info else: # we can and did accurately discover by switch or in enclosure # but... is this really ok? could be on an upstream port or # erroneously put in the enclosure with no nodes yet # so first, see if the candidate node is a chain host if info['maccount']: # discovery happened through switch nl = list( cfg.filter_node_attributes('enclosure.extends=' + nodename)) if nl: # The candidate nodename is the head of a chain, we must # validate the smm certificate by the switch macmap.get_node_fingerprint(nodename, cfg) util.handler.cert_matches(fprint, handler.https_cert) return if (info['maccount'] and not handler.discoverable_by_switch(info['maccount'])): errorstr = 'The detected node {0} was detected using switch, ' \ 'however the relevant port has too many macs learned ' \ 'for this type of device ({1}) to be discovered by ' \ 'switch.'.format(nodename, handler.devname) if manual: raise exc.InvalidArgumentException(errorstr) log.log({'error': errorstr}) return if not discover_node(cfg, handler, info, nodename, manual): pending_nodes[nodename] = info
def detected(info): global rechecker global rechecktime # later, manual and CMM discovery may act on SN and/or UUID for service in info['services']: if service in nodehandlers: if service not in known_services: known_services[service] = set([]) handler = nodehandlers[service] info['handler'] = handler break else: # no nodehandler, ignore for now return try: snum = info['attributes']['enclosure-serial-number'][0].strip() if snum: info['serialnumber'] = snum known_serials[info['serialnumber']] = info except (KeyError, IndexError): pass try: info['modelnumber'] = info['attributes'][ 'enclosure-machinetype-model'][0] known_services[service].add(info['modelnumber']) except (KeyError, IndexError): pass if info['hwaddr'] in known_info and 'addresses' in info: # we should tee these up for parsing when an enclosure comes up # also when switch config parameters change, should discard # and there's also if wiring is fixed... # of course could periodically revisit known_nodes # replace potentially stale address info #TODO(jjohnson2): remove this # temporary workaround for XCC not doing SLP DA over dedicated port # bz 93219, fix submitted, but not in builds yet # strictly speaking, going ipv4 only legitimately is mistreated here, # but that should be an edge case oldaddr = known_info[info['hwaddr']].get('addresses', []) for addr in info['addresses']: if addr[0].startswith('fe80::'): break else: for addr in oldaddr: if addr[0].startswith('fe80::'): info['addresses'].append(addr) if known_info[info['hwaddr']].get('addresses', []) == info['addresses']: # if the ip addresses match, then assume no changes # now something resetting to defaults could, in theory # have the same address, but need to be reset # in that case, however, a user can clear pubkeys to force a check return known_info[info['hwaddr']] = info cfg = cfm.ConfigManager(None) if handler: handler = handler.NodeHandler(info, cfg) handler.scan() uuid = info.get('uuid', None) if uuid_is_valid(uuid): known_uuids[uuid][info['hwaddr']] = info if handler and handler.https_supported and not handler.https_cert: if handler.cert_fail_reason == 'unreachable': log.log({ 'info': '{0} with hwaddr {1} is not reachable by https ' 'at address {2}'.format(handler.devname, info['hwaddr'], handler.ipaddr) }) info['addresses'] = [ x for x in info.get('addresses', []) if x != handler.ipaddr ] return log.log({ 'info': '{0} with hwaddr {1} at address {2} is not yet running ' 'https, will examine later'.format(handler.devname, info['hwaddr'], handler.ipaddr) }) if rechecker is not None and rechecktime > util.monotonic_time() + 300: rechecker.cancel() if rechecker is None or rechecker.dead: rechecktime = util.monotonic_time() + 300 rechecker = eventlet.spawn_after(300, _periodic_recheck, cfg) unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentfied' #TODO, eventlet spawn after to recheck sooner, or somehow else # influence periodic recheck to shorten delay? return nodename, info['maccount'] = get_nodename(cfg, handler, info) if nodename and handler and handler.https_supported: dp = cfg.get_node_attributes([nodename], ('pubkeys.tls_hardwaremanager', )) lastfp = dp.get(nodename, {}).get('pubkeys.tls_hardwaremanager', {}).get('value', None) if util.cert_matches(lastfp, handler.https_cert): info['nodename'] = nodename known_nodes[nodename][info['hwaddr']] = info info['discostatus'] = 'discovered' return # already known, no need for more #TODO(jjohnson2): We might have to get UUID for certain searches... #for now defer probe until inside eval_node. We might not have #a nodename without probe in the future. if nodename and handler: eval_node(cfg, handler, info, nodename) elif handler: log.log({ 'info': 'Detected unknown {0} with hwaddr {1} at ' 'address {2}'.format(handler.devname, info['hwaddr'], handler.ipaddr) }) info['discostatus'] = 'unidentified' unknown_info[info['hwaddr']] = info
def eval_node(cfg, handler, info, nodename, manual=False): try: handler.probe() # unicast interrogation as possible to get more data # for now, we search switch only, ideally we search cmm, smm, and # switch concurrently # do some preconfig, for example, to bring a SMM online if applicable handler.preconfig() except Exception as e: unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentified' errorstr = 'An error occured during discovery, check the ' \ 'trace and stderr logs, mac was {0} and ip was {1}' \ ', the node or the containing enclosure was {2}' \ ''.format(info['hwaddr'], handler.ipaddr, nodename) traceback.print_exc() if manual: raise exc.InvalidArgumentException(errorstr) log.log({'error': errorstr}) return # first, if had a bay, it was in an enclosure. If it was discovered by # switch, it is probably the enclosure manager and not # the node directly. switch is ambiguous and we should leave it alone if 'enclosure.bay' in info and handler.is_enclosure: unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentified' log.log({ 'error': 'Something that is an enclosure reported a bay, ' 'not possible' }) if manual: raise exc.InvalidArgumentException() return nl = list(cfg.filter_node_attributes('enclosure.manager=' + nodename)) if not handler.is_enclosure and nl: # The specified node is an enclosure (has nodes mapped to it), but # what we are talking to is *not* an enclosure if 'enclosure.bay' not in info: unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentified' errorstr = '{2} with mac {0} is in {1}, but unable to ' \ 'determine bay number'.format(info['hwaddr'], nodename, handler.ipaddr) if manual: raise exc.InvalidArgumentException(errorstr) log.log({'error': errorstr}) return # search for nodes fitting our description using filters # lead with the most specific to have a small second pass nl = cfg.filter_node_attributes( 'enclosure.bay={0}'.format(info['enclosure.bay']), nl) nl = list(nl) if len(nl) != 1: info['discofailure'] = 'ambigconfig' if len(nl): errorstr = 'The following nodes have duplicate ' \ 'enclosure attributes: ' + ','.join(nl) else: errorstr = 'The {0} in enclosure {1} bay {2} does not ' \ 'seem to be a defined node ({3})'.format( handler.devname, nodename, info['enclosure.bay'], handler.ipaddr, ) if manual: raise exc.InvalidArgumentException(errorstr) log.log({'error': errorstr}) unknown_info[info['hwaddr']] = info info['discostatus'] = 'unidentified' return nodename = nl[0] if not discover_node(cfg, handler, info, nodename, manual): # store it as pending, assuming blocked on enclosure # assurance... pending_nodes[nodename] = info else: # we can and did accurately discover by switch or in enclosure if not discover_node(cfg, handler, info, nodename, manual): pending_nodes[nodename] = info