def get_nodename(cfg, handler, info): nodename = None maccount = None info['verified'] = False if not handler: return None, None if handler.https_supported: currcert = handler.https_cert if not currcert: info['discofailure'] = 'nohttps' return None, None currprint = util.get_fingerprint(currcert, 'sha256') nodename = nodes_by_fprint.get(currprint, None) if not nodename: # Try SHA512 as well currprint = util.get_fingerprint(currcert) nodename = nodes_by_fprint.get(currprint, None) if not nodename: curruuid = info.get('uuid', None) if uuid_is_valid(curruuid): nodename = nodes_by_uuid.get(curruuid, None) if nodename is None: _map_unique_ids() nodename = nodes_by_uuid.get(curruuid, None) if not nodename: # Ok, see if it is something with a chassis-uuid and discover by # chassis nodename = get_nodename_from_enclosures(cfg, info) if not nodename and handler.devname == 'SMM': nodename = get_nodename_from_chained_smms(cfg, handler, info) if not nodename: # as a last resort, search switches for info # This is the slowest potential operation, so we hope for the # best to occur prior to this nodename, macinfo = macmap.find_nodeinfo_by_mac(info['hwaddr'], cfg) maccount = macinfo['maccount'] if nodename: if handler.devname == 'SMM': nl = list(cfg.filter_node_attributes( 'enclosure.extends=' + nodename)) if nl: # We found an SMM, and it's in a chain per configuration # we need to ask the switch for the fingerprint to see # if we have a match or not newnodename, v = get_chained_smm_name(nodename, cfg, handler, nl) if newnodename: # while this started by switch, it was disambiguated info['verified'] = v return newnodename, None if (nodename and not handler.discoverable_by_switch(macinfo['maccount'])): if handler.devname == 'SMM': errorstr = 'Attempt to discover SMM by switch, but chained ' \ 'topology or incorrect net attributes detected, ' \ 'which is not compatible with switch discovery ' \ 'of SMM, nodename would have been ' \ '{0}'.format(nodename) log.log({'error': errorstr}) return None, None return nodename, maccount
def get_nodename(cfg, handler, info): if not handler.https_supported: curruuid = info['uuid'] nodename = nodes_by_uuid.get(curruuid, None) if nodename is None: _map_unique_ids() nodename = nodes_by_uuid.get(curruuid, None) if nodename is None: # TODO: if there are too many matches on port for a # given type, error! Can't just arbitarily limit, # shared nic with vms is possible and valid nodename = macmap.find_node_by_mac(info['hwaddr'], cfg) return nodename currcert = handler.https_cert if not currcert: info['discofailure'] = 'nohttps' return None currprint = util.get_fingerprint(currcert) nodename = nodes_by_fprint.get(currprint, None) if not nodename: nodename = macmap.find_node_by_mac(info['hwaddr'], cfg) return nodename
def handle_connection(connection, cert, request, local=False): global currentleader global retrythread operation = request['operation'] if cert: cert = crypto.dump_certificate(crypto.FILETYPE_ASN1, cert) else: if not local: return if operation in ('show', 'delete'): if not list(cfm.list_collective()): tlvdata.send( connection, { 'collective': { 'error': 'Collective mode not ' 'enabled on this ' 'system' } }) return if follower: linfo = cfm.get_collective_member_by_address(currentleader) remote = socket.create_connection((currentleader, 13001)) remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') cert = remote.getpeercert(binary_form=True) if not (linfo and util.cert_matches(linfo['fingerprint'], cert)): remote.close() tlvdata.send(connection, { 'error': 'Invalid certificate, ' 'redo invitation process' }) connection.close() return tlvdata.recv(remote) # ignore banner tlvdata.recv(remote) # ignore authpassed: 0 tlvdata.send(remote, { 'collective': { 'operation': 'getinfo', 'name': get_myname() } }) collinfo = tlvdata.recv(remote) else: collinfo = {} populate_collinfo(collinfo) try: cfm.check_quorum() collinfo['quorum'] = True except exc.DegradedCollective: collinfo['quorum'] = False if operation == 'show': tlvdata.send(connection, {'collective': collinfo}) elif operation == 'delete': todelete = request['member'] if (todelete == collinfo['leader'] or todelete in collinfo['active']): tlvdata.send( connection, { 'collective': { 'error': '{0} is still active, stop the confluent service to remove it' .format(todelete) } }) return if todelete not in collinfo['offline']: tlvdata.send( connection, { 'collective': { 'error': '{0} is not a recognized collective member'. format(todelete) } }) return cfm.del_collective_member(todelete) tlvdata.send( connection, { 'collective': { 'status': 'Successfully deleted {0}'.format(todelete) } }) connection.close() return if 'invite' == operation: try: cfm.check_quorum() except exc.DegradedCollective: tlvdata.send(connection, { 'collective': { 'error': 'Collective does not have quorum' } }) return #TODO(jjohnson2): Cannot do the invitation if not the head node, the certificate hand-carrying #can't work in such a case. name = request['name'] invitation = invites.create_server_invitation(name) tlvdata.send(connection, {'collective': { 'invitation': invitation }}) connection.close() if 'join' == operation: invitation = request['invitation'] try: invitation = base64.b64decode(invitation) name, invitation = invitation.split(b'@', 1) name = util.stringify(name) except Exception: tlvdata.send( connection, {'collective': { 'status': 'Invalid token format' }}) connection.close() return host = request['server'] try: remote = socket.create_connection((host, 13001)) # This isn't what it looks like. We do CERT_NONE to disable # openssl verification, but then use the invitation as a # shared secret to validate the certs as part of the join # operation remote = ssl.wrap_socket(remote, cert_reqs=ssl.CERT_NONE, keyfile='/etc/confluent/privkey.pem', certfile='/etc/confluent/srvcert.pem') except Exception: tlvdata.send( connection, { 'collective': { 'status': 'Failed to connect to {0}'.format(host) } }) connection.close() return mycert = util.get_certificate_from_file( '/etc/confluent/srvcert.pem') cert = remote.getpeercert(binary_form=True) proof = base64.b64encode( invites.create_client_proof(invitation, mycert, cert)) tlvdata.recv(remote) # ignore banner tlvdata.recv(remote) # ignore authpassed: 0 tlvdata.send(remote, { 'collective': { 'operation': 'enroll', 'name': name, 'hmac': proof } }) rsp = tlvdata.recv(remote) if 'error' in rsp: tlvdata.send(connection, {'collective': { 'status': rsp['error'] }}) connection.close() return proof = rsp['collective']['approval'] proof = base64.b64decode(proof) j = invites.check_server_proof(invitation, mycert, cert, proof) if not j: remote.close() tlvdata.send(connection, {'collective': { 'status': 'Bad server token' }}) connection.close() return tlvdata.send(connection, {'collective': {'status': 'Success'}}) connection.close() currentleader = rsp['collective']['leader'] f = open('/etc/confluent/cfg/myname', 'w') f.write(name) f.close() log.log({ 'info': 'Connecting to collective due to join', 'subsystem': 'collective' }) eventlet.spawn_n(connect_to_leader, rsp['collective']['fingerprint'], name) if 'enroll' == operation: #TODO(jjohnson2): error appropriately when asked to enroll, but the master is elsewhere mycert = util.get_certificate_from_file('/etc/confluent/srvcert.pem') proof = base64.b64decode(request['hmac']) myrsp = invites.check_client_proof(request['name'], mycert, cert, proof) if not myrsp: tlvdata.send(connection, {'error': 'Invalid token'}) connection.close() return myrsp = base64.b64encode(myrsp) fprint = util.get_fingerprint(cert) myfprint = util.get_fingerprint(mycert) cfm.add_collective_member(get_myname(), connection.getsockname()[0], myfprint) cfm.add_collective_member(request['name'], connection.getpeername()[0], fprint) myleader = get_leader(connection) ldrfprint = cfm.get_collective_member_by_address( myleader)['fingerprint'] tlvdata.send( connection, { 'collective': { 'approval': myrsp, 'fingerprint': ldrfprint, 'leader': get_leader(connection) } }) if 'assimilate' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not droneinfo: tlvdata.send( connection, {'error': 'Unrecognized leader, ' 'redo invitation process'}) return if not util.cert_matches(droneinfo['fingerprint'], cert): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) return if request['txcount'] < cfm._txcount: tlvdata.send( connection, { 'error': 'Refusing to be assimilated by inferior' 'transaction count', 'txcount': cfm._txcount, }) return if connecting.active: # don't try to connect while actively already trying to connect tlvdata.send(connection, {'status': 0}) connection.close() return if (currentleader == connection.getpeername()[0] and follower and not follower.dead): # if we are happily following this leader already, don't stir # the pot tlvdata.send(connection, {'status': 0}) connection.close() return log.log({ 'info': 'Connecting in response to assimilation', 'subsystem': 'collective' }) eventlet.spawn_n(connect_to_leader, None, None, leader=connection.getpeername()[0]) tlvdata.send(connection, {'status': 0}) connection.close() if 'getinfo' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not (droneinfo and util.cert_matches(droneinfo['fingerprint'], cert)): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) connection.close() return collinfo = {} populate_collinfo(collinfo) tlvdata.send(connection, collinfo) if 'connect' == operation: drone = request['name'] droneinfo = cfm.get_collective_member(drone) if not (droneinfo and util.cert_matches(droneinfo['fingerprint'], cert)): tlvdata.send( connection, {'error': 'Invalid certificate, ' 'redo invitation process'}) connection.close() return myself = connection.getsockname()[0] if connecting.active: tlvdata.send(connection, { 'error': 'Connecting right now', 'backoff': True }) connection.close() return if myself != get_leader(connection): tlvdata.send( connection, { 'error': 'Cannot assimilate, our leader is ' 'in another castle', 'leader': currentleader }) connection.close() return if request['txcount'] > cfm._txcount: retire_as_leader() tlvdata.send( connection, { 'error': 'Client has higher tranasaction count, ' 'should assimilate me, connecting..', 'txcount': cfm._txcount }) log.log({ 'info': 'Connecting to leader due to superior ' 'transaction count', 'subsystem': collective }) eventlet.spawn_n(connect_to_leader, None, None, connection.getpeername()[0]) connection.close() return if retrythread: retrythread.cancel() retrythread = None with leader_init: cfm.update_collective_address(request['name'], connection.getpeername()[0]) tlvdata.send(connection, cfm._dump_keys(None, False)) tlvdata.send(connection, cfm._cfgstore['collective']) tlvdata.send(connection, {}) # cfm.get_globals()) cfgdata = cfm.ConfigManager(None)._dump_to_json() tlvdata.send(connection, { 'txcount': cfm._txcount, 'dbsize': len(cfgdata) }) connection.sendall(cfgdata) #tlvdata.send(connection, {'tenants': 0}) # skip the tenants for now, # so far unused anyway if not cfm.relay_slaved_requests(drone, connection): if not retrythread: # start a recovery if everyone else seems # to have disappeared retrythread = eventlet.spawn_after(30 + random.random(), start_collective)
def validate_cert(self, certificate): # broadly speaking, merely checks consistency moment to moment, # but if https_cert gets stricter, this check means something fprint = util.get_fingerprint(self.https_cert) return util.cert_matches(fprint, certificate)
def discover_node(cfg, handler, info, nodename, manual): known_nodes[nodename][info['hwaddr']] = info if info['hwaddr'] in unknown_info: del unknown_info[info['hwaddr']] info['discostatus'] = 'identified' dp = cfg.get_node_attributes( [nodename], ('discovery.policy', 'pubkeys.tls_hardwaremanager')) policy = dp.get(nodename, {}).get('discovery.policy', {}).get('value', None) if policy is None: policy = '' policies = set(policy.split(',')) lastfp = dp.get(nodename, {}).get('pubkeys.tls_hardwaremanager', {}).get('value', None) # TODO(jjohnson2): permissive requires we guarantee storage of # the pubkeys, which is deferred for a little bit # Also, 'secure', when we have the needed infrastructure done # in some product or another. curruuid = info.get('uuid', False) if 'pxe' in policies and info['handler'] == pxeh: return do_pxe_discovery(cfg, handler, info, manual, nodename, policies) elif ('permissive' in policies and handler.https_supported and lastfp and not util.cert_matches(lastfp, handler.https_cert) and not manual): info['discofailure'] = 'fingerprint' log.log({ 'info': 'Detected replacement of {0} with existing ' 'fingerprint and permissive discovery policy, not ' 'doing discovery unless discovery.policy=open or ' 'pubkeys.tls_hardwaremanager attribute is cleared ' 'first'.format(nodename) }) return False # With a permissive policy, do not discover new elif policies & set(('open', 'permissive')) or manual: info['nodename'] = nodename if info['handler'] == pxeh: return do_pxe_discovery(cfg, handler, info, manual, nodename, policies) elif manual or not util.cert_matches(lastfp, handler.https_cert): # only 'discover' if it is not the same as last time try: handler.config(nodename) except Exception as e: info['discofailure'] = 'bug' if manual: raise log.log({ 'error': 'Error encountered trying to set up {0}, {1}'.format( nodename, str(e)) }) traceback.print_exc() return False newnodeattribs = {} if 'uuid' in info: newnodeattribs['id.uuid'] = info['uuid'] if 'serialnumber' in info: newnodeattribs['id.serial'] = info['serialnumber'] if 'modelnumber' in info: newnodeattribs['id.model'] = info['modelnumber'] if handler.https_cert: newnodeattribs['pubkeys.tls_hardwaremanager'] = \ util.get_fingerprint(handler.https_cert, 'sha256') if newnodeattribs: cfg.set_node_attributes({nodename: newnodeattribs}) log.log({ 'info': 'Discovered {0} ({1})'.format(nodename, handler.devname) }) info['discostatus'] = 'discovered' for i in pending_by_uuid.get(curruuid, []): eventlet.spawn_n(_recheck_single_unknown_info, cfg, i) return True log.log({ 'info': 'Detected {0}, but discovery.policy is not set to a ' 'value allowing discovery (open or permissive)'.format(nodename) }) info['discofailure'] = 'policy' return False