def test_clear_lock_skip_after_steal(self): handler1 = consistency_db.HashHandler() handler1.read_for_update() # lock the table handler2 = consistency_db.HashHandler() with mock.patch.object(consistency_db, 'MAX_LOCK_WAIT_TIME', new=0): handler2.read_for_update() before = self._get_hash_from_handler_db(handler1) # handler1 should not clear handler2's lock handler1.clear_lock() self.assertEqual(before, self._get_hash_from_handler_db(handler1))
def test_failure_to_steal_lock(self): handler1 = consistency_db.HashHandler() handler1.read_for_update() # lock the table handler2 = consistency_db.HashHandler() with contextlib.nested( mock.patch.object(consistency_db, 'MAX_LOCK_WAIT_TIME'), mock.patch.object(handler2, '_optimistic_update_hash_record', side_effect=[False, True])) as (mlock, oplock): # handler2 will go through 2 iterations since the lock will fail on # the first attempt mlock.__lt__.side_effect = [False, True, False, True] handler2.read_for_update() self.assertEqual(4, mlock.__lt__.call_count) self.assertEqual(2, oplock.call_count)
def test_take_lock_from_other(self): handler1 = consistency_db.HashHandler() handler1.read_for_update() # lock the table handler2 = consistency_db.HashHandler() with mock.patch.object(consistency_db, 'MAX_LOCK_WAIT_TIME') as mlock: # make handler2 wait for only one iteration mlock.__lt__.side_effect = [False, True] handler2.read_for_update() # once MAX LOCK exceeded, comparisons should stop due to lock steal self.assertEqual(2, mlock.__lt__.call_count) dbentry = self._get_hash_from_handler_db(handler1) # handler2 should have the lock self.assertIn(handler2.lock_marker, dbentry) self.assertNotIn(handler1.lock_marker, dbentry) # lock protection only blocks read_for_update, anyone can change handler1.put_hash('H1')
def initialize(self): LOG.debug('Initializing driver') # register plugin config opts pl_config.register_config() self.evpool = eventlet.GreenPool(cfg.CONF.RESTPROXY.thread_pool_size) LOG.debug("Force topology sync if consistency hash is empty") hash_handler = cdb.HashHandler() cur_hash = hash_handler.read_for_update() if not cur_hash: hash_handler.put_hash('intial:hash,code') LOG.debug("Force topology sync Done") # init network ctrl connections self.servers = servermanager.ServerPool() self.servers.get_topo_function = self._get_all_data_auto self.segmentation_types = ', '.join(cfg.CONF.ml2.type_drivers) # Track hosts running IVS to avoid excessive calls to the backend self.vswitch_host_cache = {} self.setup_sg_rpc_callbacks() self.unsupported_vnic_types = [ portbindings.VNIC_DIRECT, portbindings.VNIC_DIRECT_PHYSICAL ] LOG.debug("Initialization done")
def test_clear_lock(self): handler = consistency_db.HashHandler() handler.put_hash('SOMEHASH') handler.read_for_update() # lock the table self.assertEqual(handler.lock_marker + 'SOMEHASH', self._get_hash_from_handler_db(handler)) handler.clear_lock() self.assertEqual('SOMEHASH', self._get_hash_from_handler_db(handler))
def test_handler_already_holding_lock(self): handler = consistency_db.HashHandler() handler.read_for_update() # lock the table with mock.patch.object(handler._FACADE, 'get_engine') as ge: handler.read_for_update() # get engine should not have been called because no update # should have been made self.assertFalse(ge.called)
def test_db_duplicate_on_insert(self): handler = consistency_db.HashHandler() with mock.patch.object(handler.session, 'add', side_effect=[db_exc.DBDuplicateEntry, '']) as add_mock: handler.read_for_update() # duplicate insert failure should result in retry self.assertEqual(2, add_mock.call_count)
def test_delete_failure_sets_bad_hash(self): pl = directory.get_plugin() hash_handler = consistency_db.HashHandler() with mock.patch(SERVERMANAGER + '.ServerProxy.rest_call', return_value=(httplib.INTERNAL_SERVER_ERROR, 0, 0, 0)): # a failed delete call should put a bad hash in the DB pl.servers.rest_call('DELETE', '/', '', None, []) self.assertEqual('INCONSISTENT,INCONSISTENT', hash_handler.read_for_update())
def test_hash_handle_lock_no_initial_record(self): handler = consistency_db.HashHandler() h1 = handler.read_for_update() # return to caller should be empty even with lock in DB self.assertFalse(h1) # db should have a lock marker self.assertEqual(handler.lock_marker, self._get_hash_from_handler_db(handler)) # an entry should clear the lock handler.put_hash('DIGEST') self.assertEqual('DIGEST', self._get_hash_from_handler_db(handler))
def test_hash_handle_lock_existing_record(self): handler = consistency_db.HashHandler() handler.put_hash('DIGEST') # set initial hash h1 = handler.read_for_update() self.assertEqual('DIGEST', h1) self.assertEqual(handler.lock_marker + 'DIGEST', self._get_hash_from_handler_db(handler)) # make sure update works handler.put_hash('DIGEST2') self.assertEqual('DIGEST2', self._get_hash_from_handler_db(handler))
def keep_updating_lock(self): topo_index = ''.join( random.choice(string.ascii_uppercase + string.digits) for _ in range(2)) # topology sync will lock the consistency hash table # the lock starts with TOPO prefix = "TOPO" + topo_index while self._topo_sync_in_progress: handler = cdb.HashHandler(prefix=prefix, length=4) new = handler.lock_marker + "initial:hash,code" handler.put_hash(new) time.sleep(2)
def test_update_hit_no_records(self): handler = consistency_db.HashHandler() # set initial hash so update will be required handler.put_hash('DIGEST') with mock.patch.object(handler._FACADE, 'get_engine') as ge: conn = ge.return_value.begin.return_value.__enter__.return_value firstresult = mock.Mock() # a rowcount of 0 simulates the effect of another db client # updating the same record the handler was trying to update firstresult.rowcount = 0 secondresult = mock.Mock() secondresult.rowcount = 1 conn.execute.side_effect = [firstresult, secondresult] handler.read_for_update() # update should have been called again after the failure self.assertEqual(2, conn.execute.call_count)
def _update_tenant_cache(self, reconcile=True): try: auth = v3.Password(auth_url=self.auth_url, username=self.auth_user, password=self.auth_password, project_name=self.auth_tenant, user_domain_id=self.user_domain_id, project_domain_id=self.project_domain_id) sess = session.Session(auth=auth) keystone_client = ksclient.Client(session=sess) tenants = keystone_client.projects.list() new_cached_tenants = {tn.id: tn.name for tn in tenants} # Add SERVICE_TENANT to handle hidden network for VRRP new_cached_tenants[SERVICE_TENANT] = SERVICE_TENANT LOG.debug("New TENANTS: %s \nPrevious Tenants %s" % (new_cached_tenants, self.keystone_tenants)) diff = DictDiffer(new_cached_tenants, self.keystone_tenants) self.keystone_tenants = new_cached_tenants if reconcile: for tenant_id in diff.added(): LOG.debug("TENANT create: id %s name %s" % (tenant_id, self.keystone_tenants[tenant_id])) self._rest_create_tenant(tenant_id) for tenant_id in diff.removed(): LOG.debug("TENANT delete: id %s" % tenant_id) self.rest_delete_tenant(tenant_id) if diff.changed(): hash_handler = cdb.HashHandler() res = hash_handler._get_current_record() if res: lock_owner = hash_handler._get_lock_owner(res.hash) if lock_owner and "TOPO" in lock_owner: # topology sync is still going on return True LOG.debug("TENANT changed: force topo sync") hash_handler.put_hash('initial:hash,code') return True except Exception: LOG.exception(_LE("Encountered an error syncing with " "keystone.")) return False
def rest_call(self, action, resource, data, headers, ignore_codes, timeout=False): context = self.get_context_ref() if context: # include the requesting context information if available cdict = context.to_dict() # remove the auth token so it's not present in debug logs on the # backend controller cdict.pop('auth_token', None) headers[REQ_CONTEXT_HEADER] = jsonutils.dumps(cdict) hash_handler = cdb.HashHandler() good_first = sorted(self.servers, key=lambda x: x.failed) first_response = None for active_server in good_first: LOG.debug( "ServerProxy: %(action)s to servers: " "%(server)r, %(resource)s" % { 'action': action, 'server': (active_server.server, active_server.port), 'resource': resource }) for x in range(HTTP_SERVICE_UNAVAILABLE_RETRY_COUNT + 1): ret = active_server.rest_call(action, resource, data, headers, timeout, reconnect=self.always_reconnect, hash_handler=hash_handler) if ret[0] != httplib.SERVICE_UNAVAILABLE: break time.sleep(HTTP_SERVICE_UNAVAILABLE_RETRY_INTERVAL) # If inconsistent, do a full synchronization if ret[0] == httplib.CONFLICT: if not self.get_topo_function: raise cfg.Error( _('Server requires synchronization, ' 'but no topology function was defined.')) LOG.info( _LI("ServerProxy: HashConflict detected with request " "%(action)s %(resource)s Starting Topology sync"), { 'action': action, 'resource': resource }) self._topo_sync_in_progress = True eventlet.spawn_n(self.keep_updating_lock) try: data = self.get_topo_function( **self.get_topo_function_args) if data: data = self._sanitize_data_for_topo_sync(data) ret_ts = active_server.rest_call('POST', TOPOLOGY_PATH, data, timeout=None) if self.server_failure(ret_ts, ignore_codes): LOG.error(_LE("ServerProxy: Topology sync failed")) raise RemoteRestError(reason=ret_ts[2], status=ret_ts[0]) finally: LOG.info(_LI("ServerProxy: Topology sync completed")) self._topo_sync_in_progress = False if data is None: return None # Store the first response as the error to be bubbled up to the # user since it was a good server. Subsequent servers will most # likely be cluster slaves and won't have a useful error for the # user (e.g. 302 redirect to master) if not first_response: first_response = ret if not self.server_failure(ret, ignore_codes): active_server.failed = False LOG.debug( "ServerProxy: %(action)s succeed for servers: " "%(server)r Response: %(response)s" % { 'action': action, 'server': (active_server.server, active_server.port), 'response': ret[3] }) return ret else: LOG.warning( _LW('ServerProxy: %(action)s failure for servers:' '%(server)r Response: %(response)s'), { 'action': action, 'server': (active_server.server, active_server.port), 'response': ret[3] }) LOG.warning( _LW("ServerProxy: Error details: " "status=%(status)d, reason=%(reason)r, " "ret=%(ret)s, data=%(data)r"), { 'status': ret[0], 'reason': ret[1], 'ret': ret[2], 'data': ret[3] }) active_server.failed = True # A failure on a delete means the object is gone from Neutron but not # from the controller. Set the consistency hash to a bad value to # trigger a sync on the next check. # NOTE: The hash must have a comma in it otherwise it will be ignored # by the backend. if action == 'DELETE': hash_handler.put_hash('INCONSISTENT,INCONSISTENT') # All servers failed, reset server list and try again next time LOG.error( _LE('ServerProxy: %(action)s failure for all servers: ' '%(server)r'), { 'action': action, 'server': tuple((s.server, s.port) for s in self.servers) }) return first_response
def rest_call(self, action, resource, data='', headers=None, timeout=False, reconnect=False, hash_handler=None): uri = self.base_uri + resource body = jsonutils.dumps(data) headers = headers or {} headers['Content-type'] = 'application/json' headers['Accept'] = 'application/json' headers['NeutronProxy-Agent'] = self.name headers['Instance-ID'] = self.neutron_id headers['Orchestration-Service-ID'] = ORCHESTRATION_SERVICE_ID if hash_handler: # this will be excluded on calls that don't need hashes # (e.g. topology sync, capability checks) headers[HASH_MATCH_HEADER] = hash_handler.read_for_update() else: hash_handler = cdb.HashHandler() # TODO(kevinbenton): Re-enable keep-alive in a thread-safe fashion. # When multiple workers are enabled the saved connection gets mangled # by multiple threads so we always reconnect. if 'keep-alive' in self.capabilities and False: headers['Connection'] = 'keep-alive' else: reconnect = True if self.auth: headers['Authorization'] = self.auth LOG.debug( "ServerProxy: server=%(server)s, port=%(port)d, " "ssl=%(ssl)r", { 'server': self.server, 'port': self.port, 'ssl': self.ssl }) LOG.debug( "ServerProxy: resource=%(resource)s, data=%(data)r, " "headers=%(headers)r, action=%(action)s", { 'resource': resource, 'data': data, 'headers': headers, 'action': action }) # unspecified timeout is False because a timeout can be specified as # None to indicate no timeout. if timeout is False: timeout = self.timeout if timeout != self.timeout: # need a new connection if timeout has changed reconnect = True if not self.currentconn or reconnect: if self.currentconn: self.currentconn.close() if self.ssl: currentconn = HTTPSConnectionWithValidation(self.server, self.port, timeout=timeout) if currentconn is None: LOG.error( _LE('ServerProxy: Could not establish HTTPS ' 'connection')) return 0, None, None, None currentconn.combined_cert = self.combined_cert else: currentconn = httplib.HTTPConnection(self.server, self.port, timeout=timeout) if currentconn is None: LOG.error( _LE('ServerProxy: Could not establish HTTP ' 'connection')) return 0, None, None, None try: currentconn.request(action, uri, body, headers) response = currentconn.getresponse() respstr = response.read() respdata = respstr if response.status in self.success_codes: hash_value = response.getheader(HASH_MATCH_HEADER) # don't clear hash from DB if a hash header wasn't present if hash_value is not None: # BVS-6979: race-condition(#1) set sync=false so that # keep_updating_thread doesn't squash updated HASH # Delay is required in-case the loop is already executing if resource == TOPOLOGY_PATH: self._topo_sync_in_progress = False time.sleep(0.10) hash_handler.put_hash(hash_value) else: hash_handler.clear_lock() try: respdata = jsonutils.loads(respstr) except ValueError: # response was not JSON, ignore the exception pass else: # BVS-6979: race-condition(#2) on HashConflict, don't unlock # to ensure topo_sync is scheduled next (it force grabs lock) if response.status != httplib.CONFLICT: # release lock so others don't have to wait for timeout hash_handler.clear_lock() ret = (response.status, response.reason, respstr, respdata) except httplib.HTTPException: # If we were using a cached connection, try again with a new one. with excutils.save_and_reraise_exception() as ctxt: currentconn.close() if reconnect: # if reconnect is true, this was on a fresh connection so # reraise since this server seems to be broken ctxt.reraise = True else: # if reconnect is false, it was a cached connection so # try one more time before re-raising ctxt.reraise = False return self.rest_call(action, resource, data, headers, timeout=timeout, reconnect=True) except (socket.timeout, socket.error) as e: currentconn.close() LOG.error(_LE('ServerProxy: %(action)s failure, %(e)r'), { 'action': action, 'e': e }) ret = 0, None, None, None LOG.debug( "ServerProxy: status=%(status)d, reason=%(reason)r, " "ret=%(ret)s, data=%(data)r", { 'status': ret[0], 'reason': ret[1], 'ret': ret[2], 'data': ret[3] }) return ret