def test_lock_check_ts_false_lock_clash(self): # 2 threads try to lock the DB at the same time when check_ts is False # and no thread holds the lock handler1 = consistency_db.HashHandler() h1 = handler1.lock() self.assertTrue(h1) handler1.unlock() self.assertEqual(handler1.lock_ts, self._get_hash_from_handler_db(handler1)) handler2 = consistency_db.HashHandler() with mock.patch.object(handler2._FACADE, 'get_engine') as ge, \ mock.patch(CONSISTENCYDB + '.eventlet.sleep', side_effect=[None]) as emock: conn = ge.return_value.begin.return_value.__enter__.return_value firstresult = mock.Mock() # a rowcount of 0 simulates the effect of another db client # updating the same record the handler was trying to update firstresult.rowcount = 0 secondresult = mock.Mock() secondresult.rowcount = 1 conn.execute.side_effect = [firstresult, secondresult] h2 = handler2.lock(check_ts=False) self.assertTrue(h2) # update should have been called again after the failure self.assertEqual(2, conn.execute.call_count) # sleep should have been called once, during first result failure emock.assert_called_once()
def test_unlock_set_prev_ts(self): handler1 = consistency_db.HashHandler() handler1.lock() self.assertEqual(handler1.lock_marker, self._get_hash_from_handler_db(handler1)) handler1.unlock() # first lock-unlock is done. now comes a second call with # check_ts = False handler2 = consistency_db.HashHandler() h2 = handler2.lock(check_ts=False) self.assertTrue(h2) self.assertEqual(handler1.lock_ts, handler2.prev_lock_ts) # now assuming exception occured during topo_sync, call # handler2.unlock(set_prev_ts=True) handler2.unlock(set_prev_ts=True) # hash in consistency_db will be previous hash_handler's lock_ts self.assertEqual(handler1.lock_ts, self._get_hash_from_handler_db(handler2)) # try unlock again on the same handler2 - it should have no effect # as unlock(set_prev_ts) removed TOPO_SYNC marker. this simulates # unlock() being called in the finally block of force_topo_sync() handler2.unlock() self.assertEqual(handler1.lock_ts, self._get_hash_from_handler_db(handler2))
def test_clear_lock_skip_after_steal(self): handler1 = consistency_db.HashHandler() handler1.read_for_update() # lock the table handler2 = consistency_db.HashHandler() with mock.patch.object(consistency_db, 'MAX_LOCK_WAIT_TIME', new=0): handler2.read_for_update() before = self._get_hash_from_handler_db(handler1) # handler1 should not clear handler2's lock handler1.clear_lock() self.assertEqual(before, self._get_hash_from_handler_db(handler1))
def test_failure_to_steal_lock(self): handler1 = consistency_db.HashHandler() handler1.read_for_update() # lock the table handler2 = consistency_db.HashHandler() with\ mock.patch.object(consistency_db, 'MAX_LOCK_WAIT_TIME') as mlock,\ mock.patch.object(handler2, '_optimistic_update_hash_record', side_effect=[False, True]) as oplock: # handler2 will go through 2 iterations since the lock will fail on # the first attempt mlock.__lt__.side_effect = [False, True, False, True] handler2.read_for_update() self.assertEqual(4, mlock.__lt__.call_count) self.assertEqual(2, oplock.call_count)
def test_lock_check_ts_true_prev_lock_exists(self): handler1 = consistency_db.HashHandler() h1 = handler1.lock() self.assertTrue(h1) self.assertEqual(handler1.lock_marker, self._get_hash_from_handler_db(handler1)) # 2nd thread came in just 10 millisecs after first one, and first one # still holds the lock, expired = False timestamp_2 = float(handler1.lock_ts) + 10 handler2 = consistency_db.HashHandler(timestamp_ms=timestamp_2) h2 = handler2.lock() self.assertFalse(h2) self.assertEqual(handler1.lock_ts, handler2.prev_lock_ts)
def test_take_lock_from_other(self): handler1 = consistency_db.HashHandler() handler1.read_for_update() # lock the table handler2 = consistency_db.HashHandler() with mock.patch.object(consistency_db, 'MAX_LOCK_WAIT_TIME') as mlock: # make handler2 wait for only one iteration mlock.__lt__.side_effect = [False, True] handler2.read_for_update() # once MAX LOCK exceeded, comparisons should stop due to lock steal self.assertEqual(2, mlock.__lt__.call_count) dbentry = self._get_hash_from_handler_db(handler1) # handler2 should have the lock self.assertIn(handler2.lock_marker, dbentry) self.assertNotIn(handler1.lock_marker, dbentry) # lock protection only blocks read_for_update, anyone can change handler1.put_hash('H1')
def test_clear_lock(self): handler = consistency_db.HashHandler() handler.lock() # lock the table self.assertEqual(handler.lock_marker, self._get_hash_from_handler_db(handler)) handler.unlock() self.assertEqual(handler.lock_ts, self._get_hash_from_handler_db(handler))
def test_clear_lock(self): handler = consistency_db.HashHandler() handler.put_hash('SOMEHASH') handler.read_for_update() # lock the table self.assertEqual(handler.lock_marker + 'SOMEHASH', self._get_hash_from_handler_db(handler)) handler.clear_lock() self.assertEqual('SOMEHASH', self._get_hash_from_handler_db(handler))
def test_handler_already_holding_lock(self): handler = consistency_db.HashHandler() handler.read_for_update() # lock the table with mock.patch.object(handler._FACADE, 'get_engine') as ge: handler.read_for_update() # get engine should not have been called because no update # should have been made self.assertFalse(ge.called)
def test_db_duplicate_on_insert(self): handler = consistency_db.HashHandler() with mock.patch.object(handler.session, 'add', side_effect=[db_exc.DBDuplicateEntry, '']) as add_mock: handler.lock() # duplicate insert failure should result in retry self.assertEqual(2, add_mock.call_count)
def test_delete_failure_sets_bad_hash(self): pl = directory.get_plugin() hash_handler = consistency_db.HashHandler() with mock.patch(SERVERMANAGER + '.ServerProxy.rest_call', return_value=(httplib.INTERNAL_SERVER_ERROR, 0, 0, 0)): # a failed delete call should put a bad hash in the DB pl.servers.rest_call('DELETE', '/', '', None, []) self.assertEqual('INCONSISTENT,INCONSISTENT', hash_handler.read_for_update())
def test_lock_check_ts_false_prev_lock_exists(self): handler1 = consistency_db.HashHandler() h1 = handler1.lock() self.assertTrue(h1) self.assertEqual(handler1.lock_marker, self._get_hash_from_handler_db(handler1)) self.assertEqual('0', handler1.prev_lock_ts) hh2_ts_hh1_ts_plus_1780 = float(handler1.lock_ts) + 1780 handler2 = consistency_db.HashHandler( hash_id='1', timestamp_ms=hh2_ts_hh1_ts_plus_1780) with mock.patch(CONSISTENCYDB + '.eventlet.sleep', side_effect=[Exception]) as emock: try: handler2.lock(check_ts=False) except Exception: pass self.assertEqual(1, emock.call_count) self.assertEqual(handler1.lock_ts, handler2.prev_lock_ts)
def dblock_mark_toposync_started(self, orig_hash_handler): """Insert DBLock with TopoSync start marker marker = 'TOPO' + original call's lockID @return: new HashHandler for TopoSync """ prefix = cdb.DBLOCK_PREFIX_TOPO + orig_hash_handler.random_lock_id handler = cdb.HashHandler(prefix=prefix) new = handler.lock_marker + "initial:hash,code" handler.put_hash(new) return handler
def test_hash_handle_lock_no_initial_record(self): handler = consistency_db.HashHandler() h1 = handler.read_for_update() # return to caller should be empty even with lock in DB self.assertFalse(h1) # db should have a lock marker self.assertEqual(handler.lock_marker, self._get_hash_from_handler_db(handler)) # an entry should clear the lock handler.put_hash('DIGEST') self.assertEqual('DIGEST', self._get_hash_from_handler_db(handler))
def test_lock_check_ts_false_prev_lock_not_expired(self): handler1 = consistency_db.HashHandler() h1 = handler1.lock() self.assertTrue(h1) self.assertEqual(handler1.lock_marker, self._get_hash_from_handler_db(handler1)) handler1.unlock() self.assertEqual(handler1.lock_ts, self._get_hash_from_handler_db(handler1)) # thread 1 has executed the complete lock-unlock cycle # thread 2 now tries to get lock with check_ts True # TOPO_SYNC_EXPIRED_SECS = 1800 hh2_ts_under_limit = float(handler1.lock_ts) + 1000 handler2 = consistency_db.HashHandler(hash_id=1, timestamp_ms=hh2_ts_under_limit) h2 = handler2.lock(check_ts=False) self.assertTrue(h2) self.assertEqual(handler1.lock_ts, handler2.prev_lock_ts)
def test_lock_check_ts_true_prev_lock_expired(self): handler1 = consistency_db.HashHandler() h1 = handler1.lock() self.assertTrue(h1) self.assertEqual(handler1.lock_marker, self._get_hash_from_handler_db(handler1)) handler1.unlock() self.assertEqual(handler1.lock_ts, self._get_hash_from_handler_db(handler1)) # thread 1 has executed the complete lock-unlock cycle # thread 2 now tries to get lock with check_ts True # TOPO_SYNC_EXPIRED_SECS = 1 for testing time.sleep(1) handler2 = consistency_db.HashHandler() # only for testing consistency_db.TOPO_SYNC_EXPIRED_SECS = 1 h2 = handler2.lock() self.assertTrue(h2) self.assertEqual(handler1.lock_ts, handler2.prev_lock_ts)
def test_hash_handle_lock_existing_record(self): handler = consistency_db.HashHandler() handler.put_hash('DIGEST') # set initial hash h1 = handler.read_for_update() self.assertEqual('DIGEST', h1) self.assertEqual(handler.lock_marker + 'DIGEST', self._get_hash_from_handler_db(handler)) # make sure update works handler.put_hash('DIGEST2') self.assertEqual('DIGEST2', self._get_hash_from_handler_db(handler))
def _update_tenant_cache(self, reconcile=True, ratelimit=False): if ratelimit is True and self._last_keystone_sync_time is not None: if time.time() - self._last_keystone_sync_time <= \ KEYSTONE_SYNC_RATE_LIMIT: return try: auth = v3.Password(auth_url=self.auth_url, username=self.auth_user, password=self.auth_password, project_name=self.auth_tenant, user_domain_name=self.user_domain_name, project_domain_name=self.project_domain_name) sess = session.Session(auth=auth) keystone_client = ksclient.Client(session=sess) tenants = keystone_client.projects.list() new_cached_tenants = { tn.id: Util.format_resource_name(tn.name) for tn in tenants } # Add SERVICE_TENANT to handle hidden network for VRRP new_cached_tenants[SERVICE_TENANT] = SERVICE_TENANT LOG.debug("New TENANTS: %s \nPrevious Tenants %s", new_cached_tenants, self.keystone_tenants) diff = DictDiffer(new_cached_tenants, self.keystone_tenants) self.keystone_tenants = new_cached_tenants if reconcile: for tenant_id in diff.added(): LOG.debug("TENANT create: id %s name %s", tenant_id, self.keystone_tenants[tenant_id]) self._rest_create_tenant(tenant_id) for tenant_id in diff.removed(): LOG.debug("TENANT delete: id %s", tenant_id) self.rest_delete_tenant(tenant_id) if diff.changed(): hash_handler = cdb.HashHandler() res = hash_handler._get_current_record() if res: lock_owner = hash_handler._get_lock_owner(res.hash) if lock_owner and cdb.DBLOCK_PREFIX_TOPO in lock_owner: # topology sync is still going on return True LOG.debug("TENANT changed: force topo sync") hash_handler.put_hash('initial:hash,code') return True except Exception: LOG.exception("Encountered an error syncing with keystone.") return False finally: self._last_keystone_sync_time = time.time()
def test_lock_no_initial_record(self): handler = consistency_db.HashHandler() h1 = handler.lock() # lock() request on empty DB should succeed self.assertTrue(h1) # db should have a lock marker self.assertEqual(handler.lock_marker, self._get_hash_from_handler_db(handler)) # prev_lock_ts must be 0 for initial case self.assertEqual(handler.prev_lock_ts, '0') # unlock() should clear the lock handler.unlock() self.assertEqual(handler.lock_ts, self._get_hash_from_handler_db(handler))
def test_update_hit_no_records(self): handler = consistency_db.HashHandler() # set initial hash so update will be required handler.put_hash('DIGEST') with mock.patch.object(handler._FACADE, 'get_engine') as ge: conn = ge.return_value.begin.return_value.__enter__.return_value firstresult = mock.Mock() # a rowcount of 0 simulates the effect of another db client # updating the same record the handler was trying to update firstresult.rowcount = 0 secondresult = mock.Mock() secondresult.rowcount = 1 conn.execute.side_effect = [firstresult, secondresult] handler.read_for_update() # update should have been called again after the failure self.assertEqual(2, conn.execute.call_count)
def initialize(self): LOG.debug('Initializing driver') # register plugin config opts pl_config.register_config() self.evpool = eventlet.GreenPool(cfg.CONF.RESTPROXY.thread_pool_size) hash_handler = cdb.HashHandler() if hash_handler.is_db_hash_empty(): LOG.debug("Forcing topology sync as consistency hash is empty") hash_handler.read_for_update() hash_handler.put_hash('initial:hash,code') # init network ctrl connections self.servers = servermanager.ServerPool() self.servers.get_topo_function = self._get_all_data_auto self.segmentation_types = ', '.join(cfg.CONF.ml2.type_drivers) # Track hosts running IVS to avoid excessive calls to the backend self.vswitch_host_cache = {} self.setup_sg_rpc_callbacks() LOG.debug("Initialization done")
def rest_call(self, action, resource, data, headers, ignore_codes, timeout=False): context = self.get_context_ref() if context: # include the requesting context information if available cdict = context.to_dict() # remove the auth token so it's not present in debug logs on the # backend controller cdict.pop('auth_token', None) if ('tenant_name' in cdict and cdict['tenant_name']): cdict['tenant_name'] = Util.format_resource_name( cdict['tenant_name']) headers[REQ_CONTEXT_HEADER] = jsonutils.dumps(cdict) hash_handler = cdb.HashHandler() good_first = sorted(self.servers, key=lambda x: x.failed) first_response = None for active_server in good_first: LOG.debug( "ServerProxy: %(action)s to servers: " "%(server)r, %(resource)s", { 'action': action, 'server': (active_server.server, active_server.port), 'resource': resource }) for x in range(HTTP_SERVICE_UNAVAILABLE_RETRY_COUNT + 1): ret = active_server.rest_call(action, resource, data, headers, timeout, reconnect=self.always_reconnect, hash_handler=hash_handler) if ret[0] != httplib.SERVICE_UNAVAILABLE: break eventlet.sleep(HTTP_SERVICE_UNAVAILABLE_RETRY_INTERVAL) # If inconsistent, do a full synchronization if ret[0] == httplib.CONFLICT and hash_handler.is_db_lock_owner(): if not self.get_topo_function: raise cfg.Error( _('Server requires synchronization, ' 'but no topology function was defined.')) LOG.info( "ServerProxy: HashConflict detected with request " "%(action)s %(resource)s Starting Topology sync", { 'action': action, 'resource': resource }) topo_hh = self.dblock_mark_toposync_started(hash_handler) try: data = self.get_topo_function( **self.get_topo_function_args) if data: ret_ts = active_server.rest_call('POST', TOPOLOGY_PATH, data, timeout=None, hash_handler=topo_hh) if self.server_failure(ret_ts, ignore_codes): LOG.error("ServerProxy: Topology sync failed") raise RemoteRestError(reason=ret_ts[2], status=ret_ts[0]) finally: LOG.info("ServerProxy: Topology sync completed") if data is None: return None elif ret[0] == httplib.CONFLICT and \ not hash_handler.is_db_lock_owner(): # DB lock ownership lost, allow current owner to detect hash # conflict and perform needed TopoSync LOG.warning("HashConflict detected but thread is no longer" " DB lock owner. Skipping TopoSync call") # Store the first response as the error to be bubbled up to the # user since it was a good server. Subsequent servers will most # likely be cluster slaves and won't have a useful error for the # user (e.g. 302 redirect to master) if not first_response: first_response = ret if not self.server_failure(ret, ignore_codes): active_server.failed = False LOG.debug( "ServerProxy: %(action)s succeed for servers: " "%(server)r Response: %(response)s", { 'action': action, 'server': (active_server.server, active_server.port), 'response': ret[3] }) return ret else: LOG.warning( 'ServerProxy: %(action)s failure for servers:' '%(server)r Response: %(response)s', { 'action': action, 'server': (active_server.server, active_server.port), 'response': ret[3] }) LOG.warning( "ServerProxy: Error details: " "status=%(status)d, reason=%(reason)r, " "ret=%(ret)s, data=%(data)r", { 'status': ret[0], 'reason': ret[1], 'ret': ret[2], 'data': ret[3] }) active_server.failed = True # A failure on a delete means the object is gone from Neutron but not # from the controller. Set the consistency hash to a bad value to # trigger a sync on the next check. # NOTE: The hash must have a comma in it otherwise it will be ignored # by the backend. if action == 'DELETE': hash_handler.put_hash('INCONSISTENT,INCONSISTENT') # All servers failed, reset server list and try again next time LOG.error( 'ServerProxy: %(action)s failure for all servers: ' '%(server)r', { 'action': action, 'server': tuple((s.server, s.port) for s in self.servers) }) return first_response
def force_topo_sync(self, check_ts=True): """Execute a topology_sync between OSP and BCF. Topology sync collects all data from Openstack and pushes to BCF in one single REST call. This is a heavy operation and is not executed automatically. Conditions when this would be called: (1) during ServerPool initialization (a) must check previous timestamp (2) if periodic keystone tenant_cache find a diff in tenant list (a) should not check previous timestamp (3) externally triggered by neutron force-bcf-sync command (a) should not check previous timestamp (4) a rest_call to BCF fails on both servers and failure_code is not part of the ignore_codes list (a) must check previous timestamp :param check_ts: boolean flag to check previous timestamp < TOPO_SYNC_EXPIRED_SECS prev_resp: a REST response tuple from the previous failed REST call if available. If we skip topo_sync, return the failure as previously observed. :return: (sync_executed, response) sync_executed - Boolean - returns True if we were able to acquire a lock to perform topo_sync, False otherwise response - tuple of the typical HTTP response from REST call (response.status, response.reason, respstr, respdata) """ LOG.info(_LI('TOPO_SYNC requested with check_ts %s'), check_ts) if not self.get_topo_function: raise cfg.Error(_('Server requires synchronization, ' 'but no topology function was defined.')) # get current timestamp curr_ts = str(time.time()) hash_handler = cdb.HashHandler(timestamp_ms=curr_ts) if not hash_handler.lock(check_ts): LOG.info(_LI("TOPO_SYNC: lock() returned False. Skipping.")) return False, TOPO_RESPONSE_OK # else, perform topo_sync try: LOG.debug("TOPO_SYNC: requested at %(request_ts)s started at " "%(start_ts)s", {'request_ts': cdb.convert_ts_to_datetime(curr_ts), 'start_ts': cdb.convert_ts_to_datetime(time.time())}) data = self.get_topo_function( **self.get_topo_function_args) if not data: # when keystone sync fails, it fails silently with data = None # that is wrong, we need to raise an exception raise Exception(_("TOPO_SYNC: failed to retrieve data.")) LOG.debug("TOPO_SYNC: data received from OSP, sending " "request to BCF.") errstr = _("Unable to perform forced topology_sync: %s") return True, self.rest_action('POST', TOPOLOGY_PATH, data, errstr) except Exception as e: # if encountered an exception, set to previous timestamp LOG.warning(_LW("TOPO_SYNC: Exception during topology sync. " "Consistency DB timestamp will not be updated.")) hash_handler.unlock(set_prev_ts=True) raise e finally: hash_handler.unlock() diff = time.time() - float(hash_handler.lock_ts) LOG.info(_LI("TOPO_SYNC: took %s seconds to execute topo_sync. " "consistency_db unlocked."), str(diff))
def rest_call(self, action, resource, data='', headers=None, timeout=False, reconnect=False, hash_handler=None): uri = self.base_uri + resource body = jsonutils.dumps(data) headers = headers or {} headers['Content-type'] = 'application/json' headers['Accept'] = 'application/json' headers['NeutronProxy-Agent'] = self.name headers['Instance-ID'] = self.neutron_id headers['Orchestration-Service-ID'] = ORCHESTRATION_SERVICE_ID lock_start_time = time.time() if resource == TOPOLOGY_PATH: # TopoSync call provides hash_handler but doesn't need hash checks pass elif hash_handler: # For calls that need hash checks headers[HASH_MATCH_HEADER] = hash_handler.read_for_update() else: # For calls that don't need hash checks CapabilityCheck hash_handler = cdb.HashHandler() lock_end_time = time.time() LOG.debug("Time waited to get DB lock %.2fsecs", (lock_end_time - lock_start_time)) # TODO(kevinbenton): Re-enable keep-alive in a thread-safe fashion. # When multiple workers are enabled the saved connection gets mangled # by multiple threads so we always reconnect. if 'keep-alive' in self.capabilities and False: headers['Connection'] = 'keep-alive' else: reconnect = True if self.auth: headers['Authorization'] = self.auth LOG.debug( "ServerProxy: server=%(server)s, port=%(port)d, " "ssl=%(ssl)r", { 'server': self.server, 'port': self.port, 'ssl': self.ssl }) LOG.debug( "ServerProxy: resource=%(resource)s, data=%(data)r, " "headers=%(headers)r, action=%(action)s", { 'resource': resource, 'data': data, 'headers': headers, 'action': action }) # unspecified timeout is False because a timeout can be specified as # None to indicate no timeout. if timeout is False: timeout = self.timeout if timeout != self.timeout: # need a new connection if timeout has changed reconnect = True if not self.currentconn or reconnect: if self.currentconn: self.currentconn.close() if self.ssl: currentconn = HTTPSConnectionWithValidation(self.server, self.port, timeout=timeout) if currentconn is None: LOG.error('ServerProxy: Could not establish HTTPS ' 'connection') return 0, None, None, None currentconn.combined_cert = self.combined_cert else: currentconn = httplib.HTTPConnection(self.server, self.port, timeout=timeout) if currentconn is None: LOG.error('ServerProxy: Could not establish HTTP ' 'connection') return 0, None, None, None try: currentconn.request(action, uri, body, headers) response = currentconn.getresponse() respstr = response.read() respdata = respstr bcf_response_time = time.time() LOG.debug("Time waited to get response from BCF %.2fsecs", (bcf_response_time - lock_end_time)) if response.status in self.success_codes: hash_value = response.getheader(HASH_MATCH_HEADER) if hash_value is not None: # There maybe be (rare) cases when the response takes too # long and thread looses the DB lock. In such cases # updating the HASH could result in chaos (eg: current # thread's lock is overwritten without mercy and would # allow another thread to grab the lock). Safest way # forward seems to be to check lock ownership prior to # updating Hash else let next REQUEST trigger a TopoSync if not hash_handler.put_hash_if_owner(hash_value): LOG.warning("Thread is no longer DB lock owner") else: # Don't clear hash from DB if a hash header wasn't present hash_handler.clear_lock() try: respdata = jsonutils.loads(respstr) except ValueError: # response was not JSON, ignore the exception pass else: # BVS-6979: race-condition(#2) on HashConflict, don't unlock # to ensure topo_sync is scheduled next (it force grabs lock) if response.status != httplib.CONFLICT: # release lock so others don't have to wait for timeout hash_handler.clear_lock() ret = (response.status, response.reason, respstr, respdata) except httplib.HTTPException: # If we were using a cached connection, try again with a new one. with excutils.save_and_reraise_exception() as ctxt: currentconn.close() if reconnect: # if reconnect is true, this was on a fresh connection so # reraise since this server seems to be broken ctxt.reraise = True else: # if reconnect is false, it was a cached connection so # try one more time before re-raising ctxt.reraise = False return self.rest_call(action, resource, data, headers, timeout=timeout, reconnect=True) except (socket.timeout, socket.error) as e: currentconn.close() LOG.error('ServerProxy: %(action)s failure, %(e)r', { 'action': action, 'e': e }) ret = 0, None, None, None LOG.debug( "ServerProxy: status=%(status)d, reason=%(reason)r, " "ret=%(ret)s, data=%(data)r", { 'status': ret[0], 'reason': ret[1], 'ret': ret[2], 'data': ret[3] }) return ret