Exemple #1
0
 def _calculate_tenants(self, aim_ctx):
     with aim_ctx.store.begin(subtransactions=True):
         # Refresh this agent
         self.agent = self.manager.get(aim_ctx, self.agent)
         if not self.single_aid:
             down_time = self.agent.down_time(aim_ctx)
             if max(0, down_time or 0) > self.max_down_time:
                 utils.perform_harakiri(LOG, "Agent has been down for %s "
                                             "seconds." % down_time)
             # Get peers
             agents = [
                 x for x in self.manager.find(aim_ctx, resource.Agent,
                                              admin_state_up=True)
                 if not x.is_down(aim_ctx)]
             # Validate agent version
             if not agents:
                 return []
             max_version = max(agents, key=lambda x: x.version).version
             if self._major_vercompare(self.agent.version, max_version) < 0:
                 LOG.error("Agent version is outdated: Current %s Required "
                           "%s" % (self.agent.version, max_version))
                 return []
             # Purge outdated agents
             agents = [x for x in agents if
                       self._major_vercompare(x.version, max_version) == 0]
         else:
             agents = [self.agent]
         result = self._tenant_assignation_algorithm(aim_ctx, agents)
         # Store result in DB
         self.agent.hash_trees = result
         self.manager.create(aim_ctx, self.agent, overwrite=True)
         return result
 def establish_ws_session(self, max_retries=None):
     retries = 0
     self._reload_websocket_config()
     max_retries = max_retries or 2 * len(self.ws_urls)
     while retries < max_retries:
         if self.session and self.session.session:
             self.session.close()
         LOG.info('Establishing WS connection with parameters: %s',
                  [self.ws_urls[0], self.apic_username, self.apic_password,
                   self.verify_ssl_certificate])
         self.session = acitoolkit.Session(
             self.ws_urls[0], self.apic_username, self.apic_password,
             verify_ssl=self.verify_ssl_certificate,
             cert_name=self.cert_name, key=self.private_key_file)
         resp = self.session.login()
         if not resp.ok:
             LOG.debug('Websocket connection failed: %s' % resp.text)
             self.ws_urls.rotate(-1)
             LOG.info('Rotating websocket URL, using: %s' % self.ws_urls[0])
             retries += 1
             continue
         LOG.info('Websocket connection succeeded.')
         self._spawn_monitors()
         return self.session
     utils.perform_harakiri(LOG, "Cannot establish WS connection after %s "
                                 "retries." % retries)
 def _thread_monitor(self, thread, name, flag):
     # TODO(ivar): I could have used thread.join instead of this
     retries = None
     max_retries = len(self.ws_urls)
     LOG.debug("Monitoring thread %s" % name)
     try:
         while flag['monitor_runs']:
             if not thread.isAlive():
                 if retries and retries.get() >= max_retries:
                     utils.perform_harakiri(
                         LOG, "Critical thread %s stopped working" % name)
                 else:
                     retries = utils.exponential_backoff(
                         self.monitor_max_backoff, tentative=retries)
                     try:
                         self.establish_ws_session()
                     except Exception as e:
                         LOG.debug(
                             "Monitor for thread %s tried to reconnect web "
                             "socket, but something went wrong. Will retry "
                             "%s more times: %s" %
                             (name, max_retries - retries.get(), e.message))
                         continue
             else:
                 LOG.debug("Thread %s is in good shape" % name)
                 retries = None
             time.sleep(self.monitor_sleep_time)
             # for testing purposes
             flag['monitor_runs'] -= 1
     except Exception as e:
         msg = ("Unknown error in thread monitor "
                "for %s: %s" % (name, e.message))
         LOG.error(msg)
         utils.perform_harakiri(LOG, msg)
 def establish_ws_session(self, max_retries=None):
     try:
         with utils.get_rlock(lcon.ACI_WS_CONNECTION_LOCK, blocking=False):
             retries = 0
             self._reload_websocket_config()
             max_retries = max_retries or 2 * len(self.ws_urls)
             while retries < max_retries:
                 if self.session and self.session.session:
                     self.session.close()
                 LOG.info('Establishing WS connection with url: %s',
                          self.ws_urls[0])
                 self.session = acitoolkit.Session(
                     self.ws_urls[0],
                     self.apic_username,
                     self.apic_password,
                     verify_ssl=self.verify_ssl_certificate,
                     cert_name=self.cert_name,
                     key=self.private_key_file)
                 resp = self.session.login()
                 if not resp.ok:
                     LOG.warn('Websocket connection failed: %s' % resp.text)
                     self.ws_urls.rotate(-1)
                     LOG.info('Rotating websocket URL, '
                              'using: %s' % self.ws_urls[0])
                     retries += 1
                     continue
                 LOG.info('Websocket connection succeeded.')
                 self._spawn_monitors()
                 return self.session
             utils.perform_harakiri(
                 LOG, "Cannot establish WS connection "
                 "after %s retries." % retries)
     except utils.LockNotAcquired:
         # Some other thread is trying to reconnect
         return
 def establish_ws_session(self, max_retries=None):
     try:
         with utils.get_rlock(lcon.ACI_WS_CONNECTION_LOCK, blocking=False):
             retries = 0
             self._reload_websocket_config()
             max_retries = max_retries or 2 * len(self.ws_urls)
             while retries < max_retries:
                 if self.session and self.session.session:
                     self.session.close()
                 LOG.info('Establishing WS connection with url: %s',
                          self.ws_urls[0])
                 self.session = acitoolkit.Session(
                     self.ws_urls[0], self.apic_username,
                     self.apic_password,
                     verify_ssl=self.verify_ssl_certificate,
                     cert_name=self.cert_name, key=self.private_key_file)
                 resp = self.session.login()
                 if not resp.ok:
                     LOG.warn('Websocket connection failed: %s' % resp.text)
                     self.ws_urls.rotate(-1)
                     LOG.info('Rotating websocket URL, '
                              'using: %s' % self.ws_urls[0])
                     retries += 1
                     continue
                 LOG.info('Websocket connection succeeded.')
                 self._spawn_monitors()
                 return self.session
             utils.perform_harakiri(LOG, "Cannot establish WS connection "
                                         "after %s retries." % retries)
     except utils.LockNotAcquired:
         # Some other thread is trying to reconnect
         return
 def test_harakiri(self):
     original = self.cfg_manager.get_option('recovery_restart', 'aim')
     self.set_override('recovery_restart', False, 'aim')
     with mock.patch.object(internal_utils.os, '_exit') as ex:
         internal_utils.perform_harakiri(mock.Mock(), '')
         self.assertEqual(0, ex.call_count)
         self.set_override('recovery_restart', True, 'aim')
         internal_utils.perform_harakiri(mock.Mock(), '')
         ex.assert_called_once_with(1)
     self.set_override('recovery_restart', original, 'aim')
 def test_harakiri(self):
     original = self.cfg_manager.get_option('recovery_restart', 'aim')
     self.set_override('recovery_restart', False, 'aim')
     with mock.patch.object(internal_utils.os, '_exit') as ex:
         internal_utils.perform_harakiri(mock.Mock(), '')
         self.assertEqual(0, ex.call_count)
         self.set_override('recovery_restart', True, 'aim')
         internal_utils.perform_harakiri(mock.Mock(), '')
         ex.assert_called_once_with(1)
     self.set_override('recovery_restart', original, 'aim')
 def _thread(self, func, name):
     LOG.info("Starting main loop of %s", name)
     try:
         while True:
             func()
     except utils.ThreadExit:
         return
     except Exception as e:
         LOG.error(traceback.format_exc())
         utils.perform_harakiri(LOG, "%s thread stopped "
                                     "unexpectedly: %s" % (name, str(e)))
Exemple #9
0
    def _heartbeat_loop(self):
        start_time = time.time()
        aim_ctx = context.AimContext(store=api.get_store())
        self._send_heartbeat(aim_ctx)
        # REVISIT: This code should be removed once we've
        #          removed all the locking in AID.
        if start_time > self.daemon_loop_time:
            down_time = start_time - self.daemon_loop_time
            if down_time > DEADLOCK_TIME:
                utils.perform_harakiri(
                    LOG, "Agent has been down for %s "
                    "seconds." % down_time)

        utils.wait_for_next_cycle(start_time,
                                  self.report_interval,
                                  LOG,
                                  readable_caller='AID-HB',
                                  notify_exceeding_timeout=False)
 def _thread_monitor(self, flag):
     login_thread_name = 'login_thread'
     subscription_thread_name = 'subscription_thread'
     name_to_retry = {
         login_thread_name: None,
         subscription_thread_name: None
     }
     max_retries = len(self.ws_urls)
     LOG.debug("Monitoring threads login and subscription")
     try:
         while flag['monitor_runs']:
             for thd, name in [(self.login_thread, 'login_thread'),
                               (self.subs_thread, 'subscription_thread')]:
                 if thd and not thd.isAlive():
                     if name_to_retry[name] and name_to_retry[name].get(
                     ) >= max_retries:
                         utils.perform_harakiri(
                             LOG, "Critical thread %s stopped "
                             "working" % name)
                     else:
                         name_to_retry[name] = utils.exponential_backoff(
                             self.monitor_max_backoff,
                             tentative=name_to_retry[name])
                         try:
                             self.establish_ws_session()
                         except Exception as e:
                             LOG.debug(
                                 "Monitor for thread %s tried to reconnect "
                                 "web socket, but something went wrong. "
                                 "Will retry %s more times: %s" %
                                 (name, max_retries -
                                  name_to_retry[name].get(), e.message))
                             continue
                 elif thd:
                     LOG.debug("Thread %s is in good shape" % name)
                     name_to_retry[name] = None
             time.sleep(self.monitor_sleep_time)
             # for testing purposes
             flag['monitor_runs'] -= 1
     except Exception as e:
         msg = ("Unknown error in thread monitor: %s" % e.message)
         LOG.error(msg)
         utils.perform_harakiri(LOG, msg)
 def _thread_monitor(self, flag):
     login_thread_name = 'login_thread'
     subscription_thread_name = 'subscription_thread'
     name_to_retry = {login_thread_name: None,
                      subscription_thread_name: None}
     max_retries = len(self.ws_urls)
     LOG.debug("Monitoring threads login and subscription")
     try:
         while flag['monitor_runs']:
             for thd, name in [(self.login_thread, 'login_thread'),
                               (self.subs_thread, 'subscription_thread')]:
                 if thd and not thd.isAlive():
                     if name_to_retry[name] and name_to_retry[
                             name].get() >= max_retries:
                         utils.perform_harakiri(
                             LOG, "Critical thread %s stopped "
                                  "working" % name)
                     else:
                         name_to_retry[name] = utils.exponential_backoff(
                             self.monitor_max_backoff,
                             tentative=name_to_retry[name])
                         try:
                             self.establish_ws_session()
                         except Exception as e:
                             LOG.debug(
                                 "Monitor for thread %s tried to reconnect "
                                 "web socket, but something went wrong. "
                                 "Will retry %s more times: %s" %
                                 (name,
                                  max_retries - name_to_retry[name].get(),
                                  e.message))
                             continue
                 elif thd:
                     LOG.debug("Thread %s is in good shape" % name)
                     name_to_retry[name] = None
             time.sleep(self.monitor_sleep_time)
             # for testing purposes
             flag['monitor_runs'] -= 1
     except Exception as e:
         msg = ("Unknown error in thread monitor: %s" % e.message)
         LOG.error(msg)
         utils.perform_harakiri(LOG, msg)
Exemple #12
0
 def _fail_agent(self, context, aim_object, operation, reason):
     utils.perform_harakiri(LOG, message=reason)
    def _thread_monitor(self, flag):
        login_thread_name = 'login_thread'
        subscription_thread_name = 'subscription_thread'
        name_to_retry = {
            login_thread_name: None,
            subscription_thread_name: None
        }
        max_retries = len(self.ws_urls)
        recovery_timer = utils.get_time()
        recovery_retry = 0
        aim_context = aim_ctx.AimContext(store=api.get_store())
        LOG.debug("Monitoring threads login and subscription")
        try:
            while flag['monitor_runs']:
                for thd, name in [(self.login_thread, 'login_thread'),
                                  (self.subs_thread, 'subscription_thread')]:
                    if thd and not thd.isAlive():
                        if name_to_retry[name] and name_to_retry[name].get(
                        ) >= max_retries:
                            utils.perform_harakiri(
                                LOG, "Critical thread %s stopped "
                                "working" % name)
                        else:
                            name_to_retry[name] = utils.exponential_backoff(
                                self.monitor_max_backoff,
                                tentative=name_to_retry[name])
                            try:
                                self.establish_ws_session()
                            except Exception as e:
                                LOG.debug(
                                    "Monitor for thread %s tried to reconnect "
                                    "web socket, but something went wrong. "
                                    "Will retry %s more times: %s" %
                                    (name, max_retries -
                                     name_to_retry[name].get(), str(e)))
                                continue
                    elif thd:
                        LOG.debug("Thread %s is in good shape" % name)
                        name_to_retry[name] = None

                if self.need_recovery:
                    # No point to do any recovery session if we
                    # only have 1 ws_url.
                    if (len(self.ws_urls) > 1
                            and utils.get_time() > recovery_timer):
                        self.establish_ws_session(recovery_mode=True)
                        # Still fail to recover
                        if self.need_recovery:
                            recovery_retry += 1
                            recovery_timer = (
                                utils.get_time() + utils.get_backoff_time(
                                    self.recovery_max_backoff, recovery_retry))
                        else:
                            recovery_retry = 0
                else:
                    # Update the last_update_timestamp
                    if self.apic_assign_obj:
                        self.apic_assign_obj = self.manager.update(
                            aim_context, self.apic_assign_obj)
                    else:
                        # This should never happen
                        LOG.error('There is no such apic_assign_obj exist '
                                  'for %s!' % self.session.ipaddr)

                time.sleep(self.monitor_sleep_time)
                # for testing purposes
                flag['monitor_runs'] -= 1
        except Exception as e:
            msg = ("Unknown error in thread monitor: %s" % str(e))
            LOG.error(msg)
            utils.perform_harakiri(LOG, msg)
    def establish_ws_session(self, max_retries=None, recovery_mode=False):
        try:
            with utils.get_rlock(lcon.ACI_WS_CONNECTION_LOCK, blocking=False):
                if not recovery_mode:
                    purpose = NORMAL_PURPOSE
                    self._reload_websocket_config()
                    self.need_recovery = False
                else:
                    purpose = RECOVERY_PURPOSE
                backup_urls = collections.deque()
                max_retries = max_retries or 2 * len(self.ws_urls)
                url_max_retries = max(1, max_retries / len(self.ws_urls))
                aim_context = aim_ctx.AimContext(store=api.get_store())
                for url in self.ws_urls:
                    apic_assign = api_infra.ApicAssignment(apic_host=url)
                    apic_assign_obj = self.manager.get(aim_context,
                                                       apic_assign)
                    if (apic_assign_obj
                            and apic_assign_obj.aim_aid_id != self.agent_id
                            and not apic_assign_obj.is_available(aim_context)):
                        backup_urls.append(url)
                        continue

                    # This means the original aim-aid owner might have
                    # crashed or something. We will just take it!
                    if (recovery_mode and apic_assign_obj
                            and self.session.ipaddr in url):
                        obj = self._update_apic_assign_db(
                            aim_context, apic_assign, apic_assign_obj)
                        if obj is None:
                            continue
                        self.need_recovery = False
                        self.apic_assign_obj = obj
                        return

                    is_conn_successful = self._ws_session_login(
                        url, url_max_retries, purpose, aim_context,
                        apic_assign, apic_assign_obj)
                    if is_conn_successful:
                        return
                    else:
                        backup_urls.append(url)

                if recovery_mode:
                    return
                # Try the backup urls. Randomly rotate the list first so that
                # the extra aim-aids won't all go for the same backup url.
                backup_urls_len = len(backup_urls)
                if backup_urls_len > 1:
                    backup_urls.rotate(random.randint(1, backup_urls_len))
                for url in backup_urls:
                    is_conn_successful = self._ws_session_login(
                        url, url_max_retries, BACKUP_PURPOSE)
                    if is_conn_successful:
                        return
                utils.perform_harakiri(
                    LOG, "Cannot establish WS connection "
                    "after %s retries." % max_retries)
        except utils.LockNotAcquired:
            # Some other thread is trying to reconnect
            return