def _get_rain_addr(self, on_the_fly=False): try: rainx_instance = self.cs_client.next_instance("rainx") rainx_addr = "http://%s" % rainx_instance.get('addr') except Exception as e: self.logger.error("No rainx service found (%s)" % e.message) raise ServiceUnavailable("No rainx service found (%s)" % e.message) if on_the_fly: rainx_addr += "/on-the-fly" return rainx_addr
def assign_services(self, service_type, max_per_rdir=None, **kwargs): all_services = self.cs.all_services(service_type, **kwargs) all_rdir = self.cs.all_services('rdir', True, **kwargs) if len(all_rdir) <= 0: raise ServiceUnavailable("No rdir service found in %s" % self.ns) by_id = {_make_id(self.ns, 'rdir', x['addr']): x for x in all_rdir} errors = list() for provider in all_services: provider_id = provider['tags'].get('tag.service_id', provider['addr']) try: resp = self.directory.list(RDIR_ACCT, provider_id, service_type='rdir', **kwargs) rdir_host = _filter_rdir_host(resp) try: provider['rdir'] = by_id[_make_id(self.ns, 'rdir', rdir_host)] except KeyError: self.logger.warn("rdir %s linked to %s %s seems down", rdir_host, service_type, provider_id) except NotFound: try: rdir = self._smart_link_rdir(provider_id, all_rdir, service_type=service_type, max_per_rdir=max_per_rdir, **kwargs) except OioException as exc: self.logger.warn("Failed to link an rdir to %s %s: %s", service_type, provider_id, exc) errors.append((provider_id, exc)) continue n_bases = by_id[rdir]['tags'].get("stat.opened_db_count", 0) by_id[rdir]['tags']["stat.opened_db_count"] = n_bases + 1 provider['rdir'] = by_id[rdir] except OioException as exc: self.logger.warn( "Failed to check rdir linked to %s %s " "(thus won't try to make the link): %s", service_type, provider_id, exc) errors.append((provider_id, exc)) if errors: # group_chunk_errors is flexible enough to accept service addresses errors = group_chunk_errors(errors) if len(errors) == 1: err, addrs = errors.popitem() oio_reraise(type(err), err, str(addrs)) else: raise OioException('Several errors encountered: %s' % errors) return all_services
def _smart_link_rdir(self, volume_id, all_rdir, max_per_rdir=None): """ Force the load balancer to avoid services that already host more bases than the average (or more than `max_per_rdir`) while selecting rdir services. """ opened_db = [ x['tags']['stat.opened_db_count'] for x in all_rdir if x['score'] > 0 ] if len(opened_db) <= 0: raise ServiceUnavailable("No valid rdir service found in %s" % self.ns) if not max_per_rdir: upper_limit = sum(opened_db) / float(len(opened_db)) else: upper_limit = max_per_rdir - 1 avoids = [ _make_id(self.ns, "rdir", x['addr']) for x in all_rdir if x['score'] > 0 and x['tags']['stat.opened_db_count'] > upper_limit ] known = [_make_id(self.ns, "rawx", volume_id)] try: polled = self._poll_rdir(avoid=avoids, known=known) except ClientException as exc: if exc.status != 481 or max_per_rdir: raise # Retry without `avoids`, hoping the next iteration will rebalance polled = self._poll_rdir(known=known) forced = { 'host': polled['addr'], 'type': 'rdir', 'seq': 1, 'args': "", 'id': polled['id'] } self.directory.force(RDIR_ACCT, volume_id, 'rdir', forced, autocreate=True) try: self.rdir.create(volume_id) except Exception as exc: self.logger.warn("Failed to create database for %s on %s: %s", volume_id, polled['addr'], exc) return polled['id']
def assign_all_rawx(self, max_per_rdir=None): """ Find a rdir service for all rawx that don't have one already. :param max_per_rdir: maximum number or rawx services that an rdir can be linked to :type max_per_rdir: `int` """ cs = ConscienceClient(self.conf) all_rawx = cs.all_services('rawx') all_rdir = cs.all_services('rdir', True) if len(all_rdir) <= 0: raise ServiceUnavailable("No rdir service found in %s" % self.ns) by_id = {_make_id(self.ns, 'rdir', x['addr']): x for x in all_rdir} for rawx in all_rawx: try: # Verify that there is no rdir linked resp = self.directory.list(RDIR_ACCT, rawx['addr'], service_type='rdir') rdir_host = _filter_rdir_host(resp) try: rawx['rdir'] = by_id[_make_id(self.ns, 'rdir', rdir_host)] except KeyError: self.logger.warn("rdir %s linked to rawx %s seems down", rdir_host, rawx['addr']) except (NotFound, ClientException): if rawx['score'] <= 0: self.logger.warn("rawx %s has score %s, and thus cannot be" " affected a rdir (load balancer " "limitation)", rawx['addr'], rawx['score']) continue rdir = self._smart_link_rdir(rawx['addr'], cs, all_rdir, max_per_rdir) n_bases = by_id[rdir]['tags'].get("stat.opened_db_count", 0) by_id[rdir]['tags']["stat.opened_db_count"] = n_bases + 1 rawx['rdir'] = by_id[rdir] return all_rawx
def _smart_link_rdir(self, volume_id, all_rdir, max_per_rdir=None, max_attempts=7, service_type='rawx', min_dist=None, **kwargs): """ Force the load balancer to avoid services that already host more bases than the average (or more than `max_per_rdir`) while selecting rdir services. """ opened_db = [x['tags'].get('stat.opened_db_count', 0) for x in all_rdir if x['score'] > 0] if len(opened_db) <= 0: raise ServiceUnavailable( "No valid rdir service found in %s" % self.ns) if not max_per_rdir: upper_limit = sum(opened_db) / float(len(opened_db)) else: upper_limit = max_per_rdir - 1 avoids = [_make_id(self.ns, "rdir", x['addr']) for x in all_rdir if x['score'] > 0 and x['tags'].get('stat.opened_db_count', 0) > upper_limit] known = [_make_id(self.ns, service_type, volume_id)] try: polled = self._poll_rdir(avoid=avoids, known=known, min_dist=min_dist, **kwargs) except ClientException as exc: if exc.status != 481 or max_per_rdir: raise # Retry without `avoids`, hoping the next iteration will rebalance polled = self._poll_rdir(known=known, min_dist=min_dist, **kwargs) # Associate the rdir to the rawx forced = {'host': polled['addr'], 'type': 'rdir', 'seq': 1, 'args': "", 'id': polled['id']} for i in range(max_attempts): try: self.directory.force(RDIR_ACCT, volume_id, 'rdir', forced, autocreate=True, **kwargs) break except ClientException as ex: # Already done done = (455,) if ex.status in done: break if ex.message.startswith( 'META1 error: (SQLITE_CONSTRAINT) ' 'UNIQUE constraint failed'): self.logger.info( "Ignored exception (already0): %s", ex) break if ex.message.startswith( 'META1 error: (SQLITE_CONSTRAINT) ' 'columns cid, srvtype, seq are not unique'): self.logger.info( "Ignored exception (already1): %s", ex) break # Manage several unretriable errors retry = (406, 450, 503, 504) if ex.status >= 400 and ex.status not in retry: raise # Monotonic backoff (retriable and net erorrs) if i < max_attempts - 1: sleep(i * 1.0) continue # Too many attempts raise # Do the creation in the rdir itself try: self.rdir.create(volume_id, service_type=service_type, **kwargs) except Exception as exc: self.logger.warn("Failed to create database for %s on %s: %s", volume_id, polled['addr'], exc) return polled['id']
def assign_services(self, service_type, max_per_rdir=None, min_dist=None, service_id=None, reassign=False, **kwargs): """ Assign an rdir service to all `service_type` servers that aren't already assigned one. :param max_per_rdir: Maximum number of services an rdir can handle. :type max_per_rdir: `int` :param min_dist: Minimum required distance between any service and its assigned rdir service. :type min_dist: `int` :param service_id: Assign only this service ID. :type service_id: `str` :param reassign: Reassign an rdir service. :type reassign: `bool` :param dry_run: Display actions but do nothing. :type dry_run: `bool` :returns: The list of `service_type` services that were assigned rdir services. """ all_services = self.cs.all_services(service_type, **kwargs) if service_id: for provider in all_services: provider_id = provider['tags'].get('tag.service_id', provider['addr']) if service_id == provider_id: break else: raise ValueError('%s isn\'t a %s' % (service_id, service_type)) all_services = [provider] all_rdir = self.cs.all_services('rdir', True, **kwargs) if len(all_rdir) <= 0: raise ServiceUnavailable("No rdir service found in %s" % self.ns) by_id = _build_dict_by_id(self.ns, all_rdir) errors = list() for provider in all_services: provider_id = provider['tags'].get('tag.service_id', provider['addr']) try: resp = self.directory.list(RDIR_ACCT, provider_id, service_type='rdir', **kwargs) rdir_host = _filter_rdir_host(resp) try: rdir = by_id[_make_id(self.ns, 'rdir', rdir_host)] if reassign: rdir['tags']['stat.opened_db_count'] = \ rdir['tags'].get('stat.opened_db_count', 0) - 1 # TODO(adu) Delete database raise NotFound('Reassign an rdir services') provider['rdir'] = rdir except KeyError: self.logger.warn("rdir %s linked to %s %s seems down", rdir_host, service_type, provider_id) if reassign: raise NotFound('Reassign an rdir services') except NotFound: try: rdir = self._smart_link_rdir(provider_id, all_rdir, service_type=service_type, max_per_rdir=max_per_rdir, min_dist=min_dist, reassign=reassign, **kwargs) except OioException as exc: self.logger.warn("Failed to link an rdir to %s %s: %s", service_type, provider_id, exc) errors.append((provider_id, exc)) continue n_bases = by_id[rdir]['tags'].get("stat.opened_db_count", 0) by_id[rdir]['tags']["stat.opened_db_count"] = n_bases + 1 provider['rdir'] = by_id[rdir] except OioException as exc: self.logger.warn( "Failed to check rdir linked to %s %s " "(thus won't try to make the link): %s", service_type, provider_id, exc) errors.append((provider_id, exc)) if errors: # group_chunk_errors is flexible enough to accept service addresses errors = group_chunk_errors(errors) if len(errors) == 1: err, addrs = errors.popitem() oio_reraise(type(err), err, str(addrs)) else: raise OioException('Several errors encountered: %s' % errors) return all_services