def _ClassifyFree(self, free, pool_name): # Ensure free machines are unique. free = setlib.make_dict(free) free = free.keys() free.sort() inter = setlib.intersect(free, self._used_hosts) if inter: prodlib.log('Ignoring used machines: %s' % string.join(inter)) free = setlib.diff(free, inter) # Preload hardware data. self._mach_mgr.MachineList(free, load_hardware=1) # Group free machines by their defining characteristics. This # allows us to examine a smaller set of free machines as # candidates for a replacement. freedict = {} # Prune out free without hardware information. for host in free: mach = self._mach_mgr.Machine(host) if not mach or not mach.hardware(): continue freedict.setdefault(mach.ClassString(), []).append(mach.name()) prodlib.log('Found %d free machs from %s.\n' % (len(free), pool_name)) return freedict
def _FreeDict(self, coloc, pool_name): """ Get a free dictionary corresponding to the pool name and coloc. Args: coloc: coloc pool: pool name Returns: { 'free_class' : ['mach'] } """ # Specified pools are not restricted by coloc. if pool_name == 'specified': coloc = None pool = self._free_pools.get((coloc, pool_name)) if pool is not None: return pool # Find free machines if not specified by user. if pool_name == 'specified': pool = self._ClassifyFree(self._free, pool_name) else: prodlib.log('Finding free machs from %s:%s.' % (coloc, pool_name)) free = [] pool = self._ClassifyFree(free, pool_name) self._free_pools[(coloc, pool_name)] = pool return pool
def main(argv): import getopt global send_mail, MAILTO # so mail works regardless where we fail! send_mail = 0 # disable mail unless user wants it. It is enabled # by default only so we can catch major syntax errors. batch = 0 # assume we're running from command line try: (optlist, args) = getopt.getopt(argv, 'n', ['re=', 'delay=', 'ports=', 'loop', 'noexec', 'nolock', 'force_lock', 'start=', 'kill=', 'mailto=', 'nomail', 'fromcron', 'batch', 'mach=', 'nodataversion', 'sets=', 'sandbox=', 'setpgrp', 'kill_batch_size=', 'nocheckpoint', 'checkpoint_time=', 'validate', 'useinvalidconfig', 'nolooprestarts', 'corphack', 'corptest', 'babyalias=', 'lockdir=', 'maxiters=', 'config_dir=', 'restarts_file=', 'nobabycheck', 'ssh_user='******'nortsignals', 'regtest', ] ) except getopt.error, e: prodlib.log("getopt error: %s" % e) usage()
def NormalizeTypeLevel(typelvl): cnt = string.count(typelvl, ':') # typelvl is in correct format (i.e. 'mtype:lvl') if cnt == 1: return typelvl # force typelvl (i.e. 'mtype') elif cnt == 0: return typelvl + ':0' # assume lvl 0 if unknown # invalid format (i.e. 'mtype::lvl') else: prodlib.log("Invalid typelvl format: %s" % typelvl) raise RuntimeError
def go(self, timeout=None): # done registering the machines if self.monitor_in != None: self.monitor_in.close() self.monitor_in = None # if we have nothing to do just return if not self.restartfns: return self.restarts reached_maxiters = 0 while timeout > 0 or timeout == None: start_time = time.time() # We select to give wait most the specified timeout (ioready, _, _) = select.select( [self.monitor_out], [], # don't care about "write"-s [], # ... or errors timeout) if self.monitor_out not in ioready: return self.restarts # Adjust the time left for us if timeout != None: timeout = timeout - time.time() + start_time line = self.monitor_out.readline( ) # each line now means "restart me!" if 'MAXITERS\n' == line: assert self.maxiters > 0 assert not reached_maxiters reached_maxiters = 1 continue if not line: if reached_maxiters: break prodlib.log( "EOF from monitor subprocess! (Subprocess died?) Exiting.") prodlib.log("Make sure you have monitor installed " + "(google/bin/monitor)") sys.exit(1) assert not reached_maxiters print "Restarting %s" % line[:-1] sys.stdout.flush() # mix this with monitor's messages printable_hostport = line[:-1] self.restartfns[printable_hostport]( ) # line (minus \n) is key to table original_hostport = self.originaldata[printable_hostport] self.restarts[original_hostport] = self.restarts.get( original_hostport, 0) + 1 return self.restarts
def RemoveSets(self, srv_mgr, srvsetnums): prodlib.log('Beginning RemoveSet:\n') removed = [] for srvsetnum in srvsetnums: (srvset, num) = string.split(srvsetnum, ':') num = int(num) set = srv_mgr.Set(srvset) if set.property('auto_assigned'): continue ports = set.Ports() # For each port, cut off servers that are located > num. for port in ports: servers = set.ServersForPort(port) cnt = len(servers) - num if cnt <= 0: continue for _ in range(cnt): server = servers[-1] removed.append('%s' % server) prodlib.log(' Removed server %s' % server) srv_mgr.RemoveServer(server) prodlib.log('\nremoved="%s"' % string.join(removed)) prodlib.log('\nFinished RemoveSet.') return (removed, [])
def main(argv): import getopt global send_mail, MAILTO # so mail works regardless where we fail! send_mail = 0 # disable mail unless user wants it. It is enabled # by default only so we can catch major syntax errors. batch = 0 # assume we're running from command line try: (optlist, args) = getopt.getopt(argv, 'n', [ 're=', 'delay=', 'ports=', 'loop', 'noexec', 'nolock', 'force_lock', 'start=', 'kill=', 'mailto=', 'nomail', 'fromcron', 'batch', 'mach=', 'nodataversion', 'sets=', 'sandbox=', 'setpgrp', 'kill_batch_size=', 'nocheckpoint', 'checkpoint_time=', 'validate', 'useinvalidconfig', 'nolooprestarts', 'corphack', 'corptest', 'babyalias=', 'lockdir=', 'maxiters=', 'config_dir=', 'restarts_file=', 'nobabycheck', 'ssh_user='******'nortsignals', 'regtest', ]) except getopt.error, e: prodlib.log("getopt error: %s" % e) usage()
def RemoveNum(self, srv_mgr, srvsetnums): prodlib.log('Beginning RemoveNum:\n') removed = [] for srvsetnum in srvsetnums: (srvset, num) = string.split(srvsetnum, ':') num = int(num) set = srv_mgr.Set(srvset) if set.property('auto_assigned'): continue indices = set.Indices()[:] indices.reverse() for index in indices: servers = set.ServersForIndex(index)[:] servers.reverse() # TODO: We may want to be less strict about not allowing # servers to be removed. for server in servers: if len(srv_mgr.ServersForHost(server.host())) > 1: raise Error, '%s has multiple roles' % server if num <= 0: break removed.append('%s' % server) srv_mgr.RemoveServer(server) prodlib.log(' Removed server %s' % server) num = num-1 if num <= 0: break prodlib.log('\nremoved="%s"' % string.join(removed)) prodlib.log('\nFinished RemoveNum.') return (removed, [])
def RemoveNum(self, srv_mgr, srvsetnums): prodlib.log('Beginning RemoveNum:\n') removed = [] for srvsetnum in srvsetnums: (srvset, num) = string.split(srvsetnum, ':') num = int(num) set = srv_mgr.Set(srvset) if set.property('auto_assigned'): continue indices = set.Indices()[:] indices.reverse() for index in indices: servers = set.ServersForIndex(index)[:] servers.reverse() # TODO: We may want to be less strict about not allowing # servers to be removed. for server in servers: if len(srv_mgr.ServersForHost(server.host())) > 1: raise Error, '%s has multiple roles' % server if num <= 0: break removed.append('%s' % server) srv_mgr.RemoveServer(server) prodlib.log(' Removed server %s' % server) num = num - 1 if num <= 0: break prodlib.log('\nremoved="%s"' % string.join(removed)) prodlib.log('\nFinished RemoveNum.') return (removed, [])
def GetLocalDataFileInfo(mtype): prop_info = GetProperty(mtype, 'local_data_files') src_target_files_list = [] # now fill in the default values if prop_info: for p_dict in prop_info: src_target_files_list.append( (p_dict.get('srcpath', ''), p_dict.get('targetpath', ''), p_dict.get('files', ['*'])) ) #end for else: prodlib.log('No local_data_files for %s.' % mtype) # end if return src_target_files_list
def GetLocalDataFileInfo(mtype): prop_info = GetProperty(mtype, 'local_data_files') src_target_files_list = [] # now fill in the default values if prop_info: for p_dict in prop_info: src_target_files_list.append( (p_dict.get('srcpath', ''), p_dict.get('targetpath', ''), p_dict.get('files', ['*']))) #end for else: prodlib.log('No local_data_files for %s.' % mtype) # end if return src_target_files_list
def handle_error(self, *info): # The prototype for handle_error is different for 2.x and 1.5: # 2.x: def handle_error (self) # 1.5: def handle_error (self, *info): # with exception info in info if not info: # python 2.x _, exc_type, exc_value, exc_traceback = asyncore.compact_traceback() else: # python 1.5 (exc_type, exc_value, exc_traceback) = info self.err_ = (exc_type, exc_value) prodlib.log("error encountered: %s-%s" % self.err_) # stderr logging! del exc_traceback self.close()
def go (self, timeout = None): # done registering the machines if self.monitor_in != None: self.monitor_in.close() self.monitor_in = None # if we have nothing to do just return if not self.restartfns: return self.restarts reached_maxiters = 0 while timeout > 0 or timeout == None: start_time = time.time() # We select to give wait most the specified timeout (ioready, _, _) = select.select([self.monitor_out], [], # don't care about "write"-s [], # ... or errors timeout) if self.monitor_out not in ioready: return self.restarts # Adjust the time left for us if timeout != None: timeout = timeout - time.time() + start_time line = self.monitor_out.readline() # each line now means "restart me!" if 'MAXITERS\n' == line: assert self.maxiters > 0 assert not reached_maxiters reached_maxiters = 1 continue if not line: if reached_maxiters: break prodlib.log("EOF from monitor subprocess! (Subprocess died?) Exiting.") prodlib.log("Make sure you have monitor installed " + "(google/bin/monitor)") sys.exit(1) assert not reached_maxiters print "Restarting %s" % line[:-1] sys.stdout.flush() # mix this with monitor's messages printable_hostport = line[:-1] self.restartfns[printable_hostport]() # line (minus \n) is key to table original_hostport = self.originaldata[printable_hostport] self.restarts[original_hostport] = self.restarts.get(original_hostport, 0) + 1 return self.restarts
def handle_error(self, *info): # The prototype for handle_error is different for 2.x and 1.5: # 2.x: def handle_error (self) # 1.5: def handle_error (self, *info): # with exception info in info if not info: # python 2.x _, exc_type, exc_value, exc_traceback = asyncore.compact_traceback( ) else: # python 1.5 (exc_type, exc_value, exc_traceback) = info self.err_ = (exc_type, exc_value) prodlib.log("error encountered: %s-%s" % self.err_) # stderr logging! del exc_traceback self.close()
def __call__(self, port, hint=None): """Map a port into its associated server type.""" mtype = self.port_to_type.get(port, None) if mtype is not None: return mtype # The hint optional parameter speeds up code in serverlib.py, because it # runs through ports in sorted order, and some mtypes have many shards. if hint is not None: (lower_bound, upper_bound) = self.type_to_bounds[hint] if lower_bound <= port < upper_bound: mtype = hint self.port_to_type[port] = mtype return mtype if self.bounds_array is None: self.initialize_search() # Binary search: if port < self.bounds_array[0]: # first, a sanity check prodlib.log('WARNING: returning unknown servertype for: %d' % port) return 'unknown' first_possible = 0 n_possible = len(self.bounds_array) while n_possible > 1: n_first_half = int(n_possible / 2.0) probe = first_possible + n_first_half if port < self.bounds_array[probe]: n_possible = n_first_half else: n_possible = n_possible - n_first_half first_possible = first_possible + n_first_half lower_bound = self.bounds_array[first_possible] mtype = self.lower_bound_to_type.get(lower_bound, 'unknown') self.port_to_type[port] = mtype if mtype == 'unknown': prodlib.log('WARNING: returning unknown servertype for: %d' % port) return mtype
def AsynRequest(hostportlist, request, timeout, retrycnt=3, half_shutdown=1): retries = {} clients = {} hostports = hostportlist delay = 1 while hostports: # Clear the asyncore socket map before every loop. Otherwise, if we have a # dead server, then the socket corresponding to that server will stay in # the map for all the following retry attempts. asyncore.socket_map = {} # For each AsynClient, a socket is created and the map updated so that # the socket can be polled during the next loop() call for hostport in hostports: clients[hostport] = AsynClient(hostport, request, half_shutdown=half_shutdown) # connect retries[hostport] = retries.get(hostport, 0) + 1 # update counter # enter the select loop loop(timeout) # go through the client list and see if anyone failed hostports = [] # start from scratch. assume all replies came back fine for hostport, client in clients.items(): if client.failed() and retries[hostport] < retrycnt: # Some error occured. Put it back in the list and try again prodlib.log("Error on %s port %s: %s %s" % (hostport + client.err_)) hostports.append(hostport) if hostports: # any retries needed? time.sleep(delay) # ... then wait for better times if delay < 20: delay = delay * 2 # exponential backoff return clients.values()
def AsynRequest(hostportlist, request, timeout, retrycnt=3, half_shutdown=1): retries = {} clients = {} hostports = hostportlist delay = 1 while hostports: # Clear the asyncore socket map before every loop. Otherwise, if we have a # dead server, then the socket corresponding to that server will stay in # the map for all the following retry attempts. asyncore.socket_map = {} # For each AsynClient, a socket is created and the map updated so that # the socket can be polled during the next loop() call for hostport in hostports: clients[hostport] = AsynClient( hostport, request, half_shutdown=half_shutdown) # connect retries[hostport] = retries.get(hostport, 0) + 1 # update counter # enter the select loop loop(timeout) # go through the client list and see if anyone failed hostports = [] # start from scratch. assume all replies came back fine for hostport, client in clients.items(): if client.failed() and retries[hostport] < retrycnt: # Some error occured. Put it back in the list and try again prodlib.log("Error on %s port %s: %s %s" % (hostport + client.err_)) hostports.append(hostport) if hostports: # any retries needed? time.sleep(delay) # ... then wait for better times if delay < 20: delay = delay * 2 # exponential backoff return clients.values()
def Swap(self, srv_mgr, src, dst): cnstr_mgr = srv_mgr.constraint_mgr() added = [] removed = [] prodlib.log('Beginning Swap:\n') for server in srv_mgr.ServersForSpec(src): if server.property('auto_assigned'): continue prodlib.log(' Swapping %s with %s' % (server, dst)) removed.append('%s' % server) srv_mgr.ReplaceServer(server, dst) added.append('%s' % server) results = cnstr_mgr.VerifyServer(srv_mgr, server, errors_only=1) if results: prodlib.log(' WARNING: %s' % results[0]) prodlib.log('\nFinished Swap.') return (added, removed, [])
def Add(self, srv_mgr, names): cnstr_mgr = srv_mgr.constraint_mgr() added = [] prodlib.log('Beginning Add:\n') for name in names: server = serverlib.Server() server.InitFromName(name) prodlib.log(' Adding %s' % server) srv_mgr.AddServer(server) if server.property('auto_assigned'): raise Error, 'Cannot add auto assigned server: %s' % server added.append('%s' % server) results = cnstr_mgr.VerifyServer(srv_mgr, server, errors_only=1) if results: prodlib.log(' WARNING: %s' % results[0]) prodlib.log('\nFinished Add.') return (added, [])
def Remove(self, srv_mgr, names): # Silence pychecker. removed = [] prodlib.log('Beginning Removal:\n') for name in names: # Make a copy since the servers for host will be internally modified. servers = srv_mgr.ServersForSpec(name)[:] if not servers: prodlib.log('WARNING: no servers matched: %s' % name) for server in servers: if server.property('auto_assigned'): continue prodlib.log(' Removing %s' % server) srv_mgr.RemoveServer(server) removed.append('%s' % server) prodlib.log('\nFinished Removal.') return (removed, [])
def _AllocateHostFromFreePool(self, srv_mgr, server, pool, free_dict, force=0, exclude=None): """ Allocate a machine for the passed in server from specific pool. """ cnstr_mgr = srv_mgr.constraint_mgr() # Find currently used compatible machines. used_hosts = {} if self._used: used_hosts = cnstr_mgr.Constraint('sharing').CompatibleHosts(srv_mgr, server) # Find free machine set from the free_dict - we get one of each class # of machines and save the others in its class. We know that we can # rank members of the same class with the same score. free = {} for (machclass, hosts) in free_dict.items(): if not hosts: continue free[hosts[0]] = hosts prodlib.log(' Allocating server for %s (used=%d, free=%d, pool=%s)' % \ (server, len(used_hosts), len(free), pool)) hosts = used_hosts.keys() + free.keys() # Exclude optional excludes. if self._exclude: hosts = setlib.diff(hosts, self._exclude) # Exclude locally specified excludes. if exclude: hosts = setlib.diff(hosts, exclude) random.shuffle(hosts) # Save the original host. orig_host = server.host() results = [] failed = [] # Assign weights and prune out ones that don't fit. for host in hosts: # Replace the server's host with the candidate host and verify. if self._verbose: prodlib.log(' ranking candidate: %s' % host) srv_mgr.ReplaceServer(server, host) servers = [server] + used_hosts.get(host, []) ver_results = cnstr_mgr.VerifyServer(srv_mgr, server, servers=servers, force=force) if self._verbose: for res in ver_results: if res.error(): status = 'fail' else: status = 'ok' prodlib.log(' %s: %s' % (status, res)) if ver_results[-1].error(): failed.append(host) else: # Compute total weight assigned to machine. weight = 0.0 for res in ver_results: weight = weight + res.weight() if self._verbose: prodlib.log(' weight: %.2f' % weight) # Append results for machine. We augment the hosts with # free machines of the same class since these should receive # the same score. if free.has_key(host): for free_host in free[host]: results.append((free_host, weight)) else: results.append((host, weight)) # Sort the results by highest weight to find the best candidate. results.sort(lambda x,y: -cmp(x[1], y[1])) for (host, weight) in results: prodlib.log(' Trying %s (%.2f)' % (host, weight)) # Set server to new host. srv_mgr.ReplaceServer(server, host) if not used_hosts.has_key(host): # Remove the machine from the free list if necessary. key = self._mach_mgr.Machine(host).ClassString() hosts = free_dict[key] hosts.remove(host) if hosts == []: del(free_dict[key]) # Return the server with its newly allocated host. prodlib.log(' Allocated %s (%.2f)' % (server, weight)) return server # Failed so replace old host. srv_mgr.ReplaceServer(server, orig_host) prodlib.log(' Unable to allocate server.') return None
def AddSets(self, srv_mgrs, srvsetnums, do_min=0): # Allow interface to take a single server manager. if type(srv_mgrs) != types.ListType: srv_mgrs = [srv_mgrs] # Find free dictionary for this set. srv_mgr = srv_mgrs[0] cnstr_mgr = srv_mgr.constraint_mgr() # Set up the sets to process from constraints if not specified. if not srvsetnums: srvsetnums = {} # First iterate through all existing sets and add those sets # that have a shard length constraint. Note that if the shardlen # constraint is specified in defaults this will add all currently # created sets. for set in srv_mgr.Sets(): if set.property('auto_assigned'): continue if cnstr_mgr.Constraint('shardlen').Constraint(set.name()): srvsetnums[set.name()] = 1 # There may be some sets specified that are not currently in # existance. Iterate through the list of explicitly specified # shardlen constraints and add sets for their types. for srvset in cnstr_mgr.Constraint('shardlen').server_sets(): srvsetnums[srvset] = 1 srvsetnums = srvsetnums.keys() srvsetnums.sort() # Set up the number of each type to add. tmp = [] for srvsetnum in srvsetnums: if string.find(srvsetnum, ':') == -1: # TODO: Right now we're storing constraint descs in the constraint # manager. This will move in another checkin to the server sets # and will be more readable when accessing. vals = cnstr_mgr.Constraint('shardlen').Constraint(srvsetnum) # If no shardlen constrs were specified for this type then ignore. if not vals: prodlib.log( 'Cannot add sets for %s - no shardlen constraint' % srvsetnum) continue srvsetnum = '%s:%s,%s' % (srvsetnum, vals[0], vals[1]) tmp.append(srvsetnum) srvsetnums = tmp prodlib.log('Beginning AddSet:\n') added = [] failed = [] tried = [] for srvsetnum in srvsetnums: (srvset, num) = string.split(srvsetnum, ':') num = string.split(num, ',') if len(num) == 2: min = int(num[0]) max = int(num[1]) else: min = int(num[0]) max = min if do_min: cnt = min else: cnt = max # For balancer sets, we add these with the same port range as the # balanced set if they were not present in the server manager object. if srvset[0] == '+': for srv_mgr in srv_mgrs: balset = srv_mgr.Set(srvset[1:]) if not balset: raise Error, 'Cannot add set %s: no balanced set' % srvset set = srv_mgr.AddSet(srvset, balset.level()) # Ensure port ranges are matched to balanced set. for port in balset.Ports(): set.AddPort(port) # Build union of all ports for this type. ports = {} for srv_mgr in srv_mgrs: set = srv_mgr.Set(srvset) if set != None: cur_ports = set.Ports() for port in cur_ports: ports[port] = 1 ports = ports.keys() ports.sort() # Iterate up the slices so that we replace short shards first. for i in range(cnt + 1): # To check if all allocations failed on the ports. every_port_failed = 1 # For each port check if this shard is short. for port in ports: for srv_mgr in srv_mgrs: cnstr_mgr = srv_mgr.constraint_mgr() set = srv_mgr.Set(srvset) if set.property('auto_assigned'): continue if port not in set.Ports(): continue # If we have enough servers just skip. num = len(set.ServersForPort(port)) if num >= i: every_port_failed = 0 continue # Create a placeholder server object. server = serverlib.Server() server.InitFromName('%s%s:%s' % (srvset, i, port)) srv_mgr.AddServer(server) # Try and allocate a host for it. force = 0 if num < min: force = 1 if self._AllocateHost(srv_mgr, server, force): every_port_failed = 0 added.append('%s' % server) else: # Failed so remove it from the map. srv_mgr.RemoveServer(server) if force: failed.append('%s' % server) else: tried.append('%s' % server) # Break if we tried every port for this server manager and # could not allocate anything. if every_port_failed: break prodlib.log('\nsuccess="%s"' % string.join(added)) prodlib.log('failed="%s"' % string.join(failed)) prodlib.log('tried="%s"' % string.join(tried)) if self._free: prodlib.log('free="%s"' % string.join(self._RemainingFree())) return (added, failed, tried, self._RemainingFree())
def ComputeServers(config, types, machine_re, restrictports, myhostname, excluded, do_ckpt, ckpt_time, sets, restrict_servers=None, ssh_user=None): srv_mgr = config.GetServerManager() servers = srv_mgr.Servers(wanted_sets=types, wanted_ports=restrictports, wanted_indices=sets) computed_servers = [] for server in servers: set = srv_mgr.Set(server.srvset()) host = server.host() port = server.port() if restrict_servers and not restrict_servers.get(str(server), 0): continue mtype = server.servertype() if not WantedServer(host, port, mtype, config, machine_re, myhostname, excluded): # Server has not been selected for inclusion. continue # Allow overriding ssh_user. if server.property('ssh_user'): ssh_user = server.property('ssh_user') # For AM transition, if there is no binary user set, then # do not use the ssh_user so we can still go on as root. # We will run the babysitter as root and it will ssh into # machines with binary_user set as prodsetup, and it will # ssh into machines without binary_user set as root. # TODO: Remove this when we are finished with conversion. binary_user = set.property('binary_user') if not binary_user or binary_user == 'root': ssh_user = None # Print out an informational string for the user. set_str = '' if sets: set_str = ' - set %s' % server.index() print "Checking %s:%d (%s%s)" % \ (host, servertype.GetServingPort(port), mtype, set_str) safe_start_time = server.index() * server.property('inter_set_delay') try: hostip = socket.gethostbyname(host) except socket.error, e: prodlib.log("DNS error for %s: %s. Skipping." % (host, e)) continue # Form restart closure restartfn = lambda f=server.Start, m=print_only(), u=ssh_user: \ f(m, u) # Form kill closure killfn = lambda f=server.Stop, u=ssh_user, ck=do_ckpt, ct=ckpt_time: \ f(2, u, ck, ct) server.set_property('hostip', hostip) server.set_property('safe_start_time', safe_start_time) server.set_property('restartfn', restartfn) server.set_property('killfn', killfn) computed_servers.append(server)
def _AllocateHostFromFreePool(self, srv_mgr, server, pool, free_dict, force=0, exclude=None): """ Allocate a machine for the passed in server from specific pool. """ cnstr_mgr = srv_mgr.constraint_mgr() # Find currently used compatible machines. used_hosts = {} if self._used: used_hosts = cnstr_mgr.Constraint('sharing').CompatibleHosts( srv_mgr, server) # Find free machine set from the free_dict - we get one of each class # of machines and save the others in its class. We know that we can # rank members of the same class with the same score. free = {} for (machclass, hosts) in free_dict.items(): if not hosts: continue free[hosts[0]] = hosts prodlib.log(' Allocating server for %s (used=%d, free=%d, pool=%s)' % \ (server, len(used_hosts), len(free), pool)) hosts = used_hosts.keys() + free.keys() # Exclude optional excludes. if self._exclude: hosts = setlib.diff(hosts, self._exclude) # Exclude locally specified excludes. if exclude: hosts = setlib.diff(hosts, exclude) random.shuffle(hosts) # Save the original host. orig_host = server.host() results = [] failed = [] # Assign weights and prune out ones that don't fit. for host in hosts: # Replace the server's host with the candidate host and verify. if self._verbose: prodlib.log(' ranking candidate: %s' % host) srv_mgr.ReplaceServer(server, host) servers = [server] + used_hosts.get(host, []) ver_results = cnstr_mgr.VerifyServer(srv_mgr, server, servers=servers, force=force) if self._verbose: for res in ver_results: if res.error(): status = 'fail' else: status = 'ok' prodlib.log(' %s: %s' % (status, res)) if ver_results[-1].error(): failed.append(host) else: # Compute total weight assigned to machine. weight = 0.0 for res in ver_results: weight = weight + res.weight() if self._verbose: prodlib.log(' weight: %.2f' % weight) # Append results for machine. We augment the hosts with # free machines of the same class since these should receive # the same score. if free.has_key(host): for free_host in free[host]: results.append((free_host, weight)) else: results.append((host, weight)) # Sort the results by highest weight to find the best candidate. results.sort(lambda x, y: -cmp(x[1], y[1])) for (host, weight) in results: prodlib.log(' Trying %s (%.2f)' % (host, weight)) # Set server to new host. srv_mgr.ReplaceServer(server, host) if not used_hosts.has_key(host): # Remove the machine from the free list if necessary. key = self._mach_mgr.Machine(host).ClassString() hosts = free_dict[key] hosts.remove(host) if hosts == []: del (free_dict[key]) # Return the server with its newly allocated host. prodlib.log(' Allocated %s (%.2f)' % (server, weight)) return server # Failed so replace old host. srv_mgr.ReplaceServer(server, orig_host) prodlib.log(' Unable to allocate server.') return None
def DoBabysit(servers, config, maxiters=0, monitor_port_increment=0, extra_restarts=None, restart_requests=None, nolooprestarts=0, nortsignals=0, succinterval=None, failinterval=None): # map from (host, port) to the corresponding server . We need this # because el.go returns us results in form of host ports while we # want servers. hostport_srvinfo_map = {} if nolooprestarts: if extra_restarts: sleep_time = max(map(lambda m: m.property('safe_start_time'), extra_restarts)) else: prodlib.log('Babysitter loop has nothing to do.. exiting') sys.exit() # endif monitor_command = "sleep %d # " % sleep_time else: monitor_command = "%s/google/bin/monitor --status_port=%s" % ( sitecustomize.GOOGLEBASE, servertype.GetPortBase('monitor') + monitor_port_increment) # endif if nortsignals: monitor_command = monitor_command + " --nortsignals" if succinterval: monitor_command = monitor_command + " --succinterval=%s" % succinterval if failinterval: monitor_command = monitor_command + " --failinterval=%s" % failinterval el = monitor_event_loop(monitor_command, maxiters) for server in servers: # Cannot babysit virtual servers (ports >= 65536) if server.port() >= 65536: continue if server.property('skip_babysitting'): print "WARNING: babysitting disabled for %s:%s" % (server.host(), server.port()) continue if not nolooprestarts: datadir = server.datadir() if datadir is None: datadir = '' query = server.property('request_info') # some queries require dataversion. Right now dataversion is # the datadir for all servers. query = query % {'dataversion': datadir} el.register((server.host(), servertype.GetServingPort(server.balport()), server.port(), server.property('hostip'), server.property('restartfn')), query, server.property('response_len'), server.property('test_timeouts')) host_port = (server.host(), server.port()) hostport_srvinfo_map[host_port] = server # endif if extra_restarts and restart_requests: start_time = time.time() for server in extra_restarts: # Insure that enough seconds have passed since we start passed_time = time.time() - start_time if server.property('safe_start_time') > passed_time: delay = server.property('safe_start_time') - passed_time print "Spending %s seconds monitoring." % delay if not print_only(): el.go(timeout=delay) else: print "Actually simulating a sleep of %s" % delay # Call restart function. server.property('restartfn')() # And mark this guy restarted restart_requests.MarkRestarted(str(server)) # Start the actual babysitter. restarted_srv_list = [] argv_checker = babysitter_argv_checker.BabysitterArgvChecker(config) logging.info("%s %s: Restarting servers whose argv has changed. " % (time.ctime(), time.tzname[0])) hostports = argv_checker.RestartIfArgvChanged() for hostport in hostports: restarted_srv_list.append(hostport_srvinfo_map[hostport]) if not print_only(): restarts = el.go() restarts = restarts.keys() for hostport in restarts: restarted_srv_list.append(hostport_srvinfo_map[hostport]) return restarted_srv_list
def Replace(self, srv_mgr, names): prodlib.log('Beginning Replacements:\n') replaced = [] succeeded = [] failed = [] # Find hosts to replace so we can exclude these from # being used as candidates. replace_hosts = [] for name in names: servers = srv_mgr.ServersForSpec(name) for server in servers: replace_hosts.append(server.host()) for name in names: # Find servers for name. We copy this since the internal # array is modified when servers get replaced. servers = srv_mgr.ServersForSpec(name)[:] if not servers: prodlib.log('WARNING: no servers matched: %s' % name) for server in servers: if server.property('auto_assigned'): continue orig = '%s' % server if not self._AllocateHost(srv_mgr, server, exclude=replace_hosts): failed.append(orig) else: succeeded.append(orig) replaced.append('%s' % server) prodlib.log('\nreplace="%s"' % string.join(replaced)) prodlib.log('success="%s"' % string.join(succeeded)) prodlib.log('fail="%s"' % string.join(failed)) if self._free: prodlib.log('free="%s"' % string.join(self._RemainingFree())) return (replaced, succeeded, failed, self._RemainingFree())
def Repair(self, srv_mgr, tries=10): cnstr_mgr = srv_mgr.constraint_mgr() prodlib.log('Beginning Repair:\n') replaced = [] succeeded = [] failed = {} seen_replaced = {} # Try to repair a fixed number of rounds. # We just repair one server from each problem each round. # This should fix most problems but for distribution/sharing # violations, multiple fixes for the problem may be needed. for _ in range(tries): # Find constraint violations. ver_results = cnstr_mgr.VerifyServers(srv_mgr) if not ver_results: prodlib.log('\nNo more problems to repair.') break replacements = [] # Find new machines to try and fix. any_left = 0 for res in ver_results: server = res.servers()[0] if not seen_replaced.has_key('%s' % server): any_left = 1 replacements.append(server) seen_replaced['%s' % server] = 1 # We tried all these before. if not any_left: prodlib.log('\nUnable to make further progress.') break # Fix problems by replacing the server. for server in replacements: if server.property('auto_assigned'): continue orig = '%s' % server if not self._AllocateHost(srv_mgr, server): failed[orig] = 1 else: succeeded.append(orig) replaced.append('%s' % server) failed = failed.keys() prodlib.log('\nreplace="%s"' % string.join(replaced)) prodlib.log('success="%s"' % string.join(succeeded)) prodlib.log('fail="%s"' % string.join(failed)) if self._free: prodlib.log('free="%s"' % string.join(self._RemainingFree())) ver_results = cnstr_mgr.VerifyServers(srv_mgr) if ver_results: prodlib.log('Unable to fully repair: Still more errors in config.\n') for res in ver_results: prodlib.log(' %s' % res) else: prodlib.log('Succesfully repaired') return (replaced, succeeded, failed, self._RemainingFree())
def DoBabysit(servers, config, maxiters=0, monitor_port_increment=0, extra_restarts=None, restart_requests=None, nolooprestarts=0, nortsignals=0, succinterval=None, failinterval=None): # map from (host, port) to the corresponding server . We need this # because el.go returns us results in form of host ports while we # want servers. hostport_srvinfo_map = {} if nolooprestarts: if extra_restarts: sleep_time = max( map(lambda m: m.property('safe_start_time'), extra_restarts)) else: prodlib.log('Babysitter loop has nothing to do.. exiting') sys.exit() # endif monitor_command = "sleep %d # " % sleep_time else: monitor_command = "%s/google/bin/monitor --status_port=%s" % ( sitecustomize.GOOGLEBASE, servertype.GetPortBase('monitor') + monitor_port_increment) # endif if nortsignals: monitor_command = monitor_command + " --nortsignals" if succinterval: monitor_command = monitor_command + " --succinterval=%s" % succinterval if failinterval: monitor_command = monitor_command + " --failinterval=%s" % failinterval el = monitor_event_loop(monitor_command, maxiters) for server in servers: # Cannot babysit virtual servers (ports >= 65536) if server.port() >= 65536: continue if server.property('skip_babysitting'): print "WARNING: babysitting disabled for %s:%s" % (server.host(), server.port()) continue if not nolooprestarts: datadir = server.datadir() if datadir is None: datadir = '' query = server.property('request_info') # some queries require dataversion. Right now dataversion is # the datadir for all servers. query = query % {'dataversion': datadir} el.register( (server.host(), servertype.GetServingPort(server.balport()), server.port(), server.property('hostip'), server.property('restartfn')), query, server.property('response_len'), server.property('test_timeouts')) host_port = (server.host(), server.port()) hostport_srvinfo_map[host_port] = server # endif if extra_restarts and restart_requests: start_time = time.time() for server in extra_restarts: # Insure that enough seconds have passed since we start passed_time = time.time() - start_time if server.property('safe_start_time') > passed_time: delay = server.property('safe_start_time') - passed_time print "Spending %s seconds monitoring." % delay if not print_only(): el.go(timeout=delay) else: print "Actually simulating a sleep of %s" % delay # Call restart function. server.property('restartfn')() # And mark this guy restarted restart_requests.MarkRestarted(str(server)) # Start the actual babysitter. restarted_srv_list = [] argv_checker = babysitter_argv_checker.BabysitterArgvChecker(config) logging.info("%s %s: Restarting servers whose argv has changed. " % (time.ctime(), time.tzname[0])) hostports = argv_checker.RestartIfArgvChanged() for hostport in hostports: restarted_srv_list.append(hostport_srvinfo_map[hostport]) if not print_only(): restarts = el.go() restarts = restarts.keys() for hostport in restarts: restarted_srv_list.append(hostport_srvinfo_map[hostport]) return restarted_srv_list
def Assign(self, configs, operation, args): """Run assigner. Args: configs: [googleconfig.Config, ...] - config objects to repair. operation: 'op' - assignment operation to perform. args: [arg1, ...] - arguments for the operation. Returns: 0 on successful replacements 1 on no changes could be made """ # Check if we have multiple configs with crawl, and get common owner. # Unowned machines that have servers reserved on them are # assigned to owner. is_crawl_config = 0 for c in configs: cfg_owner = c.GetServerManager().property('owner') if self._owner is None: self._owner = cfg_owner elif self._owner != cfg_owner: prodlib.log("Can't deal with multiple owners in cfgs: %s vs. %s" % (self._owner, cfg_owner)) return 1 if not is_crawl_config: is_crawl_config = c.var('CRAWLMASTER') != None # endif # endfor if is_crawl_config and len(configs) > 1: prodlib.log('More than one config specified per crawl change.' ' Can\'t deal with this yet') return 1 # endif # Perform requested operation. config_changes = self.RunOperation(configs, operation, args) changes = [] # Create final change list and save config. for (config, add, rem, fail) in config_changes: srv_mgr = config.GetServerManager() if not add and not rem: prodlib.log('No changes for %s.' % config.GetConfigFileName()) continue for (add_srv, rem_srv) in map(lambda x, y: (x, y), add, rem): if add_srv: add_srv = _ServersFromSpecs([add_srv], srv_mgr)[0] if rem_srv: rem_srv = _ServersFromSpecs([rem_srv], srv_mgr)[0] changes.append((add_srv, rem_srv, _GetGFSCluster(config))) if self._save: out_file = os.path.basename(config.GetConfigFileName()) + '.out' config.SaveServers(out_file) prodlib.log('Saved to %s.' % out_file) if not changes: prodlib.log('No changes for any configs.') return 1 # Find added servers. new_servers = [i for (i, _, _) in changes if i is not None] fail = [] # Remove these allocated from pending changes. new_changes = [] for (add_srv, rem_srv, gfs_cluster) in changes: if add_srv in fail: continue new_changes.append((add_srv, rem_srv, gfs_cluster)) changes = new_changes # Return failure if any setup failed. if fail: return 1 else: return 0
def Replace(self, srv_mgr, names): prodlib.log('Beginning Replacements:\n') replaced = [] succeeded = [] failed = [] # Find hosts to replace so we can exclude these from # being used as candidates. replace_hosts = [] for name in names: servers = srv_mgr.ServersForSpec(name) for server in servers: replace_hosts.append(server.host()) for name in names: # Find servers for name. We copy this since the internal # array is modified when servers get replaced. servers = srv_mgr.ServersForSpec(name)[:] if not servers: prodlib.log('WARNING: no servers matched: %s' % name) for server in servers: if server.property('auto_assigned'): continue orig = '%s' % server if not self._AllocateHost( srv_mgr, server, exclude=replace_hosts): failed.append(orig) else: succeeded.append(orig) replaced.append('%s' % server) prodlib.log('\nreplace="%s"' % string.join(replaced)) prodlib.log('success="%s"' % string.join(succeeded)) prodlib.log('fail="%s"' % string.join(failed)) if self._free: prodlib.log('free="%s"' % string.join(self._RemainingFree())) return (replaced, succeeded, failed, self._RemainingFree())
if flag == '--mach': # list of machines specified. Compute the corresponding regexp # that will match them all (and only them): (mach1)|(mach2)|... machines = string.split(value, ',') if len(machines) == 1: machines = string.split( value) # maybe it's space-separated? regexpstr = '^((%s))$' % string.join(machines, ')|(') else: regexpstr = value # TODO: allow multiple regexps if not machine_re: machine_re = re.compile(regexpstr) else: prodlib.log("Only one of --re= or --mach= is allowed") usage() elif flag == '--delay': delay = float(value) elif flag == '--ports': restrictports = prodlib.CollectTypes(value, {}) elif flag == '--noexec' or flag == '-n': print_only(1) elif flag == '--corptest': # For testing, don't do any DNS lookups to speed up processing. # Also, sprinkled through the code are calls to machdistance.ParseMachine # which fail for corp machines - hack local hostname into something # that passes it. socket.gethostbyname = lambda x: x socket.gethostname = lambda: 'exyz1' corptest = 1
def Repair(self, srv_mgr, tries=10): cnstr_mgr = srv_mgr.constraint_mgr() prodlib.log('Beginning Repair:\n') replaced = [] succeeded = [] failed = {} seen_replaced = {} # Try to repair a fixed number of rounds. # We just repair one server from each problem each round. # This should fix most problems but for distribution/sharing # violations, multiple fixes for the problem may be needed. for _ in range(tries): # Find constraint violations. ver_results = cnstr_mgr.VerifyServers(srv_mgr) if not ver_results: prodlib.log('\nNo more problems to repair.') break replacements = [] # Find new machines to try and fix. any_left = 0 for res in ver_results: server = res.servers()[0] if not seen_replaced.has_key('%s' % server): any_left = 1 replacements.append(server) seen_replaced['%s' % server] = 1 # We tried all these before. if not any_left: prodlib.log('\nUnable to make further progress.') break # Fix problems by replacing the server. for server in replacements: if server.property('auto_assigned'): continue orig = '%s' % server if not self._AllocateHost(srv_mgr, server): failed[orig] = 1 else: succeeded.append(orig) replaced.append('%s' % server) failed = failed.keys() prodlib.log('\nreplace="%s"' % string.join(replaced)) prodlib.log('success="%s"' % string.join(succeeded)) prodlib.log('fail="%s"' % string.join(failed)) if self._free: prodlib.log('free="%s"' % string.join(self._RemainingFree())) ver_results = cnstr_mgr.VerifyServers(srv_mgr) if ver_results: prodlib.log( 'Unable to fully repair: Still more errors in config.\n') for res in ver_results: prodlib.log(' %s' % res) else: prodlib.log('Succesfully repaired') return (replaced, succeeded, failed, self._RemainingFree())
def AddSets(self, srv_mgrs, srvsetnums, do_min=0): # Allow interface to take a single server manager. if type(srv_mgrs) != types.ListType: srv_mgrs = [srv_mgrs] # Find free dictionary for this set. srv_mgr = srv_mgrs[0] cnstr_mgr = srv_mgr.constraint_mgr() # Set up the sets to process from constraints if not specified. if not srvsetnums: srvsetnums = {} # First iterate through all existing sets and add those sets # that have a shard length constraint. Note that if the shardlen # constraint is specified in defaults this will add all currently # created sets. for set in srv_mgr.Sets(): if set.property('auto_assigned'): continue if cnstr_mgr.Constraint('shardlen').Constraint(set.name()): srvsetnums[set.name()] = 1 # There may be some sets specified that are not currently in # existance. Iterate through the list of explicitly specified # shardlen constraints and add sets for their types. for srvset in cnstr_mgr.Constraint('shardlen').server_sets(): srvsetnums[srvset] = 1 srvsetnums = srvsetnums.keys() srvsetnums.sort() # Set up the number of each type to add. tmp = [] for srvsetnum in srvsetnums: if string.find(srvsetnum, ':') == -1: # TODO: Right now we're storing constraint descs in the constraint # manager. This will move in another checkin to the server sets # and will be more readable when accessing. vals = cnstr_mgr.Constraint('shardlen').Constraint(srvsetnum) # If no shardlen constrs were specified for this type then ignore. if not vals: prodlib.log('Cannot add sets for %s - no shardlen constraint' % srvsetnum) continue srvsetnum = '%s:%s,%s' % (srvsetnum, vals[0], vals[1]) tmp.append(srvsetnum) srvsetnums = tmp prodlib.log('Beginning AddSet:\n') added = [] failed = [] tried = [] for srvsetnum in srvsetnums: (srvset, num) = string.split(srvsetnum, ':') num = string.split(num, ',') if len(num) == 2: min = int(num[0]) max = int(num[1]) else: min = int(num[0]) max = min if do_min: cnt = min else: cnt = max # For balancer sets, we add these with the same port range as the # balanced set if they were not present in the server manager object. if srvset[0] == '+': for srv_mgr in srv_mgrs: balset = srv_mgr.Set(srvset[1:]) if not balset: raise Error, 'Cannot add set %s: no balanced set' % srvset set = srv_mgr.AddSet(srvset, balset.level()) # Ensure port ranges are matched to balanced set. for port in balset.Ports(): set.AddPort(port) # Build union of all ports for this type. ports = {} for srv_mgr in srv_mgrs: set = srv_mgr.Set(srvset) if set != None: cur_ports = set.Ports() for port in cur_ports: ports[port] = 1 ports = ports.keys() ports.sort() # Iterate up the slices so that we replace short shards first. for i in range(cnt+1): # To check if all allocations failed on the ports. every_port_failed = 1 # For each port check if this shard is short. for port in ports: for srv_mgr in srv_mgrs: cnstr_mgr = srv_mgr.constraint_mgr() set = srv_mgr.Set(srvset) if set.property('auto_assigned'): continue if port not in set.Ports(): continue # If we have enough servers just skip. num = len(set.ServersForPort(port)) if num >= i: every_port_failed = 0 continue # Create a placeholder server object. server = serverlib.Server() server.InitFromName('%s%s:%s' % (srvset, i, port)) srv_mgr.AddServer(server) # Try and allocate a host for it. force = 0 if num < min: force = 1 if self._AllocateHost(srv_mgr, server, force): every_port_failed = 0 added.append('%s' % server) else: # Failed so remove it from the map. srv_mgr.RemoveServer(server) if force: failed.append('%s' % server) else: tried.append('%s' % server) # Break if we tried every port for this server manager and # could not allocate anything. if every_port_failed: break prodlib.log('\nsuccess="%s"' % string.join(added)) prodlib.log('failed="%s"' % string.join(failed)) prodlib.log('tried="%s"' % string.join(tried)) if self._free: prodlib.log('free="%s"' % string.join(self._RemainingFree())) return (added, failed, tried, self._RemainingFree())
if flag in ['--re', '--mach']: if flag == '--mach': # list of machines specified. Compute the corresponding regexp # that will match them all (and only them): (mach1)|(mach2)|... machines = string.split(value, ',') if len(machines) == 1: machines = string.split(value) # maybe it's space-separated? regexpstr = '^((%s))$' % string.join(machines, ')|(') else: regexpstr = value # TODO: allow multiple regexps if not machine_re: machine_re = re.compile(regexpstr) else: prodlib.log("Only one of --re= or --mach= is allowed") usage() elif flag == '--delay': delay = float(value) elif flag == '--ports': restrictports = prodlib.CollectTypes(value, {}) elif flag == '--noexec' or flag == '-n': print_only(1) elif flag == '--corptest': # For testing, don't do any DNS lookups to speed up processing. # Also, sprinkled through the code are calls to machdistance.ParseMachine # which fail for corp machines - hack local hostname into something # that passes it. socket.gethostbyname = lambda x: x socket.gethostname = lambda : 'exyz1' corptest = 1
def Assign(self, configs, operation, args): """Run assigner. Args: configs: [googleconfig.Config, ...] - config objects to repair. operation: 'op' - assignment operation to perform. args: [arg1, ...] - arguments for the operation. Returns: 0 on successful replacements 1 on no changes could be made """ # Check if we have multiple configs with crawl, and get common owner. # Unowned machines that have servers reserved on them are # assigned to owner. is_crawl_config = 0 for c in configs: cfg_owner = c.GetServerManager().property('owner') if self._owner is None: self._owner = cfg_owner elif self._owner != cfg_owner: prodlib.log( "Can't deal with multiple owners in cfgs: %s vs. %s" % (self._owner, cfg_owner)) return 1 if not is_crawl_config: is_crawl_config = c.var('CRAWLMASTER') != None # endif # endfor if is_crawl_config and len(configs) > 1: prodlib.log('More than one config specified per crawl change.' ' Can\'t deal with this yet') return 1 # endif # Perform requested operation. config_changes = self.RunOperation(configs, operation, args) changes = [] # Create final change list and save config. for (config, add, rem, fail) in config_changes: srv_mgr = config.GetServerManager() if not add and not rem: prodlib.log('No changes for %s.' % config.GetConfigFileName()) continue for (add_srv, rem_srv) in map(lambda x, y: (x, y), add, rem): if add_srv: add_srv = _ServersFromSpecs([add_srv], srv_mgr)[0] if rem_srv: rem_srv = _ServersFromSpecs([rem_srv], srv_mgr)[0] changes.append((add_srv, rem_srv, _GetGFSCluster(config))) if self._save: out_file = os.path.basename( config.GetConfigFileName()) + '.out' config.SaveServers(out_file) prodlib.log('Saved to %s.' % out_file) if not changes: prodlib.log('No changes for any configs.') return 1 # Find added servers. new_servers = [i for (i, _, _) in changes if i is not None] fail = [] # Remove these allocated from pending changes. new_changes = [] for (add_srv, rem_srv, gfs_cluster) in changes: if add_srv in fail: continue new_changes.append((add_srv, rem_srv, gfs_cluster)) changes = new_changes # Return failure if any setup failed. if fail: return 1 else: return 0