def stress(self, *args): numInstances = 2 if len(args) > 0: numInstances = int(args[0]) if numInstances <= 0: utils.log("[%s] invalid number of instances to run stress tests on") test_instances = self.test_instances if len(test_instances) != numInstances: if len(test_instances) > 0: utils.log("[%s] removing %d stale test instances before create can occur" % (self, len(test_instances))) ids = set() # remove stale test instances for instance in test_instances: ids.add(instance.instance_id) instance.terminate() self.instances = filter(lambda instance: instance.instance_id not in ids, self.instances) utils.log("[%s] creating %d test instances" % (self, numInstances)) # create new test instances test_instances = [] for i in xrange(numInstances): config = {"name": "test%d" % i, "roles": ["test"], "instance_type": "m1.small"} instance = AWSInstance(self, config) test_instances.append(instance) self._pool.spawn(instance.create) self._pool.join() self.instances.extend(test_instances) utils.log("[%s] done creating %d test instances; initiating tests..." % (self, numInstances)) env.user = "******" env.key_filename = ["keys/test-keypair"] # TODO: test just this portion for instance in test_instances: test_cmd = "/stamped/stamped/platform/tests/stampede/StressTests.py" log = "/stamped/logs/test.log" cmd = "sudo nohup bash -c '. /stamped/bin/activate && python %s >& %s < /dev/null' &" % (test_cmd, log) num_retries = 5 while num_retries > 0: ret = utils.runbg(instance.public_dns_name, env.user, cmd) if 0 == ret: break num_retries -= 1
def clear_cache(self, *args): force = len(args) >= 1 and args[0] == "force" cmd = "sudo /bin/bash -c 'restart memcached'" pp = [] # restart memcached across all memcached servers for instance in self.mem_server_instances: pp.append((instance, utils.runbg(instance.public_dns_name, env.user, cmd))) for instance, p in pp: ret = p.wait()
def update(self, *args, **kwargs): force = len(args) >= 1 and args[0] == "force" utils.log("[%s] updating %d instances" % (self, len(self.instances))) branch = kwargs.get("branch", None) cmd = "sudo /bin/bash -c '. /stamped/bin/activate && python /stamped/bootstrap/bin/update.py%s%s'" % ( " --force" if force else "", " --branch %s" % branch if branch is not None else "", ) # cmd = "sudo /bin/bash -c '. /stamped/bin/activate && python /stamped/bootstrap/bin/update.py%s%s && cd /stamped/stamped/platform/servers/web2 && bin/restart.sh'" % \ # (" --force" if force else "", " --branch %s" % branch if branch is not None else "") pp = [] separator = "-" * 80 if force: # update all instances in parallel for instance in self.instances: pp.append((instance, utils.runbg(instance.public_dns_name, env.user, cmd))) for instance, p in pp: ret = p.wait() else: # update all instances synchronously, removing them one-at-a-time from # their respective ELBs and readding them once we're sure that the # update was applied successfully and the resulting instance is healthy for instance in self.instances: utils.log() utils.log(separator) utils.log("[%s] UPDATING %s" % (self, instance)) # TODO: this logic doesn't account for the case where an instance # may belong to multiple ELBs. NOTE that this scenario will never # arise in our current stack architecture, but I'm leaving this # note in here just in case that assumption changes in the future. elb = self._get_elb(instance) # only deregister instance if it belongs to a non-trivial ELB deregister = elb is not None # and len(elb.instances) > 1) if deregister: utils.log("[%s] temporarily deregistering %s from %s" % (self, instance, elb)) instances = elb.deregister_instances([instance.instance_id]) # TODO: this sleep shouldn't be necessary since the instance # is definitely removed from the ELB at this point, but without # pausing, the ELB seems to ignore performing a new health check # before successfully re-registering the instance. pausing here # effectively ensures that the state of the instance will be # set to OutOfService s.t. the health check must be passed # before the instance is considered InService after instance # re-registration. # # NOTE: an additional advantage of pausing here is that the # instance update script may restart certain daemons, and a # small pause after removing the instance from its ELB should # give the instance's daemons a chance to finish handling any # in-progress requests (e.g., gunicorn / nginx). time.sleep(10) # apply update synchronously with settings(host_string=instance.public_dns_name): try: result = run(cmd, pty=False, shell=True) status = result.return_code except Exception: # if run fails, ask the user whether or not to continue instead of aborting status = 1 if 0 != status: utils.log("[%s] warning: failure updating %s" % (self, instance)) confirmation = utils.get_input() if deregister and (confirmation == "n" or confirmation == "a"): utils.log("[%s] warning: not re-registering %s with %s" % (self, instance, elb)) if confirmation == "n": continue elif confirmation == "a": return if deregister: utils.log("[%s] %s re-registering with %s" % (self, instance, elb)) elb.register_instances([instance.instance_id]) utils.log("[%s] %s is waiting to come back online..." % (self, instance)) # TODO: infer max timeout from health check settings timeout = 600 delay = 2 # wait for the instance to come back online with the ELB while True: try: health = elb.get_instance_health([instance.instance_id])[0] if health.state == "InService": utils.log("[%s] %s is back online with elb %s..." % (self, instance, elb)) break except Exception, e: health = utils.AttributeDict(dict(state="error retrieving health", description=str(e))) utils.log("[%s] %s is '%s' (%s)" % (self, instance, health.state, health.description)) # instance is not in service yet; sleep for a bit before retrying timeout -= delay if timeout <= 0: utils.log( "[%s] %s timed out with elb %s (state=%s, desc=%s)..." % (self, instance, elb, health.state, health.description) ) confirmation = utils.get_input() if confirmation == "n" or confirmation == "a": return else: break time.sleep(delay) utils.log("[%s] successfully updated %s" % (self, instance))