def resume_vm(self, node): ec2_instance_id = node.deploy_data.ec2.instance_id log.info(" |- Resuming instance %s for %s." % (ec2_instance_id, node.id)) started = self.conn.start_instances([ec2_instance_id]) log.info(" |- Resumed instance %s." % ",".join([i.id for i in started])) return EC2VM(started[0])
def wait(self): if self.state in (Node.STATE_RUNNING_UNCONFIGURED, Node.STATE_RESUMED_UNCONFIGURED): self.deployer.wait_state(self.ec2_instance, "running") log.info("Instance %s is running. Hostname: %s" % (self.ec2_instance.id, self.ec2_instance.public_dns_name)) elif self.state == Node.STATE_STOPPED: self.deployer.wait_state(self.ec2_instance, "stopped") elif self.state == Node.STATE_TERMINATED: self.deployer.wait_state(self.ec2_instance, "terminated")
def run2(self): topology = self.deployer.instance.topology self.node.state = Node.STATE_CONFIGURING topology.save() self.node.state = Node.STATE_RUNNING topology.save() log.info("Dummy configure done")
def configure_stop(self, ssh): node = self.node log.info("Configuring node for shutdown", node) ssh.run("sudo cp /etc/hosts.gp-bak /etc/hosts", expectnooutput=True) ssh.run("sudo cp /etc/hostname.gp-bak /etc/hostname", expectnooutput=True) ssh.run("sudo /etc/init.d/hostname.sh || sudo /etc/init.d/hostname restart", expectnooutput=True) ssh.run("sudo bash -c \"echo +auto.master > /etc/auto.master\"", exception_on_error = False) ssh.run("sudo bash -c \"echo > /etc/yp.conf\"", exception_on_error = False) ssh.run("sudo bash -c \"echo > /etc/default/nfs-common\"", exception_on_error = False) ssh.run("sudo update-rc.d -f nis remove", exception_on_error = False) log.info("Configuration done.", node)
def __allocate_vms(self, deployer, nodes, resuming): # TODO: Make this an option sequential = False topology = deployer.instance.topology if not resuming: log.info("Allocating %i VMs." % len(nodes)) next_state = Node.STATE_RUNNING_UNCONFIGURED else: log.info("Resuming %i VMs" % len(nodes)) next_state = Node.STATE_RESUMED_UNCONFIGURED node_vm = {} for n in nodes: try: if not resuming: n.set_property("state", Node.STATE_STARTING) topology.save() vm = deployer.allocate_vm(n) else: n.set_property("state", Node.STATE_RESUMING) topology.save() vm = deployer.resume_vm(n) node_vm[n] = vm except Exception: message = self.__unexpected_exception_to_text() return (False, message, None) if sequential: log.debug("Waiting for instance to start.") wait = deployer.NodeWaitThread(None, "wait-%s" % str(vm), n, vm, deployer, state = next_state) wait.run2() if not sequential: log.debug("Waiting for instances to start.") mt_instancewait = MultiThread() for node, vm in node_vm.items(): mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = next_state)) mt_instancewait.run() if not mt_instancewait.all_success(): message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.") return (False, message, None) return (True, "Success", node_vm)
def instance_stop(self, inst_id): (success, message, inst) = self.__get_instance(inst_id) if not success: return (API.STATUS_FAIL, message) log.set_logging_instance(inst) try: if inst.topology.state != Topology.STATE_RUNNING: message = "Cannot start an instance that is in state '%s'" % (Topology.state_str[inst.topology.state]) return (API.STATUS_FAIL, message) deployer_class = self.__get_deployer_class(inst) deployer = deployer_class() try: deployer.set_instance(inst) except DeploymentException, de: message = "Deployer failed to initialize. %s " % de return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_STOPPING inst.topology.save() nodes = inst.topology.get_nodes() (success, message) = self.__stop_vms(deployer, nodes) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_STOPPED inst.topology.save() log.info("Stopping Globus Online endpoints") try: eps = inst.topology.get_go_endpoints() self.__globusonline_stop(inst, eps) inst.topology.save() except GlobusOnlineException, goe: log.warning("Unable to stop GO endpoint/s: %s" % goe)
def instance_terminate(self, inst_id): (success, message, inst) = self.__get_instance(inst_id) if not success: return (API.STATUS_FAIL, message) log.set_logging_instance(inst) try: if inst.topology.state in [Topology.STATE_NEW]: message = "Cannot terminate an instance that is in state '%s'" % (Topology.state_str[inst.topology.state]) return (API.STATUS_FAIL, message) deployer_class = self.__get_deployer_class(inst) deployer = deployer_class() try: deployer.set_instance(inst) except DeploymentException, de: message = "Deployer failed to initialize. %s " % de return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_TERMINATING inst.topology.save() nodes = inst.topology.get_nodes() (success, message) = self.__terminate_vms(deployer, nodes) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) # Remove GO endpoints eps = inst.topology.get_go_endpoints() self.__globusonline_remove(inst, eps) inst.topology.state = Topology.STATE_TERMINATED inst.topology.save() log.info("Instances have been terminated.") return (API.STATUS_SUCCESS, "Success")
def pre_configure(self, ssh): node = self.node instance = self.ec2_instance log.info("Setting up instance %s. Hostname: %s" % (instance.id, instance.public_dns_name), node) try: ssh.run("ls -l /chef") except SSHCommandFailureException: #The image is not properly setup, so do all pre-configuration for globus-provision log.info("Image is not configured with Chef, so installing...") ssh.run("sudo chown -R %s /chef" % self.config.get("ec2-username")) ssh.scp_dir("%s" % self.chef_dir, "/chef") ssh.run("addgroup admin", exception_on_error = False) ssh.run("echo \"%s `hostname`\" | sudo tee -a /etc/hosts" % instance.private_ip_address) ssh.run("sudo apt-get install lsb-release wget") ssh.run("echo \"deb http://apt.opscode.com/ `lsb_release -cs` main\" | sudo tee /etc/apt/sources.list.d/opscode.list") ssh.run("wget -qO - http://apt.opscode.com/[email protected] | sudo apt-key add -") ssh.run("sudo apt-get update") ssh.run("echo 'chef chef/chef_server_url string http://127.0.0.1:4000' | sudo debconf-set-selections") ssh.run("sudo apt-get -q=2 install chef") ssh.run("echo -e \"cookbook_path \\\"/chef/cookbooks\\\"\\nrole_path \\\"/chef/roles\\\"\" > /tmp/chef.conf") ssh.run("echo '{ \"run_list\": \"recipe[provision::ec2]\", \"scratch_dir\": \"%s\" }' > /tmp/chef.json" % self.scratch_dir) ssh.run("sudo chef-solo -c /tmp/chef.conf -j /tmp/chef.json") ssh.run("sudo update-rc.d -f nis remove") ssh.run("sudo update-rc.d -f condor remove") ssh.run("sudo update-rc.d -f chef-client remove") log.debug("Removing private data...") ssh.run("sudo find /root/.*history /home/*/.*history -exec rm -f {} \;", exception_on_error = False)
def instance_stop(self, inst_id): (success, message, inst) = self.__get_instance(inst_id) if not success: return (API.STATUS_FAIL, message) log.set_logging_instance(inst_id) try: if inst.topology.state != Topology.STATE_RUNNING: message = "Cannot start an instance that is in state '%s'" % (Topology.state_str[inst.topology.state]) return (API.STATUS_FAIL, message) deployer_class = self.__get_deployer_class(inst) deployer = deployer_class() try: deployer.set_instance(inst) except DeploymentException, de: message = "Deployer failed to initialize. %s " % de return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_STOPPING inst.topology.save() nodes = inst.topology.get_nodes() (success, message) = self.__stop_vms(deployer, nodes) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_STOPPED inst.topology.save() log.info("Instances have been stopped running.") return (API.STATUS_SUCCESS, "Success")
def allocate_vm(self, node): log.info("Allocated dummy VM.") return DummyVM()
def stop_vms(self, nodes): ec2_instance_ids = [n.deploy_data.ec2.instance_id for n in nodes] log.info("Stopping EC2 instances %s." % ", ".join(ec2_instance_ids)) stopped = self.conn.stop_instances(ec2_instance_ids) log.info("Stopped EC2 instances %s." % ", ".join([i.id for i in stopped]))
def terminate_vms(self, nodes): ec2_instance_ids = [n.deploy_data.ec2.instance_id for n in nodes] log.info("Terminating EC2 instances %s." % ", ".join(ec2_instance_ids)) terminated = self.conn.terminate_instances(ec2_instance_ids) log.info("Terminated EC2 instances %s." % ", ".join([i.id for i in terminated]))
def instance_update(self, inst_id, topology_json, extra_files, run_cmds): try: (success, message, inst) = self.__get_instance(inst_id) if not success: return (API.STATUS_FAIL, message) log.set_logging_instance(inst) if inst.topology.state == Topology.STATE_NEW: # If the topology is still in a New state, we simply # validate that the update is valid, and replace # the old topology. We don't need to deploy or # configure any hosts.. if topology_json != None: (success, message, topology_changes) = inst.update_topology(topology_json) if not success: message = "Error in topology file: %s" % message return (API.STATUS_FAIL, message) return (API.STATUS_SUCCESS, "Success") elif inst.topology.state not in (Topology.STATE_RUNNING, Topology.STATE_FAILED): message = "Cannot update the topology of an instance that is in state '%s'" % (Topology.state_str[inst.topology.state]) return (API.STATUS_FAIL, message) deployer_class = self.__get_deployer_class(inst) deployer = deployer_class(extra_files, run_cmds) try: deployer.set_instance(inst) except DeploymentException, de: message = "Deployer failed to initialize. %s " % de return (API.STATUS_FAIL, message) if topology_json != None: old_topology = inst.topology try: (success, message, topology_changes) = inst.update_topology(topology_json) if not success: return (API.STATUS_FAIL, message) except ObjectValidationException, ove: message = "Error in topology file: %s" % ove return (API.STATUS_FAIL, message) create_hosts = [] destroy_hosts = [] create_endpoints = [] remove_endpoints = [] if topology_changes.changes.has_key("domains"): for domain in topology_changes.changes["domains"].add: d = inst.topology.domains[domain] create_hosts += [n.id for n in d.nodes.values()] for domain in topology_changes.changes["domains"].remove: d = inst.topology.domains[domain].keys() destroy_hosts += [n.id for n in d.nodes.values()] for domain in topology_changes.changes["domains"].edit: if topology_changes.changes["domains"].edit[domain].changes.has_key("nodes"): nodes_changes = topology_changes.changes["domains"].edit[domain].changes["nodes"] create_hosts += nodes_changes.add destroy_hosts += nodes_changes.remove if topology_changes.changes["domains"].edit[domain].changes.has_key("go_endpoints"): ep_changes = topology_changes.changes["domains"].edit[domain].changes["go_endpoints"] if ep_changes.change_type == PropertyChange.ADD: create_endpoints += inst.topology.domains[domain].go_endpoints elif ep_changes.change_type == PropertyChange.REMOVE: remove_endpoints += old_topology.domains[domain].go_endpoints elif ep_changes.change_type == PropertyChange.EDIT: create_endpoints += ep_changes.add remove_endpoints += ep_changes.remove nodes = inst.topology.get_nodes() if len(destroy_hosts) > 0: old_nodes = old_topology.get_nodes() log.info("Terminating hosts %s" % destroy_hosts) old_nodes = [n for n in old_nodes if n.id in destroy_hosts] (success, message) = self.__terminate_vms(deployer, old_nodes) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.save() if len(create_endpoints) > 0: try: self.__globusonline_pre_start(inst, create_endpoints) except GlobusOnlineException, goe: log.warning("Unable to create GO endpoint/s: %s" % goe)
map[device_name] = device user_data_mounts += """- [ ephemeral%i, /ephemeral/%i, auto, "defaults,noexec" ]\n""" % (i, i) # The following will only work with Ubuntu AMIs (including the AMI we provide) # If using a different AMI, you may need to manually mount the ephemeral partitions. user_data = """#cloud-config manage_etc_hosts: true """ + user_data_mounts if instance_type in ("cc1.4xlarge", "cg1.4xlarge"): pg = self.__get_placement_group() placement_group = pg.name else: placement_group = None log.info(" |- Launching a %s instance for %s." % (instance_type, node.id)) reservation = image.run(min_count=1, max_count=1, instance_type=instance_type, security_groups=security_groups, key_name=self.instance.config.get("ec2-keypair"), user_data=user_data, block_device_map=map, placement_group = placement_group, placement = None) instance = reservation.instances[0] return EC2VM(instance) def resume_vm(self, node): ec2_instance_id = node.deploy_data.ec2.instance_id
inst.topology.save() nodes = inst.topology.get_nodes() (success, message, node_vm) = self.__allocate_vms(deployer, nodes, resuming) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_CONFIGURING inst.topology.save() log.info("Instances are running.") for node, vm in node_vm.items(): deployer.post_allocate(node, vm) inst.topology.save() # Generate certificates if not resuming: inst.gen_certificates(force_hosts=False, force_users=False) else: inst.gen_certificates(force_hosts=True, force_users=False) inst.topology.gen_chef_ruby_file(inst.instance_dir + "/topology.rb") inst.topology.gen_hosts_file(inst.instance_dir + "/hosts")
def instance_start(self, inst_id, extra_files, run_cmds): (success, message, inst) = self.__get_instance(inst_id) if not success: return (API.STATUS_FAIL, message) log.set_logging_instance(inst_id) try: deployer_class = self.__get_deployer_class(inst) deployer = deployer_class(extra_files, run_cmds) try: deployer.set_instance(inst) except DeploymentException, de: message = "Deployer failed to initialize. %s " % de return (API.STATUS_FAIL, message) if inst.topology.state == Topology.STATE_NEW: resuming = False elif inst.topology.state == Topology.STATE_STOPPED: resuming = True else: message = "Cannot start an instance that is in state '%s'" % (Topology.state_str[inst.topology.state]) return (API.STATUS_FAIL, message) if not resuming: inst.topology.state = Topology.STATE_STARTING else: inst.topology.state = Topology.STATE_RESUMING inst.topology.save() nodes = inst.topology.get_nodes() (success, message, node_vm) = self.__allocate_vms(deployer, nodes, resuming) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_CONFIGURING inst.topology.save() log.info("Instances are running.") for node, vm in node_vm.items(): deployer.post_allocate(node, vm) inst.topology.save() # Generate certificates if not resuming: inst.gen_certificates(force_hosts=False, force_users=False) else: inst.gen_certificates(force_hosts=True, force_users=False) inst.topology.gen_chef_ruby_file(inst.instance_dir + "/topology.rb") inst.topology.gen_hosts_file(inst.instance_dir + "/hosts") log.info("Setting up Globus Provision on instances") (success, message) = self.__configure_vms(deployer, node_vm) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_RUNNING inst.topology.save() return (API.STATUS_SUCCESS, "Success")
def instance_update(self, inst_id, topology_json, extra_files, run_cmds): try: (success, message, inst) = self.__get_instance(inst_id) if not success: return (API.STATUS_FAIL, message) log.set_logging_instance(inst_id) if inst.topology.state != Topology.STATE_RUNNING: message = "Cannot update the topology of an instance that is in state '%s'" % (Topology.state_str[inst.topology.state]) return (API.STATUS_FAIL, message) deployer_class = self.__get_deployer_class(inst) deployer = deployer_class(extra_files, run_cmds) try: deployer.set_instance(inst) except DeploymentException, de: message = "Deployer failed to initialize. %s " % de return (API.STATUS_FAIL, message) if topology_json != None: old_topology = inst.topology try: (success, message, create_hosts, destroy_hosts) = inst.update_topology(topology_json) except ObjectValidationException, ove: message = "Error in topology file: %s" % ove return (API.STATUS_FAIL, message) nodes = inst.topology.get_nodes() if len(destroy_hosts) > 0: old_nodes = old_topology.get_nodes() log.info("Terminating hosts %s" % destroy_hosts) old_nodes = [n for n in old_nodes if n.id in destroy_hosts] (success, message) = self.__terminate_vms(deployer, old_nodes) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.save() if len(create_hosts) > 0: nodes = inst.topology.get_nodes() log.info("Allocating VMs for hosts %s" % create_hosts) new_nodes = [n for n in nodes if n.id in create_hosts] (success, message, node_vm) = self.__allocate_vms(deployer, new_nodes, resuming = False) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.save() for node, vm in node_vm.items(): deployer.post_allocate(node, vm) inst.topology.save() # Generate certificates inst.gen_certificates() inst.topology.gen_chef_ruby_file(inst.instance_dir + "/topology.rb") inst.topology.gen_hosts_file(inst.instance_dir + "/hosts")
return (API.STATUS_FAIL, message) inst.topology.save() for node, vm in node_vm.items(): deployer.post_allocate(node, vm) inst.topology.save() # Generate certificates inst.gen_certificates() inst.topology.gen_chef_ruby_file(inst.instance_dir + "/topology.rb") inst.topology.gen_hosts_file(inst.instance_dir + "/hosts") log.info("Setting up Globus Provision on instances") # Right now we reconfigure all nodes. It shouldn't be hard to follow # the dependency tree to make sure only the new nodes and "ancestor" # nodes are updated nodes = inst.topology.get_nodes() node_vm = deployer.get_node_vm(nodes) (success, message) = self.__configure_vms(deployer, node_vm) if not success: inst.topology.state = Topology.STATE_FAILED inst.topology.save() return (API.STATUS_FAIL, message) inst.topology.state = Topology.STATE_RUNNING inst.topology.save()
def resume_vm(self, node): log.info("Resumed dummy VM.") return DummyVM()
def terminate_vms(self, nodes): log.info("Dummy nodes terminated.")
def wait(self): log.info("Waiting for state %s" % Node.state_str[self.state])
def configure(self, ssh): domain = self.domain node = self.node instance_dir = self.deployer.instance.instance_dir if self.basic: # Upload host file and update hostname log.debug("Uploading host file and updating hostname", node) ssh.scp("%s/hosts" % instance_dir, "/chef/cookbooks/provision/files/default/hosts") ssh.run( "sudo cp /chef/cookbooks/provision/files/default/hosts /etc/hosts", expectnooutput=True) ssh.run( "sudo bash -c \"echo %s > /etc/hostname\"" % node.hostname, expectnooutput=True) ssh.run( "sudo /etc/init.d/hostname.sh || sudo /etc/init.d/hostname restart", expectnooutput=True) self.check_continue() if self.chef: # Upload topology file log.debug("Uploading topology file", node) ssh.scp("%s/topology.rb" % instance_dir, "/chef/cookbooks/provision/attributes/topology.rb") # Copy certificates log.debug("Copying certificates", node) ssh.scp_dir("%s/certs" % instance_dir, "/chef/cookbooks/provision/files/default/") # Upload extra files log.debug("Copying extra files", node) for src, dst in self.deployer.extra_files: ssh.scp(src, dst) self.check_continue() #temporarily add admin group log.debug("Create new admin group") try: ssh.run("addgroup admin") except SSHCommandFailureException: log.debug("Admin group already exists, skipping..") # Run chef log.debug("Running chef", node) ssh.run( "echo -e \"cookbook_path \\\"/chef/cookbooks\\\"\\nrole_path \\\"/chef/roles\\\"\" > /tmp/chef.conf", expectnooutput=True) ssh.run( "echo '{ \"run_list\": [ %s ], \"scratch_dir\": \"%s\", \"domain_id\": \"%s\", \"node_id\": \"%s\" }' > /tmp/chef.json" % (",".join("\"%s\"" % r for r in node.run_list), self.config.get("scratch-dir"), domain.id, node.id), expectnooutput=True) # Sometimes, Chef will fail because a service didn't start or restart # properly (NFS-related services seem to do this occasionally). # In most cases, the problem just "goes away" if you try to restart the # service again. So, if Chef fails, we don't give up and try again # (since the recipes are idempotent, there's no harm to running them # multiple times) chef_tries = 3 while chef_tries > 0: rc = ssh.run( "sudo -i chef-solo -c /tmp/chef.conf -j /tmp/chef.json", exception_on_error=False) if rc != 0: chef_tries -= 1 log.debug("chef-solo failed. %i attempts left", node) else: break if chef_tries == 0: raise DeploymentException, "Failed to configure node %s" % node.id self.check_continue() if self.basic: ssh.run("sudo update-rc.d nis defaults") for cmd in self.deployer.run_cmds: ssh.run(cmd) log.info("Configuration done.", node)
def pre_configure(self, ssh): node = self.node instance = self.ec2_instance log.info("Setting up instance %s. Hostname: %s" % (instance.id, instance.public_dns_name), node)