Example #1
0
 def abort_dependents(self, thread):
     dep = [th for th in self.threads.values() if th.depends == thread]
     for th in dep:
         log.debug("%s thread is being aborted because it depends on failed %s thread." % (th.name, thread.name))
         th.status = 3
         self.done_threads += 1
         self.abort_dependents(th)
Example #2
0
 def thread_success(self, thread):
     with self.lock:
         self.done_threads += 1
         log.debug("%s thread has finished successfully." % thread.name)
         log.debug("%i threads are done. Remaining: %s" % (self.done_threads, ",".join([t.name for t in self.threads.values() if t.status == -1])))
         for t in [th for th in self.threads.values() if th.depends == thread]:
             t.start()            
         if self.done_threads == self.num_threads:
             self.all_done.set()            
Example #3
0
    def ssh_connect(self, username, hostname, keyfile):
        node = self.node

        log.debug("Establishing SSH connection", node)
        ssh = SSH(username, hostname, keyfile, default_outf = None, default_errf = None)
        try:
            ssh.open()
        except Exception, e:
            log.debug("SSH connection timed out", node)
            # Raise exception and let multi-thread manager handle it
            raise e
Example #4
0
 def scp_dir(self, fromdir, todir):
     for root, dirs, files in walk(fromdir):
         todir_full = todir + "/" + root[len(fromdir):]
         try:
             self.sftp.stat(todir_full)
         except IOError, e:
             self.sftp.mkdir(todir_full)
         for f in files:
             fromfile = root + "/" + f
             tofile = todir_full + "/" + f
             self.sftp.put(fromfile, tofile)
             log.debug("scp %s -> %s:%s" % (fromfile, self.hostname, tofile))
Example #5
0
    def __allocate_vms(self, deployer, nodes, resuming):
        # TODO: Make this an option
        sequential = False
        topology = deployer.instance.topology
        
        if not resuming:
            log.info("Allocating %i VMs." % len(nodes))
            next_state = Node.STATE_RUNNING_UNCONFIGURED
        else:
            log.info("Resuming %i VMs" % len(nodes))
            next_state = Node.STATE_RESUMED_UNCONFIGURED
        node_vm = {}
        for n in nodes:
            try:
                if not resuming:
                    n.set_property("state", Node.STATE_STARTING)
                    topology.save()
                    vm = deployer.allocate_vm(n)
                else:
                    n.set_property("state", Node.STATE_RESUMING)
                    topology.save()
                    vm = deployer.resume_vm(n)
                node_vm[n] = vm
            except Exception:
                message = self.__unexpected_exception_to_text()
                return (False, message, None)
        
            if sequential:
                log.debug("Waiting for instance to start.")
                wait = deployer.NodeWaitThread(None, "wait-%s" % str(vm), n, vm, deployer, state = next_state)
                wait.run2()
                
        if not sequential:        
            log.debug("Waiting for instances to start.")
            mt_instancewait = MultiThread()
            for node, vm in node_vm.items():
                mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = next_state))

            mt_instancewait.run()
            if not mt_instancewait.all_success():
                message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.")
                return (False, message, None)
            
        return (True, "Success", node_vm)
Example #6
0
    def __stop_vms(self, deployer, nodes):
        node_vm = deployer.get_node_vm(nodes)
        topology = deployer.instance.topology
        mt_configure = MultiThread()        
        order = topology.get_launch_order(nodes)

        for n in node_vm:
            n.state = Node.STATE_STOPPING
        topology.save()
        
        threads = {}
        for node in order:
            threads[node] = deployer.NodeConfigureThread(mt_configure, 
                                                         "stop-configure-%s" % node.id, 
                                                         node, 
                                                         node_vm[node], 
                                                         deployer, 
                                                         depends=[threads[t] for t in topology.get_depends(node)])            

        for thread in threads.values():
            mt_configure.add_thread(thread)
        
        mt_configure.run()
        if not mt_configure.all_success():
            message = self.__mt_exceptions_to_text(mt_configure.get_exceptions(), "Globus Provision was unable to configure the instances.")
            return (False, message)        
        
        order.reverse()
        for node in order:
            deployer.stop_vms([node])
        
        log.debug("Waiting for instances to stop.")
        mt_instancewait = MultiThread()
        for node, vm in node_vm.items():
            mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = Node.STATE_STOPPED))
        
        mt_instancewait.run()
        if not mt_instancewait.all_success():
            message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.")
            return (False, message)     
            
        return (True, "Success")
Example #7
0
    def __terminate_vms(self, deployer, nodes):
        topology = deployer.instance.topology

        deployer.terminate_vms(nodes)
        
        node_vm = deployer.get_node_vm(nodes)
        
        log.debug("Waiting for instances to terminate.")
        mt_instancewait = MultiThread()
        for node, vm in node_vm.items():
            mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = Node.STATE_TERMINATED))
        
        mt_instancewait.run()
        if not mt_instancewait.all_success():
            message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.")
            return (False, message)
            
        return (True, "Success")
        
        
Example #8
0
 def __stop_vms(self, deployer, nodes):
     topology = deployer.instance.topology
     order = topology.get_launch_order(nodes)
     order.reverse()
     
     for nodeset in order:
         deployer.stop_vms(nodeset)
     
     node_vm = deployer.get_node_vm(nodes)
     
     log.debug("Waiting for instances to stop.")
     mt_instancewait = MultiThread()
     for node, vm in node_vm.items():
         mt_instancewait.add_thread(deployer.NodeWaitThread(mt_instancewait, "wait-%s" % str(vm), node, vm, deployer, state = Node.STATE_STOPPED))
     
     mt_instancewait.run()
     if not mt_instancewait.all_success():
         message = self.__mt_exceptions_to_text(mt_instancewait.get_exceptions(), "Exception raised while waiting for instances.")
         return (False, message)
         
     return (True, "Success")
Example #9
0
        def pre_configure(self, ssh):
            node = self.node
            instance = self.ec2_instance
            
            log.info("Setting up instance %s. Hostname: %s" % (instance.id, instance.public_dns_name), node)
           
            try:
                ssh.run("ls -l /chef")
            except SSHCommandFailureException:
                #The image is not properly setup, so do all pre-configuration for globus-provision
                log.info("Image is not configured with Chef, so installing...")

                ssh.run("sudo chown -R %s /chef" % self.config.get("ec2-username"))
                ssh.scp_dir("%s" % self.chef_dir, "/chef")



                ssh.run("addgroup admin", exception_on_error = False)
                ssh.run("echo \"%s `hostname`\" | sudo tee -a /etc/hosts" % instance.private_ip_address)

                ssh.run("sudo apt-get install lsb-release wget")
                ssh.run("echo \"deb http://apt.opscode.com/ `lsb_release -cs` main\" | sudo tee /etc/apt/sources.list.d/opscode.list")
                ssh.run("wget -qO - http://apt.opscode.com/[email protected] | sudo apt-key add -")
                ssh.run("sudo apt-get update")
                ssh.run("echo 'chef chef/chef_server_url string http://127.0.0.1:4000' | sudo debconf-set-selections")
                ssh.run("sudo apt-get -q=2 install chef")
        
                ssh.run("echo -e \"cookbook_path \\\"/chef/cookbooks\\\"\\nrole_path \\\"/chef/roles\\\"\" > /tmp/chef.conf")        
                ssh.run("echo '{ \"run_list\": \"recipe[provision::ec2]\", \"scratch_dir\": \"%s\" }' > /tmp/chef.json" % self.scratch_dir)

                ssh.run("sudo chef-solo -c /tmp/chef.conf -j /tmp/chef.json")    
        
                ssh.run("sudo update-rc.d -f nis remove")
                ssh.run("sudo update-rc.d -f condor remove")
                ssh.run("sudo update-rc.d -f chef-client remove")
       

                log.debug("Removing private data...")
         
                ssh.run("sudo find /root/.*history /home/*/.*history -exec rm -f {} \;", exception_on_error = False)
Example #10
0
    def __connect(self):
        config = self.instance.config
        
        try:
            log.debug("Connecting to EC2...")
            ec2_server_hostname = config.get("ec2-server-hostname")
            ec2_server_port = config.get("ec2-server-port")
            ec2_server_path = config.get("ec2-server-path")
            
            if ec2_server_hostname != None:
                self.conn = create_ec2_connection(ec2_server_hostname,
                                                  ec2_server_path,
                                                  ec2_server_port) 
            else:
                self.conn = create_ec2_connection()
            
            if self.conn == None:
                raise DeploymentException, "AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment variables are not set."

            log.debug("Connected to EC2.")
        except BotoClientError, exc:
            raise DeploymentException, "Could not connect to EC2. %s" % exc.reason
Example #11
0
    def __recv(self, f, ready_func, recv_func, log_label, rem):
        nbytes = 0
        while ready_func():
            data = recv_func(4096)
            if len(data) > 0:
                nbytes += len(data)
                
                if f is not None: 
                    f.write(data)

                lines = data.split('\n')

                if len(lines) == 1:
                    rem += lines[0]
                else:
                    log.debug(log_label + ": %s" % (rem + lines[0]))
                    for line in lines[1:-1]:
                        log.debug(log_label + ": %s" % line)
                    rem = lines[-1]
                
        if f is not None: f.flush()
        
        return nbytes, rem
Example #12
0
 def run2(self):
     topology = self.deployer.instance.topology
     
     if self.node.state in (Node.STATE_RUNNING_UNCONFIGURED, Node.STATE_RUNNING, Node.STATE_RESUMED_UNCONFIGURED):
         if self.node.state == Node.STATE_RUNNING_UNCONFIGURED:
             log.debug("Configuring node for the first time", self.node)
             self.node.state = Node.STATE_CONFIGURING
             next_state = Node.STATE_RUNNING
         elif self.node.state == Node.STATE_RUNNING:
             log.debug("Reconfiguring already-running node", self.node)
             self.node.state = Node.STATE_RECONFIGURING
             next_state = Node.STATE_RUNNING
         elif self.node.state == Node.STATE_RESUMED_UNCONFIGURED:
             log.debug("Reconfiguring resumed node", self.node)
             self.node.state = Node.STATE_RESUMED_RECONFIGURING
             next_state = Node.STATE_RUNNING
         
         topology.save()
         
         if not self.dryrun:
             ssh = self.connect()
             self.check_continue()
             self.pre_configure(ssh)
             self.check_continue()
             self.configure(ssh)
             self.check_continue()
             self.post_configure(ssh)
             self.check_continue()
 
         self.node.state = next_state
         topology.save()
     elif self.node.state == Node.STATE_STOPPING:
         log.debug("Doing pre-shutdown configuration", self.node)
         self.node.state = Node.STATE_STOPPING_CONFIGURING
         topology.save()
         
         if not self.dryrun:
             ssh = self.connect()
             self.check_continue()
             self.configure_stop(ssh)
             self.check_continue()
 
         self.node.state = Node.STATE_STOPPING_CONFIGURED
         topology.save()            
Example #13
0
 def thread_failure(self, thread):
     with self.lock:
         if not isinstance(thread.exception, ThreadAbortException):
             log.debug("%s thread has failed: %s" % (thread.name, thread.exception))
             self.abort.set()
         else:
             log.debug("%s thread is being aborted." % thread.name)
             thread.status = 2
         self.done_threads += 1
         self.abort_dependents(thread)
         log.debug("%i threads are done. Remaining: %s" % (self.done_threads, ",".join([t.name for t in self.threads.values() if t.status == -1])))
         if self.done_threads == self.num_threads:
             self.all_done.set()           
Example #14
0
    def configure(self, ssh):
        domain = self.domain
        node = self.node
        instance_dir = self.deployer.instance.instance_dir

        if self.basic:
            # Upload host file and update hostname
            log.debug("Uploading host file and updating hostname", node)
            ssh.scp("%s/hosts" % instance_dir,
                    "/chef/cookbooks/provision/files/default/hosts")
            ssh.run(
                "sudo cp /chef/cookbooks/provision/files/default/hosts /etc/hosts",
                expectnooutput=True)

            ssh.run(
                "sudo bash -c \"echo %s > /etc/hostname\"" % node.hostname,
                expectnooutput=True)
            ssh.run(
                "sudo /etc/init.d/hostname.sh || sudo /etc/init.d/hostname restart",
                expectnooutput=True)

        self.check_continue()

        if self.chef:
            # Upload topology file
            log.debug("Uploading topology file", node)
            ssh.scp("%s/topology.rb" % instance_dir,
                    "/chef/cookbooks/provision/attributes/topology.rb")

            # Copy certificates
            log.debug("Copying certificates", node)
            ssh.scp_dir("%s/certs" % instance_dir,
                        "/chef/cookbooks/provision/files/default/")

            # Upload extra files
            log.debug("Copying extra files", node)
            for src, dst in self.deployer.extra_files:
                ssh.scp(src, dst)

            self.check_continue()

            #temporarily add admin group
            log.debug("Create new admin group")
            try:
                ssh.run("addgroup admin")
            except SSHCommandFailureException:
                log.debug("Admin group already exists, skipping..")

            # Run chef
            log.debug("Running chef", node)
            ssh.run(
                "echo -e \"cookbook_path \\\"/chef/cookbooks\\\"\\nrole_path \\\"/chef/roles\\\"\" > /tmp/chef.conf",
                expectnooutput=True)
            ssh.run(
                "echo '{ \"run_list\": [ %s ], \"scratch_dir\": \"%s\", \"domain_id\": \"%s\", \"node_id\": \"%s\"  }' > /tmp/chef.json"
                % (",".join("\"%s\"" % r for r in node.run_list),
                   self.config.get("scratch-dir"), domain.id, node.id),
                expectnooutput=True)

            # Sometimes, Chef will fail because a service didn't start or restart
            # properly (NFS-related services seem to do this occasionally).
            # In most cases, the problem just "goes away" if you try to restart the
            # service again. So, if Chef fails, we don't give up and try again
            # (since the recipes are idempotent, there's no harm to running them
            # multiple times)
            chef_tries = 3
            while chef_tries > 0:
                rc = ssh.run(
                    "sudo -i chef-solo -c /tmp/chef.conf -j /tmp/chef.json",
                    exception_on_error=False)
                if rc != 0:
                    chef_tries -= 1
                    log.debug("chef-solo failed. %i attempts left", node)
                else:
                    break

            if chef_tries == 0:
                raise DeploymentException, "Failed to configure node %s" % node.id

            self.check_continue()

        if self.basic:
            ssh.run("sudo update-rc.d nis defaults")

        for cmd in self.deployer.run_cmds:
            ssh.run(cmd)

        log.info("Configuration done.", node)
Example #15
0
    def run(self, command, outf=None, errf=None, exception_on_error = True, expectnooutput=False):
        channel = self.client.get_transport().open_session()
        
        log.debug("%s - Running %s" % (self.hostname,command))
        
        if outf != None:
            outf = open(outf, "w")
        else:
            outf = self.default_outf
    
        if errf != None:
            errf = open(errf, "w")
        else:
            errf = self.default_errf
            
        try:
            channel.exec_command(command)
            if expectnooutput:
                log.debug("Ignoring output from command (not expecting any)")
            else:
                all_out_nbytes = 0
                all_err_nbytes = 0   
                rem_out = ""
                rem_err = ""
                while True:
                    rl, wl, xl = select.select([channel],[],[], 0.1)
                    if len(rl) > 0:
                        out_nbytes, rem_out = self.__recv(outf, channel.recv_ready, channel.recv, "SSH_OUT", rem_out)
                        err_nbytes, rem_err = self.__recv(errf, channel.recv_stderr_ready, channel.recv_stderr, "SSH_ERR", rem_err)

                        if out_nbytes + err_nbytes == 0:
                            break

                        all_out_nbytes += out_nbytes
                        all_err_nbytes += err_nbytes

                if all_out_nbytes == 0:
                    log.debug("Command did not write to standard output.")

                if all_err_nbytes == 0:
                    log.debug("Command did not write to standard error.")
            
                if outf is not None: 
                    if outf != sys.stdout:
                        outf.close()
                    
                    if errf != sys.stderr:
                        outf.close()
            
            log.debug("%s - Waiting for exit status: %s" % (self.hostname,command))
            rc = channel.recv_exit_status()
            log.debug("%s - Ran %s" % (self.hostname,command))
            channel.close()
        except Exception, e:
            raise # Replace by something more meaningful
Example #16
0
     except IOError, e:
         pdirs = get_parent_directories(tof)
         for d in pdirs:
             try:
                 self.sftp.stat(d)
             except IOError, e:
                 self.sftp.mkdir(d)        
     try:
         self.sftp.put(fromf, tof)
     except Exception, e:
         traceback.print_exc()
         try:
             self.close()
         except:
             pass
     log.debug("scp %s -> %s:%s" % (fromf, self.hostname, tof))
     
 def scp_dir(self, fromdir, todir):
     for root, dirs, files in walk(fromdir):
         todir_full = todir + "/" + root[len(fromdir):]
         try:
             self.sftp.stat(todir_full)
         except IOError, e:
             self.sftp.mkdir(todir_full)
         for f in files:
             fromfile = root + "/" + f
             tofile = todir_full + "/" + f
             self.sftp.put(fromfile, tofile)
             log.debug("scp %s -> %s:%s" % (fromfile, self.hostname, tofile))
             
 def __recv(self, f, ready_func, recv_func, log_label, rem):