def lockHost(hostname, unlock=False): mod = unlock and "-e" or "-d" command = ["qmod", mod, "all.q@%s" % hostname] try: run_sge_command(command, log) except subprocess.CalledProcessError: log.error("Error %s host %s", "unlocking" if unlock else "locking", hostname)
def _run_sge_command_for_multiple_hosts(hosts, command_template): """Sequentially run an sge command on the master node for the given hostnames.""" succeeded_hosts = [] for host in hosts: command = command_template.format(hostname=host.hostname, slots=host.slots) try: run_sge_command(command.format(hostname=host.hostname)) succeeded_hosts.append(host) except Exception as e: logging.error("Failed when executing command %s with exception %s", command, e) return succeeded_hosts
def removeHost(hostname, cluster_user, max_cluster_size): log.info('Removing %s', hostname) # Check if host is administrative host command = "qconf -sh" if _is_host_configured(command, hostname): # Removing host as administrative host command = ("qconf -dh %s" % hostname) run_sge_command(command, log) else: log.info('Host %s is not administrative host', hostname) # Check if host is in all.q (qconf -sq all.q) # Purge hostname from all.q try: command = ("qconf -purge queue '*' all.q@%s" % hostname) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to remove host %s from all.q", hostname) # Check if host is in @allhosts group (qconf -shgrp_resolved @allhosts) # Remove host from @allhosts group try: command = ("qconf -dattr hostgroup hostlist %s @allhosts" % hostname) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to remove host %s from @allhosts group", hostname) # Check if host is execution host command = "qconf -sel" if _is_host_configured(command, hostname): # Removing host as execution host command = ("qconf -de %s" % hostname) run_sge_command(command, log) else: log.info('Host %s is not execution host', hostname) # Check if host is submission host command = "qconf -ss" if _is_host_configured(command, hostname): # Removing host as submission host command = ("qconf -ds %s" % hostname) run_sge_command(command, log) else: log.info('Host %s is not submission host', hostname)
def unlock_host(hostname): logging.info("Unlocking host %s", hostname) command = ["qmod", "-e", "all.q@{0}".format(hostname)] run_sge_command(command)
def addHost(hostname, cluster_user, slots, max_cluster_size): log.info('Adding %s with %s slots' % (hostname, slots)) # Adding host as administrative host try: command = ("qconf -ah %s" % hostname) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to add host %s as administrative host", hostname) # Adding host as submit host try: command = ("qconf -as %s" % hostname) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to add host %s as submission host", hostname) # Setup template to add execution host qconf_Ae_template = """hostname %s load_scaling NONE complex_values NONE user_lists NONE xuser_lists NONE projects NONE xprojects NONE usage_scaling NONE report_variables NONE """ with NamedTemporaryFile() as t: temp_template = open(t.name, 'w') temp_template.write(qconf_Ae_template % hostname) temp_template.flush() os.fsync(t.fileno()) # Add host as an execution host try: command = ("qconf -Ae %s" % t.name) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to add host %s as execution host", hostname) # Connect and start SGE ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) hosts_key_file = os.path.expanduser("~" + cluster_user) + '/.ssh/known_hosts' user_key_file = os.path.expanduser("~" + cluster_user) + '/.ssh/id_rsa' iter = 0 connected = False while iter < 3 and connected is False: try: log.info('Connecting to host: %s iter: %d' % (hostname, iter)) ssh.connect(hostname, username=cluster_user, key_filename=user_key_file) connected = True except socket.error, e: log.error('Socket error: %s' % e) time.sleep(10 + iter) iter = iter + 1 if iter == 3: log.critical("Unable to provision host") return
except IOError: ssh._host_keys_filename = None pass ssh.save_host_keys(hosts_key_file) command = ( "sudo sh -c \'cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf\'" ).format(sge.SGE_ROOT) stdin, stdout, stderr = ssh.exec_command(command) while not stdout.channel.exit_status_ready(): time.sleep(1) ssh.close() # Add the host to the all.q try: command = ("qconf -aattr hostgroup hostlist %s @allhosts" % hostname) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to add host %s to all.q", hostname) # Set the numbers of slots for the host try: command = ('qconf -aattr queue slots ["%s=%s"] all.q' % (hostname, slots)) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to set the number of slots for the host %s", hostname) def removeHost(hostname, cluster_user, max_cluster_size): log.info('Removing %s', hostname)
def addHost(hostname, cluster_user, slots, max_cluster_size): log.info('Adding %s with %s slots' % (hostname,slots)) # Adding host as administrative host try: command = ("qconf -ah %s" % hostname) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to add host %s as administrative host", hostname) # Adding host as submit host try: command = ("qconf -as %s" % hostname) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to add host %s as submission host", hostname) # Setup template to add execution host qconf_Ae_template = """hostname %s load_scaling NONE complex_values NONE user_lists NONE xuser_lists NONE projects NONE xprojects NONE usage_scaling NONE report_variables NONE """ with NamedTemporaryFile() as t: temp_template = open(t.name, 'w') temp_template.write(qconf_Ae_template % hostname) temp_template.flush() os.fsync(t.fileno()) # Add host as an execution host try: command = ("qconf -Ae %s" % t.name) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to add host %s as execution host", hostname) # Connect and start SGE ssh = paramiko.SSHClient() ssh.set_missing_host_key_policy(paramiko.AutoAddPolicy()) hosts_key_file = os.path.expanduser("~" + cluster_user) + '/.ssh/known_hosts' user_key_file = os.path.expanduser("~" + cluster_user) + '/.ssh/id_rsa' iter = 0 connected = False while iter < 3 and connected is False: try: log.info('Connecting to host: %s iter: %d' % (hostname, iter)) ssh.connect(hostname, username=cluster_user, key_filename=user_key_file) connected = True except socket.error, e: log.error('Socket error: %s' % e) time.sleep(10 + iter) iter = iter + 1 if iter == 3: log.critical("Unable to provision host") return
except IOError: ssh._host_keys_filename = None pass ssh.save_host_keys(hosts_key_file) command = ( "sudo sh -c \'cd {0} && {0}/inst_sge -noremote -x -auto /opt/parallelcluster/templates/sge/sge_inst.conf\'" ).format(sge.SGE_ROOT) stdin, stdout, stderr = ssh.exec_command(command) while not stdout.channel.exit_status_ready(): time.sleep(1) ssh.close() # Add the host to the all.q try: command = ("qconf -aattr hostgroup hostlist %s @allhosts" % hostname) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to add host %s to all.q", hostname) # Set the numbers of slots for the host try: command = ('qconf -aattr queue slots ["%s=%s"] all.q' % (hostname, slots)) run_sge_command(command, log) except subprocess.CalledProcessError: log.warning("Unable to set the number of slots for the host %s", hostname) def removeHost(hostname, cluster_user, max_cluster_size): log.info('Removing %s', hostname) # Check if host is administrative host