def run(self, nodes, master, user, user_shell, volumes): #master.ssh.execute( #"killall -9 pbs_server; killall -9 pbs_sched; CLEAN_DELAY=0 emerge -C torque; rm -rvf /var/spool/torque; FEATURES=buildpkg emerge -g -j torque", #silent=False) #import IPython; ipshell = IPython.embed; ipshell(banner1='ipshell') # -- configure torque's server and scheduler on the master node log.info("Configuring torque server...") master.ssh.execute(master_configure_server) # -- configure torque's clients on each node and complete the # configuration on the master node for node in nodes[1:]: log.info("Configuring torque node '%s'..." % node.alias) node.ssh.execute(node_configure_mom) self._add_torque_node_to_master(node, master) # -- (re)start services log.info("Starting torque services...") self._force_deamon_restart(master, 'pbs_server') for node in nodes[1:]: self._start_torque_node_daemon(node) self._force_deamon_restart(master, 'pbs_sched') # -- print infos / debug log.debug("Torque server information:") master.ssh.execute("qmgr -c 'l s'") master.ssh.execute("qmgr -c 'p s'") log.debug("Torque nodes information:") for node in nodes[1:]: master.ssh.execute('momctl -h %s -d 2' % node.alias) master.ssh.execute("qnodes")
def _find_node_for_removal(self): """ This function will find a suitable node to remove from the cluster. The criteria for removal are: 1. The node must not be running any SGE job 2. The node must have been up for 50-60 minutes past its start time 3. The node must not be the master, or allow_master_kill=True """ nodes = self._cluster.running_nodes to_rem = [] for node in nodes: if not self.allow_master_kill and \ node.id == self._cluster.master_node.id: log.debug("not removing master node") continue is_working = self.stat.is_node_working(node) mins_up = self._minutes_uptime(node) % 60 if not is_working: log.info("Idle Node %s (%s) has been up for %d minutes " \ "past the hour." % (node.id, node.alias, mins_up)) if self.polling_interval > 300: self.kill_after = \ max(45, 60 - (2 * self.polling_interval / 60)) if not is_working and mins_up >= self.kill_after: to_rem.append(node) return to_rem
def settingsCommand(self): target = self.root + "/" + self.cell + "/common" cmd = 'cd ' + target + '; ' cmd += self.exportEnvironmentVars() cmd += self.root + '/util/create_settings.sh ' + target log.debug("sge.CreateCell.createSettings cmd: %s", cmd) return cmd
def load(self): """ Populate this config object from the StarCluster config """ log.debug('Loading config') try: self.globals = self._load_section('global', self.global_settings) except exception.ConfigSectionMissing: pass try: self.aws = self._load_section('aws info', self.aws_settings) except exception.ConfigSectionMissing: log.warn("No [aws info] section found in the config!") self.aws.update(self.get_settings_from_env(self.aws_settings)) self.keys = self._load_sections('key', self.key_settings) self.vols = self._load_sections('volume', self.volume_settings) self.vols.update(self._load_sections('vol', self.volume_settings)) self.plugins = self._load_sections('plugin', self.plugin_settings, filter_settings=False) self.permissions = self._load_sections('permission', self.permission_settings) sections = self._get_sections('cluster') self.clusters = self._load_cluster_sections(sections) return self
def addEnvarsToProfile(self, node): """ Add environment variables (SGE_CELL, ports, etc.) to /etc/profile """ envars = self.exportEnvironmentVars(); log.debug("sge.addEnvarsToProfile envars: echo '%s' >> /etc/profile", envars) node.ssh.execute("echo '" + envars + "' >> /etc/profile")
def _probe_peers(self, master, nodes): cmd = "" log.info("Probing %d nodes" % len(nodes)) for node in nodes: cmd += "/usr/sbin/gluster peer probe %s;" % node.alias log.debug(master.ssh.execute(cmd)) log.debug(master.ssh.execute("/usr/sbin/gluster peer status"))
def run(self, nodes, master, user, user_shell, volumes): """ Mount NFS shares on master and all nodes """ log.info("Running plugin automount") log.debug( "automount.NfsShares.run automount.NfsShares.run(nodes, master, user, user_shell, volumes)" ) #### OPEN NFS-RELATED PORTS FOR THIS CLUSTER self.openNfsPorts("default") self.openNfsPorts('@sc-' + self.cluster) #### SET HEAD NODE INTERNAL IP self.getHeadIp() #### FIX mountd PORT ON head AND MASTER/NODES mountdport = "32767" for node in nodes: self.setMountdOnNode(node, mountdport) self.setMountdOnHead(mountdport) self.restartServicesOnHead() #### MOUNT ON ALL NODES for node in nodes: self.mount(node) log.info("Completed plugin automount")
def _eval_remove_node(self): """ This function uses the sge stats to decide whether or not to remove a node from the cluster. """ qlen = len(self.stat.get_queued_jobs()) if qlen != 0: return if not self.has_cluster_stabilized(): return if len(self.stat.hosts) <= self.min_nodes: log.info("Not removing nodes: already at or below minimum (%d)" % self.min_nodes) return max_remove = len(self.stat.hosts) - self.min_nodes log.info("Looking for nodes to remove...") remove_nodes = self._find_nodes_for_removal(max_remove=max_remove) if not remove_nodes: log.info("No nodes can be removed at this time") for node in remove_nodes: if node.update() != "running": log.error("Node %s is already dead - not removing" % node.alias) continue log.warn("Removing %s: %s (%s)" % (node.alias, node.id, node.dns_name)) try: self._cluster.remove_node(node) self.__last_cluster_mod_time = datetime.datetime.utcnow() except Exception: log.error("Failed to remove node %s" % node.alias) log.debug(traceback.format_exc())
def enableSchedulingInfo(self): """ Enable job scheduling info output for 'qstat -j' """ log.info("Enabling job scheduling info") envars = self.exportEnvironmentVars() log.debug(envars + self.rootpath + "/qconf -ssconf") queue_template = subprocess.Popen(envars + self.rootpath + "/qconf -ssconf", stdout=subprocess.PIPE, shell=True).stdout.read() log.debug("sge.CreateCell.enableSchedulingInfo BEFORE queue_template: %s", queue_template) match = "schedd_job_info false" insert = "schedd_job_info true" queue_template = string.replace(queue_template, match, insert) log.debug("sge.CreateCell.enableSchedulingInfo AFTER queue_template: %s", queue_template) pid = os.getpid() filename = "/tmp/queue-" + str(os.getpid()) + ".txt" queue_file = open(filename, 'w') print >> queue_file, queue_template queue_file.close() cmd = envars + self.rootpath + "/qconf -Msconf " + filename log.debug(cmd) os.system(cmd) remove = "rm -fr " + filename log.debug(remove) os.system(remove)
def get(self, remotepaths, localpath=''): """ Copies one or more files from the remote host to the local host. """ remotepaths = self._make_list(remotepaths) localpath = localpath or os.getcwd() globs = [] noglobs = [] for rpath in remotepaths: if glob.has_magic(rpath): globs.append(rpath) else: noglobs.append(rpath) globresults = [self.glob(g) for g in globs] remotepaths = noglobs for globresult in globresults: remotepaths.extend(globresult) recursive = False for rpath in remotepaths: if not self.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) for rpath in remotepaths: if self.isdir(rpath): recursive = True break try: self.scp.get(remotepaths, local_path=localpath, recursive=recursive) except Exception, e: log.debug("get failed: remotepaths=%s, localpath=%s", str(remotepaths), localpath) raise exception.SCPException(str(e))
def execute(self, args): if len(args) < 3: self.parser.error("please specify a cluster, remote file or " + "directory, and a local destination path") ctag = args[0] lpath = args[-1] rpaths = args[1:-1] cl = self.cm.get_cluster(ctag, load_receipt=False) try: node = cl.get_node(self.opts.node) except exception.InstanceDoesNotExist as ide: if self.opts.node == "master": #may have happened because master node is clustername-master #i.e. dns_prefix = True in config #lets check try: node = cl.get_node('%s-%s' % (ctag, self.opts.node)) except exception.InstanceDoesNotExist as ide2: #k, master is just not there, raise original error log.debug("Neither master nor %s-%s exist." % (ctag, self.opts.node)) raise (ide) else: #node name was provided raise if self.opts.user: node.ssh.switch_user(self.opts.user) for rpath in rpaths: if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) node.ssh.get(rpaths, lpath)
def enableSchedulingInfo(self): """ Enable job scheduling info output for 'qstat -j' """ log.info("Enabling job scheduling info") envars = self.exportEnvironmentVars() log.debug(envars + self.rootpath + "/qconf -ssconf") queue_template = subprocess.Popen(envars + self.rootpath + "/qconf -ssconf", stdout=subprocess.PIPE, shell=True).stdout.read() log.debug( "sge.CreateCell.enableSchedulingInfo BEFORE queue_template: %s", queue_template) match = "schedd_job_info false" insert = "schedd_job_info true" queue_template = string.replace(queue_template, match, insert) log.debug( "sge.CreateCell.enableSchedulingInfo AFTER queue_template: %s", queue_template) pid = os.getpid() filename = "/tmp/queue-" + str(os.getpid()) + ".txt" queue_file = open(filename, 'w') print >> queue_file, queue_template queue_file.close() cmd = envars + self.rootpath + "/qconf -Msconf " + filename log.debug(cmd) os.system(cmd) remove = "rm -fr " + filename log.debug(remove) os.system(remove)
def scp(self): """Initialize the SCP client.""" if not self._scp or not self._scp.transport.is_active(): log.debug("creating scp connection") self._scp = scp.SCPClient(self.transport, progress=self._file_transfer_progress) return self._scp
def connect(self, host=None, username=None, password=None, private_key=None, private_key_pass=None, port=None, timeout=30, compress=None): host = host or self._host username = username or self._username password = password or self._password compress = compress or self._compress port = port if port is not None else self._port pkey = self._pkey if private_key: pkey = self.load_private_key(private_key, private_key_pass) log.debug("connecting to host %s on port %d as user %s" % (host, port, username)) try: sock = self._get_socket(host, port) transport = paramiko.Transport(sock) transport.banner_timeout = timeout except socket.error: raise exception.SSHConnectionError(host, port) # Enable/disable compression transport.use_compression(compress) # Authenticate the transport. try: transport.connect(username=username, pkey=pkey, password=password) except paramiko.AuthenticationException: raise exception.SSHAuthException(username, host) except paramiko.SSHException, e: msg = e.args[0] raise exception.SSHError(msg)
def addParallelEnvironment(self, master): """ Add 'threaded' parallel environment """ log.info("Adding 'threaded' parallel environment") sge_pe_template = """ pe_name threaded slots %s user_lists NONE xuser_lists NONE start_proc_args /bin/true stop_proc_args /bin/true allocation_rule $pe_slots control_slaves TRUE job_is_first_task FALSE urgency_slots min accounting_summary FALSE """ log.debug("addParallelEnvironment sge_pe_template: %s", sge_pe_template) #### PRINT TEMPLATE FILE pe_file = master.ssh.remote_file("/tmp/pe.txt") print >> pe_file, sge_pe_template % 99999 pe_file.close() envars = self.exportEnvironmentVars() rootpath = self.getRootPath(master) log.debug("CreateCell.addParallelEnvironment rootpath: %s", rootpath) master.ssh.execute(envars + rootpath + "/qconf -Ap %s &> /tmp/pe.out" % pe_file.name) master.ssh.execute(envars + rootpath + '/qconf -mattr queue pe_list "threaded" all.q &> /tmp/pe2q.out')
def __init__(self,enable_hvmem="True",master_slots=0): if enable_hvmem == "False": self.enable_hvmem = False else: self.enable_hvmem = True self.master_slots = master_slots log.debug("enable_hvmem = %s , master_slots = %s" % (self.enable_hvmem, self.master_slots))
def execute(self, args): if not args: cls = [ c.cluster_tag for c in self.cm.get_clusters(load_plugins=False, load_receipt=False) ] msg = "please specify a cluster" if cls: opts = ', '.join(cls) msg = " ".join([msg, '(options:', opts, ')']) self.parser.error(msg) for cluster_name in args: try: cl = self.cm.get_cluster(cluster_name) except exception.ClusterDoesNotExist: raise except Exception, e: log.debug("Failed to load cluster settings!", exc_info=True) log.error("Failed to load cluster settings!") if self.opts.force: log.warn("Ignoring cluster settings due to --force option") cl = self.cm.get_cluster(cluster_name, load_receipt=False, require_keys=False) else: if not isinstance(e, exception.IncompatibleCluster): log.error("Use -f to forcefully stop the cluster") raise is_stoppable = cl.is_stoppable() if not is_stoppable: has_stoppable_nodes = cl.has_stoppable_nodes() if not self.opts.terminate_unstoppable and has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' contains 'stoppable' and 'unstoppable' " "nodes. Your options are:\n\n" "1. Use the --terminate-unstoppable option to " "stop all 'stoppable' nodes and terminate all " "'unstoppable' nodes\n\n" "2. Use the 'terminate' command to destroy the " "cluster.\n\nPass --help for more info." % cluster_name) if not has_stoppable_nodes: raise exception.BaseException( "Cluster '%s' does not contain any 'stoppable' nodes " "and can only be terminated. Please use the " "'terminate' command instead to destroy the cluster." "\n\nPass --help for more info" % cluster_name) if not self.opts.confirm: resp = raw_input("Stop cluster %s (y/n)? " % cluster_name) if resp not in ['y', 'Y', 'yes']: log.info("Aborting...") continue cl.stop_cluster(self.opts.terminate_unstoppable, force=self.opts.force) log.warn("All non-spot, EBS-backed nodes are now in a " "'stopped' state") log.warn("You can restart this cluster by passing -x " "to the 'start' command") log.warn("Use the 'terminate' command to *completely* " "terminate this cluster")
def _get_stats(self): master = self._cluster.master_node now = self.get_remote_time() qatime = self.get_qatime(now) qacct_cmd = 'qacct -j -b ' + qatime qstat_cmd = 'qstat -u \* -xml' qhostxml = '\n'.join( master.ssh.execute('qhost -xml', log_output=True, source_profile=True, raise_on_failure=True)) qstatxml = '\n'.join( master.ssh.execute(qstat_cmd, log_output=True, source_profile=True, raise_on_failure=True)) qacct = '\n'.join( master.ssh.execute(qacct_cmd, log_output=True, ignore_exit_status=True, source_profile=True)) stats = SGEStats() stats.parse_qhost(qhostxml) stats.parse_qstat(qstatxml, queues=["all.q", ""]) stats.parse_qacct(qacct, now) log.debug("sizes: qhost: %d, qstat: %d, qacct: %d" % (len(qhostxml), len(qstatxml), len(qacct))) return stats
def execute(self, args): if len(args) < 3: self.parser.error("please specify a cluster, remote file or " + "directory, and a local destination path") ctag = args[0] lpath = args[-1] rpaths = args[1:-1] cl = self.cm.get_cluster(ctag, load_receipt=False) try: node = cl.get_node(self.opts.node) except exception.InstanceDoesNotExist as ide: if self.opts.node == "master": #may have happened because master node is clustername-master #i.e. dns_prefix = True in config #lets check try: node = cl.get_node('%s-%s' % (ctag, self.opts.node) ) except exception.InstanceDoesNotExist as ide2: #k, master is just not there, raise original error log.debug("Neither master nor %s-%s exist." % (ctag, self.opts.node)) raise( ide ) else: #node name was provided raise if self.opts.user: node.ssh.switch_user(self.opts.user) for rpath in rpaths: if not glob.has_magic(rpath) and not node.ssh.path_exists(rpath): raise exception.BaseException( "Remote file or directory does not exist: %s" % rpath) node.ssh.get(rpaths, lpath)
def get_stats(self): """ this function will ssh to the SGE master and get load & queue stats. it will feed these stats to SGEStats, which parses the XML. it will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc """ log.debug("starting get_stats") master = self._cluster.master_node self.stat = SGEStats() qhostXml = "" qstatXml = "" qacct = "" try: now = self.get_remote_time() qatime = self.get_qatime(now) qacct_cmd = 'source /etc/profile && qacct -j -b ' + qatime qstat_cmd = 'source /etc/profile && qstat -q all.q -u \"*\" -xml' qhostXml = '\n'.join(master.ssh.execute( \ 'source /etc/profile && qhost -xml', log_output=False)) qstatXml = '\n'.join(master.ssh.execute(qstat_cmd, log_output=False)) qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=False, \ ignore_exit_status=True)) except Exception, e: log.error("Error occured getting SGE stats via ssh. "\ "Cluster terminated?") log.error(e) return -1
def connect(self, host=None, username=None, password=None, private_key=None, private_key_pass=None, port=22, timeout=30): host = host or self._host username = username or self._username password = password or self._password pkey = self._pkey if private_key: pkey = self.load_private_key(private_key, private_key_pass) log.debug("connecting to host %s on port %d as user %s" % (host, port, username)) try: sock = self._get_socket(host, port) transport = ssh.Transport(sock) transport.banner_timeout = timeout except socket.error: raise exception.SSHConnectionError(host, port) # Authenticate the transport. try: transport.connect(username=username, pkey=pkey, password=password) except ssh.AuthenticationException: raise exception.SSHAuthException(username, host) except ssh.SSHException, e: msg = e.args[0] raise exception.SSHError(msg)
def editStartupScript(self, file, master): """ Add entry in /etc/rc.local to run masterRestart.pl on boot """ log.info("Adding entry to /etc/rc.local to run masterRestart on boot") log.debug("startup.StartUp.editStartupScript self.installdir: %s ", self.installdir) if ( file == None or file == "" ): file = "/etc/rc.local" log.debug("startup.StartUp.editStartupScript file: %s ", file) #### SET RUN resetMaster.pl COMMAND command = self.resetdir + "/resetMaster.pl " \ + " --cell " + self.cell \ + " --headnodeid " + self.headnodeid \ + " --cgiscript " + "/cgi-bin/agua/reset.cgi" log.debug("startup.StartUp.editStartupScript command: %s ", command) #### PRINT COMMAND TO FILE infilehandle = master.ssh.remote_file(file, 'r') contents = infilehandle.read() log.debug("startup.StartUp.editStartupScript contents: %s ", contents) contents = string.replace(contents, "exit 0", "") contents = string.replace(contents, command, "") contents += command + "\n" contents += "\nexit 0\n" log.debug("startup.StartUp.editStartupScript printing to %s contents: %s ", file, contents) outfilehandle = master.ssh.remote_file(file, 'w') outfilehandle.write(contents) outfilehandle.close()
def alias(self): """ Fetches the node's alias stored in a tag from either the instance or the instance's parent spot request. If no alias tag is found an exception is raised. """ if not self._alias: alias = self.tags.get('alias') if not alias: user_data = self._get_user_data(tries=5) aliases = user_data.split('|') index = self.ami_launch_index try: alias = aliases[index] except IndexError: log.debug("invalid user_data: %s (index: %d)" % (aliases, index)) alias = None if not alias: raise exception.BaseException("instance %s has no alias" % self.id) self.add_tag('alias', alias) name = self.tags.get('Name') if not name: self.add_tag('Name', alias) self._alias = alias return self._alias
def __init__(self, my_arg, my_other_arg, my_other_other_arg): self.my_arg = my_arg self.my_other_arg = my_other_arg self.my_other_other_arg = my_other_other_arg msg = "setupclass3: my_arg = %s, my_other_arg = %s" msg += " my_other_other_arg = %s" log.debug(msg % (my_arg, my_other_arg, my_other_other_arg))
def export_fs_to_nodes(self, nodes, export_paths): """ Export each path in export_paths to each node in nodes via NFS nodes - list of nodes to export each path to export_paths - list of paths on this remote host to export to each node Example: # export /home and /opt/sge6 to each node in nodes $ node.start_nfs_server() $ node.export_fs_to_nodes(nodes=[node1,node2], export_paths=['/home', '/opt/sge6']) """ log.debug("Cleaning up potentially stale NFS entries") self.stop_exporting_fs_to_nodes(nodes, paths=export_paths) log.info("Configuring NFS exports path(s):\n%s" % ' '.join(export_paths)) nfs_export_settings = "(async,no_root_squash,no_subtree_check,rw)" etc_exports = self.ssh.remote_file('/etc/exports', 'r') contents = etc_exports.read() etc_exports.close() etc_exports = self.ssh.remote_file('/etc/exports', 'a') for node in nodes: for path in export_paths: export_line = ' '.join( [path, node.alias + nfs_export_settings + '\n']) if export_line not in contents: etc_exports.write(export_line) etc_exports.close() self.ssh.execute('exportfs -fra')
def on_add_node(self, node, nodes, master, user, user_shell, volumes): log.info("Doing 'on_add_node' for plugin: sge.CreateCell") log.info("Adding %s", node.alias) log.debug( "sge.CreateCell.on_add_node CreateCell.on_add_node(self, node, nodes, master, user, user_shell, volumes)" ) log.debug("sge.CreateCell.on_add_node node.private_dns_name: %s" % node.private_dns_name) #### SET HEAD NODE INTERNAL IP self.getHeadIp() #### ADD ENVIRONMENT VARIABLES TO /etc/profile ON MASTER self.addEnvarsToProfile(node) ##### CREATE NEW CELL DIRECTORY ON HEAD AND MASTER self.copyCell(node) ##### RESTART SGE ON NODE self.restartSge(node) #### ADD NODE TO @allhosts GROUP self.addToAllhosts(node, master) log.info("Completed 'on_add_node' for plugin: sge.CreateCell")
def alias(self): """ Fetches the node's alias stored in a tag from either the instance or the instance's parent spot request. If no alias tag is found an exception is raised. """ if not self._alias: alias = self.tags.get('alias') if not alias: user_data = self._get_user_data(tries=5) aliases = user_data.split('|') index = self.ami_launch_index try: alias = aliases[index] except IndexError: log.debug( "invalid user_data: %s (index: %d)" % (aliases, index)) alias = None if not alias: raise exception.BaseException( "instance %s has no alias" % self.id) self.add_tag('alias', alias) name = self.tags.get('Name') if not name: self.add_tag('Name', alias) self._alias = alias return self._alias
def sftp(self): """Establish the SFTP connection.""" if not self._sftp or self._sftp.sock.closed: log.debug("creating sftp connection") self._sftp = paramiko.SFTPClient.from_transport(self.transport) self._sftp.get_channel().settimeout(self._timeout) return self._sftp
def get_stats(self): """ this function will ssh to the SGE master and get load & queue stats. it will feed these stats to SGEStats, which parses the XML. it will return two arrays: one of hosts, each host has a hash with its host information inside. The job array contains a hash for every job, containing statistics about the job name, priority, etc """ log.debug("starting get_stats") master = self._cluster.master_node self.stat = SGEStats() qhostXml = "" qstatXml = "" qacct = "" try: now = self.get_remote_time() qatime = self.get_qatime(now) qacct_cmd = 'source /etc/profile && qacct -j -b ' + qatime qstat_cmd = 'source /etc/profile && qstat -q all.q -u \"*\" -xml' qhostXml = '\n'.join(master.ssh.execute( \ 'source /etc/profile && qhost -xml', log_output=False)) qstatXml = '\n'.join( master.ssh.execute(qstat_cmd, log_output=False)) qacct = '\n'.join(master.ssh.execute(qacct_cmd, log_output=False, \ ignore_exit_status=True)) except Exception, e: log.error("Error occured getting SGE stats via ssh. "\ "Cluster terminated?") log.error(e) return -1
def alias(self): """ Fetches the node's alias stored in a tag from either the instance or the instance's parent spot request. If no alias tag is found an exception is raised. """ if not self._alias: alias = self.tags.get('alias') if not alias: aliasestxt = self.user_data.get(static.UD_ALIASES_FNAME) aliases = aliasestxt.splitlines()[2:] index = self.ami_launch_index try: alias = aliases[index] except IndexError: alias = None log.debug("invalid aliases file in user_data:\n%s" % aliasestxt) if not alias: raise exception.BaseException( "instance %s has no alias" % self.id) self.add_tag('alias', alias) if not self.tags.get('Name'): self.add_tag('Name', alias) self._alias = alias return self._alias
def mount_nfs_shares(self, server_node, remote_paths): """ Mount each path in remote_paths from the remote server_node server_node - remote server node that is sharing the remote_paths remote_paths - list of remote paths to mount from server_node """ self.ssh.execute('/etc/init.d/portmap start') # TODO: move this fix for xterm somewhere else self.ssh.execute('mount -t devpts none /dev/pts', ignore_exit_status=True) mount_map = self.get_mount_map() mount_paths = [] for path in remote_paths: network_device = "%s:%s" % (server_node.alias, path) if network_device in mount_map: mount_path, typ, options = mount_map.get(network_device) log.debug('nfs share %s already mounted to %s on ' 'node %s, skipping...' % (network_device, mount_path, self.alias)) else: mount_paths.append(path) remote_paths = mount_paths remote_paths_regex = '|'.join(map(lambda x: x.center(len(x) + 2), remote_paths)) self.ssh.remove_lines_from_file('/etc/fstab', remote_paths_regex) fstab = self.ssh.remote_file('/etc/fstab', 'a') for path in remote_paths: fstab.write('%s:%s %s nfs vers=3,user,rw,exec,noauto 0 0\n' % (server_node.alias, path, path)) fstab.close() for path in remote_paths: if not self.ssh.path_exists(path): self.ssh.makedirs(path) self.ssh.execute('mount %s' % path)
def run(self, nodes, master, user, user_shell, volumes): """ Mount NFS shares on master and all nodes """ log.info("Running plugin automount") log.debug("automount.NfsShares.run automount.NfsShares.run(nodes, master, user, user_shell, volumes)") #### OPEN NFS-RELATED PORTS FOR THIS CLUSTER self.openNfsPorts("default") self.openNfsPorts('@sc-' + self.cluster) #### SET HEAD NODE INTERNAL IP self.getHeadIp(); #### FIX mountd PORT ON head AND MASTER/NODES mountdport = "32767" for node in nodes: self.setMountdOnNode(node, mountdport) self.setMountdOnHead(mountdport) self.restartServicesOnHead() #### MOUNT ON ALL NODES for node in nodes: self.mount(node) log.info("Completed plugin automount")
def setMountdOnNode(self, node, mountdport): """ Fix mountd port to same number on all hosts - head, master and exec nodes """ log.info("Setting mountd port on %s", node.alias) cmd = self.mountdCommand(mountdport) log.debug("Doing node.ssh.execute: " + cmd) node.ssh.execute(cmd)
def _load_rsa_key(self, private_key, private_key_pass=None): private_key_file = os.path.expanduser(private_key) try: rsa_key = get_rsa_key(key_location=private_key_file, passphrase=private_key_pass) log.debug("Using private key %s (RSA)" % private_key) return rsa_key except (paramiko.SSHException, exception.SSHError): log.error("invalid rsa key or passphrase specified")
def conn(self): if self._conn is None: log.debug('creating self._conn w/ connection_authenticator ' + 'kwargs = %s' % self._kwargs) self._conn = self.connection_authenticator( self.aws_access_key_id, self.aws_secret_access_key, **self._kwargs) return self._conn
def _stage_attrs(self, fileName, attrsDict): dir=self._create_tmp_dir() file="{dir}/{name}".format(dir=dir, name=fileName) log.debug("Checking for file %s", file) f = self.mssh.remote_file(file, mode="w") f.writelines(self._format_attrs(attrsDict)) f.close() return file
def conn(self): if self._conn is None: log.debug('creating self._conn w/ connection_authenticator kwargs' + ' = %s' % self._kwargs) self._conn = self.connection_authenticator( self.aws_access_key, self.aws_secret_access_key, **self._kwargs ) return self._conn
def run(self, nodes, master, user, user_shell, volumes): sudoCmd = 'sudo ' if user == 'root': sudoCmd = '' for node in nodes: log.debug('run on %s: %s' % (node.alias, ("%sX :0 &" % sudoCmd))) node.ssh.execute("%sX :0 &" % sudoCmd) log.debug('run on %s: OK' % node.alias)
def _setup_etc_hosts(self, nodes=None): """ Configure /etc/hosts on all StarCluster nodes""" log.info("Configuring /etc/hosts on each node") nodes = nodes or self._nodes log.debug("Launching jobs " + str(datetime.datetime.utcnow())) for node in nodes: self.pool.simple_job(node.add_to_etc_hosts, (nodes, ), jobid=node.alias) self.pool.wait(numtasks=len(nodes))
def createSettings(self, node): """ Generate settings.sh file containing SGE_CELL, SGE_ROOT and port info """ log.info("Generating settings.sh file") log.debug("CreateCell.createSettings CreateCell.createSettings(master)") cmd = self.settingsCommand() log.debug("CreateCell.createSettings cmd: %s", cmd) node.ssh.execute(cmd)
def _addToFstab(self, node, sourcedir, sourceip, mountpoint, interval): """ Add entries to /etc/fstab on master/exec nodes """ log.info("Adding /etc/fstab entry (%s on %s)", mountpoint, node.alias) insert = self.head_ip + ":" + sourcedir + " " + mountpoint + " nfs nfsvers=3,defaults 0 0" cmd = "echo '" + insert + "' >> /etc/fstab ;" log.debug(cmd) node.ssh.execute(cmd)
def _load_rsa_key(self, private_key, private_key_pass=None): private_key_file = os.path.expanduser(private_key) try: rsa_key = paramiko.RSAKey.from_private_key_file(private_key_file, private_key_pass) log.debug("Using private key %s (rsa)" % private_key) return rsa_key except paramiko.SSHException: log.error('invalid rsa key or password specified')
def _load_rsa_key(self, private_key, private_key_pass=None): private_key_file = os.path.expanduser(private_key) try: rsa_key = get_rsa_key(key_location=private_key_file, passphrase=private_key_pass) log.debug("Using private key %s (RSA)" % private_key) return rsa_key except (paramiko.SSHException, exception.SSHError): log.error('invalid rsa key or passphrase specified')
def switch_user(self, user): """ Reconnect, if necessary, to host as user """ if not self.is_active() or user and self.get_current_user() != user: self.connect(username=user) else: user = user or self._username log.debug("already connected as user %s" % user)
def _load_rsa_key(self, private_key, private_key_pass=None): private_key_file = os.path.expanduser(private_key) try: rsa_key = paramiko.RSAKey.from_private_key_file( private_key_file, private_key_pass) log.debug("Using private key %s (rsa)" % private_key) return rsa_key except paramiko.SSHException: log.error('invalid rsa key or password specified')