def modify_htcondor(self, key, value, action="a"): """ Modifying HTCondor environment for running HTCondor as desired. It will restart HTCondor after modifying the configuration. The configuration format is in the form of a key value string pair and if the action passed as "a" then it will add the new value to the old value. """ log.debug("modifying HTCondor") all_done = False try: default_val = self.find_config(key, '/etc/condor/condor_config') log.debug(default_val) val = "" if action == "a": if default_val != "": val = value + "," + default_val else: val = value else: val = value with open(paths.P_HTCONDOR_CONFIG_PATH, 'a') as f: print >> f, str(key) + "=" + str(val) misc.run(paths.P_HTCONDOR_HOME + "/condor restart") all_done = True except Exception, e: log.debug("Error while configuring HTCondor: {0}".format(e)) all_done = False self.state = service_states.ERROR
def _stop_sge(self): log.info("Stopping SGE.") for inst in self.app.manager.worker_instances: self.remove_node(inst) misc.run('export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -km' % (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root), "Problems stopping SGE master", "Successfully stopped SGE master.")
def stop_sge(self): log.info("Stopping SGE.") for inst in self.app.manager.worker_instances: self.remove_sge_host(inst.get_id(), inst.get_private_ip()) misc.run('export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -km' \ % (paths.P_SGE_ROOT, paths.P_SGE_ROOT), "Problems stopping SGE master", \ "Successfully stopped SGE master.")
def reload(self): """ Reload nginx process (`nginx -s reload`) """ # TODO: run `nginx -t` before attemping to reload the process to make # sure the conf files are OK and thus reduce chances of screwing up misc.run('{0} -c {1} -s reload'.format(self.exe, self.conf_file))
def _attempt_chown_galaxy(self, path): try: galaxy_uid = pwd.getpwnam("galaxy")[2] galaxy_gid = grp.getgrnam("galaxy")[2] os.chown(path, galaxy_uid, galaxy_gid) except OSError: misc.run("chown galaxy:galaxy '%s'" % path)
def configure_htcondor(self): """ Configure environment for running HTCondor service over a node. """ all_done = False try: htcondor_params = {} if self.srv_type == "master": condor_template = conf_manager.load_conf_template(conf_manager.HTCONDOR_MASTER_CONF_TEMPLATE) # log.debug("Condor template: {0}".format(condor_template)) htcondor_params["flock_host"] = self.flock_to else: condor_template = conf_manager.load_conf_template(conf_manager.HTCONDOR_WOORKER_CONF_TEMPLATE) htcondor_params = { "host": self.host } log.debug("HTCondor params: {0}".format(str(htcondor_params))) condor_template = condor_template.substitute(htcondor_params) if os.path.exists(paths.P_HTCONDOR_CONFIG_PATH): with open(paths.P_HTCONDOR_CONFIG_PATH, 'a') as f: print >> f, condor_template misc.run(paths.P_HTCONDOR_HOME + "/condor restart") all_done = True self.state = service_states.RUNNING else: log.error("HTCondor config file {0} not found!" .format(paths.P_HTCONDOR_CONFIG_PATH)) except Exception, e: log.debug("Error while configuring HTCondor: {0}".format(e)) self.state = service_states.ERROR all_done = False
def configure_hadoop(self): """ Configure environment for running Hadoop on demand. """ all_done = False try: log.debug("Setting up Hadoop environment") etcFile = open("/etc/environment", "a") etcFile.write("JAVA_HOME=\"/usr\"\n") etcFile.flush() etcFile.close() log.debug("Hadoop id_rsa set from::" + self.id_rsa_path) hadoop_id_rsa = "/home/ubuntu/.ssh/id_rsa" shutil.copy(self.id_rsa_path, hadoop_id_rsa) misc.run("chown -c ubuntu {0}".format(hadoop_id_rsa)) log.debug("Hadoop authFile saved to {0}".format(hadoop_id_rsa)) authFile = open("/home/ubuntu/.ssh/authorized_keys", "a") pubKeyFile = open(self.id_rsa_pub_key_path) authFile.write(pubKeyFile.read()) authFile.flush() authFile.close() pubKeyFile.close() misc.run("chown -c ubuntu /home/ubuntu/.ssh/authorized_keys") all_done = True except Exception, e: log.debug("Error while configuring HADOOP: {0}".format(e)) all_done = False
def start(self): """ Start Cloudera Manager web server. """ log.debug("Starting Cloudera Manager service") self.state = service_states.STARTING misc.run('/sbin/sysctl vm.swappiness=0') # Recommended by Cloudera threading.Thread(target=self.__start).start()
def remove(self): """ Shutting down Condor. """ log.info("Shutting down HTCondor service") self.state = service_states.SHUTTING_DOWN misc.run("condor_off") self.state = service_states.SHUT_DOWN
def remove(self): """ Shut down ProFTPd service. """ log.info("Shutting down ProFTPd service") self.state = service_states.SHUTTING_DOWN misc.run("/etc/init.d/proftpd stop") self.state = service_states.SHUT_DOWN
def unsuspend_queue(self, queue_name='all.q'): """ Unsuspend ``queue_name`` queue so it can run jobs. """ log.debug("Unsuspending SGE queue {0}".format(queue_name)) misc.run('export SGE_ROOT={0}; . $SGE_ROOT/default/common/settings.sh; ' '{1}/bin/lx24-amd64/qmod -usq {2}' .format(self.app.path_resolver.sge_root, self.app.path_resolver.sge_root, queue_name))
def start(self): """ Wait until all other services are running before starting this one.""" log.debug("Starting %s service" % self.name) # All other services OK, start this one now self.state = service_states.RUNNING log.debug("%s service prerequisites OK (i.e., all other services running), " "checking if %s was provided..." % (self.name, self.pss_filename)) local_pss_file = os.path.join( self.app.ud['cloudman_home'], self.pss_filename) # Check user data first to allow overwriting of a potentially existing # script if self.pss_url: # This assumes the provided URL is readable to anyone w/o authentication # First check if the file actually exists if misc.run('wget --server-response %s' % self.pss_url): misc.run('wget --output-document=%s %s' % ( local_pss_file, self.pss_url)) else: log.error( "Specified post_start_script url (%s) does not exist" % self.pss_url) else: s3_conn = self.app.cloud_interface.get_s3_connection() b = None if s3_conn and 'bucket_cluster' in self.app.ud: b = s3_conn.lookup(self.app.ud['bucket_cluster']) if b is not None: # Check if an existing cluster has a stored post start script log.debug("Cluster bucket '%s' found; looking for post start script '%s'" % (b.name, self.pss_filename)) misc.get_file_from_bucket( s3_conn, b.name, self.pss_filename, local_pss_file) if os.path.exists(local_pss_file) and os.path.getsize(local_pss_file) > 0: log.info("%s found and saved to '%s'; running it now (note that this may take a while)" % (self.pss_filename, os.path.join(self.app.ud['cloudman_home'], self.pss_filename))) os.chmod(local_pss_file, 0755) # Ensure the script is executable misc.run('cd %s;./%s' % (self.app.ud[ 'cloudman_home'], self.pss_filename)) self.save_to_bucket() log.info("Done running {0}".format(self.pss_filename)) else: log.debug("%s does not exist or could not be downloaded; continuing without running it." % self.name) # Prime the object with instance data (because this may take a while # on some clouds, do so in a separate thread) threading.Thread(target=self._prime_data).start() self.state = service_states.SHUT_DOWN log.debug("%s service done and marked as '%s'" % (self.name, self.state)) if self.instance_role == 'master': # On master, remove the service upon completion (PSS runs only # once) self.remove() self.state = service_states.COMPLETED # Once this service is complete, it's safe to assume the cluster is # READY self.app.manager.cluster_status = cluster_status.READY msg = "All cluster services started; the cluster is ready for use." log.info(msg) self.app.msgs.info(msg)
def remove(self): # TODO write something to clean up SGE in the case of restarts? log.info("Removing SGE service") self.state = service_states.SHUTTING_DOWN for inst in self.app.manager.worker_instances: if not inst.is_spot() or inst.spot_was_filled(): self.remove_sge_host(inst.get_id(), inst.get_private_ip()) misc.run('export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -km' % (paths.P_SGE_ROOT, paths.P_SGE_ROOT), "Problems stopping SGE master", "Successfully stopped SGE master") self.state = service_states.SHUT_DOWN
def remove(self, synchronous=False): if self._check_daemon('slurmd'): log.info("Removing {0} service".format(self.name)) super(SlurmdService, self).remove(synchronous) self.state = service_states.SHUTTING_DOWN misc.run("/sbin/start-stop-daemon --retry TERM/5/KILL/10 --stop " "--exec /usr/sbin/slurmd") self.state = service_states.SHUT_DOWN else: log.debug("Tried to remove {0} service but no deamon running?" .format(self.name))
def _install_s3fs(self): msg = "s3fs is not installed; will install it now (this typically takes 2-5 minutes)." log.info(msg) self.app.msgs.info(msg) misc.run("cd /tmp;wget --output-document=s3fs.sh http://s3.amazonaws.com/cloudman/pss/s3fs.sh") if misc.run("cd /tmp;bash s3fs.sh"): msg = "Done installing s3fs" else: msg = "Trouble installing sf3s; giving up." log.debug(msg) self.app.msgs.info(msg)
def _setup_slurm(self): """ Setup ``slurmctld`` process. """ log.debug("Setting up Slurmctld... (if stuck here for a while, check {0})" .format(self.slurm_lock_file)) if not os.path.exists('/etc/slurm-llnl'): # Slurm package not installed so grab it misc.run("apt-get install slurm-llnl -y") self._setup_slurm_conf() self._start_slurmctld() log.debug("Done setting up Slurmctld")
def start_sge(self ): if self.app.TESTFLAG is True: fakeretcode = 0 log.debug("Attempted to start SGE, but TESTFLAG is set. Returning retcode %s" % fakeretcode) return fakeretcode log.info( "Configuring SGE..." ) # Check if /lib64/libc.so.6 exists - it's required by SGE but on # Ubuntu 11.04 the location and name of the library have changed if not os.path.exists('/lib64/libc.so.6'): if os.path.exists('/lib64/x86_64-linux-gnu/libc-2.13.so'): os.symlink('/lib64/x86_64-linux-gnu/libc-2.13.so', '/lib64/libc.so.6') # Ubuntu 11.10 support elif os.path.exists("/lib/x86_64-linux-gnu/libc-2.13.so"): os.symlink("/lib/x86_64-linux-gnu/libc-2.13.so", "/lib64/libc.so.6") # Kernel 3.2 support (Ubuntu 12.04) elif os.path.exists("/lib/x86_64-linux-gnu/libc-2.15.so"): os.symlink("/lib/x86_64-linux-gnu/libc-2.15.so", "/lib64/libc.so.6") else: log.error("SGE config is likely to fail because '/lib64/libc.so.6' lib does not exists...") # Ensure lines starting with 127.0.1. are not included in /etc/hosts # because SGE fails to install if that's the case. This line is added # to /etc/hosts by cloud-init # (http://www.cs.nott.ac.uk/~aas/Software%2520Installation%2520and%2520Development%2520Problems.html) misc.run("sed -i.bak '/^127.0.1./s/^/# (Commented by CloudMan) /' /etc/hosts") log.debug( "Configuring users' SGE profiles..." ) f = open(paths.LOGIN_SHELL_SCRIPT, 'a') f.write( "\nexport SGE_ROOT=%s" % paths.P_SGE_ROOT ) f.write( "\n. $SGE_ROOT/default/common/settings.sh\n" ) f.close() SGE_config_file = '/tmp/galaxyEC2_configuration.conf' f = open( SGE_config_file, 'w' ) print >> f, sge_install_template % (self.app.cloud_interface.get_local_hostname(), self.app.cloud_interface.get_local_hostname(), self.app.cloud_interface.get_local_hostname()) f.close() os.chown( SGE_config_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2] ) log.info( "Created SGE install template as file '%s'." % SGE_config_file ) cmd = 'cd %s; ./inst_sge -x -noremote -auto %s' % (paths.P_SGE_ROOT, SGE_config_file) log.info("Setting up SGE; cmd: {0}".format(cmd)) ret_code = subprocess.call(cmd, shell=True ) if ret_code == 0: self.sge_started = 1 log.debug( "Successfully configured SGE." ) else: self.sge_started = -1 log.error( "Setting up SGE did not go smoothly, process returned with code '%s'" % ret_code ) self.console_monitor.send_node_status() return ret_code
def remove(self, synchronous=False): log.info("Removing SGE service") super(SGEService, self).remove(synchronous) self.state = service_states.SHUTTING_DOWN for inst in self.app.manager.worker_instances: if not inst.is_spot() or inst.spot_was_filled(): self.remove_node(inst) misc.run( 'export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -km' % ( self.app.path_resolver.sge_root, self.app.path_resolver.sge_root), "Problems stopping SGE master", "Successfully stopped SGE master") self.state = service_states.SHUT_DOWN
def status(self): """Check if Galaxy daemon is running and the UI is accessible.""" old_state = self.state if self._check_daemon('galaxy'): # log.debug("Galaxy daemon running. Checking if UI is accessible.") if self._is_galaxy_running(): self.state = service_states.RUNNING else: log.debug("Galaxy UI does not seem to be accessible.") self.state = service_states.STARTING elif self.state == service_states.SHUTTING_DOWN or \ self.state == service_states.SHUT_DOWN or \ self.state == service_states.UNSTARTED or \ self.state == service_states.WAITING_FOR_USER_ACTION: # self.state==service_states.STARTING: pass else: if self.state == service_states.STARTING and \ (datetime.utcnow() - self.last_state_change_time).seconds < 60: # Give Galaxy a minutes to start; otherwise, because # the monitor is running as a separate thread, it often happens # that the .pid file is not yet created after the Galaxy process # has been started so the monitor thread erroneously reports # as if starting the Galaxy process has failed. pass else: log.error("Galaxy daemon not running.") if self.remaining_start_attempts > 0: log.debug("Remaining Galaxy start attempts: {0}; setting svc state to UNSTARTED" .format(self.remaining_start_attempts)) self.state = service_states.UNSTARTED self.last_state_change_time = datetime.utcnow() else: log.debug("No remaining Galaxy start attempts; setting svc state to ERROR") self.state = service_states.ERROR self.last_state_change_time = datetime.utcnow() if old_state != self.state: log.info("Galaxy service state changed from '%s' to '%s'" % ( old_state, self.state)) self.last_state_change_time = datetime.utcnow() if self.state == service_states.RUNNING: # Once the service gets running, reset the number of start attempts self.remaining_start_attempts = NUM_START_ATTEMPTS log.debug("Granting SELECT permission to galaxyftp user on 'galaxy' database") misc.run('%s - postgres -c "%s/psql -p %s galaxy -c \\\"GRANT SELECT ON galaxy_user TO galaxyftp\\\" "' % (paths.P_SU, self.app.path_resolver.pg_home, paths.C_PSQL_PORT), "Error granting SELECT grant to 'galaxyftp' user", "Successfully added SELECT grant to 'galaxyftp' user") # Force cluster configuration state update on status change self.app.manager.console_monitor.store_cluster_config()
def _handle_prestart_commands(self): """ Inspect the user data key (either ``master_prestart_commands`` or ``worker_prestart_commands`` depending on node type and simply execute any commands provided there. For example:: master_prestart_commands: - "mkdir -p /mnt/galaxyData/pgsql/" - "mkdir -p /mnt/galaxyData/tmp" - "chown -R galaxy:galaxy /mnt/galaxyData" """ user_data_variable = "%s_prestart_commands" % self.node_type for command in self.app.config.get(user_data_variable, []): misc.run(command)
def _execute_local_script(self, script): if os.path.isdir(script): log.info("Found local directory %s'; executing all scripts therein (note that this " "may take a while)" % (script)) misc.run('cd %s; run-parts %s' % (script, script)) log.info("Done running PSS scripts in {0}".format(script)) elif os.path.isfile(script) and os.path.getsize(script) > 0: log.info("Found local file %s'; running it now (note that this " "may take a while)" % (script)) os.chmod(script, 0755) # Ensure the script is executable working_dir = os.path.dirname(script) or self.app.config['cloudman_home'] misc.run('cd %s;./%s' % (working_dir, script)) log.info("Done running PSS {0}".format(script)) else: log.debug("Specified local PSS file or directory (%s) does not exist; continuing." % script)
def mount(self): """ Mount the bucket as a local file system, making it available at ``/mnt/<bucket_name>`` """ try: if os.path.exists(self.mount_point): if len(os.listdir(self.mount_point)) != 0: log.warning("Filesystem at %s already exists and is not empty." % self.mount_point) return False else: os.mkdir(self.mount_point) mount_cmd = None mount_cmd = self._compose_mount_cmd() if mount_cmd is not None: ok = misc.run(mount_cmd) if ok is True: msg = "Done adding bucket {0} as a local file system. The bucket can now be "\ "accessed at /mnt/{0}".format(self.bucket_name) else: msg = "Seems to have run into a problem adding bucket {0} as a local file "\ "system.".format(self.bucket_name) log.debug(msg) self.app.msgs.info(msg) return ok else: log.error("Cannot compose command line for mounting bucket {0}".format(self.bucket_name)) except Exception, e: log.error("Trouble mounting bucket {0} as file system to {1}: {2}"\ .format(self.bucket_name, self.mount_point, e))
def unmount(self): """ Unmount the local file system mounted from the current bucket """ log.debug("Unmounting bucket {0} from {1}".format( self.bucket_name, self.mount_point)) return misc.run("/bin/umount {0}".format(self.mount_point))
def start_webserver(self): """ Start the Cloudera Manager web server (defaults to port 7180) """ def _disable_referer_check(): log.debug("Disabling refered check") config = {u'REFERER_CHECK': u'false', u'REMOTE_PARCEL_REPO_URLS': u'http://archive.cloudera.com/cdh5/parcels/5.4.1/'} done = False self.state = service_states.CONFIGURING while not done: try: self.cm_manager.update_config(config) log.debug("Succesfully disabled referer check") done = True self.started = True except Exception: log.debug("Still have not disabled referer check... ") time.sleep(15) if self.state in [service_states.SHUTTING_DOWN, service_states.SHUT_DOWN, service_states.ERROR]: log.debug("Service state {0}; not configuring ClouderaManager." .format(self.state)) done = True if misc.run("service cloudera-scm-server start"): _disable_referer_check()
def configure_sge( self ): if self.app.TESTFLAG is True: log.debug( "Attempted to get volumes, but TESTFLAG is set." ) return None log.info( "Configuring SGE..." ) SGE_config_file = '%s/galaxyEC2.conf' % paths.P_SGE_ROOT with open( SGE_config_file, 'w' ) as f: print >> f, self._get_sge_install_conf() os.chown(SGE_config_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2]) log.debug("Created SGE install template as file '%s'" % SGE_config_file) # Check if /lib64/libc.so.6 exists - it's required by SGE but on # Ubuntu 11.04 the location and name of the library have changed if not os.path.exists('/lib64/libc.so.6'): if os.path.exists('/lib64/x86_64-linux-gnu/libc-2.13.so'): os.symlink('/lib64/x86_64-linux-gnu/libc-2.13.so', '/lib64/libc.so.6') # Ubuntu 11.10 support elif os.path.exists("/lib/x86_64-linux-gnu/libc-2.13.so"): os.symlink("/lib/x86_64-linux-gnu/libc-2.13.so", "/lib64/libc.so.6") # Kernel 3.2 support (Ubuntu 12.04) elif os.path.exists("/lib/x86_64-linux-gnu/libc-2.15.so"): os.symlink("/lib/x86_64-linux-gnu/libc-2.15.so", "/lib64/libc.so.6") else: log.error("SGE config is likely to fail because '/lib64/libc.so.6' lib does not exists...") log.debug("Setting up SGE.") self._fix_util_arch() if misc.run('cd %s; ./inst_sge -m -x -auto %s' % (paths.P_SGE_ROOT, SGE_config_file), "Setting up SGE did not go smoothly", "Successfully set up SGE"): log.info("Successfully setup SGE; configuring SGE") log.debug("Adding parallel environments") pes = ['SMP_PE', 'MPI_PE'] for pe in pes: pe_file_path = os.path.join('/tmp', pe) with open(pe_file_path, 'w') as f: print >> f, getattr(templates, pe) misc.run('cd %s; ./bin/lx24-amd64/qconf -Ap %s' % (paths.P_SGE_ROOT, pe_file_path)) log.debug("Creating queue 'all.q'") SGE_allq_file = '%s/all.q.conf' % paths.P_SGE_ROOT with open( SGE_allq_file, 'w' ) as f: print >> f, templates.ALL_Q_TEMPLATE os.chown(SGE_allq_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2]) log.debug("Created SGE all.q template as file '%s'" % SGE_allq_file) misc.run('cd %s; ./bin/lx24-amd64/qconf -Mq %s' % (paths.P_SGE_ROOT, SGE_allq_file), "Error modifying all.q", "Successfully modified all.q") log.debug("Configuring users' SGE profiles") with open(paths.LOGIN_SHELL_SCRIPT, 'a') as f: f.write("\nexport SGE_ROOT=%s" % paths.P_SGE_ROOT) f.write("\n. $SGE_ROOT/default/common/settings.sh\n") return True return False
def _start_supervisord(self): """ Start the supervisord process with ``self.main_conf_file``. """ log.debug("Starting supervisord with {0}".format(self.main_conf_file)) if misc.run('supervisord -c {0}'.format(self.main_conf_file)): self.server = xmlrpclib.Server('http://localhost:{0}/RPC2'.format( self.sv_port))
def nginx_conf_dir(self): """ Use the running nginx to provide the location of the current nginx configuration directory """ conf_file = misc.run("{0} -t && {0} -t 2>&1 | head -n 1 | cut -d' ' -f5".format(self.nginx_executable)) if os.path.exists(conf_file.strip()): return conf_file.rstrip("nginx.conf\n") return ''
def _ensure_ephemeral_disk_mounted(self): """ Make sure `/mnt` is a mounted device vs. just being part of `/`. At least some AWS instance types (e.g., r3) do not auto-mount what's in `/ets/fstab` so make sure the ephemeral disks are in fact mounted. http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html#InstanceStoreTrimSupport """ if not misc.run('mountpoint -q /mnt'): device = '/dev/xvdb' # Most of AWS instances have this device if os.path.exists(device): log.debug("/mnt is not a mountpoint; will try to mount it from {0}" .format(device)) misc.run('mkfs.xfs {0}'.format(device)) misc.run('mount -o discard {0} /mnt'.format(device)) else: log.warning("Mountpoint /mnt not available and no device {0}" .format(device))
def remove(self, synchronous=False): """ Stop the Supervisor service. """ log.info("Stopping {0} service".format(self.name)) super(SupervisorService, self).remove(synchronous) self.state = service_states.SHUTTING_DOWN try: assert self.supervisor.shutdown() self.state = service_states.SHUT_DOWN self.server = None except: # Let's try a more direct approach log.debug("Stopping supervisord with pid from {0}".format(self.pid_file)) cmd = ('/sbin/start-stop-daemon --retry TERM/5/KILL/10 --stop ' '--pidfile {0}'.format(self.pid_file)) misc.run(cmd) self.state = service_states.SHUT_DOWN self.server = None
def _reconfigure_cluster(self): """ (Re)configure the cluster (ie, job manager) to match the current set of resources. The method will (re)generate ``slurm.conf`` and issue ``scontrol reconfigure`` command that will update all Slurm damemons. """ log.debug("Reconfiguring Slurm cluster") self._setup_slurm_conf() return misc.run("/usr/bin/scontrol reconfigure")
def _as_postgres(self, cmd, cwd=None): return misc.run('%s - postgres -c "%s"' % (paths.P_SU, cmd), cwd=cwd)
def manage_galaxy(self, to_be_started=True): """ Use this method to start and stop Galaxy application. :type to_be_started: bool :param to_be_started: If set, this method will attempt to start the Galaxy application process. If not set, the method will attempt to shut down the application process. """ log.debug("Using Galaxy from '{0}'".format(self.galaxy_home)) os.putenv("GALAXY_HOME", self.galaxy_home) os.putenv("TEMP", self.app.path_resolver.galaxy_temp) os.putenv("TMPDIR", self.app.path_resolver.galaxy_temp) self.env_vars["GALAXY_HOME"] = self.galaxy_home self.env_vars["TEMP"] = self.app.path_resolver.galaxy_temp self.env_vars["TMPDIR"] = self.app.path_resolver.galaxy_temp conf_dir = self.option_manager.setup() if conf_dir: self.env_vars["GALAXY_UNIVERSE_CONFIG_DIR"] = conf_dir if self.multiple_processes(): self.env_vars["GALAXY_RUN_ALL"] = "TRUE" # HACK: Galaxy has a known problem when starting from a fresh # configuration in multiple process mode. Each process attempts to # create the same directories and one or more processes can fail to # start because it "failed" to create said directories (because # another process created them first). This hack staggers # the process starts in an attempt to circumvent this problem. patch_run_sh_command = ( "sudo sed -i -e \"s/server.log \\$\\@$/\\0; " "sleep 4/\" %s/run.sh" % self.galaxy_home) misc.run(patch_run_sh_command) self.extra_daemon_args = "" else: # Instead of sticking with default paster.pid and paster.log, # explicitly set pid and log file to ``main.pid`` and ``main.log`` # to bring single process case inline with defaults for for multiple # process case (i.e. when GALAXY_RUN_ALL is set and multiple servers # are defined). # self.extra_daemon_args = "--pid-file=main.pid --log-file=main.log" # No longer required pass if to_be_started and self.remaining_start_attempts > 0: self.status() if not self.configured: log.debug("Setting up Galaxy application") # Set job manager configs if necessary for job_manager_svc in self.app.manager.service_registry.active( service_role=ServiceRole.JOB_MANAGER): if ServiceRole.SGE in job_manager_svc.svc_roles: log.debug("Running on SGE; setting env_vars") self.env_vars[ "SGE_ROOT"] = self.app.path_resolver.sge_root, self.env_vars[ "DRMAA_LIBRARY_PATH"] = self.app.path_resolver.drmaa_library_path # Make sure Galaxy home dir exists if not os.path.exists(self.galaxy_home): log.error("Galaxy application directory '%s' does not " "exist! Aborting." % self.galaxy_home) log.debug("ls /mnt/: %s" % os.listdir('/mnt/')) self.state = service_states.ERROR self.last_state_change_time = datetime.utcnow() return False # Ensure the necessary directories exist for dir_name in [ paths.P_GALAXY_INDICES, ('%s/tmp/job_working_directory' % self.app.path_resolver.galaxy_data) ]: misc.make_dir(dir_name, 'galaxy') self.configured = True if not self._is_galaxy_running(): log.debug("Starting Galaxy...") self.update_galaxy_config() start_command = self.galaxy_run_command("%s --daemon" % self.extra_daemon_args) attempt_chown_galaxy(self.galaxy_home) if misc.run(start_command): self.remaining_start_attempts -= 1 elif self.remaining_start_attempts > 0: log.debug( "It seems Galaxy failed to start; will atempt to " "auto-restart (up to {0} more time(s)).".format( self.remaining_start_attempts)) self.state = service_states.UNSTARTED self.last_state_change_time = datetime.utcnow() else: log.debug( "It seems Galaxy failed to start; setting service " "state to {0}.".format(service_states.ERROR)) self.state = service_states.ERROR self.last_state_change_time = datetime.utcnow() else: log.debug("Galaxy already running.") else: log.info("Shutting down Galaxy...") self.state = service_states.SHUTTING_DOWN stop_command = self.galaxy_run_command("%s --stop-daemon" % self.extra_daemon_args) if self._is_galaxy_running(): misc.run(stop_command) if not self._is_galaxy_running(): log.debug( "Galaxy not running; setting service state to SHUT_DOWN.") self.state = service_states.SHUT_DOWN self.last_state_change_time = datetime.utcnow() # Move all log files subprocess.call( "bash -c 'for f in $GALAXY_HOME/{main,handler,manager,web}*.log; " "do mv \"$f\" \"$f.%s\"; done'" % datetime.utcnow().strftime('%H_%M'), shell=True)
def _configure_sge(self): log.info("Setting up SGE...") SGE_config_file = '%s/galaxyEC2.conf' % self.app.path_resolver.sge_root with open(SGE_config_file, 'w') as f: print >> f, _get_sge_install_conf( self.app, self.app.cloud_interface.get_private_ip()) os.chown(SGE_config_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2]) log.debug("Created SGE install template as file '%s'" % SGE_config_file) fix_libc() log.debug("Setting up SGE.") self._fix_util_arch() if misc.run( 'cd %s; ./inst_sge -m -x -auto %s' % (self.app.path_resolver.sge_root, SGE_config_file), "Setting up SGE did not go smoothly", "Successfully set up SGE"): log.debug("Successfully setup SGE; configuring SGE") log.debug("Adding parallel environments") pes = ['SGE_SMP_PE', 'SGE_MPI_PE'] for pe in pes: pe_file_path = os.path.join('/tmp', pe) with open(pe_file_path, 'w') as f: print >> f, conf_manager.load_conf_template( getattr(conf_manager, pe)).safe_substitute() misc.run('cd %s; ./bin/lx24-amd64/qconf -Ap %s' % (self.app.path_resolver.sge_root, pe_file_path)) log.debug("Creating queue 'all.q'") SGE_allq_file = '%s/all.q.conf' % self.app.path_resolver.sge_root all_q_template = conf_manager.load_conf_template( conf_manager.SGE_ALL_Q_TEMPLATE) if self.app.config.hadoop_enabled: all_q_params = { "slots": int(commands.getoutput("nproc")), "prolog_path": os.path.join( paths.P_HADOOP_HOME, paths.P_HADOOP_INTEGRATION_FOLDER + "/hdfsstart.sh"), "epilog_path": os.path.join( paths.P_HADOOP_HOME, paths.P_HADOOP_INTEGRATION_FOLDER + "/hdfsstop.sh") } else: all_q_params = { "slots": int(commands.getoutput("nproc")), "prolog_path": 'NONE', "epilog_path": 'NONE' } with open(SGE_allq_file, 'w') as f: print >> f, all_q_template.substitute(all_q_params) os.chown(SGE_allq_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2]) log.debug("Created SGE all.q template as file '%s'" % SGE_allq_file) misc.run( 'cd %s; ./bin/lx24-amd64/qconf -Mq %s' % (self.app.path_resolver.sge_root, SGE_allq_file), "Error modifying all.q", "Successfully modified all.q") log.debug("Configuring users' SGE profiles") misc.append_to_file( paths.LOGIN_SHELL_SCRIPT, "\nexport SGE_ROOT=%s" % self.app.path_resolver.sge_root) misc.append_to_file(paths.LOGIN_SHELL_SCRIPT, "\n. $SGE_ROOT/default/common/settings.sh\n") # Write out the .sge_request file for individual users sge_request_template = conf_manager.load_conf_template( conf_manager.SGE_REQUEST_TEMPLATE) sge_request_params = { 'psql_home': self.app.path_resolver.pg_home, 'galaxy_tools_dir': self.app.path_resolver.galaxy_tools, } users = ['galaxy', 'ubuntu'] for user in users: sge_request_file = os.path.join('/home', user, '.sge_request') with open(sge_request_file, 'w') as f: print >> f, sge_request_template.substitute( sge_request_params) os.chown(sge_request_file, pwd.getpwnam(user)[2], grp.getgrnam(user)[2]) return True return False
def _add_instance_as_exec_host(self, inst_alias, inst_private_ip): """ Add instance with ``inst_alias`` and ``inst_private_ip`` to the SGE execution host list. ``inst_alias`` is used only in log statements while the the ``inst_private_ip`` is the IP address (or hostname) of the given instance, which must be visible (i.e., accessible) to the other nodes in the clusters. """ ok = True # Check if host is already in the exec host list cmd = "export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -sel" \ % (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = proc.communicate() if inst_private_ip in stdout: log.debug("Instance '%s' already in SGE execution host list" % inst_alias) else: log.debug("Adding instance '%s' to SGE execution host list." % inst_alias) # Create a dir to hold all of workers host configuration files host_conf_dir = "%s/host_confs" % self.app.path_resolver.sge_root if not os.path.exists(host_conf_dir): subprocess.call('mkdir -p %s' % host_conf_dir, shell=True) os.chown(host_conf_dir, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2]) host_conf_file = os.path.join(host_conf_dir, str(inst_alias)) with open(host_conf_file, 'w') as f: print >> f, conf_manager.load_conf_template( conf_manager.SGE_HOST_CONF_TEMPLATE).substitute( {'hostname': inst_private_ip}) os.chown(host_conf_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2]) log.debug("Created SGE host configuration template as file '%s'." % host_conf_file) # Add worker instance as execution host to SGE cmd = 'export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -Ae %s' \ % (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root, host_conf_file) log.debug("Add SGE exec host cmd: {0}".format(cmd)) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) if proc.wait() == 0: log.debug( "Successfully added instance '%s' w/ private IP '%s' as an execution host." % (inst_alias, inst_private_ip)) else: ok = False log.error( "Process encountered problems adding instance '%s' as an SGE execution host. " "Process returned code %s" % (inst_alias, proc.returncode)) stderr = stdout = None stdout, stderr = proc.communicate() log.debug( " - adding instance '%s' SGE execution host stdout (private IP: %s): '%s'" % (inst_alias, inst_private_ip, stdout)) log.debug( " - adding instance '%s' SGE execution host stderr (private IP: %s): '%s'" % (inst_alias, inst_private_ip, stderr)) # == Add given instance's hostname to @allhosts # Check if instance is already in allhosts file and do not recreate the # file if so. # Additional documentation: allhosts file can be generated by CloudMan # each time an instance is added or removed. The file is generated based # on the Instance object CloudMan keeps track of and, as a result, it # includes all of the instances listed. So, some instances, although they # have yet to go through the addition process, might have had their IPs # already included in the allhosts file. This approach ensures consistency # between SGE and CloudMan and has been working much better than trying # to sync the two via other methods. proc = subprocess.Popen( "export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; " "%s/bin/lx24-amd64/qconf -shgrp @allhosts" % (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root), shell=True, stdout=subprocess.PIPE) allhosts_out = proc.communicate()[0] if inst_private_ip not in allhosts_out: now = datetime.datetime.utcnow() ah_file = '/tmp/ah_add_' + now.strftime("%H_%M_%S") self._write_allhosts_file(filename=ah_file, to_add=inst_private_ip) if not misc.run( 'export SGE_ROOT=%s;. $SGE_ROOT/default/common/settings.sh; ' '%s/bin/lx24-amd64/qconf -Mhgrp %s' % (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root, ah_file), "Problems updating @allhosts aimed at adding '%s'" % inst_alias, "Successfully updated @allhosts to add '%s' with address '%s'" % (inst_alias, inst_private_ip)): ok = False else: log.debug("Instance '%s' IP is already in SGE's @allhosts" % inst_alias) # On instance reboot, SGE might have already been configured for a given # instance and this method will fail along the way although the instance # will still operate within SGE so don't explicitly state it was added. if ok: log.debug("Successfully added instance '%s' to SGE" % inst_alias) return ok
def remove(self, synchronous=False): """ Stop the Cloudera Manager web server. """ log.info("Stopping Cloudera Manager service") super(ClouderaManagerService, self).remove(synchronous) self.state = service_states.SHUTTING_DOWN try: if self.cm_api_resource: cluster = self.cm_api_resource.get_cluster(self.cluster_name) cluster.stop() except Exception, exc: log.error("Exception stopping cluster {0}: {1}".format( self.cluster_name, exc)) if misc.run("service cloudera-scm-server stop"): self.state = service_states.SHUT_DOWN def configure_db(self): """ Add the necessary tables to the default PostgreSQL server running on the host and prepare the necessary roles and databases. """ # Update psql settings pg_conf = paths.P_PG_CONF lif = [ "listen_addresses = '*'", "shared_buffers = 256MB", "wal_buffers = 8MB", "checkpoint_segments = 16", "checkpoint_completion_target = 0.9" ] for l in lif:
def _as_galaxy(self, cmd): return misc.run('%s - galaxy -c "%s"' % (paths.P_SU, cmd))