Example #1
0
    def modify_htcondor(self, key, value, action="a"):
        """
        Modifying HTCondor environment for running HTCondor as desired.
        It will restart HTCondor after modifying the configuration.
        The configuration format is in the form of a key value string pair
        and if the action passed as "a" then it will add the new value to the
        old value.
        """
        log.debug("modifying HTCondor")

        all_done = False
        try:
            default_val = self.find_config(key, '/etc/condor/condor_config')
            log.debug(default_val)
            val = ""
            if action == "a":
                if default_val != "":
                    val = value + "," + default_val
                else:
                    val = value
            else:
                val = value
            with open(paths.P_HTCONDOR_CONFIG_PATH, 'a') as f:
                print >> f, str(key) + "=" + str(val)
            misc.run(paths.P_HTCONDOR_HOME + "/condor restart")
            all_done = True
        except Exception, e:
            log.debug("Error while configuring HTCondor: {0}".format(e))
            all_done = False
            self.state = service_states.ERROR
Example #2
0
 def _stop_sge(self):
     log.info("Stopping SGE.")
     for inst in self.app.manager.worker_instances:
         self.remove_node(inst)
     misc.run('export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -km'
              % (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root),
              "Problems stopping SGE master", "Successfully stopped SGE master.")
Example #3
0
 def stop_sge(self):
     log.info("Stopping SGE.")
     for inst in self.app.manager.worker_instances:
         self.remove_sge_host(inst.get_id(), inst.get_private_ip())
     misc.run('export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -km' \
         % (paths.P_SGE_ROOT, paths.P_SGE_ROOT), "Problems stopping SGE master", \
         "Successfully stopped SGE master.")
Example #4
0
 def reload(self):
     """
     Reload nginx process (`nginx -s reload`)
     """
     # TODO: run `nginx -t` before attemping to reload the process to make
     # sure the conf files are OK and thus reduce chances of screwing up
     misc.run('{0} -c {1} -s reload'.format(self.exe, self.conf_file))
Example #5
0
 def _attempt_chown_galaxy(self, path):
     try:
         galaxy_uid = pwd.getpwnam("galaxy")[2]
         galaxy_gid = grp.getgrnam("galaxy")[2]
         os.chown(path, galaxy_uid, galaxy_gid)
     except OSError:
         misc.run("chown galaxy:galaxy '%s'" % path)
Example #6
0
 def configure_htcondor(self):
     """
     Configure environment for running HTCondor service over a node.
     """
     all_done = False
     try:
         htcondor_params = {}
         if self.srv_type == "master":
             condor_template = conf_manager.load_conf_template(conf_manager.HTCONDOR_MASTER_CONF_TEMPLATE)
             # log.debug("Condor template: {0}".format(condor_template))
             htcondor_params["flock_host"] = self.flock_to
         else:
             condor_template = conf_manager.load_conf_template(conf_manager.HTCONDOR_WOORKER_CONF_TEMPLATE)
             htcondor_params = {
                 "host": self.host
             }
         log.debug("HTCondor params: {0}".format(str(htcondor_params)))
         condor_template = condor_template.substitute(htcondor_params)
         if os.path.exists(paths.P_HTCONDOR_CONFIG_PATH):
             with open(paths.P_HTCONDOR_CONFIG_PATH, 'a') as f:
                 print >> f, condor_template
             misc.run(paths.P_HTCONDOR_HOME + "/condor restart")
             all_done = True
             self.state = service_states.RUNNING
         else:
             log.error("HTCondor config file {0} not found!"
                       .format(paths.P_HTCONDOR_CONFIG_PATH))
     except Exception, e:
         log.debug("Error while configuring HTCondor: {0}".format(e))
         self.state = service_states.ERROR
         all_done = False
Example #7
0
 def configure_hadoop(self):
     """
     Configure environment for running Hadoop on demand.
     """
     all_done = False
     try:
         log.debug("Setting up Hadoop environment")
         etcFile = open("/etc/environment", "a")
         etcFile.write("JAVA_HOME=\"/usr\"\n")
         etcFile.flush()
         etcFile.close()
         log.debug("Hadoop id_rsa set from::" + self.id_rsa_path)
         hadoop_id_rsa = "/home/ubuntu/.ssh/id_rsa"
         shutil.copy(self.id_rsa_path, hadoop_id_rsa)
         misc.run("chown -c ubuntu {0}".format(hadoop_id_rsa))
         log.debug("Hadoop authFile saved to {0}".format(hadoop_id_rsa))
         authFile = open("/home/ubuntu/.ssh/authorized_keys", "a")
         pubKeyFile = open(self.id_rsa_pub_key_path)
         authFile.write(pubKeyFile.read())
         authFile.flush()
         authFile.close()
         pubKeyFile.close()
         misc.run("chown -c ubuntu /home/ubuntu/.ssh/authorized_keys")
         all_done = True
     except Exception, e:
         log.debug("Error while configuring HADOOP: {0}".format(e))
         all_done = False
Example #8
0
 def start(self):
     """
     Start Cloudera Manager web server.
     """
     log.debug("Starting Cloudera Manager service")
     self.state = service_states.STARTING
     misc.run('/sbin/sysctl vm.swappiness=0')  # Recommended by Cloudera
     threading.Thread(target=self.__start).start()
Example #9
0
 def remove(self):
     """
     Shutting down Condor.
     """
     log.info("Shutting down HTCondor service")
     self.state = service_states.SHUTTING_DOWN
     misc.run("condor_off")
     self.state = service_states.SHUT_DOWN
Example #10
0
 def remove(self):
     """
     Shut down ProFTPd service.
     """
     log.info("Shutting down ProFTPd service")
     self.state = service_states.SHUTTING_DOWN
     misc.run("/etc/init.d/proftpd stop")
     self.state = service_states.SHUT_DOWN
Example #11
0
 def unsuspend_queue(self, queue_name='all.q'):
     """
     Unsuspend ``queue_name`` queue so it can run jobs.
     """
     log.debug("Unsuspending SGE queue {0}".format(queue_name))
     misc.run('export SGE_ROOT={0}; . $SGE_ROOT/default/common/settings.sh; '
              '{1}/bin/lx24-amd64/qmod -usq {2}'
              .format(self.app.path_resolver.sge_root,
                      self.app.path_resolver.sge_root, queue_name))
Example #12
0
 def start(self):
     """ Wait until all other services are running before starting this one."""
     log.debug("Starting %s service" % self.name)
     # All other services OK, start this one now
     self.state = service_states.RUNNING
     log.debug("%s service prerequisites OK (i.e., all other services running), "
               "checking if %s was provided..." % (self.name, self.pss_filename))
     local_pss_file = os.path.join(
         self.app.ud['cloudman_home'], self.pss_filename)
     # Check user data first to allow overwriting of a potentially existing
     # script
     if self.pss_url:
         # This assumes the provided URL is readable to anyone w/o authentication
         # First check if the file actually exists
         if misc.run('wget --server-response %s' % self.pss_url):
             misc.run('wget --output-document=%s %s' % (
                 local_pss_file, self.pss_url))
         else:
             log.error(
                 "Specified post_start_script url (%s) does not exist" % self.pss_url)
     else:
         s3_conn = self.app.cloud_interface.get_s3_connection()
         b = None
         if s3_conn and 'bucket_cluster' in self.app.ud:
             b = s3_conn.lookup(self.app.ud['bucket_cluster'])
         if b is not None:  # Check if an existing cluster has a stored post start script
             log.debug("Cluster bucket '%s' found; looking for post start script '%s'"
                       % (b.name, self.pss_filename))
             misc.get_file_from_bucket(
                 s3_conn, b.name, self.pss_filename, local_pss_file)
     if os.path.exists(local_pss_file) and os.path.getsize(local_pss_file) > 0:
         log.info("%s found and saved to '%s'; running it now (note that this may take a while)"
                  % (self.pss_filename, os.path.join(self.app.ud['cloudman_home'], self.pss_filename)))
         os.chmod(local_pss_file, 0755)  # Ensure the script is executable
         misc.run('cd %s;./%s' % (self.app.ud[
                  'cloudman_home'], self.pss_filename))
         self.save_to_bucket()
         log.info("Done running {0}".format(self.pss_filename))
     else:
         log.debug("%s does not exist or could not be downloaded; continuing without running it."
                   % self.name)
     # Prime the object with instance data (because this may take a while
     # on some clouds, do so in a separate thread)
     threading.Thread(target=self._prime_data).start()
     self.state = service_states.SHUT_DOWN
     log.debug("%s service done and marked as '%s'" % (self.name, self.state))
     if self.instance_role == 'master':
         # On master, remove the service upon completion (PSS runs only
         # once)
         self.remove()
     self.state = service_states.COMPLETED
     # Once this service is complete, it's safe to assume the cluster is
     # READY
     self.app.manager.cluster_status = cluster_status.READY
     msg = "All cluster services started; the cluster is ready for use."
     log.info(msg)
     self.app.msgs.info(msg)
Example #13
0
 def remove(self):
     # TODO write something to clean up SGE in the case of restarts?
     log.info("Removing SGE service")
     self.state = service_states.SHUTTING_DOWN
     for inst in self.app.manager.worker_instances:
         if not inst.is_spot() or inst.spot_was_filled():
             self.remove_sge_host(inst.get_id(), inst.get_private_ip())
     
     misc.run('export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -km' % (paths.P_SGE_ROOT, paths.P_SGE_ROOT), "Problems stopping SGE master", "Successfully stopped SGE master")
     self.state = service_states.SHUT_DOWN
Example #14
0
 def remove(self, synchronous=False):
     if self._check_daemon('slurmd'):
         log.info("Removing {0} service".format(self.name))
         super(SlurmdService, self).remove(synchronous)
         self.state = service_states.SHUTTING_DOWN
         misc.run("/sbin/start-stop-daemon --retry TERM/5/KILL/10 --stop "
                  "--exec /usr/sbin/slurmd")
         self.state = service_states.SHUT_DOWN
     else:
         log.debug("Tried to remove {0} service but no deamon running?"
                   .format(self.name))
Example #15
0
 def _install_s3fs(self):
     msg = "s3fs is not installed; will install it now (this typically takes 2-5 minutes)."
     log.info(msg)
     self.app.msgs.info(msg)
     misc.run("cd /tmp;wget --output-document=s3fs.sh http://s3.amazonaws.com/cloudman/pss/s3fs.sh")
     if misc.run("cd /tmp;bash s3fs.sh"):
         msg = "Done installing s3fs"
     else:
         msg = "Trouble installing sf3s; giving up."
     log.debug(msg)
     self.app.msgs.info(msg)
Example #16
0
 def _setup_slurm(self):
     """
     Setup ``slurmctld`` process.
     """
     log.debug("Setting up Slurmctld... (if stuck here for a while, check {0})"
               .format(self.slurm_lock_file))
     if not os.path.exists('/etc/slurm-llnl'):
         # Slurm package not installed so grab it
         misc.run("apt-get install slurm-llnl -y")
     self._setup_slurm_conf()
     self._start_slurmctld()
     log.debug("Done setting up Slurmctld")
Example #17
0
 def start_sge(self ):
     if self.app.TESTFLAG is True:
         fakeretcode = 0
         log.debug("Attempted to start SGE, but TESTFLAG is set.  Returning retcode %s" % fakeretcode)
         return fakeretcode
     log.info( "Configuring SGE..." )
     # Check if /lib64/libc.so.6 exists - it's required by SGE but on 
     # Ubuntu 11.04 the location and name of the library have changed
     if not os.path.exists('/lib64/libc.so.6'):
         if os.path.exists('/lib64/x86_64-linux-gnu/libc-2.13.so'):
             os.symlink('/lib64/x86_64-linux-gnu/libc-2.13.so', '/lib64/libc.so.6')
         # Ubuntu 11.10 support
         elif os.path.exists("/lib/x86_64-linux-gnu/libc-2.13.so"):
             os.symlink("/lib/x86_64-linux-gnu/libc-2.13.so", "/lib64/libc.so.6")
         # Kernel 3.2 support (Ubuntu 12.04)
         elif os.path.exists("/lib/x86_64-linux-gnu/libc-2.15.so"):
             os.symlink("/lib/x86_64-linux-gnu/libc-2.15.so", "/lib64/libc.so.6")
         else:
             log.error("SGE config is likely to fail because '/lib64/libc.so.6' lib does not exists...")
     # Ensure lines starting with 127.0.1. are not included in /etc/hosts 
     # because SGE fails to install if that's the case. This line is added
     # to /etc/hosts by cloud-init
     # (http://www.cs.nott.ac.uk/~aas/Software%2520Installation%2520and%2520Development%2520Problems.html)
     misc.run("sed -i.bak '/^127.0.1./s/^/# (Commented by CloudMan) /' /etc/hosts")        
     log.debug( "Configuring users' SGE profiles..." )
     f = open(paths.LOGIN_SHELL_SCRIPT, 'a')
     f.write( "\nexport SGE_ROOT=%s" % paths.P_SGE_ROOT )
     f.write( "\n. $SGE_ROOT/default/common/settings.sh\n" )
     f.close()
     
     SGE_config_file = '/tmp/galaxyEC2_configuration.conf'
     f = open( SGE_config_file, 'w' )
     print >> f, sge_install_template % (self.app.cloud_interface.get_local_hostname(),
             self.app.cloud_interface.get_local_hostname(),
             self.app.cloud_interface.get_local_hostname())
     f.close()
     os.chown( SGE_config_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2] )
     log.info( "Created SGE install template as file '%s'." % SGE_config_file )
     
     cmd = 'cd %s; ./inst_sge -x -noremote -auto %s' % (paths.P_SGE_ROOT, SGE_config_file)
     log.info("Setting up SGE; cmd: {0}".format(cmd))
     ret_code = subprocess.call(cmd, shell=True )
     
     if ret_code == 0:
         self.sge_started = 1
         log.debug( "Successfully configured SGE." )
     else:
         self.sge_started = -1
         log.error( "Setting up SGE did not go smoothly, process returned with code '%s'" % ret_code )
     
     self.console_monitor.send_node_status()
     return ret_code
Example #18
0
    def remove(self, synchronous=False):
        log.info("Removing SGE service")
        super(SGEService, self).remove(synchronous)
        self.state = service_states.SHUTTING_DOWN
        for inst in self.app.manager.worker_instances:
            if not inst.is_spot() or inst.spot_was_filled():
                self.remove_node(inst)

        misc.run(
            'export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -km' % (
                self.app.path_resolver.sge_root, self.app.path_resolver.sge_root),
            "Problems stopping SGE master", "Successfully stopped SGE master")
        self.state = service_states.SHUT_DOWN
Example #19
0
 def status(self):
     """Check if Galaxy daemon is running and the UI is accessible."""
     old_state = self.state
     if self._check_daemon('galaxy'):
         # log.debug("Galaxy daemon running. Checking if UI is accessible.")
         if self._is_galaxy_running():
             self.state = service_states.RUNNING
         else:
             log.debug("Galaxy UI does not seem to be accessible.")
             self.state = service_states.STARTING
     elif self.state == service_states.SHUTTING_DOWN or \
         self.state == service_states.SHUT_DOWN or \
         self.state == service_states.UNSTARTED or \
             self.state == service_states.WAITING_FOR_USER_ACTION:
          # self.state==service_states.STARTING:
         pass
     else:
         if self.state == service_states.STARTING and \
                 (datetime.utcnow() - self.last_state_change_time).seconds < 60:
             # Give Galaxy a minutes to start; otherwise, because
             # the monitor is running as a separate thread, it often happens
             # that the .pid file is not yet created after the Galaxy process
             # has been started so the monitor thread erroneously reports
             # as if starting the Galaxy process has failed.
             pass
         else:
             log.error("Galaxy daemon not running.")
             if self.remaining_start_attempts > 0:
                 log.debug("Remaining Galaxy start attempts: {0}; setting svc state to UNSTARTED"
                     .format(self.remaining_start_attempts))
                 self.state = service_states.UNSTARTED
                 self.last_state_change_time = datetime.utcnow()
             else:
                 log.debug("No remaining Galaxy start attempts; setting svc state to ERROR")
                 self.state = service_states.ERROR
                 self.last_state_change_time = datetime.utcnow()
     if old_state != self.state:
         log.info("Galaxy service state changed from '%s' to '%s'" % (
             old_state, self.state))
         self.last_state_change_time = datetime.utcnow()
         if self.state == service_states.RUNNING:
             # Once the service gets running, reset the number of start attempts
             self.remaining_start_attempts = NUM_START_ATTEMPTS
             log.debug("Granting SELECT permission to galaxyftp user on 'galaxy' database")
             misc.run('%s - postgres -c "%s/psql -p %s galaxy -c \\\"GRANT SELECT ON galaxy_user TO galaxyftp\\\" "'
                      % (paths.P_SU, self.app.path_resolver.pg_home, paths.C_PSQL_PORT),
                      "Error granting SELECT grant to 'galaxyftp' user",
                      "Successfully added SELECT grant to 'galaxyftp' user")
         # Force cluster configuration state update on status change
         self.app.manager.console_monitor.store_cluster_config()
Example #20
0
    def _handle_prestart_commands(self):
        """
        Inspect the user data key (either ``master_prestart_commands`` or
        ``worker_prestart_commands`` depending on node type and simply execute
        any commands provided there.

        For example::
            master_prestart_commands:
              - "mkdir -p /mnt/galaxyData/pgsql/"
              - "mkdir -p /mnt/galaxyData/tmp"
              - "chown -R galaxy:galaxy /mnt/galaxyData"
        """
        user_data_variable = "%s_prestart_commands" % self.node_type
        for command in self.app.config.get(user_data_variable, []):
            misc.run(command)
Example #21
0
 def _execute_local_script(self, script):
     if os.path.isdir(script):
         log.info("Found local directory %s'; executing all scripts therein (note that this "
                  "may take a while)" % (script))
         misc.run('cd %s; run-parts %s' % (script, script))
         log.info("Done running PSS scripts in {0}".format(script))
     elif os.path.isfile(script) and os.path.getsize(script) > 0:
         log.info("Found local file %s'; running it now (note that this "
                  "may take a while)" % (script))
         os.chmod(script, 0755)  # Ensure the script is executable
         working_dir = os.path.dirname(script) or self.app.config['cloudman_home']
         misc.run('cd %s;./%s' % (working_dir, script))
         log.info("Done running PSS {0}".format(script))
     else:
         log.debug("Specified local PSS file or directory (%s) does not exist; continuing." % script)
Example #22
0
 def mount(self):
     """
     Mount the bucket as a local file system, making it available at
     ``/mnt/<bucket_name>``
     """
     try:
         if os.path.exists(self.mount_point):
             if len(os.listdir(self.mount_point)) != 0:
                 log.warning("Filesystem at %s already exists and is not empty." % self.mount_point)
                 return False
         else:
             os.mkdir(self.mount_point)
         mount_cmd = None
         mount_cmd = self._compose_mount_cmd()
         if mount_cmd is not None:
             ok = misc.run(mount_cmd)
             if ok is True:
                 msg = "Done adding bucket {0} as a local file system. The bucket can now be "\
                     "accessed at /mnt/{0}".format(self.bucket_name)
             else:
                 msg = "Seems to have run into a problem adding bucket {0} as a local file "\
                         "system.".format(self.bucket_name)
             log.debug(msg)
             self.app.msgs.info(msg)
             return ok
         else:
             log.error("Cannot compose command line for mounting bucket {0}".format(self.bucket_name))
     except Exception, e:
         log.error("Trouble mounting bucket {0} as file system to {1}: {2}"\
             .format(self.bucket_name, self.mount_point, e))
Example #23
0
 def unmount(self):
     """
     Unmount the local file system mounted from the current bucket
     """
     log.debug("Unmounting bucket {0} from {1}".format(
         self.bucket_name, self.mount_point))
     return misc.run("/bin/umount {0}".format(self.mount_point))
Example #24
0
    def start_webserver(self):
        """
        Start the Cloudera Manager web server (defaults to port 7180)
        """
        def _disable_referer_check():
            log.debug("Disabling refered check")
            config = {u'REFERER_CHECK': u'false',
                      u'REMOTE_PARCEL_REPO_URLS': u'http://archive.cloudera.com/cdh5/parcels/5.4.1/'}
            done = False
            self.state = service_states.CONFIGURING
            while not done:
                try:
                    self.cm_manager.update_config(config)
                    log.debug("Succesfully disabled referer check")
                    done = True
                    self.started = True
                except Exception:
                    log.debug("Still have not disabled referer check... ")
                    time.sleep(15)
                    if self.state in [service_states.SHUTTING_DOWN,
                                      service_states.SHUT_DOWN,
                                      service_states.ERROR]:
                        log.debug("Service state {0}; not configuring ClouderaManager."
                                  .format(self.state))
                        done = True

        if misc.run("service cloudera-scm-server start"):
            _disable_referer_check()
Example #25
0
 def configure_sge( self ):
     if self.app.TESTFLAG is True:
         log.debug( "Attempted to get volumes, but TESTFLAG is set." )
         return None
     log.info( "Configuring SGE..." )
     SGE_config_file = '%s/galaxyEC2.conf' % paths.P_SGE_ROOT
     with open( SGE_config_file, 'w' ) as f:
         print >> f, self._get_sge_install_conf()
     os.chown(SGE_config_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2])
     log.debug("Created SGE install template as file '%s'" % SGE_config_file)
     # Check if /lib64/libc.so.6 exists - it's required by SGE but on 
     # Ubuntu 11.04 the location and name of the library have changed
     if not os.path.exists('/lib64/libc.so.6'):
         if os.path.exists('/lib64/x86_64-linux-gnu/libc-2.13.so'):
             os.symlink('/lib64/x86_64-linux-gnu/libc-2.13.so', '/lib64/libc.so.6')
         # Ubuntu 11.10 support
         elif os.path.exists("/lib/x86_64-linux-gnu/libc-2.13.so"):
             os.symlink("/lib/x86_64-linux-gnu/libc-2.13.so", "/lib64/libc.so.6")
         # Kernel 3.2 support (Ubuntu 12.04)
         elif os.path.exists("/lib/x86_64-linux-gnu/libc-2.15.so"):
             os.symlink("/lib/x86_64-linux-gnu/libc-2.15.so", "/lib64/libc.so.6")
         else:
             log.error("SGE config is likely to fail because '/lib64/libc.so.6' lib does not exists...")
     log.debug("Setting up SGE.")
     self._fix_util_arch()
     if misc.run('cd %s; ./inst_sge -m -x -auto %s' % (paths.P_SGE_ROOT, SGE_config_file), "Setting up SGE did not go smoothly", "Successfully set up SGE"):
         log.info("Successfully setup SGE; configuring SGE")
         log.debug("Adding parallel environments")
         pes = ['SMP_PE', 'MPI_PE']
         for pe in pes:
             pe_file_path = os.path.join('/tmp', pe)
             with open(pe_file_path, 'w') as f:
                 print >> f, getattr(templates, pe)
             misc.run('cd %s; ./bin/lx24-amd64/qconf -Ap %s' % (paths.P_SGE_ROOT, pe_file_path))
         log.debug("Creating queue 'all.q'")
         SGE_allq_file = '%s/all.q.conf' % paths.P_SGE_ROOT
         with open( SGE_allq_file, 'w' ) as f:
             print >> f, templates.ALL_Q_TEMPLATE
         os.chown(SGE_allq_file, pwd.getpwnam("sgeadmin")[2], grp.getgrnam("sgeadmin")[2])
         log.debug("Created SGE all.q template as file '%s'" % SGE_allq_file)
         misc.run('cd %s; ./bin/lx24-amd64/qconf -Mq %s' % (paths.P_SGE_ROOT, SGE_allq_file), "Error modifying all.q", "Successfully modified all.q")
         log.debug("Configuring users' SGE profiles")
         with open(paths.LOGIN_SHELL_SCRIPT, 'a') as f:
             f.write("\nexport SGE_ROOT=%s" % paths.P_SGE_ROOT)
             f.write("\n. $SGE_ROOT/default/common/settings.sh\n")
         return True
     return False
Example #26
0
 def _start_supervisord(self):
     """
     Start the supervisord process with ``self.main_conf_file``.
     """
     log.debug("Starting supervisord with {0}".format(self.main_conf_file))
     if misc.run('supervisord -c {0}'.format(self.main_conf_file)):
         self.server = xmlrpclib.Server('http://localhost:{0}/RPC2'.format(
                                        self.sv_port))
Example #27
0
 def nginx_conf_dir(self):
     """
     Use the running nginx to provide the location of the current nginx configuration directory
     """
     conf_file = misc.run("{0} -t && {0} -t 2>&1 | head -n 1 | cut -d' ' -f5".format(self.nginx_executable))
     if os.path.exists(conf_file.strip()):
         return conf_file.rstrip("nginx.conf\n")
     return ''
Example #28
0
    def _ensure_ephemeral_disk_mounted(self):
        """
        Make sure `/mnt` is a mounted device vs. just being part of `/`.

        At least some AWS instance types (e.g., r3) do not auto-mount what's in
        `/ets/fstab` so make sure the ephemeral disks are in fact mounted.
        http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/InstanceStorage.html#InstanceStoreTrimSupport
        """
        if not misc.run('mountpoint -q /mnt'):
            device = '/dev/xvdb'  # Most of AWS instances have this device
            if os.path.exists(device):
                log.debug("/mnt is not a mountpoint; will try to mount it from {0}"
                          .format(device))
                misc.run('mkfs.xfs {0}'.format(device))
                misc.run('mount -o discard {0} /mnt'.format(device))
            else:
                log.warning("Mountpoint /mnt not available and no device {0}"
                            .format(device))
Example #29
0
 def remove(self, synchronous=False):
     """
     Stop the Supervisor service.
     """
     log.info("Stopping {0} service".format(self.name))
     super(SupervisorService, self).remove(synchronous)
     self.state = service_states.SHUTTING_DOWN
     try:
         assert self.supervisor.shutdown()
         self.state = service_states.SHUT_DOWN
         self.server = None
     except:  # Let's try a more direct approach
         log.debug("Stopping supervisord with pid from {0}".format(self.pid_file))
         cmd = ('/sbin/start-stop-daemon --retry TERM/5/KILL/10 --stop '
                '--pidfile {0}'.format(self.pid_file))
         misc.run(cmd)
         self.state = service_states.SHUT_DOWN
         self.server = None
Example #30
0
 def _reconfigure_cluster(self):
     """
     (Re)configure the cluster (ie, job manager) to match the current set of
     resources. The method will (re)generate ``slurm.conf`` and issue
     ``scontrol reconfigure`` command that will update all Slurm damemons.
     """
     log.debug("Reconfiguring Slurm cluster")
     self._setup_slurm_conf()
     return misc.run("/usr/bin/scontrol reconfigure")
Example #31
0
 def _as_postgres(self, cmd, cwd=None):
     return misc.run('%s - postgres -c "%s"' % (paths.P_SU, cmd), cwd=cwd)
Example #32
0
    def manage_galaxy(self, to_be_started=True):
        """
        Use this method to start and stop Galaxy application.

        :type to_be_started: bool
        :param to_be_started: If set, this method will attempt to start the
                              Galaxy application process. If not set, the
                              method will attempt to shut down the application
                              process.
        """
        log.debug("Using Galaxy from '{0}'".format(self.galaxy_home))
        os.putenv("GALAXY_HOME", self.galaxy_home)
        os.putenv("TEMP", self.app.path_resolver.galaxy_temp)
        os.putenv("TMPDIR", self.app.path_resolver.galaxy_temp)
        self.env_vars["GALAXY_HOME"] = self.galaxy_home
        self.env_vars["TEMP"] = self.app.path_resolver.galaxy_temp
        self.env_vars["TMPDIR"] = self.app.path_resolver.galaxy_temp
        conf_dir = self.option_manager.setup()
        if conf_dir:
            self.env_vars["GALAXY_UNIVERSE_CONFIG_DIR"] = conf_dir

        if self.multiple_processes():
            self.env_vars["GALAXY_RUN_ALL"] = "TRUE"
            # HACK: Galaxy has a known problem when starting from a fresh
            # configuration in multiple process mode. Each process attempts to
            # create the same directories and one or more processes can fail to
            # start because it "failed" to create said directories (because
            # another process created them first). This hack staggers
            # the process starts in an attempt to circumvent this problem.
            patch_run_sh_command = (
                "sudo sed -i -e \"s/server.log \\$\\@$/\\0; "
                "sleep 4/\" %s/run.sh" % self.galaxy_home)
            misc.run(patch_run_sh_command)
            self.extra_daemon_args = ""
        else:
            # Instead of sticking with default paster.pid and paster.log,
            # explicitly set pid and log file to ``main.pid`` and ``main.log``
            # to bring single process case inline with defaults for for multiple
            # process case (i.e. when GALAXY_RUN_ALL is set and multiple servers
            # are defined).
            # self.extra_daemon_args = "--pid-file=main.pid --log-file=main.log"
            # No longer required
            pass
        if to_be_started and self.remaining_start_attempts > 0:
            self.status()
            if not self.configured:
                log.debug("Setting up Galaxy application")
                # Set job manager configs if necessary
                for job_manager_svc in self.app.manager.service_registry.active(
                        service_role=ServiceRole.JOB_MANAGER):
                    if ServiceRole.SGE in job_manager_svc.svc_roles:
                        log.debug("Running on SGE; setting env_vars")
                        self.env_vars[
                            "SGE_ROOT"] = self.app.path_resolver.sge_root,
                        self.env_vars[
                            "DRMAA_LIBRARY_PATH"] = self.app.path_resolver.drmaa_library_path
                # Make sure Galaxy home dir exists
                if not os.path.exists(self.galaxy_home):
                    log.error("Galaxy application directory '%s' does not "
                              "exist! Aborting." % self.galaxy_home)
                    log.debug("ls /mnt/: %s" % os.listdir('/mnt/'))
                    self.state = service_states.ERROR
                    self.last_state_change_time = datetime.utcnow()
                    return False
                # Ensure the necessary directories exist
                for dir_name in [
                        paths.P_GALAXY_INDICES,
                    ('%s/tmp/job_working_directory' %
                     self.app.path_resolver.galaxy_data)
                ]:
                    misc.make_dir(dir_name, 'galaxy')
                self.configured = True
            if not self._is_galaxy_running():
                log.debug("Starting Galaxy...")
                self.update_galaxy_config()
                start_command = self.galaxy_run_command("%s --daemon" %
                                                        self.extra_daemon_args)
                attempt_chown_galaxy(self.galaxy_home)
                if misc.run(start_command):
                    self.remaining_start_attempts -= 1
                elif self.remaining_start_attempts > 0:
                    log.debug(
                        "It seems Galaxy failed to start; will atempt to "
                        "auto-restart (up to {0} more time(s)).".format(
                            self.remaining_start_attempts))
                    self.state = service_states.UNSTARTED
                    self.last_state_change_time = datetime.utcnow()
                else:
                    log.debug(
                        "It seems Galaxy failed to start; setting service "
                        "state to {0}.".format(service_states.ERROR))
                    self.state = service_states.ERROR
                    self.last_state_change_time = datetime.utcnow()
            else:
                log.debug("Galaxy already running.")
        else:
            log.info("Shutting down Galaxy...")
            self.state = service_states.SHUTTING_DOWN
            stop_command = self.galaxy_run_command("%s --stop-daemon" %
                                                   self.extra_daemon_args)
            if self._is_galaxy_running():
                misc.run(stop_command)
            if not self._is_galaxy_running():
                log.debug(
                    "Galaxy not running; setting service state to SHUT_DOWN.")
                self.state = service_states.SHUT_DOWN
                self.last_state_change_time = datetime.utcnow()
                # Move all log files
                subprocess.call(
                    "bash -c 'for f in $GALAXY_HOME/{main,handler,manager,web}*.log; "
                    "do mv \"$f\" \"$f.%s\"; done'" %
                    datetime.utcnow().strftime('%H_%M'),
                    shell=True)
Example #33
0
    def _configure_sge(self):
        log.info("Setting up SGE...")
        SGE_config_file = '%s/galaxyEC2.conf' % self.app.path_resolver.sge_root
        with open(SGE_config_file, 'w') as f:
            print >> f, _get_sge_install_conf(
                self.app, self.app.cloud_interface.get_private_ip())
        os.chown(SGE_config_file,
                 pwd.getpwnam("sgeadmin")[2],
                 grp.getgrnam("sgeadmin")[2])
        log.debug("Created SGE install template as file '%s'" %
                  SGE_config_file)
        fix_libc()
        log.debug("Setting up SGE.")
        self._fix_util_arch()
        if misc.run(
                'cd %s; ./inst_sge -m -x -auto %s' %
            (self.app.path_resolver.sge_root, SGE_config_file),
                "Setting up SGE did not go smoothly",
                "Successfully set up SGE"):
            log.debug("Successfully setup SGE; configuring SGE")
            log.debug("Adding parallel environments")
            pes = ['SGE_SMP_PE', 'SGE_MPI_PE']
            for pe in pes:
                pe_file_path = os.path.join('/tmp', pe)
                with open(pe_file_path, 'w') as f:
                    print >> f, conf_manager.load_conf_template(
                        getattr(conf_manager, pe)).safe_substitute()
                misc.run('cd %s; ./bin/lx24-amd64/qconf -Ap %s' %
                         (self.app.path_resolver.sge_root, pe_file_path))
            log.debug("Creating queue 'all.q'")

            SGE_allq_file = '%s/all.q.conf' % self.app.path_resolver.sge_root
            all_q_template = conf_manager.load_conf_template(
                conf_manager.SGE_ALL_Q_TEMPLATE)
            if self.app.config.hadoop_enabled:
                all_q_params = {
                    "slots":
                    int(commands.getoutput("nproc")),
                    "prolog_path":
                    os.path.join(
                        paths.P_HADOOP_HOME,
                        paths.P_HADOOP_INTEGRATION_FOLDER + "/hdfsstart.sh"),
                    "epilog_path":
                    os.path.join(
                        paths.P_HADOOP_HOME,
                        paths.P_HADOOP_INTEGRATION_FOLDER + "/hdfsstop.sh")
                }
            else:
                all_q_params = {
                    "slots": int(commands.getoutput("nproc")),
                    "prolog_path": 'NONE',
                    "epilog_path": 'NONE'
                }

            with open(SGE_allq_file, 'w') as f:
                print >> f, all_q_template.substitute(all_q_params)
            os.chown(SGE_allq_file,
                     pwd.getpwnam("sgeadmin")[2],
                     grp.getgrnam("sgeadmin")[2])
            log.debug("Created SGE all.q template as file '%s'" %
                      SGE_allq_file)
            misc.run(
                'cd %s; ./bin/lx24-amd64/qconf -Mq %s' %
                (self.app.path_resolver.sge_root, SGE_allq_file),
                "Error modifying all.q", "Successfully modified all.q")
            log.debug("Configuring users' SGE profiles")
            misc.append_to_file(
                paths.LOGIN_SHELL_SCRIPT,
                "\nexport SGE_ROOT=%s" % self.app.path_resolver.sge_root)
            misc.append_to_file(paths.LOGIN_SHELL_SCRIPT,
                                "\n. $SGE_ROOT/default/common/settings.sh\n")
            # Write out the .sge_request file for individual users
            sge_request_template = conf_manager.load_conf_template(
                conf_manager.SGE_REQUEST_TEMPLATE)
            sge_request_params = {
                'psql_home': self.app.path_resolver.pg_home,
                'galaxy_tools_dir': self.app.path_resolver.galaxy_tools,
            }
            users = ['galaxy', 'ubuntu']
            for user in users:
                sge_request_file = os.path.join('/home', user, '.sge_request')
                with open(sge_request_file, 'w') as f:
                    print >> f, sge_request_template.substitute(
                        sge_request_params)
                os.chown(sge_request_file,
                         pwd.getpwnam(user)[2],
                         grp.getgrnam(user)[2])
            return True
        return False
Example #34
0
    def _add_instance_as_exec_host(self, inst_alias, inst_private_ip):
        """
        Add instance with ``inst_alias`` and ``inst_private_ip`` to the SGE
        execution host list.

        ``inst_alias`` is used only in log statements while the the ``inst_private_ip``
        is the IP address (or hostname) of the given instance, which must be
        visible (i.e., accessible) to the other nodes in the clusters.
        """
        ok = True
        # Check if host is already in the exec host list
        cmd = "export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -sel" \
            % (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root)
        proc = subprocess.Popen(cmd,
                                shell=True,
                                stdout=subprocess.PIPE,
                                stderr=subprocess.PIPE)
        stdout, stderr = proc.communicate()
        if inst_private_ip in stdout:
            log.debug("Instance '%s' already in SGE execution host list" %
                      inst_alias)
        else:
            log.debug("Adding instance '%s' to SGE execution host list." %
                      inst_alias)
            # Create a dir to hold all of workers host configuration files
            host_conf_dir = "%s/host_confs" % self.app.path_resolver.sge_root
            if not os.path.exists(host_conf_dir):
                subprocess.call('mkdir -p %s' % host_conf_dir, shell=True)
                os.chown(host_conf_dir,
                         pwd.getpwnam("sgeadmin")[2],
                         grp.getgrnam("sgeadmin")[2])
            host_conf_file = os.path.join(host_conf_dir, str(inst_alias))
            with open(host_conf_file, 'w') as f:
                print >> f, conf_manager.load_conf_template(
                    conf_manager.SGE_HOST_CONF_TEMPLATE).substitute(
                        {'hostname': inst_private_ip})
            os.chown(host_conf_file,
                     pwd.getpwnam("sgeadmin")[2],
                     grp.getgrnam("sgeadmin")[2])
            log.debug("Created SGE host configuration template as file '%s'." %
                      host_conf_file)
            # Add worker instance as execution host to SGE
            cmd = 'export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; %s/bin/lx24-amd64/qconf -Ae %s' \
                % (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root, host_conf_file)
            log.debug("Add SGE exec host cmd: {0}".format(cmd))
            proc = subprocess.Popen(cmd,
                                    shell=True,
                                    stdout=subprocess.PIPE,
                                    stderr=subprocess.PIPE)
            if proc.wait() == 0:
                log.debug(
                    "Successfully added instance '%s' w/ private IP '%s' as an execution host."
                    % (inst_alias, inst_private_ip))
            else:
                ok = False
                log.error(
                    "Process encountered problems adding instance '%s' as an SGE execution host. "
                    "Process returned code %s" % (inst_alias, proc.returncode))
                stderr = stdout = None
                stdout, stderr = proc.communicate()
                log.debug(
                    " - adding instance '%s' SGE execution host stdout (private IP: %s): '%s'"
                    % (inst_alias, inst_private_ip, stdout))
                log.debug(
                    " - adding instance '%s' SGE execution host stderr (private IP: %s): '%s'"
                    % (inst_alias, inst_private_ip, stderr))

        # == Add given instance's hostname to @allhosts
        # Check if instance is already in allhosts file and do not recreate the
        # file if so.
        # Additional documentation: allhosts file can be generated by CloudMan
        # each time an instance is added or removed. The file is generated based
        # on the Instance object CloudMan keeps track of and, as a result, it
        # includes all of the instances listed. So, some instances, although they
        # have yet to go through the addition process, might have had their IPs
        # already included in the allhosts file. This approach ensures consistency
        # between SGE and CloudMan and has been working much better than trying
        # to sync the two via other methods.
        proc = subprocess.Popen(
            "export SGE_ROOT=%s; . $SGE_ROOT/default/common/settings.sh; "
            "%s/bin/lx24-amd64/qconf -shgrp @allhosts" %
            (self.app.path_resolver.sge_root, self.app.path_resolver.sge_root),
            shell=True,
            stdout=subprocess.PIPE)
        allhosts_out = proc.communicate()[0]
        if inst_private_ip not in allhosts_out:
            now = datetime.datetime.utcnow()
            ah_file = '/tmp/ah_add_' + now.strftime("%H_%M_%S")
            self._write_allhosts_file(filename=ah_file, to_add=inst_private_ip)
            if not misc.run(
                    'export SGE_ROOT=%s;. $SGE_ROOT/default/common/settings.sh; '
                    '%s/bin/lx24-amd64/qconf -Mhgrp %s' %
                (self.app.path_resolver.sge_root,
                 self.app.path_resolver.sge_root, ah_file),
                    "Problems updating @allhosts aimed at adding '%s'" %
                    inst_alias,
                    "Successfully updated @allhosts to add '%s' with address '%s'"
                    % (inst_alias, inst_private_ip)):
                ok = False
        else:
            log.debug("Instance '%s' IP is already in SGE's @allhosts" %
                      inst_alias)

        # On instance reboot, SGE might have already been configured for a given
        # instance and this method will fail along the way although the instance
        # will still operate within SGE so don't explicitly state it was added.
        if ok:
            log.debug("Successfully added instance '%s' to SGE" % inst_alias)
        return ok
Example #35
0
    def remove(self, synchronous=False):
        """
        Stop the Cloudera Manager web server.
        """
        log.info("Stopping Cloudera Manager service")
        super(ClouderaManagerService, self).remove(synchronous)
        self.state = service_states.SHUTTING_DOWN
        try:
            if self.cm_api_resource:
                cluster = self.cm_api_resource.get_cluster(self.cluster_name)
                cluster.stop()
        except Exception, exc:
            log.error("Exception stopping cluster {0}: {1}".format(
                self.cluster_name, exc))
        if misc.run("service cloudera-scm-server stop"):
            self.state = service_states.SHUT_DOWN

    def configure_db(self):
        """
        Add the necessary tables to the default PostgreSQL server running on the
        host and prepare the necessary roles and databases.
        """
        # Update psql settings
        pg_conf = paths.P_PG_CONF
        lif = [
            "listen_addresses = '*'", "shared_buffers = 256MB",
            "wal_buffers = 8MB", "checkpoint_segments = 16",
            "checkpoint_completion_target = 0.9"
        ]
        for l in lif:
Example #36
0
 def _as_galaxy(self, cmd):
     return misc.run('%s - galaxy -c "%s"' % (paths.P_SU, cmd))