def __init__(self, application, **kwargs): self.default_namespace = os.environ.get('CGCLOUD_NAMESPACE', '/__me__/') self.default_zone = os.environ.get('CGCLOUD_ZONE', None) super(ContextCommand, self).__init__(application, **kwargs) self.option('--zone', '-z', metavar='ZONE', default=self.default_zone, dest='availability_zone', required=not bool(self.default_zone), help=heredoc( """The name of the EC2 availability zone to operate in, e.g. us-east-1b, us-west-1b or us-west-2c etc. This argument implies the AWS region to run in. The value of the environment variable CGCLOUD_ZONE, if that variable is present, determines the default.""")) self.option( '--namespace', '-n', metavar='PREFIX', default=self.default_namespace, help=heredoc( """Optional prefix for naming EC2 resource like instances, images, volumes, etc. Use this option to create a separate namespace in order to avoid collisions, e.g. when running tests. The value of the environment variable CGCLOUD_NAMESPACE, if that variable is present, overrides the default. The string __me__ anywhere in the namespace will be replaced by the name of the IAM user whose credentials are used to issue requests to AWS. If the name of that IAM user contains the @ character, anything after the first occurrance of that character will be discarded before the substitution is done."""))
def __init__(self, application): super(ClusterCommand, self).__init__(application) self.option( '--cluster-name', '-c', metavar='NAME', help=heredoc( """The name of the cluster to operate on. The default is to consider all clusters of the given type regardless of their name, using --ordinal to disambiguate. Note that the cluster name is not necessarily unique, not even with a specific cluster type, there may be more than one cluster of a particular name and type.""")) self.option( '--ordinal', '-o', default=-1, type=int, help=heredoc( """Selects an individual cluster from the list of currently running clusters of the given cluster type and name. Since there is one leader per cluster, this is equal to the ordinal of the leader among all leaders of clusters of the given type and name. The ordinal is a zero-based index into the list of all clusters of the specified type and name, sorted by creation time. This means that the ordinal of a cluster is not fixed, it may change if another cluster of the same type and name is terminated. If the ordinal is negative, it will be converted to a positive ordinal by adding the number of clusters of the specified type. Passing -1, for example, selects the most recently created box."""))
def __init__(self, application): super(CreateCommand, self).__init__(application) self.option( '--boot-image', '-i', metavar='AMI_ID', help=heredoc( """The AMI ID of the image from which to create the box. This argument is optional and the default is determined automatically based on the role. Typically, this option does not need to be used.""" )) self.option( '--no-agent', default=False, action='store_true', help=heredoc( """Don't install the cghub-cloud-agent package on the box. One note-worthy effect of using this option this is that the SSH keys will be installed initially, but not maintained over time.""")) self.option( '--create-image', '-I', default=False, action='store_true', help='Create an image of the box as soon as setup completes.') # FIXME: Take a second look at this: Does it work. Is it necessary? self.option( '--upgrade', '-U', default=False, action='store_true', help=heredoc( """Bring the package repository as well as any installed packages up to date, i.e. do what on Ubuntu is achieved by doing 'sudo apt-get update ; sudo apt-get upgrade'."""))
def __to_hadoop_xml_config(properties): """ >>> print SparkBox._SparkBox__to_hadoop_xml_config( {'foo' : 'bar'} ) <?xml version='1.0' encoding='utf-8'?> <?xml-stylesheet type='text/xsl' href='configuration.xsl'?> <configuration> <property> <name>foo</name> <value>bar</value> </property> </configuration> <BLANKLINE> """ s = StringIO() s.write( heredoc(""" <?xml version='1.0' encoding='utf-8'?> <?xml-stylesheet type='text/xsl' href='configuration.xsl'?> <configuration>""")) for name, value in properties.iteritems(): s.write( heredoc(""" <property> <name>{name}</name> <value>{value}</value> </property>""", indent=' ')) s.write("</configuration>\n") return s.getvalue()
def __init__(self, application): super(RsyncCommandMixin, self).__init__(application) self.option( '--ssh-opts', '-e', metavar='OPTS', default=None, help=heredoc( """Additional options to pass to ssh. Note that if OPTS starts with a dash you must use the long option followed by an equal sign. For example, to run ssh in verbose mode, use --ssh-opt=-v. If OPTS is to include spaces, it must be quoted to prevent the shell from breaking it up. So to run ssh in verbose mode and log to syslog, you would use --ssh-opt='-v -y'.""")) self.option( 'args', metavar='...', nargs=argparse.REMAINDER, default=[], help=heredoc( """Command line options for rsync(1). The remote path argument must be prefixed with a colon. For example, 'cgcloud.py rsync foo -av :bar .' would copy the file 'bar' from the home directory of the admin user on the box 'foo' to the current directory on the local machine.""" ))
def __init__(self, application, **kwargs): super(RegisterKeyCommand, self).__init__(application, **kwargs) self.option( 'ssh_public_key', metavar='KEY_FILE', help=heredoc( """Path of file containing the SSH public key to upload to the EC2 keypair.""")) self.option('--force', '-F', default=False, action='store_true', help='Overwrite potentially existing EC2 key pair') self.option( '--keypair', '-k', metavar='NAME', dest='ec2_keypair_name', default='__me__', help=heredoc( """The desired name of the EC2 key pair. The name should associate the key with you in a way that it is obvious to other users in your organization. The string __me__ anywhere in the key pair name will be replaced with the name of the IAM user whose credentials are used to issue requests to AWS."""))
def __init__( self, application, **kwargs ): self.default_namespace = os.environ.get( 'CGCLOUD_NAMESPACE', '/__me__/' ) self.default_zone = os.environ.get( 'CGCLOUD_ZONE', None ) super( ContextCommand, self ).__init__( application, **kwargs ) self.option( '--zone', '-z', metavar='ZONE', default=self.default_zone, dest='availability_zone', required=not bool( self.default_zone ), help=heredoc( """The name of the EC2 availability zone to operate in, e.g. us-east-1b, us-west-1b or us-west-2c etc. This argument implies the AWS region to run in. The value of the environment variable CGCLOUD_ZONE, if that variable is present, determines the default.""" ) ) self.option( '--namespace', '-n', metavar='PREFIX', default=self.default_namespace, help=heredoc( """Optional prefix for naming EC2 resource like instances, images, volumes, etc. Use this option to create a separate namespace in order to avoid collisions, e.g. when running tests. A namespace begins with a slash, followed by zero or more names, each name followed by a slash. Note that this implies that the namespace begins and ends with a slash. Each name must begin with a a digit or lowercase letter followed by zero or more digits, lowercase letters, periods, underscores or dashes. The value of the environment variable CGCLOUD_NAMESPACE, if that variable is present, overrides the default. The string __me__ anywhere in the namespace will be replaced by the name of the IAM user whose credentials are used to issue requests to AWS. If the name of that IAM user contains the @ character, anything after the first occurrance of that character will be discarded before the substitution is done.""" ) )
def __init__(self, application, **kwargs): super(InstanceCommand, self).__init__(application, **kwargs) self.option( '--cluster-name', '-c', metavar='NAME', help=heredoc( """This option can be used to restrict the selection to boxes that are part of a cluster of the given name. Boxes that are not part of a cluster use their own instance id as the cluster name.""") ) self.begin_mutex() self.option( '--ordinal', '-o', default=-1, type=int, help=heredoc( """Selects an individual box from the list of boxes performing the specified role in a cluster of the given name. The ordinal is a zero-based index into the list of all boxes performing the specified role, sorted by creation time. This means that the ordinal of a box is not fixed, it may change if another box performing the specified role is terminated. If the ordinal is negative, it will be converted to a positive ordinal by adding the number of boxes performing the specified role. Passing -1, for example, selects the most recently created box.""")) self.option( '--instance-id', '-I', default=None, type=str, help=heredoc("""Selects an individual instance. When combined with --cluster-name, the specified instance needs to belong to a cluster of the specified name or an error will be raised.""")) self.end_mutex()
def __to_hadoop_xml_config( properties ): """ >>> print SparkBox._SparkBox__to_hadoop_xml_config( {'foo' : 'bar'} ) <?xml version='1.0' encoding='utf-8'?> <?xml-stylesheet type='text/xsl' href='configuration.xsl'?> <configuration> <property> <name>foo</name> <value>bar</value> </property> </configuration> <BLANKLINE> """ s = StringIO( ) s.write( heredoc( """ <?xml version='1.0' encoding='utf-8'?> <?xml-stylesheet type='text/xsl' href='configuration.xsl'?> <configuration>""" ) ) for name, value in properties.iteritems( ): s.write( heredoc( """ <property> <name>{name}</name> <value>{value}</value> </property>""", indent=' ' ) ) s.write( "</configuration>\n" ) return s.getvalue( )
def __init__( self, application ): super( CreateClusterCommand, self ).__init__( application ) self.cluster = None self.option( '--cluster-name', '-c', metavar='NAME', help=heredoc( """A name for the new cluster. If absent, the instance ID of the master will be used. Cluster names do not need to be unique, but they should be in order to avoid user error.""" ) ) self.option( '--num-workers', '-s', metavar='NUM', type=int, default=1, help='The number of workers to launch.' ) self.option( '--ebs-volume-size', '-e', metavar='GB', help=heredoc( """The size in GB of an EBS volume to be attached to each node for persistent data. The volume will be mounted at /mnt/persistent.""" ) ) self.option( '--leader-on-demand', '-D', default=False, action='store_true', help=heredoc( """Use this option to insure that the leader will be an on-demand instance, even if --spot-bid is given.""" ) ) self.option( '--share', '-S', metavar='PATH', default=None, dest='share_path', help=heredoc( """The path to a local file or directory for distribution to the cluster. The given file or directory (or the contents of the given directory, if the path ends in a slash) will be placed in the default user's ~/shared directory on each node.""" ) ) self.option( '--ssh-opts', metavar='OPTS', default=None, help=heredoc( """Additional options to pass to ssh when uploading the files shared via rsync. For more detail refer to cgcloud rsync --help""" ) )
def __init__(self, application): super(CreateClusterCommand, self).__init__(application) self.cluster = None self.option( '--cluster-name', '-c', metavar='NAME', help=heredoc( """A name for the new cluster. If absent, the instance ID of the master will be used. Cluster names do not need to be unique, but they should be in order to avoid user error.""")) self.option('--num-workers', '-s', metavar='NUM', type=int, default=1, dest='num_workers', help='The number of workers to launch.') self.option( '--ebs-volume-size', '-e', metavar='GB', help=heredoc( """The size in GB of an EBS volume to be attached to each node for persistent data. The volume will be mounted at /mnt/persistent.""" )) self.option('--leader-on-demand', '-D', dest='leader_on_demand', default=False, action='store_true', help=heredoc( """Use this option to insure that the leader will be an on-demand instance, even if --spot-bid is given.""")) self.option( '--share', '-S', metavar='PATH', default=None, dest='share_path', help=heredoc( """The path to a local file or directory for distribution to the cluster. The given file or directory (or the contents of the given directory, if the path ends in a slash) will be placed in the default user's ~/shared directory on each node.""")) self.option( '--ssh-opts', metavar='OPTS', default=None, help=heredoc( """Additional options to pass to ssh when uploading the files shared via rsync. For more detail refer to cgcloud rsync --help""" ))
def __install_sparkbox_tools( self ): """ Installs the spark-master-discovery init script and its companion spark-tools. The latter is a Python package distribution that's included in cgcloud-spark as a resource. This is in contrast to the cgcloud agent, which is a standalone distribution. """ tools_dir = install_dir + '/tools' admin = self.admin_account( ) sudo( fmt( 'mkdir -p {tools_dir} {persistent_dir} {ephemeral_dir}' ) ) sudo( fmt( 'chown {admin}:{admin} {tools_dir}' ) ) run( fmt( 'virtualenv --no-pip {tools_dir}' ) ) run( fmt( '{tools_dir}/bin/easy_install pip==1.5.2' ) ) spark_tools_artifacts = ' '.join( self._project_artifacts( 'spark-tools' ) ) with settings( forward_agent=True ): run( fmt( '{tools_dir}/bin/pip install {spark_tools_artifacts}' ), pty=False ) sudo( fmt( 'chown -R root:root {tools_dir}' ) ) spark_tools = "SparkTools(**%r)" % dict( user=user, install_dir=install_dir, ephemeral_dir=ephemeral_dir, persistent_dir=persistent_dir, lazy_dirs=self.lazy_dirs ) self._register_init_script( "sparkbox", heredoc( """ description "Spark/HDFS master discovery" console log start on runlevel [2345] stop on runlevel [016] pre-start script {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.spark_tools import SparkTools spark_tools = {spark_tools} spark_tools.start() end script post-stop script {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.spark_tools import SparkTools spark_tools = {spark_tools} spark_tools.stop() END end script""" ) ) script_path = "/usr/local/bin/sparkbox-manage-slaves" put( remote_path=script_path, use_sudo=True, local_path=StringIO( heredoc( """ #!{tools_dir}/bin/python2.7 import sys import logging logging.basicConfig( level=logging.INFO ) from cgcloud.spark_tools import SparkTools spark_tools = {spark_tools} spark_tools.manage_slaves( slaves_to_add=sys.argv[1:] )""" ) ) ) sudo( fmt( "chown root:root {script_path} && chmod 755 {script_path}" ) )
def __init__( self, application ): super( DeleteImageCommand, self ).__init__( application ) self.begin_mutex( ) self.option( '--keep-snapshot', '-K', default=False, action='store_true', help=heredoc( """Do not delete the EBS volume snapshot associated with the given image. This will leave an orphaned snapshot which should be removed at a later time using the 'cgcloud cleanup' command.""" ) ) self.option( '--quick', '-Q', default=False, action='store_true', help=heredoc( """Exit immediately after deregistration request has been made, don't wait until the image is deregistered. Implies --keep-snapshot.""" ) ) self.end_mutex( )
def __init__( self, application, **kwargs ): super( UserCommandMixin, self ).__init__( application, **kwargs ) self.begin_mutex( ) self.option( '--login', '-l', default=None, metavar='USER', dest='user', help=heredoc( """Name of user to login as. The default depends on the role, for most roles the default is the administrative user. Roles that define a second less privileged application user will default to that user. Can't be used together with -a, --admin.""" ) ) self.option( '--admin', '-a', default=False, action='store_true', help=heredoc( """Force logging in as the administrative user. Can't be used together with -l, --login.""" ) ) self.end_mutex( )
def __init__( self, application, **kwargs ): super( RegisterKeyCommand, self ).__init__( application, **kwargs ) self.option( 'ssh_public_key', metavar='KEY_FILE', help=heredoc( """Path of file containing the SSH public key to upload to the EC2 keypair.""" ) ) self.option( '--force', '-F', default=False, action='store_true', help='Overwrite potentially existing EC2 key pair' ) self.option( '--keypair', '-k', metavar='NAME', dest='ec2_keypair_name', default='__me__', help=heredoc( """The desired name of the EC2 key pair. The name should associate the key with you in a way that it is obvious to other users in your organization. The string __me__ anywhere in the key pair name will be replaced with the name of the IAM user whose credentials are used to issue requests to AWS.""" ) )
def __init__( self, application ): super( RsyncCommandMixin, self ).__init__( application ) self.option( '--ssh-opts', '-e', metavar='OPTS', default=None, help=heredoc( """Additional options to pass to ssh. Note that if OPTS starts with a dash you must use the long option followed by an equal sign. For example, to run ssh in verbose mode, use --ssh-opt=-v. If OPTS is to include spaces, it must be quoted to prevent the shell from breaking it up. So to run ssh in verbose mode and log to syslog, you would use --ssh-opt='-v -y'.""" ) ) self.option( 'args', metavar='...', nargs=argparse.REMAINDER, default=[ ], help=heredoc( """Command line options for rsync(1). The remote path argument must be prefixed with a colon. For example, 'cgcloud.py rsync foo -av :bar .' would copy the file 'bar' from the home directory of the admin user on the box 'foo' to the current directory on the local machine.""" ) )
def __init__( self, application, **kwargs ): super( InstanceCommand, self ).__init__( application, **kwargs ) self.option( '--cluster-name', '-c', metavar='NAME', help=heredoc( """This option can be used to restrict the selection to boxes that are part of a cluster of the given name. Boxes that are not part of a cluster use their own instance id as the cluster name.""" ) ) self.option( '--ordinal', '-o', default=-1, type=int, help=heredoc( """Selects an individual box from the list of boxes performing the specified role in a cluster of the given name. The ordinal is a zero-based index into the list of all boxes performing the specified role, sorted by creation time. This means that the ordinal of a box is not fixed, it may change if another box performing the specified role is terminated. If the ordinal is negative, it will be converted to a positive ordinal by adding the number of boxes performing the specified role. Passing -1, for example, selects the most recently created box.""" ) )
def _word_count( self ): self._ssh( master, 'hdfs dfs -rm -r -f -skipTrash /test.txt /test.txt.counts' ) self._ssh( master, 'rm -rf test.txt test.txt.counts' ) self._ssh( master, 'curl -o test.txt https://www.apache.org/licenses/LICENSE-2.0.txt' ) self._ssh( master, 'hdfs dfs -put -f test.txt /' ) script, script_path = mkstemp( ) try: script = os.fdopen( script, 'w' ) script.write( heredoc( """ import sys from pyspark import SparkContext sc = SparkContext(appName="PythonPi") file = sc.textFile( "/test.txt" ) counts = ( file .flatMap( lambda line: line.split( " " ) ) .map( lambda word: (word, 1) ) .reduceByKey( lambda a, b: a + b ) ) counts.saveAsTextFile( "/test.txt.counts" )""" ) ) script.close( ) self._rsync( master, script_path, ':wordcount.py' ) except: script.close( ) raise finally: os.unlink( script_path ) self._ssh( master, 'spark-submit --executor-memory 512m wordcount.py' ) self._ssh( master, 'hdfs dfs -get /test.txt.counts' ) self._ssh( master, 'test -f test.txt.counts/_SUCCESS' ) for i in xrange( num_slaves ): self._ssh( master, 'test -s test.txt.counts/part-%05d' % i )
def __init__( self, application ): super( RecreateCommand, self ).__init__( application ) self.option( '--quick', '-Q', default=False, action='store_true', help=heredoc( """Don't wait for the box to become running or reachable via SSH. If the agent is disabled in the boot image (this is uncommon, see the --no-agent option to the 'create' command), no additional SSH keypairs will be deployed.""" ) )
def _setup_docker(self): for docker_user in set(self._docker_users()): sudo("usermod -aG docker " + docker_user) prefixes = self._docker_data_prefixes() if prefixes: prefixes = " ".join(map(quote, prefixes)) self._run_init_script("docker", "stop") # Make sure Docker's aufs backend isn't mounted anymore sudo("umount /var/lib/docker/aufs", warn_only=True) # Backup initial state of data directory so we can initialize an empty ephemeral volume sudo("tar -czC /var/lib docker > /var/lib/docker.tar.gz") # Then delete it and recreate it as an empty directory to serve as the bind mount point sudo("rm -rf /var/lib/docker && mkdir /var/lib/docker") self._register_init_script( "dockerbox", heredoc( """ description "Placement of /var/lib/docker" console log start on starting docker stop on stopped docker pre-start script echo echo "This is the dockerbox pre-start script" set -ex if mountpoint -q /var/lib/docker; then echo "The directory '/var/lib/docker' is already mounted, exiting." else for prefix in {prefixes}; do # Prefix must refer to a separate volume, e.g. ephemeral or EBS if mountpoint -q "$prefix"; then # Make sure Docker's aufs backend isn't mounted anymore umount /var/lib/docker/aufs || true if test -d "$prefix/var/lib/docker"; then echo "The directory '$prefix/var/lib/docker' already exists, using it." else mkdir -p "$prefix/var/lib" # If /var/lib/docker contains files ... if python -c 'import os, sys; sys.exit( 0 if os.listdir( sys.argv[1] ) else 1 )' /var/lib/docker; then # ... move it to prefix ... mv /var/lib/docker "$prefix/var/lib" # ... and recreate it as an empty mount point, ... mkdir -p /var/lib/docker else # ... otherwise untar the initial backup. tar -xzC "$prefix/var/lib" < /var/lib/docker.tar.gz fi fi # Now bind-mount into /var/lib/docker mount --bind "$prefix/var/lib/docker" /var/lib/docker break else echo "The prefix directory '$prefix' is not a mount point, skipping." fi done fi end script""" ), ) self._run_init_script("docker", "start")
def __setup_ssh_config(self): with remote_open('/etc/ssh/ssh_config', use_sudo=True) as f: f.write( heredoc(""" Host spark-master CheckHostIP no HashKnownHosts no"""))
def __register_systemd_jobs( self, service_map ): for node_type, services in service_map.iteritems( ): for service in services: service_command_path = '/usr/sbin/%s-start.sh' % service.init_name put( local_path=StringIO( "#!/bin/sh\n" + service.command ), remote_path=service_command_path, use_sudo=True ) sudo( "chown root:root '%s'" % service_command_path ) sudo( "chmod +x '%s'" % service_command_path ) self._register_init_script( service.init_name, heredoc( """ [Unit] Description={service.description} Before=docker.service Wants=docker.service Requires=mesosbox.service After=mesosbox.service [Service] Type=simple ExecStart={service_command_path} User={service.user} Group={service.user} Environment="USER={user}" LimitNOFILE=8000:8192 UMask=022 [Install] WantedBy=multi-user.target """ ) )
def __init__( self, application ): super( SshClusterCommand, self ).__init__( application ) self.option( '--parallel', '-P', default=False, action='store_true', help=heredoc( """Run command on the workers in parallel. Note that this doesn't work if SSH or the command itself prompts for input. This will likely be the case on the first connection attempt when SSH typically prompts for confirmation of the host key. An insecure work-around is to pass "-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no".""" ) )
def __install_tools( self ): """ Installs the mesos-master-discovery init script and its companion mesos-tools. The latter is a Python package distribution that's included in cgcloud-mesos as a resource. This is in contrast to the cgcloud agent, which is a standalone distribution. """ tools_dir = install_dir + '/tools' admin = self.admin_account( ) sudo( fmt( 'mkdir -p {tools_dir}' ) ) sudo( fmt( 'chown {admin}:{admin} {tools_dir}' ) ) run( fmt( 'virtualenv --no-pip {tools_dir}' ) ) run( fmt( '{tools_dir}/bin/easy_install pip==1.5.2' ) ) with settings( forward_agent=True ): with self._project_artifacts( 'mesos-tools' ) as artifacts: pip( use_sudo=True, path=tools_dir + '/bin/pip', args=concat( 'install', artifacts ) ) sudo( fmt( 'chown -R root:root {tools_dir}' ) ) mesos_tools = "MesosTools(**%r)" % dict( user=user, shared_dir=self._shared_dir( ), ephemeral_dir=ephemeral_dir, persistent_dir=persistent_dir, lazy_dirs=self.lazy_dirs ) self.lazy_dirs = None # make sure it can't be used anymore once we are done with it self._register_init_script( "mesosbox", heredoc( """ description "Mesos master discovery" console log start on (local-filesystems and net-device-up IFACE!=lo) stop on runlevel [!2345] pre-start script for i in 1 2 3; do if {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.mesos_tools import MesosTools mesos_tools = {mesos_tools} mesos_tools.start() END then exit 0; fi; echo Retrying in 60s; sleep 60; done; exit 1 end script post-stop script {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.mesos_tools import MesosTools mesos_tools = {mesos_tools} mesos_tools.stop() END end script""" ) ) # Explicitly start the mesosbox service to achieve creation of lazy directoriess right # now. This makes a generic mesosbox useful for adhoc tests that involve Mesos and Toil. self._run_init_script( 'mesosbox' )
def _setup_docker(self): for docker_user in set(self._docker_users()): sudo("usermod -aG docker " + docker_user) prefixes = self._docker_data_prefixes() if prefixes: prefixes = ' '.join(map(quote, prefixes)) self._run_init_script('docker', 'stop') # Make sure Docker's aufs backend isn't mounted anymore sudo('umount /var/lib/docker/aufs', warn_only=True) # Backup initial state of data directory so we can initialize an empty ephemeral volume sudo('tar -czC /var/lib docker > /var/lib/docker.tar.gz') # Then delete it and recreate it as an empty directory to serve as the bind mount point sudo('rm -rf /var/lib/docker && mkdir /var/lib/docker') self._register_init_script( 'dockerbox', heredoc(""" description "Placement of /var/lib/docker" console log start on starting docker stop on stopped docker pre-start script echo echo "This is the dockerbox pre-start script" set -ex if mountpoint -q /var/lib/docker; then echo "The directory '/var/lib/docker' is already mounted, exiting." else for prefix in {prefixes}; do # Prefix must refer to a separate volume, e.g. ephemeral or EBS if mountpoint -q "$prefix"; then # Make sure Docker's aufs backend isn't mounted anymore umount /var/lib/docker/aufs || true if test -d "$prefix/var/lib/docker"; then echo "The directory '$prefix/var/lib/docker' already exists, using it." else mkdir -p "$prefix/var/lib" # If /var/lib/docker contains files ... if python -c 'import os, sys; sys.exit( 0 if os.listdir( sys.argv[1] ) else 1 )' /var/lib/docker; then # ... move it to prefix ... mv /var/lib/docker "$prefix/var/lib" # ... and recreate it as an empty mount point, ... mkdir -p /var/lib/docker else # ... otherwise untar the initial backup. tar -xzC "$prefix/var/lib" < /var/lib/docker.tar.gz fi fi # Now bind-mount into /var/lib/docker mount --bind "$prefix/var/lib/docker" /var/lib/docker break else echo "The prefix directory '$prefix' is not a mount point, skipping." fi done fi end script""")) self._run_init_script('docker', 'start')
def __install_tools(self): """ Installs the mesos-master-discovery init script and its companion mesos-tools. The latter is a Python package distribution that's included in cgcloud-mesos as a resource. This is in contrast to the cgcloud agent, which is a standalone distribution. """ tools_dir = install_dir + '/tools' admin = self.admin_account() sudo(fmt('mkdir -p {tools_dir}')) sudo(fmt('chown {admin}:{admin} {tools_dir}')) run(fmt('virtualenv --no-pip {tools_dir}')) run(fmt('{tools_dir}/bin/easy_install pip==1.5.2')) with settings(forward_agent=True): with self._project_artifacts('mesos-tools') as artifacts: pip(use_sudo=True, path=tools_dir + '/bin/pip', args=concat('install', artifacts)) sudo(fmt('chown -R root:root {tools_dir}')) mesos_tools = "MesosTools(**%r)" % dict(user=user, shared_dir=self._shared_dir(), ephemeral_dir=ephemeral_dir, persistent_dir=persistent_dir, lazy_dirs=self.lazy_dirs) self.lazy_dirs = None # make sure it can't be used anymore once we are done with it self._register_init_script( "mesosbox", heredoc(""" description "Mesos master discovery" console log start on (local-filesystems and net-device-up IFACE!=lo) stop on runlevel [!2345] pre-start script for i in 1 2 3; do if {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.mesos_tools import MesosTools mesos_tools = {mesos_tools} mesos_tools.start() END then exit 0; fi; echo Retrying in 60s; sleep 60; done; exit 1 end script post-stop script {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.mesos_tools import MesosTools mesos_tools = {mesos_tools} mesos_tools.stop() END end script""")) # Explicitly start the mesosbox service to achieve creation of lazy directoriess right # now. This makes a generic mesosbox useful for adhoc tests that involve Mesos and Toil. self._run_init_script('mesosbox')
def __init__(self, application, **kwargs): super(RoleCommand, self).__init__(application, **kwargs) self.option( 'role', metavar='ROLE', completer=self.completer, help=heredoc( """The name of the role. Use the list-roles command to show all available roles."""))
def __init__(self, application, **kwargs): super(TerminateCommand, self).__init__(application, **kwargs) self.option( '--quick', '-Q', default=False, action='store_true', help=heredoc( """Exit immediately after termination request has been made, don't wait until the box is terminated."""))
def __init__( self, application ): super( CreateCommand, self ).__init__( application ) self.option( '--boot-image', '-i', metavar='AMI_ID', help=heredoc( """The AMI ID of the image from which to create the box. This argument is optional and the default is determined automatically based on the role. Typically, this option does not need to be used.""" ) ) self.option( '--no-agent', default=False, action='store_true', help=heredoc( """Don't install the cghub-cloud-agent package on the box. One note-worthy effect of using this option this is that the SSH keys will be installed initially, but not maintained over time.""" ) ) self.option( '--create-image', '-I', default=False, action='store_true', help='Create an image of the box as soon as setup completes.' ) # FIXME: Take a second look at this: Does it work. Is it necessary? self.option( '--upgrade', '-U', default=False, action='store_true', help=heredoc( """Bring the package repository as well as any installed packages up to date, i.e. do what on Ubuntu is achieved by doing 'sudo apt-get update ; sudo apt-get upgrade'.""" ) )
def __init__( self, application ): super( ImageReferenceCommand, self ).__init__( application ) self.option( self.long_image_option, self.short_image_option, metavar='IMAGE', type=self.ordinal_or_ami_id, default=-1, # default to the last one help=heredoc( """An image ordinal, i.e. the index of an image in the list of images for the given role, sorted by creation time. Use the list-images command to print a list of images for a given role. If the ordinal is negative, it will be converted to a positive ordinal by adding the total number of images for this role. Passing -1, for example, selects the most recently created image. Alternatively, an AMI ID, e.g. 'ami-4dcced7d' can be passed in as well.""" ) )
def __add_per_boot_script( self ): """ Ensure that the cloud-init.done file is always created, even on 2nd boot and there-after. On the first boot of an instance, the .done file creation is preformed by the runcmd stanza in cloud-config. On subsequent boots this per-boot script takes over (runcmd is skipped on those boots). """ put( remote_path=self._cloudinit_boot_script( 'done' ), mode=0755, use_sudo=True, local_path=StringIO( heredoc( """ #!/bin/sh touch /tmp/cloud-init.done""" ) ) )
def __init__( self, application ): super( ClusterCommand, self ).__init__( application ) self.option( '--cluster-name', '-c', metavar='NAME', help=heredoc( """The name of the cluster to operate on. The default is to consider all clusters of the given type regardless of their name, using --ordinal to disambiguate. Note that the cluster name is not necessarily unique, not even with a specific cluster type, there may be more than one cluster of a particular name and type.""" ) ) self.option( '--ordinal', '-o', default=-1, type=int, help=heredoc( """Selects an individual cluster from the list of currently running clusters of the given cluster type and name. Since there is one leader per cluster, this is equal to the ordinal of the leader among all leaders of clusters of the given type and name. The ordinal is a zero-based index into the list of all clusters of the specified type and name, sorted by creation time. This means that the ordinal of a cluster is not fixed, it may change if another cluster of the same type and name is terminated. If the ordinal is negative, it will be converted to a positive ordinal by adding the number of clusters of the specified type. Passing -1, for example, selects the most recently created box.""" ) )
def __init__(self, application): super(SshCommandMixin, self).__init__(application) self.option( 'command', metavar='...', nargs=argparse.REMAINDER, default=[], help=heredoc( """Additional arguments to pass to ssh. This can be anything that one would normally pass to the ssh program excluding user name and host but including, for example, the remote command to execute.""" ))
def __init__(self, application): super(RecreateCommand, self).__init__(application) self.option( '--quick', '-Q', default=False, action='store_true', help=heredoc( """Don't wait for the box to become running or reachable via SSH. If the agent is disabled in the boot image (this is uncommon, see the --no-agent option to the 'create' command), no additional SSH keypairs will be deployed."""))
def _docker_patch_heredoc( self ): return heredoc( """ --- docker.service.orig 2017-04-12 20:45:15.899906518 +0000 +++ docker.service 2017-04-12 20:42:57.186495824 +0000 @@ -3,6 +3,8 @@ Documentation=https://docs.docker.com After=network-online.target docker.socket firewalld.service Wants=network-online.target +After=mesosbox.service +Requires=mesosbox.service Requires=docker.socket [Service]""" )
def __init__(self, application): super(DeleteImageCommand, self).__init__(application) self.begin_mutex() self.option( '--keep-snapshot', '-K', default=False, action='store_true', help=heredoc( """Do not delete the EBS volume snapshot associated with the given image. This will leave an orphaned snapshot which should be removed at a later time using the 'cgcloud cleanup' command.""")) self.option( '--quick', '-Q', default=False, action='store_true', help=heredoc( """Exit immediately after deregistration request has been made, don't wait until the image is deregistered. Implies --keep-snapshot.""" )) self.end_mutex()
def __add_per_boot_script(self): """ Ensure that the cloud-init.done file is always created, even on 2nd boot and thereafter. On the first boot of an instance, the .done file creation is preformed by the runcmd stanza in cloud-config. On subsequent boots this per-boot script takes over (runcmd is skipped on those boots). """ put(remote_path=self._cloudinit_boot_script('done'), mode=0755, use_sudo=True, local_path=StringIO( heredoc(""" #!/bin/sh touch /tmp/cloud-init.done""")))
def __init__(self, application): super(SshClusterCommand, self).__init__(application) self.option( '--parallel', '-P', default=False, action='store_true', help=heredoc( """Run command on the workers in parallel. Note that this doesn't work if SSH or the command itself prompts for input. This will likely be the case on the first connection attempt when SSH typically prompts for confirmation of the host key. An insecure work-around is to pass "-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no".""" ))
def __init__(self, application, **kwargs): super(UserCommandMixin, self).__init__(application, **kwargs) self.begin_mutex() self.option( '--login', '-l', default=None, metavar='USER', dest='user', help=heredoc( """Name of user to login as. The default depends on the role, for most roles the default is the administrative user. Roles that define a second less privileged application user will default to that user. Can't be used together with -a, --admin.""")) self.option( '--admin', '-a', default=False, action='store_true', help=heredoc( """Force logging in as the administrative user. Can't be used together with -l, --login.""")) self.end_mutex()
def __init__( self, application ): """ Set later, once we have a context. :type: Cluster """ super( ClusterTypeCommand, self ).__init__( application ) self.option( '--num-threads', metavar='NUM', type=int, default=100, help='The maximum number of tasks to be performed concurrently.' ) self.option( 'cluster_type', metavar='TYPE', completer=self.completer, help=heredoc( """The type of the cluster to be used. The cluster type is covariant with the role of the leader node. For example, a box performing the 'foo-leader' role will be part of a cluster of type 'foo'.""" ) )
def __init__(self, application): super(ImageReferenceCommand, self).__init__(application) self.option( self.long_image_option, self.short_image_option, metavar='IMAGE', type=self.ordinal_or_ami_id, default=-1, # default to the last one help=heredoc( """An image ordinal, i.e. the index of an image in the list of images for the given role, sorted by creation time. Use the list-images command to print a list of images for a given role. If the ordinal is negative, it will be converted to a positive ordinal by adding the total number of images for this role. Passing -1, for example, selects the most recently created image. Alternatively, an AMI ID, e.g. 'ami-4dcced7d' can be passed in as well."""))
def _install_mesosbox_tools( self ): """ Installs the mesos-master-discovery init script and its companion mesos-tools. The latter is a Python package distribution that's included in cgcloud-mesos as a resource. This is in contrast to the cgcloud agent, which is a standalone distribution. """ tools_dir = install_dir + '/tools' sudo( fmt( 'mkdir -p {tools_dir}') ) sudo( fmt('mkdir -p %s' % shared_dir) ) sudo( fmt('chmod 777 %s' % shared_dir) ) sudo( fmt( 'virtualenv --no-pip {tools_dir}' ) ) sudo( fmt( '{tools_dir}/bin/easy_install pip==1.5.2' ) ) mesos_tools_artifacts = ' '.join( self._project_artifacts( 'mesos-tools' ) ) with settings( forward_agent=True ): sudo( fmt( '{tools_dir}/bin/pip install {mesos_tools_artifacts}' ), pty=False ) mesos_tools = "MesosTools(**%r)" % dict( user=user, ephemeral_dir=ephemeral_dir, persistent_dir=persistent_dir, lazy_dirs=self.lazy_dirs) self._register_init_script( "mesosbox", heredoc( """ description "Mesos master discovery" console log start on runlevel [2345] stop on runlevel [016] pre-start script {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.mesos_tools import MesosTools mesos_tools = {mesos_tools} mesos_tools.start() END end script post-stop script {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.mesos_tools import MesosTools mesos_tools = {mesos_tools} mesos_tools.stop() END end script""" ) )
def _setup_docker( self ): super( ToilBoxSupport, self )._setup_docker( ) # The docker and dockerbox init jobs depend on /mnt/persistent which is set up by the # mesosbox job. Adding a dependency of the docker job on mesosbox should satsify that # dependency. with remote_sudo_popen( 'patch -d /etc/init' ) as patch: patch.write( heredoc( """ --- docker.conf.orig 2015-12-18 23:28:48.693072560 +0000 +++ docker.conf 2015-12-18 23:40:30.553072560 +0000 @@ -1,6 +1,6 @@ description "Docker daemon" -start on (local-filesystems and net-device-up IFACE!=lo) +start on (local-filesystems and net-device-up IFACE!=lo and started mesosbox) stop on runlevel [!2345] limit nofile 524288 1048576 limit nproc 524288 1048576""" ) )
def __register_upstart_jobs(self, service_map): for node_type, services in service_map.iteritems(): start_on = "sparkbox-start-" + node_type for service in services: self._register_init_script( service.init_name, heredoc(""" description "{service.description}" console log start on {start_on} stop on runlevel [016] setuid {user} setgid {user} env USER={user} pre-start exec {service.start_script} post-stop exec {service.stop_script}""")) start_on = "started " + service.init_name
def __patch_asynchat(self): """ This bites us in pyftpdlib during S3AM unit tests: http://jenkins.cgcloud.info/job/s3am/13/testReport/junit/src.s3am.test.s3am_tests/CoreTests/test_copy/ The patch is from https://hg.python.org/cpython/rev/d422062d7d36 http://bugs.python.org/issue16133 Fixed in 2.7.9: https://hg.python.org/cpython/raw-file/v2.7.9/Misc/NEWS """ if self._remote_python_version() < (2, 7, 9): with remote_sudo_popen('patch -d /usr/lib/python2.7 -p2') as patch: patch.write( heredoc(''' diff --git a/Lib/asynchat.py b/Lib/asynchat.py --- a/Lib/asynchat.py +++ b/Lib/asynchat.py @@ -46,12 +46,17 @@ method) up to the terminator, and then c you - by calling your self.found_terminator() method. """ +import asyncore +import errno import socket -import asyncore from collections import deque from sys import py3kwarning from warnings import filterwarnings, catch_warnings +_BLOCKING_IO_ERRORS = (errno.EAGAIN, errno.EALREADY, errno.EINPROGRESS, + errno.EWOULDBLOCK) + + class async_chat (asyncore.dispatcher): """This is an abstract class. You must derive from this class, and add the two methods collect_incoming_data() and found_terminator()""" @@ -109,6 +114,8 @@ class async_chat (asyncore.dispatcher): try: data = self.recv (self.ac_in_buffer_size) except socket.error, why: + if why.args[0] in _BLOCKING_IO_ERRORS: + return self.handle_error() return'''))
def __register_upstart_jobs( self, service_map ): for node_type, services in service_map.iteritems( ): start_on = "sparkbox-start-" + node_type for service in services: self._register_init_script( service.init_name, heredoc( """ description "{service.description}" console log start on {start_on} stop on runlevel [016] setuid {user} setgid {user} env USER={user} pre-start exec {service.start_script} post-stop exec {service.stop_script}""" ) ) start_on = "started " + service.init_name
def _sync_package_repos( self ): put( remote_path='/etc/apt/apt.conf.d/99timeout', use_sudo=True, local_path=StringIO( heredoc( """ Acquire::http::Timeout "10"; Acquire::ftp::Timeout "10"; """ ) ), ) for i in range( 5 ): cmd = self.apt_get + ' update' result = sudo( cmd, warn_only=True ) if result.succeeded: return # https://bugs.launchpad.net/ubuntu/+source/apt/+bug/972077 # https://lists.debian.org/debian-dak/2012/05/threads.html#00006 if 'Hash Sum mismatch' in result: log.warn( "Detected race condition during in '%s'" ) else: raise RuntimeError( "Command '%s' failed" % cmd ) raise RuntimeError( "Command '%s' repeatedly failed with race condition. Giving up." )
def __patch_asynchat( self ): """ This bites us in pyftpdlib during S3AM unit tests: http://jenkins.cgcloud.info/job/s3am/13/testReport/junit/src.s3am.test.s3am_tests/CoreTests/test_copy/ The patch is from https://hg.python.org/cpython/rev/d422062d7d36 http://bugs.python.org/issue16133 Fixed in 2.7.9: https://hg.python.org/cpython/raw-file/v2.7.9/Misc/NEWS """ if self._remote_python_version() < (2,7,9): with remote_sudo_popen( 'patch -d /usr/lib/python2.7 -p2' ) as patch: patch.write( heredoc( ''' diff --git a/Lib/asynchat.py b/Lib/asynchat.py --- a/Lib/asynchat.py +++ b/Lib/asynchat.py @@ -46,12 +46,17 @@ method) up to the terminator, and then c you - by calling your self.found_terminator() method. """ +import asyncore +import errno import socket -import asyncore from collections import deque from sys import py3kwarning from warnings import filterwarnings, catch_warnings +_BLOCKING_IO_ERRORS = (errno.EAGAIN, errno.EALREADY, errno.EINPROGRESS, + errno.EWOULDBLOCK) + + class async_chat (asyncore.dispatcher): """This is an abstract class. You must derive from this class, and add the two methods collect_incoming_data() and found_terminator()""" @@ -109,6 +114,8 @@ class async_chat (asyncore.dispatcher): try: data = self.recv (self.ac_in_buffer_size) except socket.error, why: + if why.args[0] in _BLOCKING_IO_ERRORS: + return self.handle_error() return''' ) )
def _setup_docker(self): super(ToilBox, self)._setup_docker() # The docker and dockerbox init jobs depend on /mnt/persistent which is set up by the # mesosbox job. Adding a dependency of the docker job on mesosbox should satsify that # dependency. with remote_sudo_popen('patch -d /etc/init') as patch: patch.write( heredoc(""" --- docker.conf.orig 2015-12-18 23:28:48.693072560 +0000 +++ docker.conf 2015-12-18 23:40:30.553072560 +0000 @@ -1,6 +1,6 @@ description "Docker daemon" -start on (local-filesystems and net-device-up IFACE!=lo) +start on (local-filesystems and net-device-up IFACE!=lo and started mesosbox) stop on runlevel [!2345] limit nofile 524288 1048576 limit nproc 524288 1048576"""))
def __register_upstart_jobs( self, service_map ): for node_type, services in service_map.iteritems( ): start_on = "mesosbox-start-" + node_type for service in services: self._register_init_script( service.init_name, heredoc( """ description "{service.description}" console log start on {start_on} stop on runlevel [016] respawn umask 022 limit nofile 8000 8192 setuid {user} setgid {user} env USER={user} exec {service.command}""" ) ) start_on = "started " + service.init_name
def __patch_distutils(self): """ https://hg.python.org/cpython/rev/cf70f030a744/ https://bitbucket.org/pypa/setuptools/issues/248/exit-code-is-zero-when-upload-fails Fixed in 2.7.8: https://hg.python.org/cpython/raw-file/v2.7.8/Misc/NEWS """ if self._remote_python_version() < (2, 7, 8): with remote_sudo_popen("patch -d /usr/lib/python2.7 -p2") as patch: patch.write( heredoc( """ --- a/Lib/distutils/command/upload.py +++ b/Lib/distutils/command/upload.py @@ -10,7 +10,7 @@ import urlparse import cStringIO as StringIO from hashlib import md5 -from distutils.errors import DistutilsOptionError +from distutils.errors import DistutilsError, DistutilsOptionError from distutils.core import PyPIRCCommand from distutils.spawn import spawn from distutils import log @@ -181,7 +181,7 @@ class upload(PyPIRCCommand): self.announce(msg, log.INFO) except socket.error, e: self.announce(str(e), log.ERROR) - return + raise except HTTPError, e: status = e.code reason = e.msg @@ -190,5 +190,6 @@ class upload(PyPIRCCommand): self.announce('Server response (%s): %s' % (status, reason), log.INFO) else: - self.announce('Upload failed (%s): %s' % (status, reason), - log.ERROR) + msg = 'Upload failed (%s): %s' % (status, reason) + self.announce(msg, log.ERROR) + raise DistutilsError(msg)""" ) )
def _register_upstart_jobs( self, service_map ): for node_type, services in service_map.iteritems( ): start_on = "mesosbox-start-" + node_type for service in services: # FIXME: include chdir to logging directory in this script self._register_init_script( service.init_name, heredoc( """ description "{service.description}" console log respawn umask 022 limit nofile 8000 8192 setuid {user} setgid {user} env USER={user} env PYTHONPATH=/home/ubuntu/ start on {start_on} stop on runlevel [016] exec {service.action}""" ) ) start_on = "started " + service.init_name
def __setup_application_user( self ): sudo( fmt( 'useradd ' '--home /home/{user} ' '--create-home ' '--user-group ' '--shell /bin/bash {user}' ) ) sudoer_file = heredoc( """ # CGcloud - MesosBox # User rules for ubuntu mesosbox ALL=(ALL) NOPASSWD:ALL # User rules for ubuntu mesosbox ALL=(ALL) NOPASSWD:ALL """ ) sudoer_file_path = '/etc/sudoers.d/89-mesosbox-user' put( local_path=StringIO( sudoer_file ), remote_path=sudoer_file_path, use_sudo=True, mode=0440 ) sudo( "chown root:root '%s'" % sudoer_file_path )
def _sync_package_repos(self): put( remote_path='/etc/apt/apt.conf.d/99timeout', use_sudo=True, local_path=StringIO( heredoc(""" Acquire::http::Timeout "10"; Acquire::ftp::Timeout "10"; """)), ) for i in range(5): cmd = self.apt_get + ' update' result = sudo(cmd, warn_only=True) if result.succeeded: return # https://bugs.launchpad.net/ubuntu/+source/apt/+bug/972077 # https://lists.debian.org/debian-dak/2012/05/threads.html#00006 if 'Hash Sum mismatch' in result: log.warn("Detected race condition during in '%s'") else: raise RuntimeError("Command '%s' failed" % cmd) raise RuntimeError( "Command '%s' repeatedly failed with race condition. Giving up.")
def __patch_distutils(self): """ https://hg.python.org/cpython/rev/cf70f030a744/ https://bitbucket.org/pypa/setuptools/issues/248/exit-code-is-zero-when-upload-fails Fixed in 2.7.8: https://hg.python.org/cpython/raw-file/v2.7.8/Misc/NEWS """ if self._remote_python_version() < (2, 7, 8): with remote_sudo_popen('patch -d /usr/lib/python2.7 -p2') as patch: patch.write( heredoc(""" --- a/Lib/distutils/command/upload.py +++ b/Lib/distutils/command/upload.py @@ -10,7 +10,7 @@ import urlparse import cStringIO as StringIO from hashlib import md5 -from distutils.errors import DistutilsOptionError +from distutils.errors import DistutilsError, DistutilsOptionError from distutils.core import PyPIRCCommand from distutils.spawn import spawn from distutils import log @@ -181,7 +181,7 @@ class upload(PyPIRCCommand): self.announce(msg, log.INFO) except socket.error, e: self.announce(str(e), log.ERROR) - return + raise except HTTPError, e: status = e.code reason = e.msg @@ -190,5 +190,6 @@ class upload(PyPIRCCommand): self.announce('Server response (%s): %s' % (status, reason), log.INFO) else: - self.announce('Upload failed (%s): %s' % (status, reason), - log.ERROR) + msg = 'Upload failed (%s): %s' % (status, reason) + self.announce(msg, log.ERROR) + raise DistutilsError(msg)"""))
def __init__(self, application): """ Set later, once we have a context. :type: Cluster """ super(ClusterTypeCommand, self).__init__(application) self.option( '--num-threads', metavar='NUM', type=int, default=100, dest='num_threads', help='The maximum number of tasks to be performed concurrently.') self.option( 'cluster_type', metavar='TYPE', completer=self.completer, help=heredoc( """The type of the cluster to be used. The cluster type is covariant with the role of the leader node. For example, a box performing the 'foo-leader' role will be part of a cluster of type 'foo'.""" ))
def __install_tools( self ): """ Installs the spark-master-discovery init script and its companion spark-tools. The latter is a Python package distribution that's included in cgcloud-spark as a resource. This is in contrast to the cgcloud agent, which is a standalone distribution. """ tools_dir = install_dir + '/tools' admin = self.admin_account( ) sudo( fmt( 'mkdir -p {tools_dir}' ) ) sudo( fmt( 'chown {admin}:{admin} {tools_dir}' ) ) run( fmt( 'virtualenv --no-pip {tools_dir}' ) ) run( fmt( '{tools_dir}/bin/easy_install pip==1.5.2' ) ) with settings( forward_agent=True ): with self._project_artifacts( 'spark-tools' ) as artifacts: pip( use_sudo=True, path=tools_dir + '/bin/pip', args=concat( 'install', artifacts ) ) sudo( fmt( 'chown -R root:root {tools_dir}' ) ) spark_tools = "SparkTools(**%r)" % dict( user=user, shared_dir=self._shared_dir( ), install_dir=install_dir, ephemeral_dir=ephemeral_dir, persistent_dir=persistent_dir, lazy_dirs=self.lazy_dirs ) self.lazy_dirs = None # make sure it can't be used anymore once we are done with it self._register_init_script( "sparkbox", heredoc( """ description "Spark/HDFS master discovery" console log start on (local-filesystems and net-device-up IFACE!=lo) stop on runlevel [!2345] pre-start script for i in 1 2 3; do if {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.spark_tools import SparkTools spark_tools = {spark_tools} spark_tools.start() END then exit 0; fi; echo Retrying in 60s; sleep 60; done; exit 1 end script post-stop script {tools_dir}/bin/python2.7 - <<END import logging logging.basicConfig( level=logging.INFO ) from cgcloud.spark_tools import SparkTools spark_tools = {spark_tools} spark_tools.stop() END end script""" ) ) script_path = "/usr/local/bin/sparkbox-manage-slaves" put( remote_path=script_path, use_sudo=True, local_path=StringIO( heredoc( """ #!{tools_dir}/bin/python2.7 import sys import logging # Prefix each log line to make it more obvious that it's the master logging when the # slave calls this script via ssh. logging.basicConfig( level=logging.INFO, format="manage_slaves: " + logging.BASIC_FORMAT ) from cgcloud.spark_tools import SparkTools spark_tools = {spark_tools} spark_tools.manage_slaves( slaves_to_add=sys.argv[1:] )""" ) ) ) sudo( fmt( "chown root:root {script_path} && chmod 755 {script_path}" ) )
def __setup_ssh_config( self ): with remote_open( '/etc/ssh/ssh_config', use_sudo=True ) as f: f.write( heredoc( """ Host spark-master CheckHostIP no HashKnownHosts no""" ) )
def __configure_gridengine( self ): """ Configure the GridEngine daemons (master and exec) and creata a default queue. Ensure that the queue is updated to reflect the number of cores actually available. """ ws = re.compile( r'\s+' ) nl = re.compile( r'[\r\n]+' ) def qconf( opt, **kwargs ): return qconf_dict( opt, kwargs ) def qconf_dict( opt, d=None, file_name='qconf.tmp' ): if d: # qconf can't read from stdin for some reason, neither -, /dev/stdin or /dev/fd/0 works s = '\n'.join( ' '.join( i ) for i in d.iteritems( ) ) + '\n' put( remote_path=file_name, local_path=StringIO( s ) ) sudo( ' '.join( [ 'qconf', opt, file_name ] ) ) run( ' '.join( [ 'rm', file_name ] ) ) else: return dict( tuple( ws.split( l, 1 ) ) for l in nl.split( run( 'SGE_SINGLE_LINE=1 qconf ' + opt ) ) if l and not l.startswith( '#' ) ) # Add the user defined in fname to the Sun Grid Engine cluster. qconf( '-Auser', name=Jenkins.user, oticket='0', fshare='0', delete_time='0', default_project='NONE' ) # Adds users to Sun Grid Engine user access lists (ACLs). sudo( 'qconf -au %s arusers' % Jenkins.user ) # Add hosts hostname to the list of hosts allowed to submit Sun Grid Engine jobs and # control their behavior only. sudo( 'qconf -as localhost' ) # Remove all currently defined execution hosts run( 'for i in `qconf -sel`; do sudo qconf -de $i ; done' ) # Add an execution host qconf( '-Ae', hostname='localhost', load_scaling='NONE', complex_values='NONE', user_lists='arusers', xuser_lists='NONE', projects='NONE', xprojects='NONE', usage_scaling='NONE', report_variables='NONE' ) # Add a parallel environment qconf( '-Ap', pe_name='smp', slots='999', user_lists='NONE', xuser_lists='NONE', start_proc_args='/bin/true', stop_proc_args='/bin/true', allocation_rule='$pe_slots', control_slaves='FALSE', job_is_first_task='TRUE', urgency_slots='min', accounting_summary='FALSE' ) # Add a queue, the slots and processors will be adjusted dynamically, by an init script qconf( '-Aq', qname='all.q', processors='1', slots='1', hostlist='localhost', seq_no='0', load_thresholds='np_load_avg=1.75', suspend_thresholds='NONE', nsuspend='1', suspend_interval='00:05:00', priority='0', min_cpu_interval='00:05:00', qtype='BATCH INTERACTIVE', ckpt_list='NONE', pe_list='make smp', rerun='FALSE', tmpdir='/tmp', shell='/bin/bash', prolog='NONE', epilog='NONE', shell_start_mode='posix_compliant', starter_method='NONE', suspend_method='NONE', resume_method='NONE', terminate_method='NONE', notify='00:00:60', owner_list='NONE', user_lists='arusers', xuser_lists='NONE', subordinate_list='NONE', complex_values='NONE', projects='NONE', xprojects='NONE', calendar='NONE', initial_state='default', s_rt='INFINITY', h_rt='INFINITY', s_cpu='INFINITY', h_cpu='INFINITY', s_fsize='INFINITY', h_fsize='INFINITY', s_data='INFINITY', h_data='INFINITY', s_stack='INFINITY', h_stack='INFINITY', s_core='INFINITY', h_core='INFINITY', s_rss='INFINITY', h_rss='INFINITY', s_vmem='INFINITY', h_vmem='INFINITY' ) # Enable on-demand scheduling. This will eliminate the long time that jobs spend waiting # in the qw state. There is no -Asconf so we have to fake it using -ssconf and -Msconf. sconf = qconf( '-ssconf' ) sconf.update( dict( flush_submit_sec='1', flush_finish_sec='1', schedule_interval='0:0:1' ) ) qconf_dict( '-Msconf', sconf ) # Enable immediate flushing of the accounting file. The SGE batch system in Toil uses the # qacct program to determine the exit code of a finished job. The qacct program reads # the accounting file. By default, this file is written to every 15 seconds which means # that it may take up to 15 seconds before a finished job is seen by Toil. An # accounting_flush_time value of 00:00:00 causes the accounting file to be flushed # immediately, allowing qacct to report the status of finished jobs immediately. Again, # there is no -Aconf, so we fake it with -sconf and -Mconf. Also, the file name has to be # 'global'. conf = qconf( '-sconf' ) params = dict( tuple( e.split( '=' ) ) for e in conf[ 'reporting_params' ].split( ' ' ) ) params[ 'accounting_flush_time' ] = '00:00:00' conf[ 'reporting_params' ] = ' '.join( '='.join( e ) for e in params.iteritems( ) ) qconf_dict( '-Mconf', conf, file_name='global' ) # Register an init-script that ensures GridEngine uses localhost instead of hostname path = '/var/lib/gridengine/default/common/' self._register_init_script( 'gridengine-pre', heredoc( """ description "GridEngine pre-start configuration" console log start on filesystem pre-start script echo localhost > {path}/act_qmaster ; chown sgeadmin:sgeadmin {path}/act_qmaster echo localhost `hostname -f` > {path}/host_aliases end script""" ) ) # Register an init-script that adjust the queue config to reflect the number of cores self._register_init_script( 'gridengine-post', heredoc( """ description "GridEngine post-start configuration" console log # I would rather depend on the gridengine daemons but don't know how as they are # started by SysV init scripts. Supposedly the 'rc' job is run last. start on started rc pre-start script cores=$(grep -c '^processor' /proc/cpuinfo) qconf -mattr queue processors $cores `qselect` qconf -mattr queue slots $cores `qselect` end script""" ) ) # Run pre-start script for daemon in ('exec', 'master'): sudo( '/etc/init.d/gridengine-%s stop' % daemon ) sudo( "killall -9 -r 'sge_.*'" ) # the exec daemon likes to hang self._run_init_script( 'gridengine-pre' ) for daemon in ('master', 'exec'): sudo( '/etc/init.d/gridengine-%s start' % daemon ) # Run post-start script self._run_init_script( 'gridengine-post' ) while 'execd is in unknown state' in run( 'qstat -f -q all.q -explain a', warn_only=True ): time.sleep( 1 )