Beispiel #1
0
    def __init__(self, application, **kwargs):
        self.default_namespace = os.environ.get('CGCLOUD_NAMESPACE',
                                                '/__me__/')
        self.default_zone = os.environ.get('CGCLOUD_ZONE', None)
        super(ContextCommand, self).__init__(application, **kwargs)

        self.option('--zone',
                    '-z',
                    metavar='ZONE',
                    default=self.default_zone,
                    dest='availability_zone',
                    required=not bool(self.default_zone),
                    help=heredoc(
                        """The name of the EC2 availability zone to operate in,
                     e.g. us-east-1b, us-west-1b or us-west-2c etc. This argument implies the AWS
                     region to run in. The value of the environment variable CGCLOUD_ZONE,
                     if that variable is present, determines the default."""))

        self.option(
            '--namespace',
            '-n',
            metavar='PREFIX',
            default=self.default_namespace,
            help=heredoc(
                """Optional prefix for naming EC2 resource like instances,
                     images, volumes, etc. Use this option to create a separate namespace in
                     order to avoid collisions, e.g. when running tests. The value of the
                     environment variable CGCLOUD_NAMESPACE, if that variable is present,
                     overrides the default. The string __me__ anywhere in the namespace will be
                     replaced by the name of the IAM user whose credentials are used to issue
                     requests to AWS. If the name of that IAM user contains the @ character,
                     anything after the first occurrance of that character will be discarded
                     before the substitution is done."""))
Beispiel #2
0
    def __init__(self, application):
        super(ClusterCommand, self).__init__(application)

        self.option(
            '--cluster-name',
            '-c',
            metavar='NAME',
            help=heredoc(
                """The name of the cluster to operate on. The default is to
                     consider all clusters of the given type regardless of their name,
                     using --ordinal to disambiguate. Note that the cluster name is not
                     necessarily unique, not even with a specific cluster type, there may be more
                     than one cluster of a particular name and type."""))

        self.option(
            '--ordinal',
            '-o',
            default=-1,
            type=int,
            help=heredoc(
                """Selects an individual cluster from the list of currently
                     running clusters of the given cluster type and name. Since there is one
                     leader per cluster, this is equal to the ordinal of the leader among all
                     leaders of clusters of the given type and name. The ordinal is a zero-based
                     index into the list of all clusters of the specified type and name,
                     sorted by creation time. This means that the ordinal of a cluster is not
                     fixed, it may change if another cluster of the same type and name is
                     terminated. If the ordinal is negative, it will be converted to a positive
                     ordinal by adding the number of clusters of the specified type. Passing -1,
                     for example, selects the most recently created box."""))
Beispiel #3
0
 def __init__(self, application):
     super(CreateCommand, self).__init__(application)
     self.option(
         '--boot-image',
         '-i',
         metavar='AMI_ID',
         help=heredoc(
             """The AMI ID of the image from which to create the box. This
                  argument is optional and the default is determined automatically based on
                  the role. Typically, this option does not need to be used."""
         ))
     self.option(
         '--no-agent',
         default=False,
         action='store_true',
         help=heredoc(
             """Don't install the cghub-cloud-agent package on the box. One
                  note-worthy effect of using this option this is that the SSH keys will be
                  installed initially, but not maintained over time."""))
     self.option(
         '--create-image',
         '-I',
         default=False,
         action='store_true',
         help='Create an image of the box as soon as setup completes.')
     # FIXME: Take a second look at this: Does it work. Is it necessary?
     self.option(
         '--upgrade',
         '-U',
         default=False,
         action='store_true',
         help=heredoc(
             """Bring the package repository as well as any installed
                  packages up to date, i.e. do what on Ubuntu is achieved by doing 'sudo
                  apt-get update ; sudo apt-get upgrade'."""))
Beispiel #4
0
 def __to_hadoop_xml_config(properties):
     """
     >>> print SparkBox._SparkBox__to_hadoop_xml_config( {'foo' : 'bar'} )
     <?xml version='1.0' encoding='utf-8'?>
     <?xml-stylesheet type='text/xsl' href='configuration.xsl'?>
     <configuration>
         <property>
             <name>foo</name>
             <value>bar</value>
         </property>
     </configuration>
     <BLANKLINE>
     """
     s = StringIO()
     s.write(
         heredoc("""
         <?xml version='1.0' encoding='utf-8'?>
         <?xml-stylesheet type='text/xsl' href='configuration.xsl'?>
         <configuration>"""))
     for name, value in properties.iteritems():
         s.write(
             heredoc("""
             <property>
                 <name>{name}</name>
                 <value>{value}</value>
             </property>""",
                     indent='    '))
     s.write("</configuration>\n")
     return s.getvalue()
Beispiel #5
0
 def __init__(self, application):
     super(RsyncCommandMixin, self).__init__(application)
     self.option(
         '--ssh-opts',
         '-e',
         metavar='OPTS',
         default=None,
         help=heredoc(
             """Additional options to pass to ssh. Note that if OPTS starts
                  with a dash you must use the long option followed by an equal sign. For
                  example, to run ssh in verbose mode, use --ssh-opt=-v. If OPTS is to include
                  spaces, it must be quoted to prevent the shell from breaking it up. So to
                  run ssh in verbose mode and log to syslog, you would use --ssh-opt='-v
                  -y'."""))
     self.option(
         'args',
         metavar='...',
         nargs=argparse.REMAINDER,
         default=[],
         help=heredoc(
             """Command line options for rsync(1). The remote path argument
                  must be prefixed with a colon. For example, 'cgcloud.py rsync foo -av :bar
                  .' would copy the file 'bar' from the home directory of the admin user on
                  the box 'foo' to the current directory on the local machine."""
         ))
Beispiel #6
0
 def __init__(self, application, **kwargs):
     super(RegisterKeyCommand, self).__init__(application, **kwargs)
     self.option(
         'ssh_public_key',
         metavar='KEY_FILE',
         help=heredoc(
             """Path of file containing the SSH public key to upload to the
                  EC2 keypair."""))
     self.option('--force',
                 '-F',
                 default=False,
                 action='store_true',
                 help='Overwrite potentially existing EC2 key pair')
     self.option(
         '--keypair',
         '-k',
         metavar='NAME',
         dest='ec2_keypair_name',
         default='__me__',
         help=heredoc(
             """The desired name of the EC2 key pair. The name should
                  associate the key with you in a way that it is obvious to other users in
                  your organization.  The string __me__ anywhere in the key pair name will be
                  replaced with the name of the IAM user whose credentials are used to issue
                  requests to AWS."""))
Beispiel #7
0
    def __init__( self, application, **kwargs ):
        self.default_namespace = os.environ.get( 'CGCLOUD_NAMESPACE', '/__me__/' )
        self.default_zone = os.environ.get( 'CGCLOUD_ZONE', None )
        super( ContextCommand, self ).__init__( application, **kwargs )

        self.option( '--zone', '-z', metavar='ZONE',
                     default=self.default_zone, dest='availability_zone',
                     required=not bool( self.default_zone ),
                     help=heredoc( """The name of the EC2 availability zone to operate in,
                     e.g. us-east-1b, us-west-1b or us-west-2c etc. This argument implies the AWS
                     region to run in. The value of the environment variable CGCLOUD_ZONE,
                     if that variable is present, determines the default.""" ) )

        self.option( '--namespace', '-n', metavar='PREFIX', default=self.default_namespace,
                     help=heredoc( """Optional prefix for naming EC2 resource like instances,
                     images, volumes, etc. Use this option to create a separate namespace in
                     order to avoid collisions, e.g. when running tests. A namespace begins with
                     a slash, followed by zero or more names, each name followed by a slash. Note
                     that this implies that the namespace begins and ends with a slash. Each name
                     must begin with a a digit or lowercase letter followed by zero or more
                     digits, lowercase letters, periods, underscores or dashes. The value of the
                     environment variable CGCLOUD_NAMESPACE, if that variable is present,
                     overrides the default. The string __me__ anywhere in the namespace will be
                     replaced by the name of the IAM user whose credentials are used to issue
                     requests to AWS. If the name of that IAM user contains the @ character,
                     anything after the first occurrance of that character will be discarded
                     before the substitution is done.""" ) )
Beispiel #8
0
 def __init__(self, application, **kwargs):
     super(InstanceCommand, self).__init__(application, **kwargs)
     self.option(
         '--cluster-name',
         '-c',
         metavar='NAME',
         help=heredoc(
             """This option can be used to restrict the selection to boxes
                  that are part of a cluster of the given name. Boxes that are not part of a
                  cluster use their own instance id as the cluster name.""")
     )
     self.begin_mutex()
     self.option(
         '--ordinal',
         '-o',
         default=-1,
         type=int,
         help=heredoc(
             """Selects an individual box from the list of boxes performing
                  the specified role in a cluster of the given name. The ordinal is a
                  zero-based index into the list of all boxes performing the specified role,
                  sorted by creation time. This means that the ordinal of a box is not fixed,
                  it may change if another box performing the specified role is terminated. If
                  the ordinal is negative, it will be converted to a positive ordinal by
                  adding the number of boxes performing the specified role. Passing -1,
                  for example, selects the most recently created box."""))
     self.option(
         '--instance-id',
         '-I',
         default=None,
         type=str,
         help=heredoc("""Selects an individual instance. When combined with
                  --cluster-name, the specified instance needs to belong to a cluster of the
                  specified name or an error will be raised."""))
     self.end_mutex()
Beispiel #9
0
 def __to_hadoop_xml_config( properties ):
     """
     >>> print SparkBox._SparkBox__to_hadoop_xml_config( {'foo' : 'bar'} )
     <?xml version='1.0' encoding='utf-8'?>
     <?xml-stylesheet type='text/xsl' href='configuration.xsl'?>
     <configuration>
         <property>
             <name>foo</name>
             <value>bar</value>
         </property>
     </configuration>
     <BLANKLINE>
     """
     s = StringIO( )
     s.write( heredoc( """
         <?xml version='1.0' encoding='utf-8'?>
         <?xml-stylesheet type='text/xsl' href='configuration.xsl'?>
         <configuration>""" ) )
     for name, value in properties.iteritems( ):
         s.write( heredoc( """
             <property>
                 <name>{name}</name>
                 <value>{value}</value>
             </property>""", indent='    ' ) )
     s.write( "</configuration>\n" )
     return s.getvalue( )
Beispiel #10
0
    def __init__( self, application ):
        super( CreateClusterCommand, self ).__init__( application )
        self.cluster = None

        self.option( '--cluster-name', '-c', metavar='NAME',
                     help=heredoc( """A name for the new cluster. If absent, the instance ID of
                     the master will be used. Cluster names do not need to be unique, but they
                     should be in order to avoid user error.""" ) )

        self.option( '--num-workers', '-s', metavar='NUM',
                     type=int, default=1,
                     help='The number of workers to launch.' )

        self.option( '--ebs-volume-size', '-e', metavar='GB',
                     help=heredoc( """The size in GB of an EBS volume to be attached to each node
                     for persistent data. The volume will be mounted at /mnt/persistent.""" ) )

        self.option( '--leader-on-demand', '-D',
                     default=False, action='store_true',
                     help=heredoc( """Use this option to insure that the leader will be an
                     on-demand instance, even if --spot-bid is given.""" ) )

        self.option( '--share', '-S', metavar='PATH',
                     default=None, dest='share_path',
                     help=heredoc( """The path to a local file or directory for distribution to
                     the cluster. The given file or directory (or the contents of the given
                     directory, if the path ends in a slash) will be placed in the default user's
                     ~/shared directory on each node.""" ) )

        self.option( '--ssh-opts', metavar='OPTS', default=None,
                     help=heredoc( """Additional options to pass to ssh when uploading the files
                     shared via rsync. For more detail refer to cgcloud rsync --help""" ) )
Beispiel #11
0
    def __init__(self, application):
        super(CreateClusterCommand, self).__init__(application)
        self.cluster = None

        self.option(
            '--cluster-name',
            '-c',
            metavar='NAME',
            help=heredoc(
                """A name for the new cluster. If absent, the instance ID of
                     the master will be used. Cluster names do not need to be unique, but they
                     should be in order to avoid user error."""))

        self.option('--num-workers',
                    '-s',
                    metavar='NUM',
                    type=int,
                    default=1,
                    dest='num_workers',
                    help='The number of workers to launch.')

        self.option(
            '--ebs-volume-size',
            '-e',
            metavar='GB',
            help=heredoc(
                """The size in GB of an EBS volume to be attached to each node
                     for persistent data. The volume will be mounted at /mnt/persistent."""
            ))

        self.option('--leader-on-demand',
                    '-D',
                    dest='leader_on_demand',
                    default=False,
                    action='store_true',
                    help=heredoc(
                        """Use this option to insure that the leader will be an
                     on-demand instance, even if --spot-bid is given."""))

        self.option(
            '--share',
            '-S',
            metavar='PATH',
            default=None,
            dest='share_path',
            help=heredoc(
                """The path to a local file or directory for distribution to
                     the cluster. The given file or directory (or the contents of the given
                     directory, if the path ends in a slash) will be placed in the default user's
                     ~/shared directory on each node."""))

        self.option(
            '--ssh-opts',
            metavar='OPTS',
            default=None,
            help=heredoc(
                """Additional options to pass to ssh when uploading the files
                     shared via rsync. For more detail refer to cgcloud rsync --help"""
            ))
Beispiel #12
0
 def __install_sparkbox_tools( self ):
     """
     Installs the spark-master-discovery init script and its companion spark-tools. The latter
     is a Python package distribution that's included in cgcloud-spark as a resource. This is
     in contrast to the cgcloud agent, which is a standalone distribution.
     """
     tools_dir = install_dir + '/tools'
     admin = self.admin_account( )
     sudo( fmt( 'mkdir -p {tools_dir} {persistent_dir} {ephemeral_dir}' ) )
     sudo( fmt( 'chown {admin}:{admin} {tools_dir}' ) )
     run( fmt( 'virtualenv --no-pip {tools_dir}' ) )
     run( fmt( '{tools_dir}/bin/easy_install pip==1.5.2' ) )
     spark_tools_artifacts = ' '.join( self._project_artifacts( 'spark-tools' ) )
     with settings( forward_agent=True ):
         run( fmt( '{tools_dir}/bin/pip install {spark_tools_artifacts}' ), pty=False )
     sudo( fmt( 'chown -R root:root {tools_dir}' ) )
     spark_tools = "SparkTools(**%r)" % dict( user=user,
                                              install_dir=install_dir,
                                              ephemeral_dir=ephemeral_dir,
                                              persistent_dir=persistent_dir,
                                              lazy_dirs=self.lazy_dirs )
     self._register_init_script(
         "sparkbox",
         heredoc( """
             description "Spark/HDFS master discovery"
             console log
             start on runlevel [2345]
             stop on runlevel [016]
             pre-start script
             {tools_dir}/bin/python2.7 - <<END
             import logging
             logging.basicConfig( level=logging.INFO )
             from cgcloud.spark_tools import SparkTools
             spark_tools = {spark_tools}
             spark_tools.start()
             end script
             post-stop script
             {tools_dir}/bin/python2.7 - <<END
             import logging
             logging.basicConfig( level=logging.INFO )
             from cgcloud.spark_tools import SparkTools
             spark_tools = {spark_tools}
             spark_tools.stop()
             END
             end script""" ) )
     script_path = "/usr/local/bin/sparkbox-manage-slaves"
     put( remote_path=script_path, use_sudo=True, local_path=StringIO( heredoc( """
         #!{tools_dir}/bin/python2.7
         import sys
         import logging
         logging.basicConfig( level=logging.INFO )
         from cgcloud.spark_tools import SparkTools
         spark_tools = {spark_tools}
         spark_tools.manage_slaves( slaves_to_add=sys.argv[1:] )""" ) ) )
     sudo( fmt( "chown root:root {script_path} && chmod 755 {script_path}" ) )
Beispiel #13
0
 def __init__( self, application ):
     super( DeleteImageCommand, self ).__init__( application )
     self.begin_mutex( )
     self.option( '--keep-snapshot', '-K',
                  default=False, action='store_true',
                  help=heredoc( """Do not delete the EBS volume snapshot associated with the
                  given image. This will leave an orphaned snapshot which should be removed at
                  a later time using the 'cgcloud cleanup' command.""" ) )
     self.option( '--quick', '-Q', default=False, action='store_true',
                  help=heredoc( """Exit immediately after deregistration request has been made,
                  don't wait until the image is deregistered. Implies --keep-snapshot.""" ) )
     self.end_mutex( )
Beispiel #14
0
 def __init__( self, application, **kwargs ):
     super( UserCommandMixin, self ).__init__( application, **kwargs )
     self.begin_mutex( )
     self.option( '--login', '-l', default=None, metavar='USER', dest='user',
                  help=heredoc( """Name of user to login as. The default depends on the role,
                  for most roles the default is the administrative user. Roles that define a
                  second less privileged application user will default to that user. Can't be
                  used together with -a, --admin.""" ) )
     self.option( '--admin', '-a', default=False, action='store_true',
                  help=heredoc( """Force logging in as the administrative user. Can't be used
                  together with -l, --login.""" ) )
     self.end_mutex( )
Beispiel #15
0
 def __init__( self, application, **kwargs ):
     super( RegisterKeyCommand, self ).__init__( application, **kwargs )
     self.option( 'ssh_public_key', metavar='KEY_FILE',
                  help=heredoc( """Path of file containing the SSH public key to upload to the
                  EC2 keypair.""" ) )
     self.option( '--force', '-F', default=False, action='store_true',
                  help='Overwrite potentially existing EC2 key pair' )
     self.option( '--keypair', '-k', metavar='NAME',
                  dest='ec2_keypair_name', default='__me__',
                  help=heredoc( """The desired name of the EC2 key pair. The name should
                  associate the key with you in a way that it is obvious to other users in
                  your organization.  The string __me__ anywhere in the key pair name will be
                  replaced with the name of the IAM user whose credentials are used to issue
                  requests to AWS.""" ) )
Beispiel #16
0
 def __init__( self, application ):
     super( RsyncCommandMixin, self ).__init__( application )
     self.option( '--ssh-opts', '-e', metavar='OPTS', default=None,
                  help=heredoc( """Additional options to pass to ssh. Note that if OPTS starts
                  with a dash you must use the long option followed by an equal sign. For
                  example, to run ssh in verbose mode, use --ssh-opt=-v. If OPTS is to include
                  spaces, it must be quoted to prevent the shell from breaking it up. So to
                  run ssh in verbose mode and log to syslog, you would use --ssh-opt='-v
                  -y'.""" ) )
     self.option( 'args', metavar='...', nargs=argparse.REMAINDER, default=[ ],
                  help=heredoc( """Command line options for rsync(1). The remote path argument
                  must be prefixed with a colon. For example, 'cgcloud.py rsync foo -av :bar
                  .' would copy the file 'bar' from the home directory of the admin user on
                  the box 'foo' to the current directory on the local machine.""" ) )
Beispiel #17
0
 def __init__( self, application, **kwargs ):
     super( InstanceCommand, self ).__init__( application, **kwargs )
     self.option( '--cluster-name', '-c', metavar='NAME',
                  help=heredoc( """This option can be used to restrict the selection to boxes
                  that are part of a cluster of the given name. Boxes that are not part of a
                  cluster use their own instance id as the cluster name.""" ) )
     self.option( '--ordinal', '-o', default=-1, type=int,
                  help=heredoc( """Selects an individual box from the list of boxes performing
                  the specified role in a cluster of the given name. The ordinal is a
                  zero-based index into the list of all boxes performing the specified role,
                  sorted by creation time. This means that the ordinal of a box is not fixed,
                  it may change if another box performing the specified role is terminated. If
                  the ordinal is negative, it will be converted to a positive ordinal by
                  adding the number of boxes performing the specified role. Passing -1,
                  for example, selects the most recently created box.""" ) )
Beispiel #18
0
 def _word_count( self ):
     self._ssh( master, 'hdfs dfs -rm -r -f -skipTrash /test.txt /test.txt.counts' )
     self._ssh( master, 'rm -rf test.txt test.txt.counts' )
     self._ssh( master, 'curl -o test.txt https://www.apache.org/licenses/LICENSE-2.0.txt' )
     self._ssh( master, 'hdfs dfs -put -f test.txt /' )
     script, script_path = mkstemp( )
     try:
         script = os.fdopen( script, 'w' )
         script.write( heredoc( """
             import sys
             from pyspark import SparkContext
             sc = SparkContext(appName="PythonPi")
             file = sc.textFile( "/test.txt" )
             counts = ( file
                 .flatMap( lambda line: line.split( " " ) )
                 .map( lambda word: (word, 1) )
                 .reduceByKey( lambda a, b: a + b ) )
             counts.saveAsTextFile( "/test.txt.counts" )""" ) )
         script.close( )
         self._rsync( master, script_path, ':wordcount.py' )
     except:
         script.close( )
         raise
     finally:
         os.unlink( script_path )
     self._ssh( master, 'spark-submit --executor-memory 512m wordcount.py' )
     self._ssh( master, 'hdfs dfs -get /test.txt.counts' )
     self._ssh( master, 'test -f test.txt.counts/_SUCCESS' )
     for i in xrange( num_slaves ):
         self._ssh( master, 'test -s test.txt.counts/part-%05d' % i )
Beispiel #19
0
 def __init__( self, application ):
     super( RecreateCommand, self ).__init__( application )
     self.option( '--quick', '-Q', default=False, action='store_true',
                  help=heredoc( """Don't wait for the box to become running or reachable via
                  SSH. If the agent is disabled in the boot image (this is uncommon,
                  see the --no-agent option to the 'create' command), no additional SSH
                  keypairs will be deployed.""" ) )
Beispiel #20
0
 def _setup_docker(self):
     for docker_user in set(self._docker_users()):
         sudo("usermod -aG docker " + docker_user)
     prefixes = self._docker_data_prefixes()
     if prefixes:
         prefixes = " ".join(map(quote, prefixes))
         self._run_init_script("docker", "stop")
         # Make sure Docker's aufs backend isn't mounted anymore
         sudo("umount /var/lib/docker/aufs", warn_only=True)
         # Backup initial state of data directory so we can initialize an empty ephemeral volume
         sudo("tar -czC /var/lib docker > /var/lib/docker.tar.gz")
         # Then delete it and recreate it as an empty directory to serve as the bind mount point
         sudo("rm -rf /var/lib/docker && mkdir /var/lib/docker")
         self._register_init_script(
             "dockerbox",
             heredoc(
                 """
                 description "Placement of /var/lib/docker"
                 console log
                 start on starting docker
                 stop on stopped docker
                 pre-start script
                     echo
                     echo "This is the dockerbox pre-start script"
                     set -ex
                     if mountpoint -q /var/lib/docker; then
                         echo "The directory '/var/lib/docker' is already mounted, exiting."
                     else
                         for prefix in {prefixes}; do
                             # Prefix must refer to a separate volume, e.g. ephemeral or EBS
                             if mountpoint -q "$prefix"; then
                                 # Make sure Docker's aufs backend isn't mounted anymore
                                 umount /var/lib/docker/aufs || true
                                 if test -d "$prefix/var/lib/docker"; then
                                     echo "The directory '$prefix/var/lib/docker' already exists, using it."
                                 else
                                     mkdir -p "$prefix/var/lib"
                                     # If /var/lib/docker contains files ...
                                     if python -c 'import os, sys; sys.exit( 0 if os.listdir( sys.argv[1] ) else 1 )' /var/lib/docker; then
                                         # ... move it to prefix ...
                                         mv /var/lib/docker "$prefix/var/lib"
                                         # ... and recreate it as an empty mount point, ...
                                         mkdir -p /var/lib/docker
                                     else
                                         # ... otherwise untar the initial backup.
                                         tar -xzC "$prefix/var/lib" < /var/lib/docker.tar.gz
                                     fi
                                 fi
                                 # Now bind-mount into /var/lib/docker
                                 mount --bind "$prefix/var/lib/docker" /var/lib/docker
                                 break
                             else
                                 echo "The prefix directory '$prefix' is not a mount point, skipping."
                             fi
                         done
                     fi
                 end script"""
             ),
         )
         self._run_init_script("docker", "start")
Beispiel #21
0
 def __setup_ssh_config(self):
     with remote_open('/etc/ssh/ssh_config', use_sudo=True) as f:
         f.write(
             heredoc("""
             Host spark-master
                 CheckHostIP no
                 HashKnownHosts no"""))
Beispiel #22
0
    def __register_systemd_jobs( self, service_map ):
        for node_type, services in service_map.iteritems( ):
            for service in services:
                service_command_path = '/usr/sbin/%s-start.sh' % service.init_name

                put( local_path=StringIO( "#!/bin/sh\n" + service.command ), remote_path=service_command_path, use_sudo=True )
                sudo( "chown root:root '%s'" % service_command_path )
                sudo( "chmod +x '%s'" % service_command_path )

                self._register_init_script(
                    service.init_name,
                    heredoc( """
                        [Unit]
                        Description={service.description}
                        Before=docker.service
                        Wants=docker.service
                        Requires=mesosbox.service
                        After=mesosbox.service

                        [Service]
                        Type=simple
                        ExecStart={service_command_path}
                        User={service.user}
                        Group={service.user}
                        Environment="USER={user}"
                        LimitNOFILE=8000:8192
                        UMask=022

                        [Install]
                        WantedBy=multi-user.target
                        """ ) )
Beispiel #23
0
 def __init__( self, application ):
     super( SshClusterCommand, self ).__init__( application )
     self.option( '--parallel', '-P', default=False, action='store_true',
                  help=heredoc( """Run command on the workers in parallel. Note that this
                  doesn't work if SSH or the command itself prompts for input. This will
                  likely be the case on the first connection attempt when SSH typically
                  prompts for confirmation of the host key. An insecure work-around is to pass
                  "-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no".""" ) )
Beispiel #24
0
    def __install_tools( self ):
        """
        Installs the mesos-master-discovery init script and its companion mesos-tools. The latter
        is a Python package distribution that's included in cgcloud-mesos as a resource. This is
        in contrast to the cgcloud agent, which is a standalone distribution.
        """
        tools_dir = install_dir + '/tools'
        admin = self.admin_account( )
        sudo( fmt( 'mkdir -p {tools_dir}' ) )
        sudo( fmt( 'chown {admin}:{admin} {tools_dir}' ) )
        run( fmt( 'virtualenv --no-pip {tools_dir}' ) )
        run( fmt( '{tools_dir}/bin/easy_install pip==1.5.2' ) )

        with settings( forward_agent=True ):
            with self._project_artifacts( 'mesos-tools' ) as artifacts:
                pip( use_sudo=True,
                     path=tools_dir + '/bin/pip',
                     args=concat( 'install', artifacts ) )
        sudo( fmt( 'chown -R root:root {tools_dir}' ) )

        mesos_tools = "MesosTools(**%r)" % dict( user=user,
                                                 shared_dir=self._shared_dir( ),
                                                 ephemeral_dir=ephemeral_dir,
                                                 persistent_dir=persistent_dir,
                                                 lazy_dirs=self.lazy_dirs )

        self.lazy_dirs = None  # make sure it can't be used anymore once we are done with it

        self._register_init_script(
            "mesosbox",
            heredoc( """
                description "Mesos master discovery"
                console log
                start on (local-filesystems and net-device-up IFACE!=lo)
                stop on runlevel [!2345]
                pre-start script
                for i in 1 2 3; do if {tools_dir}/bin/python2.7 - <<END
                import logging
                logging.basicConfig( level=logging.INFO )
                from cgcloud.mesos_tools import MesosTools
                mesos_tools = {mesos_tools}
                mesos_tools.start()
                END
                then exit 0; fi; echo Retrying in 60s; sleep 60; done; exit 1
                end script
                post-stop script
                {tools_dir}/bin/python2.7 - <<END
                import logging
                logging.basicConfig( level=logging.INFO )
                from cgcloud.mesos_tools import MesosTools
                mesos_tools = {mesos_tools}
                mesos_tools.stop()
                END
                end script""" ) )
        # Explicitly start the mesosbox service to achieve creation of lazy directoriess right
        # now. This makes a generic mesosbox useful for adhoc tests that involve Mesos and Toil.
        self._run_init_script( 'mesosbox' )
Beispiel #25
0
 def _setup_docker(self):
     for docker_user in set(self._docker_users()):
         sudo("usermod -aG docker " + docker_user)
     prefixes = self._docker_data_prefixes()
     if prefixes:
         prefixes = ' '.join(map(quote, prefixes))
         self._run_init_script('docker', 'stop')
         # Make sure Docker's aufs backend isn't mounted anymore
         sudo('umount /var/lib/docker/aufs', warn_only=True)
         # Backup initial state of data directory so we can initialize an empty ephemeral volume
         sudo('tar -czC /var/lib docker > /var/lib/docker.tar.gz')
         # Then delete it and recreate it as an empty directory to serve as the bind mount point
         sudo('rm -rf /var/lib/docker && mkdir /var/lib/docker')
         self._register_init_script(
             'dockerbox',
             heredoc("""
                 description "Placement of /var/lib/docker"
                 console log
                 start on starting docker
                 stop on stopped docker
                 pre-start script
                     echo
                     echo "This is the dockerbox pre-start script"
                     set -ex
                     if mountpoint -q /var/lib/docker; then
                         echo "The directory '/var/lib/docker' is already mounted, exiting."
                     else
                         for prefix in {prefixes}; do
                             # Prefix must refer to a separate volume, e.g. ephemeral or EBS
                             if mountpoint -q "$prefix"; then
                                 # Make sure Docker's aufs backend isn't mounted anymore
                                 umount /var/lib/docker/aufs || true
                                 if test -d "$prefix/var/lib/docker"; then
                                     echo "The directory '$prefix/var/lib/docker' already exists, using it."
                                 else
                                     mkdir -p "$prefix/var/lib"
                                     # If /var/lib/docker contains files ...
                                     if python -c 'import os, sys; sys.exit( 0 if os.listdir( sys.argv[1] ) else 1 )' /var/lib/docker; then
                                         # ... move it to prefix ...
                                         mv /var/lib/docker "$prefix/var/lib"
                                         # ... and recreate it as an empty mount point, ...
                                         mkdir -p /var/lib/docker
                                     else
                                         # ... otherwise untar the initial backup.
                                         tar -xzC "$prefix/var/lib" < /var/lib/docker.tar.gz
                                     fi
                                 fi
                                 # Now bind-mount into /var/lib/docker
                                 mount --bind "$prefix/var/lib/docker" /var/lib/docker
                                 break
                             else
                                 echo "The prefix directory '$prefix' is not a mount point, skipping."
                             fi
                         done
                     fi
                 end script"""))
         self._run_init_script('docker', 'start')
Beispiel #26
0
    def __install_tools(self):
        """
        Installs the mesos-master-discovery init script and its companion mesos-tools. The latter
        is a Python package distribution that's included in cgcloud-mesos as a resource. This is
        in contrast to the cgcloud agent, which is a standalone distribution.
        """
        tools_dir = install_dir + '/tools'
        admin = self.admin_account()
        sudo(fmt('mkdir -p {tools_dir}'))
        sudo(fmt('chown {admin}:{admin} {tools_dir}'))
        run(fmt('virtualenv --no-pip {tools_dir}'))
        run(fmt('{tools_dir}/bin/easy_install pip==1.5.2'))

        with settings(forward_agent=True):
            with self._project_artifacts('mesos-tools') as artifacts:
                pip(use_sudo=True,
                    path=tools_dir + '/bin/pip',
                    args=concat('install', artifacts))
        sudo(fmt('chown -R root:root {tools_dir}'))

        mesos_tools = "MesosTools(**%r)" % dict(user=user,
                                                shared_dir=self._shared_dir(),
                                                ephemeral_dir=ephemeral_dir,
                                                persistent_dir=persistent_dir,
                                                lazy_dirs=self.lazy_dirs)

        self.lazy_dirs = None  # make sure it can't be used anymore once we are done with it

        self._register_init_script(
            "mesosbox",
            heredoc("""
                description "Mesos master discovery"
                console log
                start on (local-filesystems and net-device-up IFACE!=lo)
                stop on runlevel [!2345]
                pre-start script
                for i in 1 2 3; do if {tools_dir}/bin/python2.7 - <<END
                import logging
                logging.basicConfig( level=logging.INFO )
                from cgcloud.mesos_tools import MesosTools
                mesos_tools = {mesos_tools}
                mesos_tools.start()
                END
                then exit 0; fi; echo Retrying in 60s; sleep 60; done; exit 1
                end script
                post-stop script
                {tools_dir}/bin/python2.7 - <<END
                import logging
                logging.basicConfig( level=logging.INFO )
                from cgcloud.mesos_tools import MesosTools
                mesos_tools = {mesos_tools}
                mesos_tools.stop()
                END
                end script"""))
        # Explicitly start the mesosbox service to achieve creation of lazy directoriess right
        # now. This makes a generic mesosbox useful for adhoc tests that involve Mesos and Toil.
        self._run_init_script('mesosbox')
Beispiel #27
0
 def __init__(self, application, **kwargs):
     super(RoleCommand, self).__init__(application, **kwargs)
     self.option(
         'role',
         metavar='ROLE',
         completer=self.completer,
         help=heredoc(
             """The name of the role. Use the list-roles command to show
                  all available roles."""))
Beispiel #28
0
 def __init__(self, application, **kwargs):
     super(TerminateCommand, self).__init__(application, **kwargs)
     self.option(
         '--quick',
         '-Q',
         default=False,
         action='store_true',
         help=heredoc(
             """Exit immediately after termination request has been made,
                  don't wait until the box is terminated."""))
Beispiel #29
0
 def __init__( self, application ):
     super( CreateCommand, self ).__init__( application )
     self.option( '--boot-image', '-i', metavar='AMI_ID',
                  help=heredoc( """The AMI ID of the image from which to create the box. This
                  argument is optional and the default is determined automatically based on
                  the role. Typically, this option does not need to be used.""" ) )
     self.option( '--no-agent',
                  default=False, action='store_true',
                  help=heredoc( """Don't install the cghub-cloud-agent package on the box. One
                  note-worthy effect of using this option this is that the SSH keys will be
                  installed initially, but not maintained over time.""" ) )
     self.option( '--create-image', '-I',
                  default=False, action='store_true',
                  help='Create an image of the box as soon as setup completes.' )
     # FIXME: Take a second look at this: Does it work. Is it necessary?
     self.option( '--upgrade', '-U',
                  default=False, action='store_true',
                  help=heredoc( """Bring the package repository as well as any installed
                  packages up to date, i.e. do what on Ubuntu is achieved by doing 'sudo
                  apt-get update ; sudo apt-get upgrade'.""" ) )
Beispiel #30
0
 def __init__( self, application ):
     super( ImageReferenceCommand, self ).__init__( application )
     self.option( self.long_image_option, self.short_image_option, metavar='IMAGE',
                  type=self.ordinal_or_ami_id, default=-1,  # default to the last one
                  help=heredoc( """An image ordinal, i.e. the index of an image in the list of
                  images for the given role, sorted by creation time. Use the list-images
                  command to print a list of images for a given role. If the ordinal is
                  negative, it will be converted to a positive ordinal by adding the total
                  number of images for this role. Passing -1, for example, selects the most
                  recently created image. Alternatively, an AMI ID, e.g. 'ami-4dcced7d' can be
                  passed in as well.""" ) )
Beispiel #31
0
 def __add_per_boot_script( self ):
     """
     Ensure that the cloud-init.done file is always created, even on 2nd boot and there-after.
     On the first boot of an instance, the .done file creation is preformed by the runcmd
     stanza in cloud-config. On subsequent boots this per-boot script takes over (runcmd is
     skipped on those boots).
     """
     put( remote_path=self._cloudinit_boot_script( 'done' ), mode=0755, use_sudo=True,
          local_path=StringIO( heredoc( """
                 #!/bin/sh
                 touch /tmp/cloud-init.done""" ) ) )
Beispiel #32
0
    def __init__( self, application ):
        super( ClusterCommand, self ).__init__( application )

        self.option( '--cluster-name', '-c', metavar='NAME',
                     help=heredoc( """The name of the cluster to operate on. The default is to
                     consider all clusters of the given type regardless of their name,
                     using --ordinal to disambiguate. Note that the cluster name is not
                     necessarily unique, not even with a specific cluster type, there may be more
                     than one cluster of a particular name and type.""" ) )

        self.option( '--ordinal', '-o', default=-1, type=int,
                     help=heredoc( """Selects an individual cluster from the list of currently
                     running clusters of the given cluster type and name. Since there is one
                     leader per cluster, this is equal to the ordinal of the leader among all
                     leaders of clusters of the given type and name. The ordinal is a zero-based
                     index into the list of all clusters of the specified type and name,
                     sorted by creation time. This means that the ordinal of a cluster is not
                     fixed, it may change if another cluster of the same type and name is
                     terminated. If the ordinal is negative, it will be converted to a positive
                     ordinal by adding the number of clusters of the specified type. Passing -1,
                     for example, selects the most recently created box.""" ) )
Beispiel #33
0
 def __init__(self, application):
     super(SshCommandMixin, self).__init__(application)
     self.option(
         'command',
         metavar='...',
         nargs=argparse.REMAINDER,
         default=[],
         help=heredoc(
             """Additional arguments to pass to ssh. This can be anything
                  that one would normally pass to the ssh program excluding user name and host
                  but including, for example, the remote command to execute."""
         ))
Beispiel #34
0
 def __init__(self, application):
     super(RecreateCommand, self).__init__(application)
     self.option(
         '--quick',
         '-Q',
         default=False,
         action='store_true',
         help=heredoc(
             """Don't wait for the box to become running or reachable via
                  SSH. If the agent is disabled in the boot image (this is uncommon,
                  see the --no-agent option to the 'create' command), no additional SSH
                  keypairs will be deployed."""))
Beispiel #35
0
    def _docker_patch_heredoc( self ):
        return heredoc( """
            --- docker.service.orig	2017-04-12 20:45:15.899906518 +0000
            +++ docker.service	2017-04-12 20:42:57.186495824 +0000
            @@ -3,6 +3,8 @@
             Documentation=https://docs.docker.com
             After=network-online.target docker.socket firewalld.service
             Wants=network-online.target
            +After=mesosbox.service
            +Requires=mesosbox.service
             Requires=docker.socket

             [Service]""" )
Beispiel #36
0
 def __init__(self, application):
     super(DeleteImageCommand, self).__init__(application)
     self.begin_mutex()
     self.option(
         '--keep-snapshot',
         '-K',
         default=False,
         action='store_true',
         help=heredoc(
             """Do not delete the EBS volume snapshot associated with the
                  given image. This will leave an orphaned snapshot which should be removed at
                  a later time using the 'cgcloud cleanup' command."""))
     self.option(
         '--quick',
         '-Q',
         default=False,
         action='store_true',
         help=heredoc(
             """Exit immediately after deregistration request has been made,
                  don't wait until the image is deregistered. Implies --keep-snapshot."""
         ))
     self.end_mutex()
Beispiel #37
0
 def __add_per_boot_script(self):
     """
     Ensure that the cloud-init.done file is always created, even on 2nd boot and thereafter.
     On the first boot of an instance, the .done file creation is preformed by the runcmd
     stanza in cloud-config. On subsequent boots this per-boot script takes over (runcmd is
     skipped on those boots).
     """
     put(remote_path=self._cloudinit_boot_script('done'),
         mode=0755,
         use_sudo=True,
         local_path=StringIO(
             heredoc("""
                 #!/bin/sh
                 touch /tmp/cloud-init.done""")))
Beispiel #38
0
 def __init__(self, application):
     super(SshClusterCommand, self).__init__(application)
     self.option(
         '--parallel',
         '-P',
         default=False,
         action='store_true',
         help=heredoc(
             """Run command on the workers in parallel. Note that this
                  doesn't work if SSH or the command itself prompts for input. This will
                  likely be the case on the first connection attempt when SSH typically
                  prompts for confirmation of the host key. An insecure work-around is to pass
                  "-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no"."""
         ))
Beispiel #39
0
 def __init__(self, application, **kwargs):
     super(UserCommandMixin, self).__init__(application, **kwargs)
     self.begin_mutex()
     self.option(
         '--login',
         '-l',
         default=None,
         metavar='USER',
         dest='user',
         help=heredoc(
             """Name of user to login as. The default depends on the role,
                  for most roles the default is the administrative user. Roles that define a
                  second less privileged application user will default to that user. Can't be
                  used together with -a, --admin."""))
     self.option(
         '--admin',
         '-a',
         default=False,
         action='store_true',
         help=heredoc(
             """Force logging in as the administrative user. Can't be used
                  together with -l, --login."""))
     self.end_mutex()
Beispiel #40
0
    def __init__( self, application ):
        """
        Set later, once we have a context.
        :type: Cluster
        """
        super( ClusterTypeCommand, self ).__init__( application )
        self.option( '--num-threads', metavar='NUM',
                     type=int, default=100,
                     help='The maximum number of tasks to be performed concurrently.' )

        self.option( 'cluster_type', metavar='TYPE',
                     completer=self.completer,
                     help=heredoc( """The type of the cluster to be used. The cluster type is
                     covariant with the role of the leader node. For example, a box performing
                     the 'foo-leader' role will be part of a cluster of type 'foo'.""" ) )
Beispiel #41
0
 def __init__(self, application):
     super(ImageReferenceCommand, self).__init__(application)
     self.option(
         self.long_image_option,
         self.short_image_option,
         metavar='IMAGE',
         type=self.ordinal_or_ami_id,
         default=-1,  # default to the last one
         help=heredoc(
             """An image ordinal, i.e. the index of an image in the list of
                  images for the given role, sorted by creation time. Use the list-images
                  command to print a list of images for a given role. If the ordinal is
                  negative, it will be converted to a positive ordinal by adding the total
                  number of images for this role. Passing -1, for example, selects the most
                  recently created image. Alternatively, an AMI ID, e.g. 'ami-4dcced7d' can be
                  passed in as well."""))
Beispiel #42
0
    def _install_mesosbox_tools( self ):
        """
        Installs the mesos-master-discovery init script and its companion mesos-tools. The latter
        is a Python package distribution that's included in cgcloud-mesos as a resource. This is
        in contrast to the cgcloud agent, which is a standalone distribution.
        """
        tools_dir = install_dir + '/tools'
        sudo( fmt( 'mkdir -p {tools_dir}') )
        sudo( fmt('mkdir -p %s' % shared_dir) )
        sudo( fmt('chmod 777 %s' % shared_dir) )

        sudo( fmt( 'virtualenv --no-pip {tools_dir}' ) )
        sudo( fmt( '{tools_dir}/bin/easy_install pip==1.5.2' ) )

        mesos_tools_artifacts = ' '.join( self._project_artifacts( 'mesos-tools' ) )
        with settings( forward_agent=True ):
            sudo( fmt( '{tools_dir}/bin/pip install {mesos_tools_artifacts}' ), pty=False )

        mesos_tools = "MesosTools(**%r)" % dict( user=user,
                                                 ephemeral_dir=ephemeral_dir,
                                                 persistent_dir=persistent_dir,
                                                 lazy_dirs=self.lazy_dirs)
        self._register_init_script(
            "mesosbox",
            heredoc( """
                description "Mesos master discovery"
                console log
                start on runlevel [2345]
                stop on runlevel [016]
                pre-start script
                {tools_dir}/bin/python2.7 - <<END
                import logging
                logging.basicConfig( level=logging.INFO )
                from cgcloud.mesos_tools import MesosTools
                mesos_tools = {mesos_tools}
                mesos_tools.start()
                END
                end script
                post-stop script
                {tools_dir}/bin/python2.7 - <<END
                import logging
                logging.basicConfig( level=logging.INFO )
                from cgcloud.mesos_tools import MesosTools
                mesos_tools = {mesos_tools}
                mesos_tools.stop()
                END
                end script""" ) )
Beispiel #43
0
    def _setup_docker( self ):
        super( ToilBoxSupport, self )._setup_docker( )
        # The docker and dockerbox init jobs depend on /mnt/persistent which is set up by the
        # mesosbox job. Adding a dependency of the docker job on mesosbox should satsify that
        # dependency.
        with remote_sudo_popen( 'patch -d /etc/init' ) as patch:
            patch.write( heredoc( """
                --- docker.conf.orig	2015-12-18 23:28:48.693072560 +0000
                +++ docker.conf	2015-12-18 23:40:30.553072560 +0000
                @@ -1,6 +1,6 @@
                 description "Docker daemon"

                -start on (local-filesystems and net-device-up IFACE!=lo)
                +start on (local-filesystems and net-device-up IFACE!=lo and started mesosbox)
                 stop on runlevel [!2345]
                 limit nofile 524288 1048576
                 limit nproc 524288 1048576""" ) )
Beispiel #44
0
 def __register_upstart_jobs(self, service_map):
     for node_type, services in service_map.iteritems():
         start_on = "sparkbox-start-" + node_type
         for service in services:
             self._register_init_script(
                 service.init_name,
                 heredoc("""
                     description "{service.description}"
                     console log
                     start on {start_on}
                     stop on runlevel [016]
                     setuid {user}
                     setgid {user}
                     env USER={user}
                     pre-start exec {service.start_script}
                     post-stop exec {service.stop_script}"""))
             start_on = "started " + service.init_name
Beispiel #45
0
    def __patch_asynchat(self):
        """
        This bites us in pyftpdlib during S3AM unit tests:

        http://jenkins.cgcloud.info/job/s3am/13/testReport/junit/src.s3am.test.s3am_tests/CoreTests/test_copy/

        The patch is from

        https://hg.python.org/cpython/rev/d422062d7d36
        http://bugs.python.org/issue16133
        Fixed in 2.7.9: https://hg.python.org/cpython/raw-file/v2.7.9/Misc/NEWS
        """
        if self._remote_python_version() < (2, 7, 9):
            with remote_sudo_popen('patch -d /usr/lib/python2.7 -p2') as patch:
                patch.write(
                    heredoc('''
                    diff --git a/Lib/asynchat.py b/Lib/asynchat.py
                    --- a/Lib/asynchat.py
                    +++ b/Lib/asynchat.py
                    @@ -46,12 +46,17 @@ method) up to the terminator, and then c
                     you - by calling your self.found_terminator() method.
                     """

                    +import asyncore
                    +import errno
                     import socket
                    -import asyncore
                     from collections import deque
                     from sys import py3kwarning
                     from warnings import filterwarnings, catch_warnings

                    +_BLOCKING_IO_ERRORS = (errno.EAGAIN, errno.EALREADY, errno.EINPROGRESS,
                    +                       errno.EWOULDBLOCK)
                    +
                    +
                     class async_chat (asyncore.dispatcher):
                         """This is an abstract class.  You must derive from this class, and add
                         the two methods collect_incoming_data() and found_terminator()"""
                    @@ -109,6 +114,8 @@ class async_chat (asyncore.dispatcher):
                             try:
                                 data = self.recv (self.ac_in_buffer_size)
                             except socket.error, why:
                    +            if why.args[0] in _BLOCKING_IO_ERRORS:
                    +                return
                                 self.handle_error()
                                 return'''))
Beispiel #46
0
 def __register_upstart_jobs( self, service_map ):
     for node_type, services in service_map.iteritems( ):
         start_on = "sparkbox-start-" + node_type
         for service in services:
             self._register_init_script(
                 service.init_name,
                 heredoc( """
                     description "{service.description}"
                     console log
                     start on {start_on}
                     stop on runlevel [016]
                     setuid {user}
                     setgid {user}
                     env USER={user}
                     pre-start exec {service.start_script}
                     post-stop exec {service.stop_script}""" ) )
             start_on = "started " + service.init_name
Beispiel #47
0
 def _sync_package_repos( self ):
     put( remote_path='/etc/apt/apt.conf.d/99timeout',
          use_sudo=True,
          local_path=StringIO( heredoc( """
             Acquire::http::Timeout "10";
             Acquire::ftp::Timeout "10"; """ ) ), )
     for i in range( 5 ):
         cmd = self.apt_get + ' update'
         result = sudo( cmd, warn_only=True )
         if result.succeeded: return
         # https://bugs.launchpad.net/ubuntu/+source/apt/+bug/972077
         # https://lists.debian.org/debian-dak/2012/05/threads.html#00006
         if 'Hash Sum mismatch' in result:
             log.warn( "Detected race condition during in '%s'" )
         else:
             raise RuntimeError( "Command '%s' failed" % cmd )
     raise RuntimeError( "Command '%s' repeatedly failed with race condition. Giving up." )
Beispiel #48
0
    def __patch_asynchat( self ):
        """
        This bites us in pyftpdlib during S3AM unit tests:

        http://jenkins.cgcloud.info/job/s3am/13/testReport/junit/src.s3am.test.s3am_tests/CoreTests/test_copy/

        The patch is from

        https://hg.python.org/cpython/rev/d422062d7d36
        http://bugs.python.org/issue16133
        Fixed in 2.7.9: https://hg.python.org/cpython/raw-file/v2.7.9/Misc/NEWS
        """
        if self._remote_python_version() < (2,7,9):
            with remote_sudo_popen( 'patch -d /usr/lib/python2.7 -p2' ) as patch:
                patch.write( heredoc( '''
                    diff --git a/Lib/asynchat.py b/Lib/asynchat.py
                    --- a/Lib/asynchat.py
                    +++ b/Lib/asynchat.py
                    @@ -46,12 +46,17 @@ method) up to the terminator, and then c
                     you - by calling your self.found_terminator() method.
                     """

                    +import asyncore
                    +import errno
                     import socket
                    -import asyncore
                     from collections import deque
                     from sys import py3kwarning
                     from warnings import filterwarnings, catch_warnings

                    +_BLOCKING_IO_ERRORS = (errno.EAGAIN, errno.EALREADY, errno.EINPROGRESS,
                    +                       errno.EWOULDBLOCK)
                    +
                    +
                     class async_chat (asyncore.dispatcher):
                         """This is an abstract class.  You must derive from this class, and add
                         the two methods collect_incoming_data() and found_terminator()"""
                    @@ -109,6 +114,8 @@ class async_chat (asyncore.dispatcher):
                             try:
                                 data = self.recv (self.ac_in_buffer_size)
                             except socket.error, why:
                    +            if why.args[0] in _BLOCKING_IO_ERRORS:
                    +                return
                                 self.handle_error()
                                 return''' ) )
Beispiel #49
0
    def _setup_docker(self):
        super(ToilBox, self)._setup_docker()
        # The docker and dockerbox init jobs depend on /mnt/persistent which is set up by the
        # mesosbox job. Adding a dependency of the docker job on mesosbox should satsify that
        # dependency.
        with remote_sudo_popen('patch -d /etc/init') as patch:
            patch.write(
                heredoc("""
                --- docker.conf.orig	2015-12-18 23:28:48.693072560 +0000
                +++ docker.conf	2015-12-18 23:40:30.553072560 +0000
                @@ -1,6 +1,6 @@
                 description "Docker daemon"

                -start on (local-filesystems and net-device-up IFACE!=lo)
                +start on (local-filesystems and net-device-up IFACE!=lo and started mesosbox)
                 stop on runlevel [!2345]
                 limit nofile 524288 1048576
                 limit nproc 524288 1048576"""))
Beispiel #50
0
 def __register_upstart_jobs( self, service_map ):
     for node_type, services in service_map.iteritems( ):
         start_on = "mesosbox-start-" + node_type
         for service in services:
             self._register_init_script(
                 service.init_name,
                 heredoc( """
                     description "{service.description}"
                     console log
                     start on {start_on}
                     stop on runlevel [016]
                     respawn
                     umask 022
                     limit nofile 8000 8192
                     setuid {user}
                     setgid {user}
                     env USER={user}
                     exec {service.command}""" ) )
             start_on = "started " + service.init_name
Beispiel #51
0
 def __register_upstart_jobs( self, service_map ):
     for node_type, services in service_map.iteritems( ):
         start_on = "mesosbox-start-" + node_type
         for service in services:
             self._register_init_script(
                 service.init_name,
                 heredoc( """
                     description "{service.description}"
                     console log
                     start on {start_on}
                     stop on runlevel [016]
                     respawn
                     umask 022
                     limit nofile 8000 8192
                     setuid {user}
                     setgid {user}
                     env USER={user}
                     exec {service.command}""" ) )
             start_on = "started " + service.init_name
Beispiel #52
0
    def __patch_distutils(self):
        """
        https://hg.python.org/cpython/rev/cf70f030a744/
        https://bitbucket.org/pypa/setuptools/issues/248/exit-code-is-zero-when-upload-fails
        Fixed in 2.7.8: https://hg.python.org/cpython/raw-file/v2.7.8/Misc/NEWS
        """
        if self._remote_python_version() < (2, 7, 8):
            with remote_sudo_popen("patch -d /usr/lib/python2.7 -p2") as patch:
                patch.write(
                    heredoc(
                        """
                    --- a/Lib/distutils/command/upload.py
                    +++ b/Lib/distutils/command/upload.py
                    @@ -10,7 +10,7 @@ import urlparse
                     import cStringIO as StringIO
                     from hashlib import md5

                    -from distutils.errors import DistutilsOptionError
                    +from distutils.errors import DistutilsError, DistutilsOptionError
                     from distutils.core import PyPIRCCommand
                     from distutils.spawn import spawn
                     from distutils import log
                    @@ -181,7 +181,7 @@ class upload(PyPIRCCommand):
                                     self.announce(msg, log.INFO)
                             except socket.error, e:
                                 self.announce(str(e), log.ERROR)
                    -            return
                    +            raise
                             except HTTPError, e:
                                 status = e.code
                                 reason = e.msg
                    @@ -190,5 +190,6 @@ class upload(PyPIRCCommand):
                                 self.announce('Server response (%s): %s' % (status, reason),
                                               log.INFO)
                             else:
                    -            self.announce('Upload failed (%s): %s' % (status, reason),
                    -                          log.ERROR)
                    +            msg = 'Upload failed (%s): %s' % (status, reason)
                    +            self.announce(msg, log.ERROR)
                    +            raise DistutilsError(msg)"""
                    )
                )
Beispiel #53
0
 def _register_upstart_jobs( self, service_map ):
     for node_type, services in service_map.iteritems( ):
         start_on = "mesosbox-start-" + node_type
         for service in services: # FIXME: include chdir to logging directory in this script
             self._register_init_script(
                 service.init_name,
                 heredoc( """
                     description "{service.description}"
                     console log
                     respawn
                     umask 022
                     limit nofile 8000 8192
                     setuid {user}
                     setgid {user}
                     env USER={user}
                     env PYTHONPATH=/home/ubuntu/
                     start on {start_on}
                     stop on runlevel [016]
                     exec {service.action}""" ) )
             start_on = "started " + service.init_name
Beispiel #54
0
    def __setup_application_user( self ):
        sudo( fmt( 'useradd '
                   '--home /home/{user} '
                   '--create-home '
                   '--user-group '
                   '--shell /bin/bash {user}' ) )

        sudoer_file = heredoc( """
            # CGcloud - MesosBox

            # User rules for ubuntu
            mesosbox ALL=(ALL) NOPASSWD:ALL

            # User rules for ubuntu
            mesosbox ALL=(ALL) NOPASSWD:ALL
            """ )

        sudoer_file_path = '/etc/sudoers.d/89-mesosbox-user'
        put( local_path=StringIO( sudoer_file ), remote_path=sudoer_file_path, use_sudo=True, mode=0440 )
        sudo( "chown root:root '%s'" % sudoer_file_path )
Beispiel #55
0
 def _sync_package_repos(self):
     put(
         remote_path='/etc/apt/apt.conf.d/99timeout',
         use_sudo=True,
         local_path=StringIO(
             heredoc("""
             Acquire::http::Timeout "10";
             Acquire::ftp::Timeout "10"; """)),
     )
     for i in range(5):
         cmd = self.apt_get + ' update'
         result = sudo(cmd, warn_only=True)
         if result.succeeded: return
         # https://bugs.launchpad.net/ubuntu/+source/apt/+bug/972077
         # https://lists.debian.org/debian-dak/2012/05/threads.html#00006
         if 'Hash Sum mismatch' in result:
             log.warn("Detected race condition during in '%s'")
         else:
             raise RuntimeError("Command '%s' failed" % cmd)
     raise RuntimeError(
         "Command '%s' repeatedly failed with race condition. Giving up.")
Beispiel #56
0
    def __patch_distutils(self):
        """
        https://hg.python.org/cpython/rev/cf70f030a744/
        https://bitbucket.org/pypa/setuptools/issues/248/exit-code-is-zero-when-upload-fails
        Fixed in 2.7.8: https://hg.python.org/cpython/raw-file/v2.7.8/Misc/NEWS
        """
        if self._remote_python_version() < (2, 7, 8):
            with remote_sudo_popen('patch -d /usr/lib/python2.7 -p2') as patch:
                patch.write(
                    heredoc("""
                    --- a/Lib/distutils/command/upload.py
                    +++ b/Lib/distutils/command/upload.py
                    @@ -10,7 +10,7 @@ import urlparse
                     import cStringIO as StringIO
                     from hashlib import md5

                    -from distutils.errors import DistutilsOptionError
                    +from distutils.errors import DistutilsError, DistutilsOptionError
                     from distutils.core import PyPIRCCommand
                     from distutils.spawn import spawn
                     from distutils import log
                    @@ -181,7 +181,7 @@ class upload(PyPIRCCommand):
                                     self.announce(msg, log.INFO)
                             except socket.error, e:
                                 self.announce(str(e), log.ERROR)
                    -            return
                    +            raise
                             except HTTPError, e:
                                 status = e.code
                                 reason = e.msg
                    @@ -190,5 +190,6 @@ class upload(PyPIRCCommand):
                                 self.announce('Server response (%s): %s' % (status, reason),
                                               log.INFO)
                             else:
                    -            self.announce('Upload failed (%s): %s' % (status, reason),
                    -                          log.ERROR)
                    +            msg = 'Upload failed (%s): %s' % (status, reason)
                    +            self.announce(msg, log.ERROR)
                    +            raise DistutilsError(msg)"""))
Beispiel #57
0
    def __init__(self, application):
        """
        Set later, once we have a context.
        :type: Cluster
        """
        super(ClusterTypeCommand, self).__init__(application)
        self.option(
            '--num-threads',
            metavar='NUM',
            type=int,
            default=100,
            dest='num_threads',
            help='The maximum number of tasks to be performed concurrently.')

        self.option(
            'cluster_type',
            metavar='TYPE',
            completer=self.completer,
            help=heredoc(
                """The type of the cluster to be used. The cluster type is
                     covariant with the role of the leader node. For example, a box performing
                     the 'foo-leader' role will be part of a cluster of type 'foo'."""
            ))
Beispiel #58
0
    def __install_tools( self ):
        """
        Installs the spark-master-discovery init script and its companion spark-tools. The latter
        is a Python package distribution that's included in cgcloud-spark as a resource. This is
        in contrast to the cgcloud agent, which is a standalone distribution.
        """
        tools_dir = install_dir + '/tools'
        admin = self.admin_account( )
        sudo( fmt( 'mkdir -p {tools_dir}' ) )
        sudo( fmt( 'chown {admin}:{admin} {tools_dir}' ) )
        run( fmt( 'virtualenv --no-pip {tools_dir}' ) )
        run( fmt( '{tools_dir}/bin/easy_install pip==1.5.2' ) )

        with settings( forward_agent=True ):
            with self._project_artifacts( 'spark-tools' ) as artifacts:
                pip( use_sudo=True,
                     path=tools_dir + '/bin/pip',
                     args=concat( 'install', artifacts ) )
        sudo( fmt( 'chown -R root:root {tools_dir}' ) )

        spark_tools = "SparkTools(**%r)" % dict( user=user,
                                                 shared_dir=self._shared_dir( ),
                                                 install_dir=install_dir,
                                                 ephemeral_dir=ephemeral_dir,
                                                 persistent_dir=persistent_dir,
                                                 lazy_dirs=self.lazy_dirs )

        self.lazy_dirs = None  # make sure it can't be used anymore once we are done with it

        self._register_init_script(
            "sparkbox",
            heredoc( """
                description "Spark/HDFS master discovery"
                console log
                start on (local-filesystems and net-device-up IFACE!=lo)
                stop on runlevel [!2345]
                pre-start script
                for i in 1 2 3; do if {tools_dir}/bin/python2.7 - <<END
                import logging
                logging.basicConfig( level=logging.INFO )
                from cgcloud.spark_tools import SparkTools
                spark_tools = {spark_tools}
                spark_tools.start()
                END
                then exit 0; fi; echo Retrying in 60s; sleep 60; done; exit 1
                end script
                post-stop script
                {tools_dir}/bin/python2.7 - <<END
                import logging
                logging.basicConfig( level=logging.INFO )
                from cgcloud.spark_tools import SparkTools
                spark_tools = {spark_tools}
                spark_tools.stop()
                END
                end script""" ) )

        script_path = "/usr/local/bin/sparkbox-manage-slaves"
        put( remote_path=script_path, use_sudo=True, local_path=StringIO( heredoc( """
            #!{tools_dir}/bin/python2.7
            import sys
            import logging
            # Prefix each log line to make it more obvious that it's the master logging when the
            # slave calls this script via ssh.
            logging.basicConfig( level=logging.INFO,
                                 format="manage_slaves: " + logging.BASIC_FORMAT )
            from cgcloud.spark_tools import SparkTools
            spark_tools = {spark_tools}
            spark_tools.manage_slaves( slaves_to_add=sys.argv[1:] )""" ) ) )
        sudo( fmt( "chown root:root {script_path} && chmod 755 {script_path}" ) )
Beispiel #59
0
 def __setup_ssh_config( self ):
     with remote_open( '/etc/ssh/ssh_config', use_sudo=True ) as f:
         f.write( heredoc( """
             Host spark-master
                 CheckHostIP no
                 HashKnownHosts no""" ) )
    def __configure_gridengine( self ):
        """
        Configure the GridEngine daemons (master and exec) and creata a default queue. Ensure
        that the queue is updated to reflect the number of cores actually available.
        """

        ws = re.compile( r'\s+' )
        nl = re.compile( r'[\r\n]+' )

        def qconf( opt, **kwargs ):
            return qconf_dict( opt, kwargs )

        def qconf_dict( opt, d=None, file_name='qconf.tmp' ):
            if d:
                # qconf can't read from stdin for some reason, neither -, /dev/stdin or /dev/fd/0 works
                s = '\n'.join( ' '.join( i ) for i in d.iteritems( ) ) + '\n'
                put( remote_path=file_name, local_path=StringIO( s ) )
                sudo( ' '.join( [ 'qconf', opt, file_name ] ) )
                run( ' '.join( [ 'rm', file_name ] ) )
            else:
                return dict( tuple( ws.split( l, 1 ) )
                                 for l in nl.split( run( 'SGE_SINGLE_LINE=1 qconf ' + opt ) )
                                 if l and not l.startswith( '#' ) )

        # Add the user defined in fname to the Sun Grid Engine cluster.
        qconf( '-Auser', name=Jenkins.user, oticket='0', fshare='0', delete_time='0',
               default_project='NONE' )

        # Adds users to Sun Grid Engine user access lists (ACLs).
        sudo( 'qconf -au %s arusers' % Jenkins.user )

        # Add hosts hostname to the list of hosts allowed to submit Sun Grid Engine jobs and
        # control their behavior only.
        sudo( 'qconf -as localhost' )

        # Remove all currently defined execution hosts
        run( 'for i in `qconf -sel`; do sudo qconf -de $i ; done' )

        # Add an execution host
        qconf( '-Ae', hostname='localhost', load_scaling='NONE', complex_values='NONE',
               user_lists='arusers', xuser_lists='NONE', projects='NONE', xprojects='NONE',
               usage_scaling='NONE', report_variables='NONE' )

        # Add a parallel environment
        qconf( '-Ap', pe_name='smp', slots='999', user_lists='NONE', xuser_lists='NONE',
               start_proc_args='/bin/true', stop_proc_args='/bin/true', allocation_rule='$pe_slots',
               control_slaves='FALSE', job_is_first_task='TRUE', urgency_slots='min',
               accounting_summary='FALSE' )

        # Add a queue, the slots and processors will be adjusted dynamically, by an init script
        qconf( '-Aq', qname='all.q', processors='1', slots='1', hostlist='localhost', seq_no='0',
               load_thresholds='np_load_avg=1.75', suspend_thresholds='NONE', nsuspend='1',
               suspend_interval='00:05:00', priority='0', min_cpu_interval='00:05:00',
               qtype='BATCH INTERACTIVE', ckpt_list='NONE', pe_list='make smp', rerun='FALSE',
               tmpdir='/tmp', shell='/bin/bash', prolog='NONE', epilog='NONE',
               shell_start_mode='posix_compliant', starter_method='NONE', suspend_method='NONE',
               resume_method='NONE', terminate_method='NONE', notify='00:00:60', owner_list='NONE',
               user_lists='arusers', xuser_lists='NONE', subordinate_list='NONE',
               complex_values='NONE', projects='NONE', xprojects='NONE', calendar='NONE',
               initial_state='default', s_rt='INFINITY', h_rt='INFINITY', s_cpu='INFINITY',
               h_cpu='INFINITY', s_fsize='INFINITY', h_fsize='INFINITY', s_data='INFINITY',
               h_data='INFINITY', s_stack='INFINITY', h_stack='INFINITY', s_core='INFINITY',
               h_core='INFINITY', s_rss='INFINITY', h_rss='INFINITY', s_vmem='INFINITY',
               h_vmem='INFINITY' )

        # Enable on-demand scheduling. This will eliminate the long time that jobs spend waiting
        # in the qw state. There is no -Asconf so we have to fake it using -ssconf and -Msconf.
        sconf = qconf( '-ssconf' )
        sconf.update( dict( flush_submit_sec='1', flush_finish_sec='1',
                            schedule_interval='0:0:1' ) )
        qconf_dict( '-Msconf', sconf )

        # Enable immediate flushing of the accounting file. The SGE batch system in Toil uses the
        #  qacct program to determine the exit code of a finished job. The qacct program reads
        # the accounting file. By default, this file is written to every 15 seconds which means
        # that it may take up to 15 seconds before a finished job is seen by Toil. An
        # accounting_flush_time value of 00:00:00 causes the accounting file to be flushed
        # immediately, allowing qacct to report the status of finished jobs immediately. Again,
        # there is no -Aconf, so we fake it with -sconf and -Mconf. Also, the file name has to be
        # 'global'.
        conf = qconf( '-sconf' )
        params = dict( tuple( e.split( '=' ) ) for e in conf[ 'reporting_params' ].split( ' ' ) )
        params[ 'accounting_flush_time' ] = '00:00:00'
        conf[ 'reporting_params' ] = ' '.join( '='.join( e ) for e in params.iteritems( ) )
        qconf_dict( '-Mconf', conf, file_name='global' )

        # Register an init-script that ensures GridEngine uses localhost instead of hostname
        path = '/var/lib/gridengine/default/common/'
        self._register_init_script( 'gridengine-pre', heredoc( """
            description "GridEngine pre-start configuration"
            console log
            start on filesystem
            pre-start script
                echo localhost > {path}/act_qmaster ; chown sgeadmin:sgeadmin {path}/act_qmaster
                echo localhost `hostname -f` > {path}/host_aliases
            end script""" ) )

        # Register an init-script that adjust the queue config to reflect the number of cores
        self._register_init_script( 'gridengine-post', heredoc( """
            description "GridEngine post-start configuration"
            console log
            # I would rather depend on the gridengine daemons but don't know how as they are
            # started by SysV init scripts. Supposedly the 'rc' job is run last.
            start on started rc
            pre-start script
                cores=$(grep -c '^processor' /proc/cpuinfo)
                qconf -mattr queue processors $cores `qselect`
                qconf -mattr queue slots $cores `qselect`
            end script""" ) )

        # Run pre-start script
        for daemon in ('exec', 'master'):
            sudo( '/etc/init.d/gridengine-%s stop' % daemon )
        sudo( "killall -9 -r 'sge_.*'" )  # the exec daemon likes to hang
        self._run_init_script( 'gridengine-pre' )
        for daemon in ('master', 'exec'):
            sudo( '/etc/init.d/gridengine-%s start' % daemon )

        # Run post-start script
        self._run_init_script( 'gridengine-post' )
        while 'execd is in unknown state' in run( 'qstat -f -q all.q -explain a', warn_only=True ):
            time.sleep( 1 )