Example #1
0
def wait_nodes_ready(*args, **kwargs):
    '''ec2 nodes wait ready name_or_config [--wait-timeout=300 --pem=/path/to/key.pem]

    Wait for nodes to finish booting and become fully ready, i.e. all
    packages to be installed have finished installing. Normally this
    will be invoked during boot or import, but can be useful if those
    run into a problem and you want to make sure all nodes have gotten
    back to a good state.
    '''

    name_or_config = arguments.parse_or_die(wait_nodes_ready, [object], *args)
    timeout = int(config.kwarg_or_get('timeout', kwargs, 'SIRIKATA_PING_WAIT_TIMEOUT', default=300))
    pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE'))

    name, cc = name_and_config(name_or_config)

    print "Waiting for nodes to become pingable..."
    pingable = wait_pingable(cc, timeout=timeout)
    if pingable != 0: return pingable
    # Give a bit more time for the nodes to become ready, pinging
    # may happen before all services are finished starting
    print "Sleeping to allow nodes to finish booting"
    time.sleep(15)
    print "Waiting for initial services and Sirikata binaries to install"
    ready = wait_ready(cc, '/home/ubuntu/ready/sirikata', timeout=timeout, pem=pemfile)
    return ready
Example #2
0
def create(*args, **kwargs):
    """ec2 create name size puppet_master keypair [--instance-type=t1.micro] [--group=security_group] [--ami=i-x7395]

    Create a new cluster. This just creates a record of the cluster
    and saves its properties, it doesn't actually allocate any nodes.
    """

    name, size, puppet_master, keypair = arguments.parse_or_die(create, [str, int, str, str], *args)

    instance_type = config.kwarg_or_get('instance-type', kwargs, 'INSTANCE_TYPE')
    group = config.kwarg_or_get('group', kwargs, 'SECURITY_GROUP')
    ami = config.kwarg_or_get('ami', kwargs, 'BASE_AMI')

    cc = EC2GroupConfig(name,
                           size=size, keypair=keypair,
                           instance_type=instance_type,
                           group=group, ami=ami,
                           puppet_master=puppet_master)
    cc.save()

    # Make sure we have a nodes config for puppet. Not needed here,
    # but it's a convenient place to make sure we have it done since
    # nothing else with the cluster can happen until this is called
    puppet.generate_default_node_config()

    return 0
Example #3
0
def import_nodes(*args, **kwargs):
    """ec2 nodes import name_or_config instance1_id instance2_id ... [--wait-timeout=300 --pem=/path/to/key.pem]

    Import instances from a spot reservation and then perform the boot sequence on them.
    The command will block for wait-timeout
    seconds, or until all nodes reach a ready state (currently defined
    as being pingable and containing files indicating readiness.
    A wait-timeout of 0 disables this. A pem
    file, either passed on the command line or through the environment
    is required for the timeout to work properly. Note that with
    timeouts enabled, this will check that the nodes reach a ready
    state.
    """

    name_or_config, instances_to_add = arguments.parse_or_die(import_nodes, [object], rest=True, *args)
    timeout = config.kwarg_or_default('wait-timeout', kwargs, default=600)
    # Note pemfile is different from other places since it's only required with wait-timeout.
    pemfile = config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE', default=None)
    name, cc = name_and_config(name_or_config)

    if 'spot' not in cc.state:
        print "It looks like this cluster hasn't made a spot reservation..."
        return 1

    conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY)

    instances_to_add = list(instances_to_add)
    if len(instances_to_add) == 0:
        print "No instances specified, trying to use full list of account instances..."
        reservations = conn.get_all_instances()
        for res in reservations:
            instances_to_add += [inst.id for inst in res.instances if inst.state == 'running']
    if len(instances_to_add) != cc.size:
        print "Number of instances doesn't match the cluster size. Make sure you explicitly specify %d instances" % (cc.size)
        return 1

    cc.state['instances'] = instances_to_add

    # Verify the instances are valid, just checking that we get valid
    # objects back when we look them up with AWS
    print "Verifying instances are valid..."
    instances = get_all_instances(cc, conn);
    if len(instances) != len(instances_to_add):
        print "Only got %d instances back, you'll need to manually clean things up..." % len(instances)
        return 1

    # Cache some information about the instances which shouldn't change
    cc.state['instance_props'] = dict(
        [
            (instid, {
                    'id' : instances[instid].id,
                    'ip' : instances[instid].ip_address,
                    'hostname' : instances[instid].dns_name,
                    'private_ip' : instances[instid].private_ip_address,
                    'private_hostname' : instances[instid].private_dns_name,
                    }) for instid in instances_to_add])
    cc.save()

    return name_and_boot_nodes(cc, conn, pemfile, timeout)
Example #4
0
def sync_sirikata(*args, **kwargs):
    """ec2 sync sirikata /path/to/installed/sirikata [--puppet-path=/etc/puppet] [--notify-puppets=cluster_name_or_config]

    Package a version of Sirikata installed in the given path and set
    it up with Puppet for distribution to puppet agent nodes.

    If you already have puppets running, add
    --notify-puppets=cluster_name to trigger a puppet update (runs the
    equivalent of sirikata-cluster.py puppet slaves restart cluster_name)
    """

    installed_path = arguments.parse_or_die(sync_sirikata, [str], *args)
    puppet_base_path = config.kwarg_or_get('puppet-path', kwargs, 'PUPPET_PATH', default='/etc/puppet')
    notify_puppets = config.kwarg_or_default('notify-puppets', kwargs)
    # Note pemfile is different from other places since it's only required with notify-puppets.
    pemfile = config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE', default=None)

    # Generate the archive if given a directory)
    gen_file = installed_path
    if os.path.isdir(installed_path):
        retcode = util_sirikata.package(installed_path)
        if retcode != 0: return retcode
        gen_file = os.path.join(installed_path, 'sirikata.tar.bz2')

    # Make sure we have a place to put the file
    dest_dir = os.path.join(puppet_base_path, 'modules', 'sirikata', 'files', 'home', 'ubuntu')
    if not os.path.exists(dest_dir):
        # Need root for this, so we have to do it through subprocess
        subprocess.call(['sudo', 'mkdir', '-p', dest_dir])

    # And copy it into place
    print "Copying archive into puppet"
    dest_file = os.path.join(dest_dir, 'sirikata.tar.bz2')
    subprocess.call(['sudo', 'cp', gen_file, dest_file])

    if notify_puppets:
        print "Notifying puppets"
        slaves_restart_kwargs = {}
        if pemfile is not None: slaves_restart_kwargs['pem'] = pemfile
        # notify_puppets == cluster name
        # Nuke old tar.bz2's so new ones will be downloaded
        nodes.ssh(notify_puppets, 'rm', '-f', 'sirikata.tar.bz2', **slaves_restart_kwargs)
        puppet.slaves_restart(notify_puppets, **slaves_restart_kwargs)
Example #5
0
def update(*args, **kwargs):
    """puppet update cluster_name_or_config [--pem=/path/to/key.pem]

    Performs both master configuration and slave restart.
    """

    name_or_config = arguments.parse_or_die(update, [object], *args)
    pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE'))

    master_config('--yes')
    slaves_restart(name_or_config, pem=pemfile)
Example #6
0
def slaves_restart(*args, **kwargs):
    """puppet slaves restart cluster_name [--pem=/path/to/key.pem]

    Restart puppet on a slave node. This is useful to force it to
    reconfigure itself since Puppet doesn't seem to have a good way of
    kicking all slaves to reconfigure and re-run their settings.
    """

    name_or_config = arguments.parse_or_die(slaves_restart, [object], *args)
    pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE'))

    nodes.ssh(name_or_config, 'sudo', 'service', 'puppet', 'restart', pem=pemfile)
Example #7
0
def wait_ready(*args, **kwargs):
    '''Wait for nodes to become ready, with an optional timeout. Ready
    means that puppet has finished configuring packages and left
    indicators that initial puppet configuration has completed. You
    should make sure all nodes are pingable before running this.'''

    name_or_config, files_to_check = arguments.parse_or_die(wait_ready, [object], rest=True, *args)
    timeout = int(config.kwarg_or_get('timeout', kwargs, 'SIRIKATA_READY_WAIT_TIMEOUT', default=0))
    pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE'))

    name, cc = name_and_config(name_or_config)

    conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY)

    instances_ips = get_all_ips(cc, conn)
    not_ready = set(instances_ips.keys())

    # Just loop, waiting on any (i.e. the first) node in the set, reset our timeout
    waited = 0
    while not_ready and (timeout == 0 or waited < timeout):
        node_id = next(iter(not_ready))
        ip = instances_ips[node_id]
        print 'Waiting on %s (%s)' % (node_id, str(ip))
        node_idx = cc.state['instances'].index(node_id)
        remote_cmd = []
        for file_to_check in files_to_check:
            if remote_cmd: remote_cmd.append('&&')
            remote_cmd += ['test', '-f', file_to_check]
        retcode = node_ssh(cc, node_idx, *remote_cmd, pem=pemfile)
        if retcode == 0: # command success
            not_ready.remove(node_id)
            continue
        time.sleep(5)
        waited += 5

    if not_ready:
        print "Failed to find readiness indicators for %s" % (next(iter(not_ready)))
        exit(1)
    print "Success"
    return 0
Example #8
0
def set_node_type(*args, **kwargs):
    """ec2 node set type cluster_name_or_config node nodetype [--pem=/path/to/pem.key]

    Set the given node (by index, hostname, IP, etc) to be of the
    specified node type in Puppet, e.g. setting sirikata_redis to make
    it a Redis server. Setting to 'default' reverts to the original config.
    """

    name_or_config, nodeid, nodetype = arguments.parse_or_die(set_node_type, [object, str, str], *args)
    pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE'))

    # Explicit check for known types so we don't get our config into a bad state
    if nodetype not in ['default', 'sirikata_redis']:
        print "The specified node type (%s) isn't known." % (nodetype)
        return 1

    name, cc = name_and_config(name_or_config)
    if 'instances' not in cc.state:
        print "No active instances were found, are you sure this cluster is currently running?"
        exit(1)

    conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY)

    # Update entry in local storage so we can update later
    if 'node-types' not in cc.state: cc.state['node-types'] = {}
    if 'capabilities' not in cc.state: cc.state['capabilities'] = {}
    inst = get_node(cc, conn, nodeid)
    if nodetype == 'default':
        if pacemaker_id(inst) in cc.state['node-types']:
            del cc.state['node-types'][pacemaker_id(inst)]
        if inst.id in cc.state['capabilities']:
            del cc.state['capabilities'][inst.id]
    else:
        cc.state['node-types'][pacemaker_id(inst)] = nodetype
        # Note currently only 1, the puppet setup doesn't really have composability right now anyway...
        cc.state['capabilities'][inst.id] = 'redis'
    cc.save()

    # Generate config
    node_config = ''.join(["node '%s' inherits %s {}" % (pacemakerid,nt) for pacemakerid,nt in cc.state['node-types'].iteritems()])
    data.save(node_config, 'puppet', 'manifests', 'nodes.pp')

    pem_kwargs = {}
    if pemfile is not None: pem_kwargs['pem'] = pemfile
    return puppet.update(cc, **pem_kwargs)
Example #9
0
def ssh(*args, **kwargs):
    """ec2 ssh cluster_name_or_config [--pem=/path/to/key.pem] [required additional arguments give command just like with real ssh]

    Run an SSH command on every node in the cluster. Note that this
    currently doesn't parallelize at all, so it can be a bit
    slow. This won't do ssh sessions -- you *must* provide a command
    to execute.
    """

    name_or_config, remote_cmd = arguments.parse_or_die(ssh, [object], rest=True, *args)
    pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE'))
    if not remote_cmd:
        print "You need to add a command to execute across all the nodes."
        exit(1)

    name, cc = name_and_config(name_or_config)
    for inst_idx in range(len(cc.state['instances'])):
        node_ssh(cc, inst_idx, *remote_cmd, pem=pemfile)
Example #10
0
def wait_pingable(*args, **kwargs):
    '''Wait for nodes to become pingable, with an optional timeout.'''

    name_or_config = arguments.parse_or_die(wait_pingable, [object], *args)
    timeout = int(config.kwarg_or_get('timeout', kwargs, 'SIRIKATA_PING_WAIT_TIMEOUT', default=0))

    name, cc = name_and_config(name_or_config)

    conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY)
    # We need to loop until we can get IPs for all nodes
    waited = 0
    while (timeout == 0 or waited < timeout):
        instances_ips = get_all_ips(cc, conn)
        not_pinged = set(instances_ips.keys())

        # If none are missing IPs, we can exit
        if not any([ip is None for ip in instances_ips.values()]):
            break
        # Otherwise sleep awhile and then try again
        time.sleep(10)

    # Just loop, waiting on any (i.e. the first) node in the set, reset our timeout
    waited = 0
    while not_pinged and (timeout == 0 or waited < timeout):
        node_id = next(iter(not_pinged))
        ip = instances_ips[node_id]
        print 'Waiting on %s (%s)' % (node_id, str(ip))
        # One of those rare instances we just want to dump the output
        retcode = 0
        with open('/dev/null', 'w') as devnull:
            retcode = subprocess.call(['ping', '-c', '2', str(ip)], stdout=devnull, stderr=devnull)
        if retcode == 0: # ping success
            not_pinged.remove(node_id)
            continue
        time.sleep(5)
        waited += 5

    if not_pinged:
        print "Failed to ping %s" % (next(iter(not_pinged)))
        exit(1)
    print "Success"
    return 0
Example #11
0
def sync_files(*args, **kwargs):
    """ec2 sync files cluster_name_or_config idx_or_name_or_node target local_or_remote:/path local_or_remote:/path [--pem=/path/to/key.pem]

    Synchronize files or directories between a the local host and a cluster node.
    """

    name_or_config, idx_or_name_or_node, src_path, dest_path = arguments.parse_or_die(sync_files, [object, object, str, str], *args)
    pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE'))

    name, cc = name_and_config(name_or_config)

    if 'instances' not in cc.state:
        print "It doesn't look like you've booted the cluster yet..."
        exit(1)

    # Get remote info
    conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY)
    if idx_or_name_or_node == 'all':
        instances_info = [ inst_props for instid, inst_props in cc.state['instance_props'].iteritems() ]
    else:
        instances_info = [ cc.state['instance_props'][cc.get_node_name(idx_or_name_or_node)] ]

    results = []
    for instance_info in instances_info:
        node_address = cc.user() + "@" + instance_info['hostname'] + ':'

        # Get correct values out for names
        paths = [src_path, dest_path]
        paths = [p.replace('local:', '').replace('remote:', node_address) for p in paths]
        src_path_final, dest_path_final = tuple(paths)

        # Make a single copy onto one of the nodes
        results.append( subprocess.call(["rsync", "-e", "ssh -i " + pemfile, src_path_final, dest_path_final]) )
        #results.append( subprocess.call(["scp", "-i", pemfile, src_path_final, dest_path_final]) )

    # Just pick one non-zero return value if any failed
    failed = [res for res in results if res != 0]
    if failed:
        return failed[0]
    return 0
Example #12
0
def node_ssh(*args, **kwargs):
    """ec2 node ssh cluster_name_or_config idx_or_name_or_node [--pem=/path/to/key.pem] [optional additional arguments give command just like with real ssh]

    Spawn an SSH process that SSHs into the node
    """

    name_or_config, idx_or_name_or_node, remote_cmd = arguments.parse_or_die(node_ssh, [object, object], rest=True, *args)
    pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE'))

    name, cc = name_and_config(name_or_config)

    if 'instances' not in cc.state:
        print "It doesn't look like you've booted the cluster yet..."
        exit(1)

    inst_info = cc.state['instance_props'][cc.get_node_name(idx_or_name_or_node)]

    # StrictHostKeyChecking no -- causes the "authenticity of host can't be
    # established" messages to not show up, and therefore not require prompting
    # the user. Not entirely safe, but much less annoying than having each node
    # require user interaction during boot phase
    cmd = ["ssh", "-o", "StrictHostKeyChecking no", "-i", pemfile, cc.user() + "@" + inst_info['hostname']] + [ssh_escape(x) for x in remote_cmd]
    return subprocess.call(cmd)
Example #13
0
def boot(*args, **kwargs):
    """ec2 nodes boot name_or_config [--wait-timeout=300 --pem=/path/to/key.pem]

    Boot a cluster's nodes. The command will block for wait-timeout
    seconds, or until all nodes reach a ready state (currently defined
    as being pingable and containing files indicating readiness.
    A wait-timeout of 0 disables this. A pem
    file, either passed on the command line or through the environment
    is required for the timeout to work properly. Note that with
    timeouts enabled, this will check that the nodes reach a ready
    state.
    """

    name_or_config = arguments.parse_or_die(boot, [object], *args)
    timeout = config.kwarg_or_default('wait-timeout', kwargs, default=600)
    # Note pemfile is different from other places since it's only required with wait-timeout.
    pemfile = config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE', default=None)
    name, cc = name_and_config(name_or_config)

    if 'reservation' in cc.state or 'spot' in cc.state or 'instances' in cc.state:
        print "It looks like you already have active nodes for this cluster..."
        exit(1)

    if timeout > 0 and not pemfile:
        print "You need to specify a pem file to use timeouts."
        exit(1)

    # Load the setup script template, replace puppet master info
    user_data = data.load('ec2-user-data', 'node-setup.sh')
    user_data = user_data.replace('{{{PUPPET_MASTER}}}', cc.puppet_master)

    # Unlike spot instances, where we can easily request that any
    # availability zone be used by that all be in the same AZ, here we
    # have to specify an AZ directly. We just choose one randomly for now...
    conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY)
    zones = conn.get_all_zones()
    zone = random.choice(zones).name

    # Now create the nodes
    reservation = conn.run_instances(cc.ami,
                                     placement=zone,
                                     min_count=cc.size, max_count=cc.size,
                                     key_name=cc.keypair,
                                     instance_type=cc.instance_type,
                                     security_groups=[cc.group],
                                     user_data=user_data
                                     )

    # Save reservation, instance info
    cc.state['reservation'] = reservation.id
    cc.state['instances'] = [inst.id for inst in reservation.instances]
    cc.save()
    # Cache some information about the instances which shouldn't
    # change. However, this can take some time to come up properly, so
    # we may need to poll a few times before we get the right info
    print "Collecting node information..."
    while True:
        new_instances = get_all_instances(cc, conn)
        if any([inst.ip_address is None or inst.dns_name is None or inst.private_ip_address is None or inst.private_dns_name is None for inst in new_instances.values()]):
            time.sleep(5)
            continue
        cc.state['instance_props'] = dict(
            [
                (inst.id, {
                        'id' : inst.id,
                        'ip' : inst.ip_address,
                        'hostname' : inst.dns_name,
                        'private_ip' : inst.private_ip_address,
                        'private_hostname' : inst.private_dns_name,
                        }) for inst in new_instances.values()])
        break
    cc.save()
    return name_and_boot_nodes(cc, conn, pemfile, timeout)
Example #14
0
def master_config(*args, **kwargs):
    """puppet master config [--path=/etc/puppet/] [--yes]

    Configure (or reconfigure) a local puppet master based on the data
    generated so far. This can be used for the initial setup of the
    Puppet master or to updated it based on changes from the cluster.

    The commands this executes will require sudo, but it will be
    invoked for you -- don't run this command under sudo.

    With --yes, 'yes' is assumed for all questions. Note that this
    still isn't fully automated in some cases since some subcommands
    still require user input, e.g. generating an initial auth key.
    """

    puppet_base_path = config.kwarg_or_get('path', kwargs, 'PUPPET_PATH', default='/etc/puppet')
    always_yes = bool(config.kwarg_or_get('yes', kwargs, default=False))

    # Some useful paths starting from the base path
    autosign_path = os.path.join(puppet_base_path, 'autosign.conf')
    fileserver_conf_path = os.path.join(puppet_base_path, 'fileserver.conf')
    manifests_path = os.path.join(puppet_base_path, 'manifests')
    templates_path = os.path.join(puppet_base_path, 'templates')
    files_path = os.path.join(puppet_base_path, 'files')

    # You need some mechanism of for signing new puppets. You can set
    # something up yourself, or you can take the very easy path of
    # just autosigning everything, regardless of where it's coming
    # from
    if not os.path.exists(autosign_path):
        if always_yes or config.ask_user_bool('Autosigning config not found. Create (very unsafe) autosign.conf?'):
            subprocess.call(['sudo', '/bin/bash', '-c', 'echo "*" > %s' % (autosign_path)])

    # There needs to be some access to files. Sanity check is that we
    # have at least one uncommented allow line
    existing_fileserver_conf = ''
    existing_fileserver_conf_lines = []
    if os.path.exists(fileserver_conf_path):
        with open(fileserver_conf_path, 'r') as fp: existing_fileserver_conf = fp.read()
        existing_fileserver_conf_lines = existing_fileserver_conf.split('\n')
    has_allow_line = any([line.strip().startswith('allow') for line in existing_fileserver_conf_lines])
    if not has_allow_line and  \
            (always_yes or config.ask_user_bool("You haven't allowed any access to files, should I enable access to the default location, %s?" % (files_path))):
        # If we already have a [files] section, we just want to add the line
        try:
            files_idx = [line.strip().startswith('[files]') for line in existing_fileserver_conf_lines].index(True)
            existing_fileserver_conf_lines.insert(files_idx+1, '  allow 0.0.0.0/0')
        except:
            # Otherwise we need to create the file from scratch -- just append [files] and the allow line
            existing_fileserver_conf_lines += [ '[files]', '  allow 0.0.0.0/0']
        # We need root to write the file...
        subprocess.call(['sudo', '/bin/bash', '-c', 'echo "%s" > %s' % ('\n'.join(existing_fileserver_conf_lines), fileserver_conf_path)])

    # Make sure we have a nodes configuration
    generate_default_node_config()

    # We need to add/replace data. Here we don't ask the user, we just copy all the data into place
    puppet_local_data_path = data.path('puppet')
    print "Copying data %s -> %s" % (puppet_local_data_path, puppet_base_path)
    subprocess.call(['sudo', '/bin/bash', '-c', 'cp -r %s/* %s/' % (puppet_local_data_path, puppet_base_path)])

    # And restart the puppet master
    print "Restarting puppetmaster"
    subprocess.call(['sudo', 'service', 'puppetmaster', 'restart'])