def wait_nodes_ready(*args, **kwargs): '''ec2 nodes wait ready name_or_config [--wait-timeout=300 --pem=/path/to/key.pem] Wait for nodes to finish booting and become fully ready, i.e. all packages to be installed have finished installing. Normally this will be invoked during boot or import, but can be useful if those run into a problem and you want to make sure all nodes have gotten back to a good state. ''' name_or_config = arguments.parse_or_die(wait_nodes_ready, [object], *args) timeout = int(config.kwarg_or_get('timeout', kwargs, 'SIRIKATA_PING_WAIT_TIMEOUT', default=300)) pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE')) name, cc = name_and_config(name_or_config) print "Waiting for nodes to become pingable..." pingable = wait_pingable(cc, timeout=timeout) if pingable != 0: return pingable # Give a bit more time for the nodes to become ready, pinging # may happen before all services are finished starting print "Sleeping to allow nodes to finish booting" time.sleep(15) print "Waiting for initial services and Sirikata binaries to install" ready = wait_ready(cc, '/home/ubuntu/ready/sirikata', timeout=timeout, pem=pemfile) return ready
def create(*args, **kwargs): """ec2 create name size puppet_master keypair [--instance-type=t1.micro] [--group=security_group] [--ami=i-x7395] Create a new cluster. This just creates a record of the cluster and saves its properties, it doesn't actually allocate any nodes. """ name, size, puppet_master, keypair = arguments.parse_or_die(create, [str, int, str, str], *args) instance_type = config.kwarg_or_get('instance-type', kwargs, 'INSTANCE_TYPE') group = config.kwarg_or_get('group', kwargs, 'SECURITY_GROUP') ami = config.kwarg_or_get('ami', kwargs, 'BASE_AMI') cc = EC2GroupConfig(name, size=size, keypair=keypair, instance_type=instance_type, group=group, ami=ami, puppet_master=puppet_master) cc.save() # Make sure we have a nodes config for puppet. Not needed here, # but it's a convenient place to make sure we have it done since # nothing else with the cluster can happen until this is called puppet.generate_default_node_config() return 0
def import_nodes(*args, **kwargs): """ec2 nodes import name_or_config instance1_id instance2_id ... [--wait-timeout=300 --pem=/path/to/key.pem] Import instances from a spot reservation and then perform the boot sequence on them. The command will block for wait-timeout seconds, or until all nodes reach a ready state (currently defined as being pingable and containing files indicating readiness. A wait-timeout of 0 disables this. A pem file, either passed on the command line or through the environment is required for the timeout to work properly. Note that with timeouts enabled, this will check that the nodes reach a ready state. """ name_or_config, instances_to_add = arguments.parse_or_die(import_nodes, [object], rest=True, *args) timeout = config.kwarg_or_default('wait-timeout', kwargs, default=600) # Note pemfile is different from other places since it's only required with wait-timeout. pemfile = config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE', default=None) name, cc = name_and_config(name_or_config) if 'spot' not in cc.state: print "It looks like this cluster hasn't made a spot reservation..." return 1 conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY) instances_to_add = list(instances_to_add) if len(instances_to_add) == 0: print "No instances specified, trying to use full list of account instances..." reservations = conn.get_all_instances() for res in reservations: instances_to_add += [inst.id for inst in res.instances if inst.state == 'running'] if len(instances_to_add) != cc.size: print "Number of instances doesn't match the cluster size. Make sure you explicitly specify %d instances" % (cc.size) return 1 cc.state['instances'] = instances_to_add # Verify the instances are valid, just checking that we get valid # objects back when we look them up with AWS print "Verifying instances are valid..." instances = get_all_instances(cc, conn); if len(instances) != len(instances_to_add): print "Only got %d instances back, you'll need to manually clean things up..." % len(instances) return 1 # Cache some information about the instances which shouldn't change cc.state['instance_props'] = dict( [ (instid, { 'id' : instances[instid].id, 'ip' : instances[instid].ip_address, 'hostname' : instances[instid].dns_name, 'private_ip' : instances[instid].private_ip_address, 'private_hostname' : instances[instid].private_dns_name, }) for instid in instances_to_add]) cc.save() return name_and_boot_nodes(cc, conn, pemfile, timeout)
def sync_sirikata(*args, **kwargs): """ec2 sync sirikata /path/to/installed/sirikata [--puppet-path=/etc/puppet] [--notify-puppets=cluster_name_or_config] Package a version of Sirikata installed in the given path and set it up with Puppet for distribution to puppet agent nodes. If you already have puppets running, add --notify-puppets=cluster_name to trigger a puppet update (runs the equivalent of sirikata-cluster.py puppet slaves restart cluster_name) """ installed_path = arguments.parse_or_die(sync_sirikata, [str], *args) puppet_base_path = config.kwarg_or_get('puppet-path', kwargs, 'PUPPET_PATH', default='/etc/puppet') notify_puppets = config.kwarg_or_default('notify-puppets', kwargs) # Note pemfile is different from other places since it's only required with notify-puppets. pemfile = config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE', default=None) # Generate the archive if given a directory) gen_file = installed_path if os.path.isdir(installed_path): retcode = util_sirikata.package(installed_path) if retcode != 0: return retcode gen_file = os.path.join(installed_path, 'sirikata.tar.bz2') # Make sure we have a place to put the file dest_dir = os.path.join(puppet_base_path, 'modules', 'sirikata', 'files', 'home', 'ubuntu') if not os.path.exists(dest_dir): # Need root for this, so we have to do it through subprocess subprocess.call(['sudo', 'mkdir', '-p', dest_dir]) # And copy it into place print "Copying archive into puppet" dest_file = os.path.join(dest_dir, 'sirikata.tar.bz2') subprocess.call(['sudo', 'cp', gen_file, dest_file]) if notify_puppets: print "Notifying puppets" slaves_restart_kwargs = {} if pemfile is not None: slaves_restart_kwargs['pem'] = pemfile # notify_puppets == cluster name # Nuke old tar.bz2's so new ones will be downloaded nodes.ssh(notify_puppets, 'rm', '-f', 'sirikata.tar.bz2', **slaves_restart_kwargs) puppet.slaves_restart(notify_puppets, **slaves_restart_kwargs)
def update(*args, **kwargs): """puppet update cluster_name_or_config [--pem=/path/to/key.pem] Performs both master configuration and slave restart. """ name_or_config = arguments.parse_or_die(update, [object], *args) pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE')) master_config('--yes') slaves_restart(name_or_config, pem=pemfile)
def slaves_restart(*args, **kwargs): """puppet slaves restart cluster_name [--pem=/path/to/key.pem] Restart puppet on a slave node. This is useful to force it to reconfigure itself since Puppet doesn't seem to have a good way of kicking all slaves to reconfigure and re-run their settings. """ name_or_config = arguments.parse_or_die(slaves_restart, [object], *args) pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE')) nodes.ssh(name_or_config, 'sudo', 'service', 'puppet', 'restart', pem=pemfile)
def wait_ready(*args, **kwargs): '''Wait for nodes to become ready, with an optional timeout. Ready means that puppet has finished configuring packages and left indicators that initial puppet configuration has completed. You should make sure all nodes are pingable before running this.''' name_or_config, files_to_check = arguments.parse_or_die(wait_ready, [object], rest=True, *args) timeout = int(config.kwarg_or_get('timeout', kwargs, 'SIRIKATA_READY_WAIT_TIMEOUT', default=0)) pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE')) name, cc = name_and_config(name_or_config) conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY) instances_ips = get_all_ips(cc, conn) not_ready = set(instances_ips.keys()) # Just loop, waiting on any (i.e. the first) node in the set, reset our timeout waited = 0 while not_ready and (timeout == 0 or waited < timeout): node_id = next(iter(not_ready)) ip = instances_ips[node_id] print 'Waiting on %s (%s)' % (node_id, str(ip)) node_idx = cc.state['instances'].index(node_id) remote_cmd = [] for file_to_check in files_to_check: if remote_cmd: remote_cmd.append('&&') remote_cmd += ['test', '-f', file_to_check] retcode = node_ssh(cc, node_idx, *remote_cmd, pem=pemfile) if retcode == 0: # command success not_ready.remove(node_id) continue time.sleep(5) waited += 5 if not_ready: print "Failed to find readiness indicators for %s" % (next(iter(not_ready))) exit(1) print "Success" return 0
def set_node_type(*args, **kwargs): """ec2 node set type cluster_name_or_config node nodetype [--pem=/path/to/pem.key] Set the given node (by index, hostname, IP, etc) to be of the specified node type in Puppet, e.g. setting sirikata_redis to make it a Redis server. Setting to 'default' reverts to the original config. """ name_or_config, nodeid, nodetype = arguments.parse_or_die(set_node_type, [object, str, str], *args) pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE')) # Explicit check for known types so we don't get our config into a bad state if nodetype not in ['default', 'sirikata_redis']: print "The specified node type (%s) isn't known." % (nodetype) return 1 name, cc = name_and_config(name_or_config) if 'instances' not in cc.state: print "No active instances were found, are you sure this cluster is currently running?" exit(1) conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY) # Update entry in local storage so we can update later if 'node-types' not in cc.state: cc.state['node-types'] = {} if 'capabilities' not in cc.state: cc.state['capabilities'] = {} inst = get_node(cc, conn, nodeid) if nodetype == 'default': if pacemaker_id(inst) in cc.state['node-types']: del cc.state['node-types'][pacemaker_id(inst)] if inst.id in cc.state['capabilities']: del cc.state['capabilities'][inst.id] else: cc.state['node-types'][pacemaker_id(inst)] = nodetype # Note currently only 1, the puppet setup doesn't really have composability right now anyway... cc.state['capabilities'][inst.id] = 'redis' cc.save() # Generate config node_config = ''.join(["node '%s' inherits %s {}" % (pacemakerid,nt) for pacemakerid,nt in cc.state['node-types'].iteritems()]) data.save(node_config, 'puppet', 'manifests', 'nodes.pp') pem_kwargs = {} if pemfile is not None: pem_kwargs['pem'] = pemfile return puppet.update(cc, **pem_kwargs)
def ssh(*args, **kwargs): """ec2 ssh cluster_name_or_config [--pem=/path/to/key.pem] [required additional arguments give command just like with real ssh] Run an SSH command on every node in the cluster. Note that this currently doesn't parallelize at all, so it can be a bit slow. This won't do ssh sessions -- you *must* provide a command to execute. """ name_or_config, remote_cmd = arguments.parse_or_die(ssh, [object], rest=True, *args) pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE')) if not remote_cmd: print "You need to add a command to execute across all the nodes." exit(1) name, cc = name_and_config(name_or_config) for inst_idx in range(len(cc.state['instances'])): node_ssh(cc, inst_idx, *remote_cmd, pem=pemfile)
def wait_pingable(*args, **kwargs): '''Wait for nodes to become pingable, with an optional timeout.''' name_or_config = arguments.parse_or_die(wait_pingable, [object], *args) timeout = int(config.kwarg_or_get('timeout', kwargs, 'SIRIKATA_PING_WAIT_TIMEOUT', default=0)) name, cc = name_and_config(name_or_config) conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY) # We need to loop until we can get IPs for all nodes waited = 0 while (timeout == 0 or waited < timeout): instances_ips = get_all_ips(cc, conn) not_pinged = set(instances_ips.keys()) # If none are missing IPs, we can exit if not any([ip is None for ip in instances_ips.values()]): break # Otherwise sleep awhile and then try again time.sleep(10) # Just loop, waiting on any (i.e. the first) node in the set, reset our timeout waited = 0 while not_pinged and (timeout == 0 or waited < timeout): node_id = next(iter(not_pinged)) ip = instances_ips[node_id] print 'Waiting on %s (%s)' % (node_id, str(ip)) # One of those rare instances we just want to dump the output retcode = 0 with open('/dev/null', 'w') as devnull: retcode = subprocess.call(['ping', '-c', '2', str(ip)], stdout=devnull, stderr=devnull) if retcode == 0: # ping success not_pinged.remove(node_id) continue time.sleep(5) waited += 5 if not_pinged: print "Failed to ping %s" % (next(iter(not_pinged))) exit(1) print "Success" return 0
def sync_files(*args, **kwargs): """ec2 sync files cluster_name_or_config idx_or_name_or_node target local_or_remote:/path local_or_remote:/path [--pem=/path/to/key.pem] Synchronize files or directories between a the local host and a cluster node. """ name_or_config, idx_or_name_or_node, src_path, dest_path = arguments.parse_or_die(sync_files, [object, object, str, str], *args) pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE')) name, cc = name_and_config(name_or_config) if 'instances' not in cc.state: print "It doesn't look like you've booted the cluster yet..." exit(1) # Get remote info conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY) if idx_or_name_or_node == 'all': instances_info = [ inst_props for instid, inst_props in cc.state['instance_props'].iteritems() ] else: instances_info = [ cc.state['instance_props'][cc.get_node_name(idx_or_name_or_node)] ] results = [] for instance_info in instances_info: node_address = cc.user() + "@" + instance_info['hostname'] + ':' # Get correct values out for names paths = [src_path, dest_path] paths = [p.replace('local:', '').replace('remote:', node_address) for p in paths] src_path_final, dest_path_final = tuple(paths) # Make a single copy onto one of the nodes results.append( subprocess.call(["rsync", "-e", "ssh -i " + pemfile, src_path_final, dest_path_final]) ) #results.append( subprocess.call(["scp", "-i", pemfile, src_path_final, dest_path_final]) ) # Just pick one non-zero return value if any failed failed = [res for res in results if res != 0] if failed: return failed[0] return 0
def node_ssh(*args, **kwargs): """ec2 node ssh cluster_name_or_config idx_or_name_or_node [--pem=/path/to/key.pem] [optional additional arguments give command just like with real ssh] Spawn an SSH process that SSHs into the node """ name_or_config, idx_or_name_or_node, remote_cmd = arguments.parse_or_die(node_ssh, [object, object], rest=True, *args) pemfile = os.path.expanduser(config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE')) name, cc = name_and_config(name_or_config) if 'instances' not in cc.state: print "It doesn't look like you've booted the cluster yet..." exit(1) inst_info = cc.state['instance_props'][cc.get_node_name(idx_or_name_or_node)] # StrictHostKeyChecking no -- causes the "authenticity of host can't be # established" messages to not show up, and therefore not require prompting # the user. Not entirely safe, but much less annoying than having each node # require user interaction during boot phase cmd = ["ssh", "-o", "StrictHostKeyChecking no", "-i", pemfile, cc.user() + "@" + inst_info['hostname']] + [ssh_escape(x) for x in remote_cmd] return subprocess.call(cmd)
def boot(*args, **kwargs): """ec2 nodes boot name_or_config [--wait-timeout=300 --pem=/path/to/key.pem] Boot a cluster's nodes. The command will block for wait-timeout seconds, or until all nodes reach a ready state (currently defined as being pingable and containing files indicating readiness. A wait-timeout of 0 disables this. A pem file, either passed on the command line or through the environment is required for the timeout to work properly. Note that with timeouts enabled, this will check that the nodes reach a ready state. """ name_or_config = arguments.parse_or_die(boot, [object], *args) timeout = config.kwarg_or_default('wait-timeout', kwargs, default=600) # Note pemfile is different from other places since it's only required with wait-timeout. pemfile = config.kwarg_or_get('pem', kwargs, 'SIRIKATA_CLUSTER_PEMFILE', default=None) name, cc = name_and_config(name_or_config) if 'reservation' in cc.state or 'spot' in cc.state or 'instances' in cc.state: print "It looks like you already have active nodes for this cluster..." exit(1) if timeout > 0 and not pemfile: print "You need to specify a pem file to use timeouts." exit(1) # Load the setup script template, replace puppet master info user_data = data.load('ec2-user-data', 'node-setup.sh') user_data = user_data.replace('{{{PUPPET_MASTER}}}', cc.puppet_master) # Unlike spot instances, where we can easily request that any # availability zone be used by that all be in the same AZ, here we # have to specify an AZ directly. We just choose one randomly for now... conn = EC2Connection(config.AWS_ACCESS_KEY_ID, config.AWS_SECRET_ACCESS_KEY) zones = conn.get_all_zones() zone = random.choice(zones).name # Now create the nodes reservation = conn.run_instances(cc.ami, placement=zone, min_count=cc.size, max_count=cc.size, key_name=cc.keypair, instance_type=cc.instance_type, security_groups=[cc.group], user_data=user_data ) # Save reservation, instance info cc.state['reservation'] = reservation.id cc.state['instances'] = [inst.id for inst in reservation.instances] cc.save() # Cache some information about the instances which shouldn't # change. However, this can take some time to come up properly, so # we may need to poll a few times before we get the right info print "Collecting node information..." while True: new_instances = get_all_instances(cc, conn) if any([inst.ip_address is None or inst.dns_name is None or inst.private_ip_address is None or inst.private_dns_name is None for inst in new_instances.values()]): time.sleep(5) continue cc.state['instance_props'] = dict( [ (inst.id, { 'id' : inst.id, 'ip' : inst.ip_address, 'hostname' : inst.dns_name, 'private_ip' : inst.private_ip_address, 'private_hostname' : inst.private_dns_name, }) for inst in new_instances.values()]) break cc.save() return name_and_boot_nodes(cc, conn, pemfile, timeout)
def master_config(*args, **kwargs): """puppet master config [--path=/etc/puppet/] [--yes] Configure (or reconfigure) a local puppet master based on the data generated so far. This can be used for the initial setup of the Puppet master or to updated it based on changes from the cluster. The commands this executes will require sudo, but it will be invoked for you -- don't run this command under sudo. With --yes, 'yes' is assumed for all questions. Note that this still isn't fully automated in some cases since some subcommands still require user input, e.g. generating an initial auth key. """ puppet_base_path = config.kwarg_or_get('path', kwargs, 'PUPPET_PATH', default='/etc/puppet') always_yes = bool(config.kwarg_or_get('yes', kwargs, default=False)) # Some useful paths starting from the base path autosign_path = os.path.join(puppet_base_path, 'autosign.conf') fileserver_conf_path = os.path.join(puppet_base_path, 'fileserver.conf') manifests_path = os.path.join(puppet_base_path, 'manifests') templates_path = os.path.join(puppet_base_path, 'templates') files_path = os.path.join(puppet_base_path, 'files') # You need some mechanism of for signing new puppets. You can set # something up yourself, or you can take the very easy path of # just autosigning everything, regardless of where it's coming # from if not os.path.exists(autosign_path): if always_yes or config.ask_user_bool('Autosigning config not found. Create (very unsafe) autosign.conf?'): subprocess.call(['sudo', '/bin/bash', '-c', 'echo "*" > %s' % (autosign_path)]) # There needs to be some access to files. Sanity check is that we # have at least one uncommented allow line existing_fileserver_conf = '' existing_fileserver_conf_lines = [] if os.path.exists(fileserver_conf_path): with open(fileserver_conf_path, 'r') as fp: existing_fileserver_conf = fp.read() existing_fileserver_conf_lines = existing_fileserver_conf.split('\n') has_allow_line = any([line.strip().startswith('allow') for line in existing_fileserver_conf_lines]) if not has_allow_line and \ (always_yes or config.ask_user_bool("You haven't allowed any access to files, should I enable access to the default location, %s?" % (files_path))): # If we already have a [files] section, we just want to add the line try: files_idx = [line.strip().startswith('[files]') for line in existing_fileserver_conf_lines].index(True) existing_fileserver_conf_lines.insert(files_idx+1, ' allow 0.0.0.0/0') except: # Otherwise we need to create the file from scratch -- just append [files] and the allow line existing_fileserver_conf_lines += [ '[files]', ' allow 0.0.0.0/0'] # We need root to write the file... subprocess.call(['sudo', '/bin/bash', '-c', 'echo "%s" > %s' % ('\n'.join(existing_fileserver_conf_lines), fileserver_conf_path)]) # Make sure we have a nodes configuration generate_default_node_config() # We need to add/replace data. Here we don't ask the user, we just copy all the data into place puppet_local_data_path = data.path('puppet') print "Copying data %s -> %s" % (puppet_local_data_path, puppet_base_path) subprocess.call(['sudo', '/bin/bash', '-c', 'cp -r %s/* %s/' % (puppet_local_data_path, puppet_base_path)]) # And restart the puppet master print "Restarting puppetmaster" subprocess.call(['sudo', 'service', 'puppetmaster', 'restart'])