def apt_installs(manager=False, single_server_ami=False): if manager: apt_install_list = APT_MANAGER_INSTALLS elif single_server_ami: apt_install_list = APT_SINGLE_SERVER_AMI_INSTALLS else: apt_install_list = APT_WORKER_INSTALLS installs_string = " ".join(apt_install_list) # Sometimes (usually on slower servers) the remote server isn't done with initial setup when # we get to this step, so it has a bunch of retry logic. installs_failed = True for i in range(10): try: sudo('apt-get -y update >> {log}'.format(log=LOG_FILE)) sudo('apt-get -y install {installs} >> {log}'.format( installs=installs_string, log=LOG_FILE)) installs_failed = False break except FabricExecutionError: log.warning( "WARNING: encountered problems when trying to run apt installs.\n" "Usually this means the server is running a software upgrade in the background.\n" "Will try 10 times, waiting 5 seconds each time.") sleep(5) # we run supervisor manually at the end sudo("service supervisor stop") if installs_failed: raise Exception("Could not install software on remote machine.")
def get_manager_instance_by_eb_environment_name(eb_environment_name): """ Get a manager dictionary of the currently running manager server. """ managers = get_instances_by_name(PROCESSING_MANAGER_NAME % eb_environment_name) if len(managers) > 1: msg = "Discovered multiple manager servers. This configuration is not supported and should be corrected." log.error(msg) raise Exception(msg) if managers: return managers[0] else: log.warning("No manager found.") return None
def setup_rabbitmq(eb_environment_name): create_rabbit_mq_password_file(eb_environment_name) # push the configuration file so that it listens on the configured port put(LOCAL_RABBIT_MQ_CONFIG_FILE_PATH, REMOTE_RABBIT_MQ_CONFIG_FILE_PATH) sudo( f"cp {REMOTE_RABBIT_MQ_CONFIG_FILE_PATH} {REMOTE_RABBIT_MQ_FINAL_CONFIG_FILE_PATH}" ) # setup a new password sudo( f"rabbitmqctl add_user beiwe {get_rabbit_mq_password(eb_environment_name)}" ) sudo('rabbitmqctl set_permissions -p / beiwe ".*" ".*" ".*"') log.warning("This next command can take quite a while to run.") # I tried backgrounding it, doing so breaks celery. o_O sudo("service rabbitmq-server restart")
def do_create_worker(): name = prompt_for_extant_eb_environment_name() do_fail_if_environment_does_not_exist(name) manager_instance = get_manager_instance_by_eb_environment_name(name) if manager_instance is None: log.error( "There is no manager server for the %s cluster, cannot deploy a worker until there is." % name) EXIT(1) try: settings = get_server_configuration_file(name) except Exception as e: log.error("could not read settings file") log.error(e) settings = None # ide warnings... EXIT(1) log.info("creating worker server for %s..." % name) try: instance = create_processing_server( name, settings[WORKER_SERVER_INSTANCE_TYPE]) except Exception as e: log.error(e) instance = None # ide warnings... EXIT(1) instance_ip = instance['NetworkInterfaces'][0]['PrivateIpAddresses'][0][ 'Association']['PublicIp'] configure_fabric(name, instance_ip) create_swap() push_home_directory_files() apt_installs() load_git_repo() setup_python() push_beiwe_configuration(name) push_manager_private_ip_and_password(name) setup_worker_cron() setup_celery_worker() # run setup worker last. log.warning( "Server is almost up. Waiting 20 seconds to avoid a race condition..." ) sleep(20) run("supervisord")
def manager_fix(): # It is unclear what causes this. The notifications task create zombie processes that on at # least one occasion did not respond to kill -9 commands even when run as the superuser. This # occurs on both workers and managers, a 20 second sleep operation fixes it, 10 seconds does not. # Tested on the slowest server, t3a.nano' with swap that is required to run the celery tasks.) # Update: it turns out there is an alternate failure mode if you try to do the 20 second # wait (which works for workers), which is that all calls to the celery Inspect object # block for exceptionally long periods, even when a timeout value is provided. (This behavior # has other triggers too, this is just a reliable way to trigger it.) try_sudo("shutdown -r now") log.warning("rebooting server to fix rabbitmq bugs...") sleep(5) retry(run, "# waiting for server to reboot, this might take a while.") # we need to re-enable the swap after the reboot, then we can finally start supervisor without # creating zombie celery threads. sudo("swapon /swapfile") sudo("swapon -s")
#################################################################################################### ##################################### Argument Parsing ############################################# #################################################################################################### if __name__ == "__main__": # validate the global configuration file if not all( (are_aws_credentials_present(), is_global_configuration_valid())): EXIT(1) # get CLI arguments, see function for details arguments = cli_args_validation() if arguments.prod: log.warning("RUNNING IN PROD MODE") PROD_MODE.set(True) if arguments.dev: if PROD_MODE: log.error("You cannot provide -prod and -dev at the same time.") EXIT(1) DEV_MODE.set(True) log.warning("RUNNING IN DEV MODE") if arguments.help_setup_new_environment: do_help_setup_new_environment() EXIT(0) if arguments.create_environment: do_create_environment()