def __prepare_queue_head(self, queue_head_machine, parameters): logging.debug('*' * 80) logging.debug('*' * 80) logging.debug( '__prepare_queue_head(queue_head_machine={0}, parameters={1})'. format(queue_head_machine, parameters)) keyfile = queue_head_machine['keyfile'] if not os.path.exists(keyfile): logging.error( 'Queue head keyfile: {0} does not exist!'.format(keyfile)) return False success = helper.wait_for_ssh_connection( key_file=keyfile, ip=queue_head_machine['ip'], username=queue_head_machine['username']) if success == True: logging.info( 'Queue Head with ip {0} is successfully ssh-able'.format( queue_head_machine['ip'])) else: logging.error('Queue Head ssh failed!') logging.debug('-' * 80) logging.debug('-' * 80) return False try: queue_head_prepare_params = { 'infrastructure': AgentTypes.FLEX, self.PARAM_FLEX_CLOUD_MACHINE_INFO: [queue_head_machine], 'credentials': parameters['credentials'], 'user_id': parameters['user_id'], self.PARAM_FLEX_DB_PASSWORD: parameters[self.PARAM_FLEX_DB_PASSWORD], self.PARAM_FLEX_QUEUE_HEAD: parameters[self.PARAM_FLEX_QUEUE_HEAD] } self.agent.prepare_instances(queue_head_prepare_params) # Create stochss table in flex db in queue head database = FlexDB(password=parameters[self.PARAM_FLEX_DB_PASSWORD], ip=queue_head_machine['ip']) database.createtable(JobDatabaseConfig.TABLE_NAME) logging.debug('-' * 80) logging.debug('-' * 80) return True except Exception as e: logging.error(e) logging.debug('-' * 80) logging.debug('-' * 80) return False
def __verify_ec2_instances_via_ssh(self, instance_ids, parameters, public_ips): keyfile = os.path.join(os.path.dirname(__file__), '..', '{0}.key'.format(parameters['keyname'])) logging.info('keyfile = {0}'.format(keyfile)) if not os.path.exists(keyfile): raise Exception("ssh keyfile file not found: {0}".format(keyfile)) connected_public_ips = [] connected_instance_ids = [] for (pub_ip, ins_id) in zip(public_ips, instance_ids): logging.info('connecting to {0}...'.format(pub_ip)) success = helper.wait_for_ssh_connection(key_file=keyfile, ip=pub_ip) if success == True: logging.info('{0} is successfully added'.format(pub_ip)) connected_public_ips.append(pub_ip) connected_instance_ids.append(ins_id) # if there are some vms not able to be connected via ssh, # just shut them down explicitly if len(public_ips) != len(connected_public_ips): logging.info( 'Time out on ssh to {0} instances. They will be terminated.'. format(len(public_ips) - len(connected_public_ips))) try: terminate_ins_ids = [] for ins_id in instance_ids: if ins_id not in connected_instance_ids: terminate_ins_ids.append(ins_id) self.agent.deregister_some_instances(parameters, terminate_ins_ids) # update db with failed vms VMStateModel.set_state(parameters, terminate_ins_ids, VMStateModel.STATE_FAILED, VMStateModel.DESCRI_TIMEOUT_ON_SSH) except: raise Exception( "Errors in terminating instances that cannot be connected via ssh." ) public_ips = None instance_ids = None return connected_public_ips, connected_instance_ids
def __prepare_queue_head(self, queue_head_machine, parameters): logging.debug("*" * 80) logging.debug("*" * 80) logging.debug( "__prepare_queue_head(queue_head_machine={0}, parameters={1})".format(queue_head_machine, parameters) ) keyfile = queue_head_machine["keyfile"] if not os.path.exists(keyfile): logging.error("Queue head keyfile: {0} does not exist!".format(keyfile)) return False success = helper.wait_for_ssh_connection( key_file=keyfile, ip=queue_head_machine["ip"], username=queue_head_machine["username"] ) if success == True: logging.info("Queue Head with ip {0} is successfully ssh-able".format(queue_head_machine["ip"])) else: logging.error("Queue Head ssh failed!") logging.debug("-" * 80) logging.debug("-" * 80) return False try: queue_head_prepare_params = { "infrastructure": AgentTypes.FLEX, self.PARAM_FLEX_CLOUD_MACHINE_INFO: [queue_head_machine], "credentials": parameters["credentials"], "user_id": parameters["user_id"], self.PARAM_FLEX_DB_PASSWORD: parameters[self.PARAM_FLEX_DB_PASSWORD], self.PARAM_FLEX_QUEUE_HEAD: parameters[self.PARAM_FLEX_QUEUE_HEAD], } self.agent.prepare_instances(queue_head_prepare_params) # Create stochss table in flex db in queue head database = FlexDB(password=parameters[self.PARAM_FLEX_DB_PASSWORD], ip=queue_head_machine["ip"]) database.createtable(JobDatabaseConfig.TABLE_NAME) logging.debug("-" * 80) logging.debug("-" * 80) return True except Exception as e: logging.error(e) logging.debug("-" * 80) logging.debug("-" * 80) return False
def __verify_ec2_instances_via_ssh(self, instance_ids, parameters, public_ips): keyfile = os.path.join(os.path.dirname(__file__), "..", "{0}.key".format(parameters["keyname"])) logging.info("keyfile = {0}".format(keyfile)) if not os.path.exists(keyfile): raise Exception("ssh keyfile file not found: {0}".format(keyfile)) connected_public_ips = [] connected_instance_ids = [] for (pub_ip, ins_id) in zip(public_ips, instance_ids): logging.info("connecting to {0}...".format(pub_ip)) success = helper.wait_for_ssh_connection(key_file=keyfile, ip=pub_ip) if success == True: logging.info("{0} is successfully added".format(pub_ip)) connected_public_ips.append(pub_ip) connected_instance_ids.append(ins_id) # if there are some vms not able to be connected via ssh, # just shut them down explicitly if len(public_ips) != len(connected_public_ips): logging.info( "Time out on ssh to {0} instances. They will be terminated.".format( len(public_ips) - len(connected_public_ips) ) ) try: terminate_ins_ids = [] for ins_id in instance_ids: if ins_id not in connected_instance_ids: terminate_ins_ids.append(ins_id) self.agent.deregister_some_instances(parameters, terminate_ins_ids) # update db with failed vms VMStateModel.set_state( parameters, terminate_ins_ids, VMStateModel.STATE_FAILED, VMStateModel.DESCRI_TIMEOUT_ON_SSH ) except: raise Exception("Errors in terminating instances that cannot be connected via ssh.") public_ips = None instance_ids = None return connected_public_ips, connected_instance_ids
def prepare_vms(self, parameters): logging.debug('prepare_vms(): parameters={0}'.format(parameters)) queue_head_machine = parameters[self.PARAM_FLEX_QUEUE_HEAD] user_data = self.__get_user_data(parameters['user_id']) if self.PARAM_FLEX_CLOUD_MACHINE_INFO not in parameters \ or parameters[self.PARAM_FLEX_CLOUD_MACHINE_INFO] == None \ or parameters[self.PARAM_FLEX_CLOUD_MACHINE_INFO] == []: logging.error('Error: No {0} param!'.format( self.PARAM_FLEX_CLOUD_MACHINE_INFO)) # Report Error user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = 'Invalid Parameters' user_data.put() return flex_cloud_machine_info = parameters[ self.PARAM_FLEX_CLOUD_MACHINE_INFO] # Set the user message to "configuring..." user_data.flex_cloud_status = True user_data.flex_cloud_info_msg = 'Flex Cloud configured. Waiting for workers to become available...' user_data.put() # Initialize the VMstateModel db all_accessible = True for machine in flex_cloud_machine_info: if self.agent.check_network_ports(machine['ip'], [22, 443]): state = VMStateModel.STATE_ACCESSIBLE else: state = VMStateModel.STATE_INACCESSIBLE all_accessible = False vm_state = VMStateModel(state=state, infra=self.agent_type, ins_type=FlexConfig.INSTANCE_TYPE, pri_ip=machine['ip'], pub_ip=machine['ip'], username=machine['username'], keyfile=machine['keyfile'], ins_id=self.agent.get_flex_instance_id( machine['ip']), user_id=parameters['user_id'], res_id=self.reservation_id) vm_state.put() if not all_accessible: # Report Failure user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = 'Error: not all workers are accessible' user_data.put() return if queue_head_machine == None or not helper.wait_for_ssh_connection( queue_head_machine['keyfile'], queue_head_machine['ip'], username=queue_head_machine['username']): logging.error( 'Found no viable ssh-able/running queue head machine!') # Report Failure user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = 'Error: Can not connect {0} (queue head) via SSH'.format( queue_head_machine['ip']) user_data.put() return if not self.__prepare_queue_head(queue_head_machine, parameters): logging.error('Error: could not prepare queue head!') # Report Failure user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = 'Error preparing the queue head' user_data.put() return flex_cloud_workers = [] for machine in parameters[self.PARAM_FLEX_CLOUD_MACHINE_INFO]: if machine[self.PARAM_IS_QUEUE_HEAD] != True: if helper.wait_for_ssh_connection( machine['keyfile'], machine['ip'], username=machine['username']): flex_cloud_workers.append(machine) else: # Report Failure user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = 'Error: Can not connect to {0} via SSH'.format( machine['ip']) user_data.put() return if len(flex_cloud_workers) > 0: logging.debug( 'Preparing workers: {0}'.format(flex_cloud_workers)) params = { 'infrastructure': AgentTypes.FLEX, self.PARAM_FLEX_CLOUD_MACHINE_INFO: flex_cloud_workers, 'credentials': parameters['credentials'], 'user_id': parameters['user_id'], self.PARAM_FLEX_QUEUE_HEAD: parameters[self.PARAM_FLEX_QUEUE_HEAD], 'reservation_id': parameters['reservation_id'] } self.agent.prepare_instances(params) helper.update_celery_config_with_queue_head_ip( queue_head_ip=queue_head_machine['ip'], agent_type=self.agent_type) self.__configure_celery(params=parameters) # Report Success logging.debug('Flex Cloud Deployed') user_data.flex_cloud_status = True user_data.flex_cloud_info_msg = 'Flex Cloud Deployed' user_data.put() # Force the update of the instance status VMStateModel.synchronize(agent=self.agent, parameters=parameters) return
def prepare_vms(self, parameters): logging.debug("prepare_vms(): parameters={0}".format(parameters)) queue_head_machine = parameters[self.PARAM_FLEX_QUEUE_HEAD] user_data = self.__get_user_data(parameters["user_id"]) if ( self.PARAM_FLEX_CLOUD_MACHINE_INFO not in parameters or parameters[self.PARAM_FLEX_CLOUD_MACHINE_INFO] == None or parameters[self.PARAM_FLEX_CLOUD_MACHINE_INFO] == [] ): logging.error("Error: No {0} param!".format(self.PARAM_FLEX_CLOUD_MACHINE_INFO)) # Report Error user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = "Invalid Parameters" user_data.put() return flex_cloud_machine_info = parameters[self.PARAM_FLEX_CLOUD_MACHINE_INFO] # Set the user message to "configuring..." user_data.flex_cloud_status = True user_data.flex_cloud_info_msg = "Flex Cloud configured. Waiting for workers to become available..." user_data.put() # Initialize the VMstateModel db all_accessible = True for machine in flex_cloud_machine_info: if self.agent.check_network_ports(machine["ip"], [22, 443]): state = VMStateModel.STATE_ACCESSIBLE else: state = VMStateModel.STATE_INACCESSIBLE all_accessible = False vm_state = VMStateModel( state=state, infra=self.agent_type, ins_type=FlexConfig.INSTANCE_TYPE, pri_ip=machine["ip"], pub_ip=machine["ip"], username=machine["username"], keyfile=machine["keyfile"], ins_id=self.agent.get_flex_instance_id(machine["ip"]), user_id=parameters["user_id"], res_id=self.reservation_id, ) vm_state.put() if not all_accessible: # Report Failure user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = "Error: not all workers are accessible" user_data.put() return if queue_head_machine == None or not helper.wait_for_ssh_connection( queue_head_machine["keyfile"], queue_head_machine["ip"], username=queue_head_machine["username"] ): logging.error("Found no viable ssh-able/running queue head machine!") # Report Failure user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = "Error: Can not connect {0} (queue head) via SSH".format( queue_head_machine["ip"] ) user_data.put() return if not self.__prepare_queue_head(queue_head_machine, parameters): logging.error("Error: could not prepare queue head!") # Report Failure user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = "Error preparing the queue head" user_data.put() return flex_cloud_workers = [] for machine in parameters[self.PARAM_FLEX_CLOUD_MACHINE_INFO]: if machine[self.PARAM_IS_QUEUE_HEAD] != True: if helper.wait_for_ssh_connection(machine["keyfile"], machine["ip"], username=machine["username"]): flex_cloud_workers.append(machine) else: # Report Failure user_data.flex_cloud_status = False user_data.flex_cloud_info_msg = "Error: Can not connect to {0} via SSH".format(machine["ip"]) user_data.put() return if len(flex_cloud_workers) > 0: logging.debug("Preparing workers: {0}".format(flex_cloud_workers)) params = { "infrastructure": AgentTypes.FLEX, self.PARAM_FLEX_CLOUD_MACHINE_INFO: flex_cloud_workers, "credentials": parameters["credentials"], "user_id": parameters["user_id"], self.PARAM_FLEX_QUEUE_HEAD: parameters[self.PARAM_FLEX_QUEUE_HEAD], "reservation_id": parameters["reservation_id"], } self.agent.prepare_instances(params) helper.update_celery_config_with_queue_head_ip( queue_head_ip=queue_head_machine["ip"], agent_type=self.agent_type ) self.__configure_celery(params=parameters) # Report Success logging.debug("Flex Cloud Deployed") user_data.flex_cloud_status = True user_data.flex_cloud_info_msg = "Flex Cloud Deployed" user_data.put() # Force the update of the instance status VMStateModel.synchronize(agent=self.agent, parameters=parameters) return