def main(options): parser = ArgumentParser(prog='delete_jobs_G5k') parser.add_argument('-j', '--job_ids', dest='job_ids', type=str, help='Grid5000 job IDs') args = parser.parse_args() if args.job_ids: job_ids = g5k_experimenter.parse_job_ids(args.job_ids) print('Jobs will be deleted:') else: sites = get_g5k_sites() job_ids = get_current_oar_jobs(frontends=sites) print('All your running jobs:') print(''.join(['%s:%s\n' % (site, job_id) for job_id, site in job_ids])) decision = input('Do you want to delete those jobs [y/n]? ') if decision.lower().strip() == 'y': oardel(job_ids) print('Delete jobs successfully!') else: print('Bye bye!')
def provisioning(self): """Provision nodes on Grid5000 based on client's requirements """ self.make_reservation() # skip waiting for resources to be up in case of making reservations for the future if not self.is_reservation: self.get_resources() else: exit() if not self.no_deploy_os: n_nodes = sum([ len(resource['hosts']) for site, resource in self.resources.items() ]) logger.info('Starting setup on %s hosts' % n_nodes) deployed_hosts, undeployed_hosts = self._launch_kadeploy() # self._configure_ssh() # Retry provisioning again if all reserved hosts are not deployed successfully if len(undeployed_hosts) > 0: if self.max_deploy > 0: self.max_deploy -= 1 if self.oar_job_ids is None: logger.info('Deleting the current reservation') oardel(self.oar_result) time.sleep(60) self.oar_result = list() logger.info( '---> Retrying provisioning nodes: attempt #%s' % (MAX_RETRY_DEPLOY - self.max_deploy)) self.provisioning() else: raise Exception( 'Failed to deploy all reserved nodes. Terminate the program.' ) logger.info("Finish provisioning nodes\n")
def run(self): logger.debug('Parse and convert configs for G5K provisioner') self.configs = parse_config_file(self.args.config_file_path) logger.debug('Creating the combination list') sweeper = create_combs_queue( result_dir=self.configs['exp_env']['results_dir'], parameters=self.configs['parameters']) oar_job_ids = None logger.info('Running the experiment workflow') while len(sweeper.get_remaining()) > 0: if oar_job_ids is None: logger.info('Setting the experiment environment') oar_job_ids = self.setup_env() comb = sweeper.get_next() sweeper = self.run_workflow(comb=comb, sweeper=sweeper) if not is_job_alive(oar_job_ids): oardel(oar_job_ids) oar_job_ids = None logger.info('Finish the experiment!!!')
def run(self): """Execute a test suite. The execution workflow is as follows: 1. Parse command-line arguments. 2. Define the parameters of the tests from the specified configuration file. Generate all the combination to test from the given parameters. 3. Consume the combinations. 3.1. Setup the cluster if it has not been done (first time or after a reservation ends. 3.2. Load the dataset into the Hadoop cluster. 3.3. Perform the experiments corresponding to the combinations linked to the loaded dataset. 4. Clean all resources. """ # Get parameters self.cluster = self.args[0] self.n_nodes = int(self.args[1]) self.config_file = self.args[2] self.site = get_cluster_site(self.cluster) if not os.path.exists(self.config_file): logger.error("Params file " + self.config_file + " does not exist") sys.exit(1) # Set oar job id if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None # Main try: # Creation of the main iterator used for the first control loop. self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: # SETUP # If no job, we make a reservation and prepare the hosts for the # experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() success = self.setup() if not success: break else: self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) if not self.hc: self.hc = HadoopCluster(self.hosts) # SETUP FINISHED # Getting the next combination (which requires a ds deployment) comb = self.sweeper.get_next() self.raw_comb = comb.copy() self.comb = comb self.prepare_dataset(comb) self.xp_wrapper(comb) # subloop over the combinations that use the same dataset while True: newcomb = self.sweeper.get_next( lambda r: filter(self._uses_same_ds, r)) if newcomb: self.raw_comb = newcomb.copy() try: self.xp_wrapper(newcomb) except: break else: break if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: pass logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging') # Clean cluster if self.hc: if self.hc.initialized: self.hc.clean() # Close summary files if self.summary_file: self.summary_file.close() if self.ds_summary_file: self.ds_summary_file.close()
def run(self): """Inherited method, put here the code for running the engine.""" # Get parameters self.cluster = self.args[0] self.n_nodes = int(self.args[1]) self.config_file = self.args[2] self.site = get_cluster_site(self.cluster) if not os.path.exists(self.config_file): logger.error("Params file " + self.config_file + " does not exist") sys.exit(1) # Set oar job id if self.options.oar_job_id: self.oar_job_id = self.options.oar_job_id else: self.oar_job_id = None # Main try: # Creation of the main iterator used for the first control loop. self.define_parameters() job_is_dead = False # While they are combinations to treat while len(self.sweeper.get_remaining()) > 0: ## SETUP # If no job, we make a reservation and prepare the hosts for the # experiments if job_is_dead or self.oar_job_id is None: self.make_reservation() success = self.setup() if not success: break else: self.hosts = get_oar_job_nodes(self.oar_job_id, self.frontend) ## SETUP FINISHED logger.info("Setup finished in hosts " + str(self.hosts)) test_threads = [] for h in self.hosts: t = TestThread(h, self.comb_manager, self.stats_manager) test_threads.append(t) t.name = "th_" + str(h.address).split(".")[0] t.start() for t in test_threads: t.join() if get_oar_job_info(self.oar_job_id, self.frontend)['state'] == 'Error': job_is_dead = True finally: if self.oar_job_id is not None: if not self.options.keep_alive: pass logger.info('Deleting job') oardel([(self.oar_job_id, self.frontend)]) else: logger.info('Keeping job alive for debugging') # Close stats self.stats_manager.close()
sweeper = self.run_workflow(comb=comb, sweeper=sweeper) if not is_job_alive(oar_job_ids): oardel(oar_job_ids) oar_job_ids = None logger.info('Finish the experiment!!!') if __name__ == "__main__": logger.info("Init engine in %s" % __file__) engine = performing_action_template() try: logger.info("Start engine in %s" % __file__) engine.start() except Exception as e: logger.error('Program is terminated by the following exception: %s' % e, exc_info=True) traceback.print_exc() except KeyboardInterrupt: logger.info('Program is terminated by keyboard interrupt.') if not engine.args.keep_alive: logger.info('Deleting reservation') oardel(engine.oar_result) logger.info('Reservation deleted') else: logger.info('Reserved nodes are kept alive for inspection purpose.')