Ejemplo n.º 1
0
def main(options):
    parser = ArgumentParser(prog='delete_jobs_G5k')

    parser.add_argument('-j',
                        '--job_ids',
                        dest='job_ids',
                        type=str,
                        help='Grid5000 job IDs')

    args = parser.parse_args()
    if args.job_ids:
        job_ids = g5k_experimenter.parse_job_ids(args.job_ids)
        print('Jobs will be deleted:')
    else:
        sites = get_g5k_sites()
        job_ids = get_current_oar_jobs(frontends=sites)
        print('All your running jobs:')

    print(''.join(['%s:%s\n' % (site, job_id) for job_id, site in job_ids]))

    decision = input('Do you want to delete those jobs [y/n]? ')
    if decision.lower().strip() == 'y':
        oardel(job_ids)
        print('Delete jobs successfully!')
    else:
        print('Bye bye!')
Ejemplo n.º 2
0
    def provisioning(self):
        """Provision nodes on Grid5000 based on client's requirements
        """
        self.make_reservation()

        # skip waiting for resources to be up in case of making reservations for the future
        if not self.is_reservation:
            self.get_resources()
        else:
            exit()

        if not self.no_deploy_os:
            n_nodes = sum([
                len(resource['hosts'])
                for site, resource in self.resources.items()
            ])
            logger.info('Starting setup on %s hosts' % n_nodes)
            deployed_hosts, undeployed_hosts = self._launch_kadeploy()
            # self._configure_ssh()

            # Retry provisioning again if all reserved hosts are not deployed successfully
            if len(undeployed_hosts) > 0:
                if self.max_deploy > 0:
                    self.max_deploy -= 1
                    if self.oar_job_ids is None:
                        logger.info('Deleting the current reservation')
                        oardel(self.oar_result)
                        time.sleep(60)
                        self.oar_result = list()
                    logger.info(
                        '---> Retrying provisioning nodes: attempt #%s' %
                        (MAX_RETRY_DEPLOY - self.max_deploy))
                    self.provisioning()
                else:
                    raise Exception(
                        'Failed to deploy all reserved nodes. Terminate the program.'
                    )
        logger.info("Finish provisioning nodes\n")
    def run(self):
        logger.debug('Parse and convert configs for G5K provisioner')
        self.configs = parse_config_file(self.args.config_file_path)

        logger.debug('Creating the combination list')
        sweeper = create_combs_queue(
            result_dir=self.configs['exp_env']['results_dir'],
            parameters=self.configs['parameters'])

        oar_job_ids = None
        logger.info('Running the experiment workflow')
        while len(sweeper.get_remaining()) > 0:
            if oar_job_ids is None:
                logger.info('Setting the experiment environment')
                oar_job_ids = self.setup_env()

            comb = sweeper.get_next()
            sweeper = self.run_workflow(comb=comb, sweeper=sweeper)

            if not is_job_alive(oar_job_ids):
                oardel(oar_job_ids)
                oar_job_ids = None

        logger.info('Finish the experiment!!!')
Ejemplo n.º 4
0
    def run(self):
        """Execute a test suite. The execution workflow is as follows:

        1. Parse command-line arguments.

        2. Define the parameters of the tests from the specified configuration
        file. Generate all the combination to test from the given parameters.

        3. Consume the combinations.

          3.1. Setup the cluster if it has not been done (first time or after a
          reservation ends.

          3.2. Load the dataset into the Hadoop cluster.

          3.3. Perform the experiments corresponding to the combinations linked
          to the loaded dataset.

        4. Clean all resources.
        """

        # Get parameters
        self.cluster = self.args[0]
        self.n_nodes = int(self.args[1])
        self.config_file = self.args[2]
        self.site = get_cluster_site(self.cluster)

        if not os.path.exists(self.config_file):
            logger.error("Params file " + self.config_file + " does not exist")
            sys.exit(1)

        # Set oar job id
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        # Main
        try:
            # Creation of the main iterator used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:

                # SETUP
                # If no job, we make a reservation and prepare the hosts for the
                # experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                    success = self.setup()
                    if not success:
                        break
                else:
                    self.hosts = get_oar_job_nodes(self.oar_job_id,
                                                   self.frontend)
                if not self.hc:
                    self.hc = HadoopCluster(self.hosts)
                # SETUP FINISHED

                # Getting the next combination (which requires a ds deployment)
                comb = self.sweeper.get_next()
                self.raw_comb = comb.copy()
                self.comb = comb
                self.prepare_dataset(comb)
                self.xp_wrapper(comb)

                # subloop over the combinations that use the same dataset
                while True:
                    newcomb = self.sweeper.get_next(
                        lambda r: filter(self._uses_same_ds, r))
                    if newcomb:
                        self.raw_comb = newcomb.copy()
                        try:
                            self.xp_wrapper(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    pass
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')

            # Clean cluster
            if self.hc:
                if self.hc.initialized:
                    self.hc.clean()

            # Close summary files
            if self.summary_file:
                self.summary_file.close()
            if self.ds_summary_file:
                self.ds_summary_file.close()
Ejemplo n.º 5
0
    def run(self):
        """Execute a test suite. The execution workflow is as follows:

        1. Parse command-line arguments.

        2. Define the parameters of the tests from the specified configuration
        file. Generate all the combination to test from the given parameters.

        3. Consume the combinations.

          3.1. Setup the cluster if it has not been done (first time or after a
          reservation ends.

          3.2. Load the dataset into the Hadoop cluster.

          3.3. Perform the experiments corresponding to the combinations linked
          to the loaded dataset.

        4. Clean all resources.
        """

        # Get parameters
        self.cluster = self.args[0]
        self.n_nodes = int(self.args[1])
        self.config_file = self.args[2]
        self.site = get_cluster_site(self.cluster)

        if not os.path.exists(self.config_file):
            logger.error("Params file " + self.config_file + " does not exist")
            sys.exit(1)

        # Set oar job id
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        # Main
        try:
            # Creation of the main iterator used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:

                # SETUP
                # If no job, we make a reservation and prepare the hosts for the
                # experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                    success = self.setup()
                    if not success:
                        break
                else:
                    self.hosts = get_oar_job_nodes(self.oar_job_id,
                                                   self.frontend)
                if not self.hc:
                    self.hc = HadoopCluster(self.hosts)
                # SETUP FINISHED

                # Getting the next combination (which requires a ds deployment)
                comb = self.sweeper.get_next()
                self.raw_comb = comb.copy()
                self.comb = comb
                self.prepare_dataset(comb)
                self.xp_wrapper(comb)

                # subloop over the combinations that use the same dataset
                while True:
                    newcomb = self.sweeper.get_next(
                        lambda r: filter(self._uses_same_ds, r))
                    if newcomb:
                        self.raw_comb = newcomb.copy()
                        try:
                            self.xp_wrapper(newcomb)
                        except:
                            break
                    else:
                        break

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    pass
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')

            # Clean cluster
            if self.hc:
                if self.hc.initialized:
                    self.hc.clean()

            # Close summary files
            if self.summary_file:
                self.summary_file.close()
            if self.ds_summary_file:
                self.ds_summary_file.close()
Ejemplo n.º 6
0
    def run(self):
        """Inherited method, put here the code for running the engine."""

        # Get parameters
        self.cluster = self.args[0]
        self.n_nodes = int(self.args[1])
        self.config_file = self.args[2]
        self.site = get_cluster_site(self.cluster)

        if not os.path.exists(self.config_file):
            logger.error("Params file " + self.config_file + " does not exist")
            sys.exit(1)

        # Set oar job id
        if self.options.oar_job_id:
            self.oar_job_id = self.options.oar_job_id
        else:
            self.oar_job_id = None

        # Main
        try:
            # Creation of the main iterator used for the first control loop.
            self.define_parameters()

            job_is_dead = False
            # While they are combinations to treat
            while len(self.sweeper.get_remaining()) > 0:

                ## SETUP
                # If no job, we make a reservation and prepare the hosts for the
                # experiments
                if job_is_dead or self.oar_job_id is None:
                    self.make_reservation()
                    success = self.setup()
                    if not success:
                        break
                else:
                    self.hosts = get_oar_job_nodes(self.oar_job_id,
                                                   self.frontend)
                ## SETUP FINISHED

                logger.info("Setup finished in hosts " + str(self.hosts))

                test_threads = []
                for h in self.hosts:
                    t = TestThread(h, self.comb_manager, self.stats_manager)
                    test_threads.append(t)
                    t.name = "th_" + str(h.address).split(".")[0]
                    t.start()

                for t in test_threads:
                    t.join()

                if get_oar_job_info(self.oar_job_id,
                                    self.frontend)['state'] == 'Error':
                    job_is_dead = True

        finally:
            if self.oar_job_id is not None:
                if not self.options.keep_alive:
                    pass
                    logger.info('Deleting job')
                    oardel([(self.oar_job_id, self.frontend)])
                else:
                    logger.info('Keeping job alive for debugging')

            # Close stats
            self.stats_manager.close()
            sweeper = self.run_workflow(comb=comb, sweeper=sweeper)

            if not is_job_alive(oar_job_ids):
                oardel(oar_job_ids)
                oar_job_ids = None

        logger.info('Finish the experiment!!!')


if __name__ == "__main__":
    logger.info("Init engine in %s" % __file__)
    engine = performing_action_template()

    try:
        logger.info("Start engine in %s" % __file__)
        engine.start()
    except Exception as e:
        logger.error('Program is terminated by the following exception: %s' %
                     e,
                     exc_info=True)
        traceback.print_exc()
    except KeyboardInterrupt:
        logger.info('Program is terminated by keyboard interrupt.')

    if not engine.args.keep_alive:
        logger.info('Deleting reservation')
        oardel(engine.oar_result)
        logger.info('Reservation deleted')
    else:
        logger.info('Reserved nodes are kept alive for inspection purpose.')