コード例 #1
0
def run(api_name, server_name, job_id, project_file, anal_file, storage,
        client_login, client_ip, api_version):
    """
    Upload project file, analyse them and save analysis

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job_id:                  the id of the job to run
    :type job_id:                   int
    :param project_file:            The raw project file to save
    :type project_file:             str
    :param anal_file:               The analysed project file
    :type anal_file:                str
    :param storage:                 The name of the storage where the project will be located
    :type storage:                  str
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    try:
        # Configure better logging name
        cmd_util.config_cmd_log(COMMAND_NAME, job_id)

        # Loading required information from database
        job = models.jobs.get_job(job_id)
        if not job:
            raise api_util.ToolchainError("Unknown job " + str(job_id))
        models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING)
        project = models.projects.get_project(job['user_id'],
                                              job['project_uid'])
        if not project:
            raise api_util.ToolchainError("Unknown project " +
                                          str(job['project_uid']))

        try:
            models.projects.set_project_status(
                project["user_id"], job["project_uid"],
                models.projects.PROJECT_STATUS_ANALYSING)

            link(job, project, project_file, anal_file)
            models.projects.set_project_status(
                project["user_id"], job["project_uid"],
                models.projects.PROJECT_STATUS_ANALYSED)
        except error_util.all_errors:
            with error_util.before_raising():
                models.projects.set_project_status(
                    project["user_id"], job["project_uid"],
                    models.projects.PROJECT_STATUS_RAW)
    finally:
        if os.path.exists(project_file):
            os.remove(project_file)
        if os.path.exists(anal_file):
            os.remove(anal_file)
コード例 #2
0
    def __enter__(self):
        proc = None
        try:
            log_info_file = util.path_join(api_util.WORKER_INPUT_PATH, "log_info.json")
            log_info = {
                "jobid": self._job_id,
                "job_type": self._command,
                "api_name": self._api_name,
                "server_name": self._server_name,
                "instance": self._running_worker.worker_ids[0],
                "provider": self._running_worker.provider_name
            }
            with file_util.temp_file(json.dumps(log_info)) as tmp_filepath:
                self.conn.send_file(tmp_filepath, api_util.WORKER_INPUT_PATH + "/log_info.json")
            self.conn.run(["chmod", "a+r", log_info_file])
            proc = self.conn.run_async(["python", api_util.WORKER_RUNNER_PATH])

            # Send the task parameter to the worker
            task_params_file = os.path.join(api_util.WORKER_INPUT_PATH, "task_params.json")
            task_params = {
                "jobid": self._job_id,
                "project_uid": self._project_uid,
                "toolchain": self._command,
                "params": json.dumps(self._params),
                "shutdown": "0" if self._running_worker.is_debug else "1"
            }
            with file_util.temp_file(json.dumps(task_params)) as tmp_filepath:
                self.conn.send_file(tmp_filepath, task_params_file)
            self.conn.run(["chmod", "a+r", task_params_file])
            self._proc = proc
        except error_util.abort_errors:
            with error_util.before_raising():
                log.info("Signal received, stopping process")
                if proc:
                    proc_util.ensure_stop_proc(proc, 2)
        except error_util.all_errors:
            with error_util.before_raising():
                try:
                    if proc:
                        proc_util.ensure_kill_proc(proc)
                except error_util.abort_errors:
                    pass
                except error_util.all_errors as e:
                    logging.getLogger("aziugo").exception(e)
        return self
コード例 #3
0
 def init(self):
     try:
         self._placement_group = self._cloud.create_placement_group(
             self.get_name())
         self._security_group = self._cloud.create_security_group_for_cluster(
             self.get_name())
     except error_util.all_errors:
         with error_util.before_raising():
             self.clean()
コード例 #4
0
 def init(self):
     try:
         self._shared_volume = re.sub(
             "_+", "_", re.sub("[^a-z0-9_]+", "_", self._name.lower()))
         cmd = ['docker', 'volume', 'create', self._shared_volume]
         subprocess.check_output(cmd, cwd=API_PATH, stderr=subprocess.PIPE)
     except error_util.all_errors:
         with error_util.before_raising():
             self.clean()
コード例 #5
0
def try_append_file_to_project(user_id,
                               project_codename,
                               file_path,
                               filename=None,
                               key=None):
    generated = append_file_to_project(user_id, project_codename, file_path,
                                       filename, key)
    try:
        yield generated
    except error_util.all_errors:
        with error_util.before_raising():
            try:
                remove_file_from_project(user_id, project_codename,
                                         generated['id'])
            except error_util.all_errors as e:
                log.error("Unable to remove file " + repr(generated))
                error_util.log_error(log, e)
コード例 #6
0
    def uploading_file(self, local_src, dest_filename):
        """
        Upload the file, yield for whatever you want and remove the file if something went wrong

        :param local_src:           The local path of the file to send to this storage
        :type local_src:            str
        :param dest_filename:       The name of the file on the cloud storage
        :type dest_filename:        str
        :return:                    The url of the file on the cloud storage, if any
        :rtype:                     str|None
        """
        result = self.upload_file(local_src, dest_filename)
        try:
            yield result
        except error_util.all_errors:
            with error_util.before_raising():
                self.delete_file(dest_filename)
コード例 #7
0
def run_garbage_collector(api_name,
                          server_name,
                          redis_host="localhost",
                          redis_port=6379,
                          data_db=0,
                          pubsub_db=1):
    signal.signal(signal.SIGTERM, raise_keyboard_interrupt)
    signal.signal(signal.SIGINT, raise_keyboard_interrupt)

    core.api_util.DatabaseContext.load_conf()
    core.api_util.RedisContext.set_params(api_name, server_name, redis_host,
                                          redis_port, data_db, pubsub_db)

    # Loading providers and storages
    conf = api_util.get_conf()
    conf.read(os.path.join(API_PATH, 'config.conf'))
    allowed_providers = json.loads(conf.get("general", "allowed_providers"))
    providers = []
    for provider_name in allowed_providers:
        providers.append(api_util.get_provider(provider_name))
    allowed_storages = json.loads(conf.get("general", "allowed_storages"))
    storages = []
    for storage_name in allowed_storages:
        storages.append(api_util.get_storage(storage_name))

    running_jobs = RunningJobs()

    thread_list = []
    for provider in providers:
        running_workers = RunningWorkers()
        worker_collector = WorkerCollector(provider, api_name, server_name,
                                           running_jobs, running_workers)
        worker_collector.start()
        thread_list.append(worker_collector)
        provider_artefact_collector = ProviderArtefactCollector(
            provider, running_jobs, running_workers)
        provider_artefact_collector.start()
        thread_list.append(provider_artefact_collector)

    job_collector = JobCollector(running_jobs)
    job_collector.start()
    thread_list.append(job_collector)

    model_collector = ModelCollector()
    model_collector.start()
    thread_list.append(model_collector)

    # FIXME: Disable for now
    # file_collector = FileCollector(api_name)
    # file_collector.start()
    # thread_list.append(file_collector)
    #
    # for storage in storages:
    #     storage_collector = StorageCollector(storage)
    #     storage_collector.start()
    #     thread_list.append(storage_collector)

    try:
        while True:
            time.sleep(0.1)
            for proc in thread_list:
                if not proc.is_alive():
                    proc.reraise()
    except error_util.all_errors as e:
        with error_util.before_raising():
            if error_util.is_abort(e):
                log.info("Signal received, exiting...")
            else:
                error_util.log_error(log, e)
            log.info("Garbage collection cleaning...")
            stop_and_join(thread_list)
            log.info("Garbage collection is cleaned")

    log.info("Garbage collection cleaning...")
    stop_and_join(thread_list)
    log.info("Garbage collection is cleaned")
コード例 #8
0
def run(api_name, server_name, job_id, project_codename, mesh_name, calc_id,
        calc_param_file, provider_name, machine, nbr_machines, split_results,
        client_login, client_ip, api_version):
    """
    Do the upload_and_analyze job

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job_id:                  the id of the job to run
    :type job_id:                   int
    :param project_codename:        The uid of the project
    :type project_codename:         str
    :param mesh_name:               The name of the mesh
    :type mesh_name:                str
    :param calc_id:                 The id of the calculation to launch
    :type calc_id:                  int
    :param calc_param_file:         The name of the param file
    :type calc_param_file:          str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param split_results:           Do you want the result file to be splitted ?
    :type split_results:            bool
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """

    try:
        # Configure better logging name
        cmd_util.config_cmd_log(COMMAND_NAME, job_id)

        # Loading required information from database
        job = models.jobs.get_job(job_id)
        if not job:
            raise api_util.ToolchainError("Unknown job " + str(job_id))
        models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING)
        project = models.projects.get_project(job['user_id'],
                                              job['project_uid'])
        if not project:
            raise api_util.ToolchainError("Unknown project " +
                                          str(job['project_uid']))
        user_id = project["user_id"]
        mesh = models.meshes.get_mesh(user_id, project_codename, mesh_name)
        if not mesh:
            raise api_util.ToolchainError("Unknown mesh " + str(mesh_name) +
                                          " in project " +
                                          str(job['project_uid']))
        calculation = models.calc.get_calc(user_id, project['uid'], calc_id)
        if not calculation:
            raise api_util.ToolchainError("Unknown calculation " +
                                          str(calc_id))
        calc_name = calculation['name']
        try:
            models.calc.set_job(user_id, project_codename, calc_name, job_id)
            models.calc.set_calc_status(user_id, project_codename, calc_name,
                                        models.calc.STATUS_RUNNING)
            calculate(api_name, server_name, job, project, mesh, calculation,
                      calc_param_file, provider_name, machine, nbr_machines,
                      split_results, client_login, client_ip, api_version)
            calculation = models.calc.get_calc(user_id, project['uid'],
                                               calc_id)
            if calculation['status'] != models.calc.STATUS_STOPPED:
                models.calc.set_calc_status(user_id, project_codename,
                                            calc_name,
                                            models.calc.STATUS_COMPUTED)
        except api_util.abort_errors:
            with error_util.before_raising():
                models.calc.set_calc_status(user_id, project_codename,
                                            calc_name,
                                            models.calc.STATUS_CANCELED)
        except error_util.all_errors:
            with error_util.before_raising():
                models.calc.set_calc_status(user_id, project_codename,
                                            calc_name,
                                            models.calc.STATUS_KILLED)
    finally:
        if os.path.exists(calc_param_file):
            os.remove(calc_param_file)
コード例 #9
0
def calculate(api_name, server_name, job, project, mesh, calculation,
              calc_param_file, provider_name, machine, nbr_machines,
              split_results, client_login, client_ip, api_version):
    """
    Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job:                     The job information
    :type job:                      dict[str, any]
    :param project:                 The main project
    :type project:                  dict[str, any]
    :param mesh:                    The mesh used for this calculation
    :type mesh:                     dict[str, any]
    :param calculation:             The calculation to launch
    :type calculation:              dict[str, any]
    :param calc_param_file:         The main job parameter file
    :type calc_param_file:          str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param split_results:           Do you want the result file to be splitted ?
    :type split_results:            bool
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    job_id = int(job['id'])
    nbr_machines = int(nbr_machines)
    models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING)
    project_codename = project['uid']
    user_id = project["user_id"]
    calc_id = calculation['id']
    tmp_folder = api_util.get_conf().get("general", "tmp_folder")
    provider = api_util.get_provider(provider_name)
    storage = api_util.get_storage(project['storage'])
    tags = {
        'operation': "calc",
        'job_id': str(job_id),
        'server': server_name,
        'api': api_name,
        'api_version': api_version,
        'client': client_login,
        'client_ip': client_ip,
        'debug': DO_NOT_KILL_INSTANCES,
        'trusted': IS_TOOLCHAIN_SECURED
    }

    models.users.charge_user_fix_price(user_id, job_id,
                                       "Calculation storage cost")
    result_name = project_codename + "-calc-" + str(job_id)
    result_file = cmd_util.ResultFile(project_codename, result_name + ".zip")
    internal_file = cmd_util.ResultFile(project_codename,
                                        result_name + "_workfiles.zip")
    if split_results:
        iterations_file = cmd_util.ResultFile(project_codename,
                                              result_name + "_iterations.zip")
        reduce_file = cmd_util.ResultFile(project_codename,
                                          result_name + "_reduce.zip")

    # Uploading file on cloud storage
    log.info("Uploading param file to storage")
    models.calc.save_calc_param_file(user_id, project_codename,
                                     calculation['name'], calc_param_file)
    try:
        # Creating worker
        with cmd_util.using_workers(
                api_name,
                provider,
                job_id,
                machine,
                nbr_machines,
                tags,
                debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers:
            # Launch main script
            with cmd_util.TaskProcess(job_id, job["project_uid"], "calc",
                                      workers, [split_results]) as task_proc:
                conn = workers.ssh_connection
                # Charge user
                end_time = models.users.charge_user_computing(
                    user_id, job_id, "Cloud computation cost")
                if models.users.get_credit(user_id) <= 0:
                    raise api_util.NoMoreCredits()

                log.info("Sending project files on worker")
                worker_in_storage = storages.SshStorage(
                    conn, api_util.WORKER_INPUT_PATH, IS_TOOLCHAIN_SECURED)

                cmd_util.copy_project_file(
                    user_id,
                    project_codename,
                    storage,
                    worker_in_storage,
                    "project_file.zip",
                    tmp_folder,
                    key=models.projects.PROJECT_FILE_RAW)
                cmd_util.copy_project_file(
                    user_id,
                    project_codename,
                    storage,
                    worker_in_storage,
                    "anal.zip",
                    tmp_folder,
                    key=models.projects.PROJECT_FILE_ANALYSED)
                cmd_util.copy_project_file(user_id,
                                           project_codename,
                                           storage,
                                           worker_in_storage,
                                           "mesh.zip",
                                           tmp_folder,
                                           file_id=mesh['result_file_id'])
                worker_in_storage.upload_file(calc_param_file,
                                              "calc_params.zip")
                os.remove(calc_param_file)
                log.info("Project files sent to the worker")

                # Tell the script to start
                log.info("Starting the computation")
                task_proc.start()
                last_fetched_progress_time = datetime.datetime.utcfromtimestamp(
                    0)
                is_stopped = False
                while True:
                    task_status = task_proc.check_status()

                    # Charge if we need
                    if datetime.datetime.utcnow() > end_time:
                        end_time = models.users.charge_user_computing(
                            user_id, job_id, "Cloud computation cost")
                        if models.users.get_credit(user_id) <= 0:
                            models.jobs.save_job_text(job_id, "No more credit")
                            raise api_util.NoMoreCredits()

                    if task_status != models.jobs.JOB_STATUS_RUNNING:
                        log.info("Computation finished with status: " +
                                 models.jobs.job_status_to_str(task_status))
                        break
                    if (datetime.datetime.utcnow() - last_fetched_progress_time
                        ).seconds > STATUS_FETCHING_DELAY:
                        fetch_progress(conn, user_id, project_codename,
                                       calculation['name'], calculation['id'],
                                       storage, tmp_folder)
                        last_fetched_progress_time = datetime.datetime.utcnow()

                    if not is_stopped:
                        calculation = models.calc.get_calc(
                            user_id, project['uid'], calculation['id'])
                        if not calculation:
                            raise api_util.ToolchainError("Calculation " +
                                                          str(calc_id) +
                                                          " disappeared")
                        if calculation['status'] == models.calc.STATUS_STOPPED:
                            log.info("Stopping computation")
                            stop_calc(conn, project_codename)
                            is_stopped = True
                    time.sleep(1)

                # Checking if the machine is still here
                if not conn.ping():
                    models.jobs.save_job_text(job_id,
                                              "Worker instance disappeared")
                    raise api_util.ToolchainError(
                        "Worker instance disappeared")

                # Fetching computed data
                log.info("Saving results")
                worker_out_storage = storages.SshStorage(
                    conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED)
                log_file = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          "worker.log")
                if conn.file_exists(log_file):
                    with file_util.temp_filename(dir=tmp_folder) as tmp:
                        conn.get_file(log_file, tmp)
                        models.jobs.save_job_log(job_id, tmp)
                else:
                    log.warning("No worker log file")

                if not result_file.exists(worker_out_storage):
                    log.error("Unable to find file " + str(result_file) +
                              " on worker")
                    raise api_util.ToolchainError(
                        "Task failed, no result file")
                result_file.save_on_storage(worker_out_storage, storage,
                                            tmp_folder)

                if split_results:
                    if not iterations_file.exists(worker_out_storage):
                        log.error("Unable to find file " +
                                  str(iterations_file) + " on worker")
                        raise api_util.ToolchainError(
                            "Task failed, no result file")
                    iterations_file.save_on_storage(worker_out_storage,
                                                    storage, tmp_folder)

                    if not reduce_file.exists(worker_out_storage):
                        log.error("Unable to find file " + str(reduce_file) +
                                  " on worker")
                        raise api_util.ToolchainError(
                            "Task failed, no result file")
                    reduce_file.save_on_storage(worker_out_storage, storage,
                                                tmp_folder)

                if internal_file.exists(worker_out_storage):
                    internal_file.save_on_storage(worker_out_storage, storage,
                                                  tmp_folder)
                else:
                    log.warning("No internal files found on server")

                fetch_progress(conn, user_id, project_codename,
                               calculation['name'], calculation['id'], storage,
                               tmp_folder)
                log.info("Computation result fetched")

                # Signaling all output was fetched
                task_proc.stop_and_wait()

        # Charge again if required
        if datetime.datetime.utcnow() > end_time:
            models.users.charge_user_computing(project["user_id"], job_id,
                                               "Cloud computation cost")

        # Uploading file on cloud storage
        result_file.save_in_database(user_id)
        internal_file_id = internal_file.save_in_database(
            user_id) if internal_file.saved else None
        if split_results:
            iterations_file.save_in_database(user_id)
            reduce_file.save_in_database(user_id)
            models.calc.save_result_files(user_id, project_codename,
                                          calculation['name'],
                                          result_file.file_id,
                                          iterations_file.file_id,
                                          reduce_file.file_id,
                                          internal_file_id)
        else:
            models.calc.save_result_files(user_id, project_codename,
                                          calculation['name'],
                                          result_file.file_id, None, None,
                                          internal_file_id)
    except error_util.all_errors:
        with error_util.before_raising():
            if REMOVE_RESULTS_ON_ERROR:
                internal_file.delete_from_distant(storage)
                result_file.delete_from_distant(storage)
                if split_results:
                    iterations_file.delete_from_distant(storage)
                    reduce_file.delete_from_distant(storage)
    log.info("Results saved")
コード例 #10
0
def using_workers(api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=False):
    machine_cost = models.provider_config.get_machine_provider_cost(provider.name, machine)
    if not machine_cost:
        raise RuntimeError("Unable to get the cost for provider " + str(provider.name))
    instance_price = api_util.price_to_float(machine_cost["cost_per_sec"]) * 3600  # In $/h, for aws spots
    nbr_machines = int(nbr_machines)
    alive_thread = None
    if nbr_machines == 1:
        workers = []
        try:
            log.info("Launching worker on provider " + str(provider.name))
            workers = provider.create_workers(int(nbr_machines), machine=machine, spot_price=instance_price)
            log.info("worker created")
            main_worker = workers[0]
            if main_worker.specific_cost:
                models.jobs.set_job_specific_cost(job_id, provider.name, machine, main_worker.specific_cost,
                                                  machine_cost["currency"], machine_cost["sec_granularity"],
                                                  machine_cost["min_sec_granularity"])

            # Tag instance
            provider.tag_workers(workers, {'Name': api_name + "_worker/job_" + str(job_id), "type": "worker"})
            if not debug_keep_instances_alive:
                debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id)
            tags = copy.copy(tags)
            tags['debug'] = "true" if debug_keep_instances_alive else "false"
            provider.tag_workers(workers, tags)

            # Connect to the worker
            ip = main_worker.public_ip if main_worker.public_ip else main_worker.private_ip
            log.info("Waiting for worker ssh connection to " + str(ip) + " ...")
            conn = ssh.SshConnection(ip, "aziugo", provider.get_key_path())
            conn.wait_for_connection()
            log.info("Connection with worker established")
            alive_thread = KeepAliveWorkerThread(conn)
            alive_thread.start()

            yield RunningWorkers(provider, workers, conn, debug_keep_instances_alive)
        finally:
            if alive_thread:
                alive_thread.stop()
                alive_thread.join()

            if workers and provider:
                if not debug_keep_instances_alive:
                    try:
                        debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id)
                    except error_util.all_errors as e:
                        log.warning(str(e))
                if debug_keep_instances_alive:
                    log.debug("Worker cleaning is disabled for debug purpose")
                else:
                    log.info("Stopping workers...")
                    cleanup_failed = False
                    try:
                        provider.terminate_workers(workers)
                    except error_util.abort_errors:
                        with error_util.before_raising():
                            try:
                                provider.terminate_workers(workers)
                                log.info("Workers stopped")
                            except error_util.abort_errors:
                                log.warning("Worker cleaned aborted.")
                                msg = "Workers of job "+str(job_id)+" are not killed. Please kill them manually"
                                log.error(msg)
                                api_util.send_admin_email("Worker cleaned aborted.", msg)
                    except error_util.all_errors as e:
                        cleanup_failed = True
                        msg = "Workers of job " + str(job_id) + " are not killed. Please kill them manually"
                        log.error(msg)
                        error_util.log_error(log, e)
                        api_util.send_admin_email("Worker cleaned aborted.", msg)
                    if not cleanup_failed:
                        log.info("Workers stopped")

    else:
        machine_info = models.provider_config.get_machine(provider.name, machine)
        if not machine_info:
            raise RuntimeError("Unable to get the description of machine " + str(machine))
        nbr_cores = int(machine_info['nbr_cores'])
        cluster_tags = copy.copy(tags)
        if not debug_keep_instances_alive:
            debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id)
        cluster_tags.update({
            "debug": "true" if debug_keep_instances_alive else "false",
            '%master%_Name': api_name + "_worker/job_" + str(job_id),
            '%master%_type': "cluster master",
            '%slave%_Name': api_name + "_worker/job_" + str(job_id) + " slave %slave_index%",
            '%slave%_type': "cluster slave",
        })
        log.info("Launching worker on provider " + str(provider.name))
        with core.cluster.Cluster(provider, "aziugo", nbr_cores, str(job_id), machine=machine,
                                  spot_price=instance_price, tags=cluster_tags,
                                  debug_no_terminate=debug_keep_instances_alive) as cluster:
            try:
                log.info("Main worker launched, with id " + str(cluster.master_id))

                log.info("Launching " + str(nbr_machines - 1) + " slave workers...")
                cluster.add_slaves(nbr_machines - 1)
                log.info("Slave workers launched")

                # Connect to the worker
                log.info("Waiting for worker ssh connection to "+str(cluster.ip)+" ...")
                conn = ssh.SshConnection(cluster.ip, "aziugo", provider.get_key_path())
                conn.wait_for_connection()
                log.info("Connection with worker established")
                alive_thread = KeepAliveClusterThread(cluster)
                alive_thread.start()
                yield RunningWorkers(provider, cluster.workers, conn, debug_keep_instances_alive)
            finally:
                if alive_thread:
                    alive_thread.stop()
                    alive_thread.join()

                if not debug_keep_instances_alive:
                    try:
                        debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id)
                    except error_util.all_errors as e:
                        log.warning(str(e))
                    if debug_keep_instances_alive:
                        cluster.disable_clean()
                if not debug_keep_instances_alive:
                    log.info("Stopping workers...")
        if not debug_keep_instances_alive:
            log.info("Workers stopped")
コード例 #11
0
def analyse(api_name, server_name, job, project, storage_name, project_file,
            provider_name, machine, nbr_machines, client_login, client_ip,
            api_version):
    """
    Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job:                     The job information
    :type job:                      dict[str, any]
    :param project:                 The main project
    :type project:                  dict[str, any]
    :param storage_name:            The name of the storage where the project will be located
    :type storage_name:             str
    :param project_file:            The raw project file to analyse
    :type project_file:             str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    job_id = int(job['id'])
    project_codename = job["project_uid"]
    analyzed_filename = job["project_uid"] + "-anal-" + str(job_id) + ".zip"
    user_id = project["user_id"]
    provider = api_util.get_provider(provider_name)
    storage = api_util.get_storage(storage_name)
    tmp_folder = api_util.get_conf().get("general", "tmp_folder")

    tags = {
        'operation': "anal",
        'job_id': str(job_id),
        'server': server_name,
        'api': api_name,
        'api_version': api_version,
        'client': client_login,
        'client_ip': client_ip,
        'trusted': IS_TOOLCHAIN_SECURED
    }

    # Uploading file on cloud storage
    log.info("Uploading project file to storage")
    models.projects.append_file_to_project(
        user_id,
        job["project_uid"],
        project_file,
        "project_" + job["project_uid"] + ".zip",
        key=models.projects.PROJECT_FILE_RAW,
        overwrite=True)
    log.info("Project file uploaded")

    models.users.charge_user_fix_price(user_id, job_id, "Project storage cost")
    analyzed_file = cmd_util.ResultFile(project_codename, analyzed_filename)
    try:
        # Creating worker
        with cmd_util.using_workers(
                api_name,
                provider,
                job_id,
                machine,
                nbr_machines,
                tags,
                debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers:
            with cmd_util.TaskProcess(job_id, job["project_uid"], "anal",
                                      workers) as task_proc:
                conn = workers.ssh_connection
                # Charge user
                end_time = models.users.charge_user_computing(
                    user_id, job_id, "Cloud computation cost")
                if models.users.get_credit(user_id) <= 0:
                    raise api_util.NoMoreCredits()

                log.info("Sending project files on worker")
                conn.send_file(
                    project_file,
                    util.path_join(api_util.WORKER_INPUT_PATH,
                                   "project_file.zip"))
                os.remove(project_file)
                log.info("Project files sent to the worker")

                # Tell the script to start
                log.info("Starting the computation")
                task_proc.start()
                while True:
                    task_status = task_proc.check_status()

                    # Charge if we need
                    if datetime.datetime.utcnow() > end_time:
                        end_time = models.users.charge_user_computing(
                            user_id, job_id, "Cloud computation cost")
                        if models.users.get_credit(user_id) <= 0:
                            models.jobs.save_job_text(job_id, "No more credit")
                            raise api_util.NoMoreCredits()

                    if task_status != models.jobs.JOB_STATUS_RUNNING:
                        log.info("Computation finished with status: " +
                                 models.jobs.job_status_to_str(task_status))
                        break
                    time.sleep(5)

                # Checking if the machine is still here
                if not conn.ping():
                    models.jobs.save_job_text(job_id,
                                              "Worker instance disappeared")
                    raise api_util.ToolchainError(
                        "Worker instance disappeared")

                # Fetching computed data
                log.info("Fetching results")
                worker_out_storage = storages.SshStorage(
                    conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED)
                log_file = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          "worker.log")
                if conn.file_exists(log_file):
                    with file_util.temp_filename(dir=tmp_folder) as tmp:
                        conn.get_file(log_file, tmp)
                        models.jobs.save_job_log(job_id, tmp)
                else:
                    log.warning("No worker log file")

                if not analyzed_file.exists(worker_out_storage):
                    log.error("Unable to find file " + str(analyzed_file) +
                              " on worker")
                    raise api_util.ToolchainError(
                        "Task failed, no result file")
                analyzed_file.save_on_storage(worker_out_storage, storage,
                                              tmp_folder)
                log.info("Computation result fetched")

                # Signaling all output was fetched
                task_proc.stop_and_wait()

        # Charge again if required
        if datetime.datetime.utcnow() > end_time:
            models.users.charge_user_computing(project["user_id"], job_id,
                                               "Cloud computation cost")

        # Uploading file on cloud storage
        analyzed_file.save_in_database(
            user_id, key=models.projects.PROJECT_FILE_ANALYSED)
    except error_util.all_errors:
        with error_util.before_raising():
            if REMOVE_RESULTS_ON_ERROR:
                analyzed_file.delete_from_distant(storage)
    log.info("Result saved")