Example #1
0
def stop_calc(conn, project_codename):
    """
    Tell the calculation to stop, running a specific script on the worker

    :param conn:                    The ssh connection to the main worker
    :type conn:                     ssh.SshConnection
    :param project_codename:        The uuid of the project
    :type project_codename:         str
    """
    calc_dir = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS",
                              "PROJECTS_CFD", project_codename, "CALC")
    if not conn.folder_exists(calc_dir):
        log.debug("calc folder " + calc_dir +
                  " doesn't exists yet, skipping...")
        return True
    _, out, _ = conn.run(
        ["find", calc_dir, "-mindepth", "1", "-maxdepth", "1", "-type", "d"])
    out = out.strip()
    if not out or "\n" in out:  # No results or more than one result
        log.warning("Unable to get the calculation output folder")
        return
    calc_dir = out.rstrip("/")
    stopper_command = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS",
                                     "APPLI", "TMP", "CFD_CALC_STOP.py")
    conn.run(["python", stopper_command, calc_dir])
Example #2
0
    def check_status(self):
        if not self._running_worker.is_debug:
            if models.jobs.is_shutdown_disabled(self._job_id):
                self._running_worker.disable_shutdown()
        if self.is_canceled():
            self._status = models.jobs.JOB_STATUS_CANCELED
            raise api_util.ToolchainCanceled()

        status_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "task_end.txt")
        code, out, err = self.conn.run(["cat", status_file], can_fail=True)
        if code == 0:
            task_status = out.strip()
            if task_status == "success":
                self._status = models.jobs.JOB_STATUS_FINISHED
            elif task_status == "cancel":
                self._status = models.jobs.JOB_STATUS_CANCELED
            elif task_status == "error":
                self._status = models.jobs.JOB_STATUS_KILLED
            else:
                raise RuntimeError("Unknown worker task result: " + repr(task_status))
            return self._status

        # Ensure the proc is still running
        if not self._proc.is_running():
            self._status = models.jobs.JOB_STATUS_KILLED

        # Load progress
        exit_code, out, _ = self.conn.run(["cat", api_util.WORKER_WORK_PATH + "/progress.txt"], can_fail=True)
        if exit_code == 0 and type_util.ll_float(out.strip()):
            models.jobs.set_job_progress(self._job_id, max(0.0, min(1.0, float(out))))
        return self._status
Example #3
0
 def stop_and_wait(self):
     self.conn.run("echo '1' > '" + util.path_join(api_util.WORKER_INPUT_PATH, "output_fetched")+"'", shell=True)
     try:
         proc_util.wait_for_proc_and_streams(self._proc, 2)
     except util.TimeoutError:
         pass  # Machine stopped
     self._proc = None
Example #4
0
    def __enter__(self):
        proc = None
        try:
            log_info_file = util.path_join(api_util.WORKER_INPUT_PATH, "log_info.json")
            log_info = {
                "jobid": self._job_id,
                "job_type": self._command,
                "api_name": self._api_name,
                "server_name": self._server_name,
                "instance": self._running_worker.worker_ids[0],
                "provider": self._running_worker.provider_name
            }
            with file_util.temp_file(json.dumps(log_info)) as tmp_filepath:
                self.conn.send_file(tmp_filepath, api_util.WORKER_INPUT_PATH + "/log_info.json")
            self.conn.run(["chmod", "a+r", log_info_file])
            proc = self.conn.run_async(["python", api_util.WORKER_RUNNER_PATH])

            # Send the task parameter to the worker
            task_params_file = os.path.join(api_util.WORKER_INPUT_PATH, "task_params.json")
            task_params = {
                "jobid": self._job_id,
                "project_uid": self._project_uid,
                "toolchain": self._command,
                "params": json.dumps(self._params),
                "shutdown": "0" if self._running_worker.is_debug else "1"
            }
            with file_util.temp_file(json.dumps(task_params)) as tmp_filepath:
                self.conn.send_file(tmp_filepath, task_params_file)
            self.conn.run(["chmod", "a+r", task_params_file])
            self._proc = proc
        except error_util.abort_errors:
            with error_util.before_raising():
                log.info("Signal received, stopping process")
                if proc:
                    proc_util.ensure_stop_proc(proc, 2)
        except error_util.all_errors:
            with error_util.before_raising():
                try:
                    if proc:
                        proc_util.ensure_kill_proc(proc)
                except error_util.abort_errors:
                    pass
                except error_util.all_errors as e:
                    logging.getLogger("aziugo").exception(e)
        return self
from lib import type_util
import provider
import storages

API_PATH = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", ".."))
PRICE_PRECISION = 100000000  # 10^8 (could be a problem after 92 billions dollar/zephycoins)

CURRENCY_DOLLAR = "dollar"
CURRENCY_YUAN = "yuan"
CURRENCY_EURO = "euro"

OPENFOAM_DONATION_RATIO = 0.05

# Worker path definitions
WORKER_HOME = "/home/aziugo"
WORKER_SCRIPTS_PATH = util.path_join(WORKER_HOME, "worker_scripts")
WORKER_INPUT_PATH = util.path_join(WORKER_SCRIPTS_PATH, "inputs")
WORKER_OUTPUT_PATH = util.path_join(WORKER_SCRIPTS_PATH, "outputs")
WORKER_RUNNER_PATH = util.path_join(WORKER_SCRIPTS_PATH, "aziugo_start.py")
WORKER_LAUNCHER_PATH = util.path_join(WORKER_SCRIPTS_PATH, "python_venv.sh")
WORKER_WORK_PATH = util.path_join(WORKER_SCRIPTS_PATH, "workdir")

log = logging.getLogger("aziugo")


class NoMoreCredits(RuntimeError):
    pass


class ToolchainError(RuntimeError):
    pass
Example #6
0
def calculate(api_name, server_name, job, project, mesh, calculation,
              calc_param_file, provider_name, machine, nbr_machines,
              split_results, client_login, client_ip, api_version):
    """
    Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job:                     The job information
    :type job:                      dict[str, any]
    :param project:                 The main project
    :type project:                  dict[str, any]
    :param mesh:                    The mesh used for this calculation
    :type mesh:                     dict[str, any]
    :param calculation:             The calculation to launch
    :type calculation:              dict[str, any]
    :param calc_param_file:         The main job parameter file
    :type calc_param_file:          str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param split_results:           Do you want the result file to be splitted ?
    :type split_results:            bool
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    job_id = int(job['id'])
    nbr_machines = int(nbr_machines)
    models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING)
    project_codename = project['uid']
    user_id = project["user_id"]
    calc_id = calculation['id']
    tmp_folder = api_util.get_conf().get("general", "tmp_folder")
    provider = api_util.get_provider(provider_name)
    storage = api_util.get_storage(project['storage'])
    tags = {
        'operation': "calc",
        'job_id': str(job_id),
        'server': server_name,
        'api': api_name,
        'api_version': api_version,
        'client': client_login,
        'client_ip': client_ip,
        'debug': DO_NOT_KILL_INSTANCES,
        'trusted': IS_TOOLCHAIN_SECURED
    }

    models.users.charge_user_fix_price(user_id, job_id,
                                       "Calculation storage cost")
    result_name = project_codename + "-calc-" + str(job_id)
    result_file = cmd_util.ResultFile(project_codename, result_name + ".zip")
    internal_file = cmd_util.ResultFile(project_codename,
                                        result_name + "_workfiles.zip")
    if split_results:
        iterations_file = cmd_util.ResultFile(project_codename,
                                              result_name + "_iterations.zip")
        reduce_file = cmd_util.ResultFile(project_codename,
                                          result_name + "_reduce.zip")

    # Uploading file on cloud storage
    log.info("Uploading param file to storage")
    models.calc.save_calc_param_file(user_id, project_codename,
                                     calculation['name'], calc_param_file)
    try:
        # Creating worker
        with cmd_util.using_workers(
                api_name,
                provider,
                job_id,
                machine,
                nbr_machines,
                tags,
                debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers:
            # Launch main script
            with cmd_util.TaskProcess(job_id, job["project_uid"], "calc",
                                      workers, [split_results]) as task_proc:
                conn = workers.ssh_connection
                # Charge user
                end_time = models.users.charge_user_computing(
                    user_id, job_id, "Cloud computation cost")
                if models.users.get_credit(user_id) <= 0:
                    raise api_util.NoMoreCredits()

                log.info("Sending project files on worker")
                worker_in_storage = storages.SshStorage(
                    conn, api_util.WORKER_INPUT_PATH, IS_TOOLCHAIN_SECURED)

                cmd_util.copy_project_file(
                    user_id,
                    project_codename,
                    storage,
                    worker_in_storage,
                    "project_file.zip",
                    tmp_folder,
                    key=models.projects.PROJECT_FILE_RAW)
                cmd_util.copy_project_file(
                    user_id,
                    project_codename,
                    storage,
                    worker_in_storage,
                    "anal.zip",
                    tmp_folder,
                    key=models.projects.PROJECT_FILE_ANALYSED)
                cmd_util.copy_project_file(user_id,
                                           project_codename,
                                           storage,
                                           worker_in_storage,
                                           "mesh.zip",
                                           tmp_folder,
                                           file_id=mesh['result_file_id'])
                worker_in_storage.upload_file(calc_param_file,
                                              "calc_params.zip")
                os.remove(calc_param_file)
                log.info("Project files sent to the worker")

                # Tell the script to start
                log.info("Starting the computation")
                task_proc.start()
                last_fetched_progress_time = datetime.datetime.utcfromtimestamp(
                    0)
                is_stopped = False
                while True:
                    task_status = task_proc.check_status()

                    # Charge if we need
                    if datetime.datetime.utcnow() > end_time:
                        end_time = models.users.charge_user_computing(
                            user_id, job_id, "Cloud computation cost")
                        if models.users.get_credit(user_id) <= 0:
                            models.jobs.save_job_text(job_id, "No more credit")
                            raise api_util.NoMoreCredits()

                    if task_status != models.jobs.JOB_STATUS_RUNNING:
                        log.info("Computation finished with status: " +
                                 models.jobs.job_status_to_str(task_status))
                        break
                    if (datetime.datetime.utcnow() - last_fetched_progress_time
                        ).seconds > STATUS_FETCHING_DELAY:
                        fetch_progress(conn, user_id, project_codename,
                                       calculation['name'], calculation['id'],
                                       storage, tmp_folder)
                        last_fetched_progress_time = datetime.datetime.utcnow()

                    if not is_stopped:
                        calculation = models.calc.get_calc(
                            user_id, project['uid'], calculation['id'])
                        if not calculation:
                            raise api_util.ToolchainError("Calculation " +
                                                          str(calc_id) +
                                                          " disappeared")
                        if calculation['status'] == models.calc.STATUS_STOPPED:
                            log.info("Stopping computation")
                            stop_calc(conn, project_codename)
                            is_stopped = True
                    time.sleep(1)

                # Checking if the machine is still here
                if not conn.ping():
                    models.jobs.save_job_text(job_id,
                                              "Worker instance disappeared")
                    raise api_util.ToolchainError(
                        "Worker instance disappeared")

                # Fetching computed data
                log.info("Saving results")
                worker_out_storage = storages.SshStorage(
                    conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED)
                log_file = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          "worker.log")
                if conn.file_exists(log_file):
                    with file_util.temp_filename(dir=tmp_folder) as tmp:
                        conn.get_file(log_file, tmp)
                        models.jobs.save_job_log(job_id, tmp)
                else:
                    log.warning("No worker log file")

                if not result_file.exists(worker_out_storage):
                    log.error("Unable to find file " + str(result_file) +
                              " on worker")
                    raise api_util.ToolchainError(
                        "Task failed, no result file")
                result_file.save_on_storage(worker_out_storage, storage,
                                            tmp_folder)

                if split_results:
                    if not iterations_file.exists(worker_out_storage):
                        log.error("Unable to find file " +
                                  str(iterations_file) + " on worker")
                        raise api_util.ToolchainError(
                            "Task failed, no result file")
                    iterations_file.save_on_storage(worker_out_storage,
                                                    storage, tmp_folder)

                    if not reduce_file.exists(worker_out_storage):
                        log.error("Unable to find file " + str(reduce_file) +
                                  " on worker")
                        raise api_util.ToolchainError(
                            "Task failed, no result file")
                    reduce_file.save_on_storage(worker_out_storage, storage,
                                                tmp_folder)

                if internal_file.exists(worker_out_storage):
                    internal_file.save_on_storage(worker_out_storage, storage,
                                                  tmp_folder)
                else:
                    log.warning("No internal files found on server")

                fetch_progress(conn, user_id, project_codename,
                               calculation['name'], calculation['id'], storage,
                               tmp_folder)
                log.info("Computation result fetched")

                # Signaling all output was fetched
                task_proc.stop_and_wait()

        # Charge again if required
        if datetime.datetime.utcnow() > end_time:
            models.users.charge_user_computing(project["user_id"], job_id,
                                               "Cloud computation cost")

        # Uploading file on cloud storage
        result_file.save_in_database(user_id)
        internal_file_id = internal_file.save_in_database(
            user_id) if internal_file.saved else None
        if split_results:
            iterations_file.save_in_database(user_id)
            reduce_file.save_in_database(user_id)
            models.calc.save_result_files(user_id, project_codename,
                                          calculation['name'],
                                          result_file.file_id,
                                          iterations_file.file_id,
                                          reduce_file.file_id,
                                          internal_file_id)
        else:
            models.calc.save_result_files(user_id, project_codename,
                                          calculation['name'],
                                          result_file.file_id, None, None,
                                          internal_file_id)
    except error_util.all_errors:
        with error_util.before_raising():
            if REMOVE_RESULTS_ON_ERROR:
                internal_file.delete_from_distant(storage)
                result_file.delete_from_distant(storage)
                if split_results:
                    iterations_file.delete_from_distant(storage)
                    reduce_file.delete_from_distant(storage)
    log.info("Results saved")
Example #7
0
def fetch_progress(conn, user_id, project_codename, calc_name, calc_id,
                   storage, tmp_folder):
    """
    Check if a progress file has been created on the main worker and save it if it exists

    :param conn:                    The ssh connection to the main worker
    :type conn:                     ssh.SshConnection
    :param user_id:                 The id of the job owner
    :type user_id:                  int
    :param project_codename:        The project uuid
    :type project_codename:         str
    :param calc_name:               The name of the calculation
    :type calc_name:                str
    :param calc_id:                 The id of the calculation
    :type calc_id:                  int
    :param storage:                 The storage of the project
    :type storage:                  core.ssh.Storage
    :param tmp_folder:              A temporary folder to use
    :type tmp_folder:               str
    :return:                        True if success, False if no file is found or a failure happens
    :rtype:                         bool
    """
    status_file_name = project_codename + "_calc_" + calc_name + "_status.zip"
    status_file = cmd_util.ResultFile(project_codename, status_file_name)
    old_status_file = None
    try:
        calc_dir = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS",
                                  "PROJECTS_CFD", project_codename, "CALC")
        if not conn.folder_exists(calc_dir):
            log.debug("calc folder " + calc_dir +
                      " doesn't exists yet, skipping...")
            return True
        _, out, _ = conn.run([
            "find", calc_dir, "-mindepth", "1", "-maxdepth", "1", "-type", "d"
        ])
        out = out.strip()
        if not out or "\n" in out:  # No results or more than one result
            log.warning("Unable to get the calculation output folder")
            return
        calc_dir = out.rstrip("/")
        zipper_command = util.path_join(api_util.WORKER_WORK_PATH,
                                        "ZephyTOOLS", "APPLI", "TMP",
                                        "CFD_CALC_ZIP_STATUS.py")
        old_status_file = models.calc.get_calc_status_file(
            user_id, project_codename, calc_id)
        status_file_path = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          status_file_name)
        conn.run(
            ["python", zipper_command, "-i", calc_dir, "-o", status_file_path])

        worker_out_storage = storages.SshStorage(conn,
                                                 api_util.WORKER_OUTPUT_PATH,
                                                 IS_TOOLCHAIN_SECURED)
        if not status_file.exists(worker_out_storage):
            log.warning(
                "Unable to get calculation status file: file not found")
            return False
        status_file.save_on_storage(worker_out_storage, storage, tmp_folder)
        file_id = status_file.save_in_database(user_id)
        models.calc.save_status_file(user_id, project_codename, calc_id,
                                     file_id)
    except error_util.all_errors as e:
        with error_util.saved_stack() as error_stack:
            status_file.delete_from_distant(storage)
            if error_util.is_abort(e):
                error_stack.reraise()
            else:
                error_util.log_error(log, e)
                return False
    if old_status_file:
        models.projects.remove_file_from_project(user_id, project_codename,
                                                 old_status_file['id'])
    return True
Example #8
0
 def start(self):
     self.conn.run(["rm", "-f", util.path_join(api_util.WORKER_OUTPUT_PATH, "task_end.txt")])
     self.conn.run("echo '1' > '" + util.path_join(api_util.WORKER_INPUT_PATH, "start_task")+"'", shell=True)
     self._status = models.jobs.JOB_STATUS_RUNNING
Example #9
0
 def disable_shutdown(self):
     self._debug = True
     self._provider.tag_workers(self._workers, {'debug': True})
     flag_file = util.path_join(api_util.WORKER_INPUT_PATH, "disable_shutdown")
     self._conn.run("echo '1' > '" + flag_file + "'", shell=True)
def analyse(api_name, server_name, job, project, storage_name, project_file,
            provider_name, machine, nbr_machines, client_login, client_ip,
            api_version):
    """
    Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job:                     The job information
    :type job:                      dict[str, any]
    :param project:                 The main project
    :type project:                  dict[str, any]
    :param storage_name:            The name of the storage where the project will be located
    :type storage_name:             str
    :param project_file:            The raw project file to analyse
    :type project_file:             str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    job_id = int(job['id'])
    project_codename = job["project_uid"]
    analyzed_filename = job["project_uid"] + "-anal-" + str(job_id) + ".zip"
    user_id = project["user_id"]
    provider = api_util.get_provider(provider_name)
    storage = api_util.get_storage(storage_name)
    tmp_folder = api_util.get_conf().get("general", "tmp_folder")

    tags = {
        'operation': "anal",
        'job_id': str(job_id),
        'server': server_name,
        'api': api_name,
        'api_version': api_version,
        'client': client_login,
        'client_ip': client_ip,
        'trusted': IS_TOOLCHAIN_SECURED
    }

    # Uploading file on cloud storage
    log.info("Uploading project file to storage")
    models.projects.append_file_to_project(
        user_id,
        job["project_uid"],
        project_file,
        "project_" + job["project_uid"] + ".zip",
        key=models.projects.PROJECT_FILE_RAW,
        overwrite=True)
    log.info("Project file uploaded")

    models.users.charge_user_fix_price(user_id, job_id, "Project storage cost")
    analyzed_file = cmd_util.ResultFile(project_codename, analyzed_filename)
    try:
        # Creating worker
        with cmd_util.using_workers(
                api_name,
                provider,
                job_id,
                machine,
                nbr_machines,
                tags,
                debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers:
            with cmd_util.TaskProcess(job_id, job["project_uid"], "anal",
                                      workers) as task_proc:
                conn = workers.ssh_connection
                # Charge user
                end_time = models.users.charge_user_computing(
                    user_id, job_id, "Cloud computation cost")
                if models.users.get_credit(user_id) <= 0:
                    raise api_util.NoMoreCredits()

                log.info("Sending project files on worker")
                conn.send_file(
                    project_file,
                    util.path_join(api_util.WORKER_INPUT_PATH,
                                   "project_file.zip"))
                os.remove(project_file)
                log.info("Project files sent to the worker")

                # Tell the script to start
                log.info("Starting the computation")
                task_proc.start()
                while True:
                    task_status = task_proc.check_status()

                    # Charge if we need
                    if datetime.datetime.utcnow() > end_time:
                        end_time = models.users.charge_user_computing(
                            user_id, job_id, "Cloud computation cost")
                        if models.users.get_credit(user_id) <= 0:
                            models.jobs.save_job_text(job_id, "No more credit")
                            raise api_util.NoMoreCredits()

                    if task_status != models.jobs.JOB_STATUS_RUNNING:
                        log.info("Computation finished with status: " +
                                 models.jobs.job_status_to_str(task_status))
                        break
                    time.sleep(5)

                # Checking if the machine is still here
                if not conn.ping():
                    models.jobs.save_job_text(job_id,
                                              "Worker instance disappeared")
                    raise api_util.ToolchainError(
                        "Worker instance disappeared")

                # Fetching computed data
                log.info("Fetching results")
                worker_out_storage = storages.SshStorage(
                    conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED)
                log_file = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          "worker.log")
                if conn.file_exists(log_file):
                    with file_util.temp_filename(dir=tmp_folder) as tmp:
                        conn.get_file(log_file, tmp)
                        models.jobs.save_job_log(job_id, tmp)
                else:
                    log.warning("No worker log file")

                if not analyzed_file.exists(worker_out_storage):
                    log.error("Unable to find file " + str(analyzed_file) +
                              " on worker")
                    raise api_util.ToolchainError(
                        "Task failed, no result file")
                analyzed_file.save_on_storage(worker_out_storage, storage,
                                              tmp_folder)
                log.info("Computation result fetched")

                # Signaling all output was fetched
                task_proc.stop_and_wait()

        # Charge again if required
        if datetime.datetime.utcnow() > end_time:
            models.users.charge_user_computing(project["user_id"], job_id,
                                               "Cloud computation cost")

        # Uploading file on cloud storage
        analyzed_file.save_in_database(
            user_id, key=models.projects.PROJECT_FILE_ANALYSED)
    except error_util.all_errors:
        with error_util.before_raising():
            if REMOVE_RESULTS_ON_ERROR:
                analyzed_file.delete_from_distant(storage)
    log.info("Result saved")