def run(api_name, server_name, job_id, project_file, anal_file, storage, client_login, client_ip, api_version): """ Upload project file, analyse them and save analysis :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job_id: the id of the job to run :type job_id: int :param project_file: The raw project file to save :type project_file: str :param anal_file: The analysed project file :type anal_file: str :param storage: The name of the storage where the project will be located :type storage: str :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ try: # Configure better logging name cmd_util.config_cmd_log(COMMAND_NAME, job_id) # Loading required information from database job = models.jobs.get_job(job_id) if not job: raise api_util.ToolchainError("Unknown job " + str(job_id)) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING) project = models.projects.get_project(job['user_id'], job['project_uid']) if not project: raise api_util.ToolchainError("Unknown project " + str(job['project_uid'])) try: models.projects.set_project_status( project["user_id"], job["project_uid"], models.projects.PROJECT_STATUS_ANALYSING) link(job, project, project_file, anal_file) models.projects.set_project_status( project["user_id"], job["project_uid"], models.projects.PROJECT_STATUS_ANALYSED) except error_util.all_errors: with error_util.before_raising(): models.projects.set_project_status( project["user_id"], job["project_uid"], models.projects.PROJECT_STATUS_RAW) finally: if os.path.exists(project_file): os.remove(project_file) if os.path.exists(anal_file): os.remove(anal_file)
def calculate(api_name, server_name, job, project, mesh, calculation, calc_param_file, provider_name, machine, nbr_machines, split_results, client_login, client_ip, api_version): """ Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job: The job information :type job: dict[str, any] :param project: The main project :type project: dict[str, any] :param mesh: The mesh used for this calculation :type mesh: dict[str, any] :param calculation: The calculation to launch :type calculation: dict[str, any] :param calc_param_file: The main job parameter file :type calc_param_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param split_results: Do you want the result file to be splitted ? :type split_results: bool :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ job_id = int(job['id']) nbr_machines = int(nbr_machines) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING) project_codename = project['uid'] user_id = project["user_id"] calc_id = calculation['id'] tmp_folder = api_util.get_conf().get("general", "tmp_folder") provider = api_util.get_provider(provider_name) storage = api_util.get_storage(project['storage']) tags = { 'operation': "calc", 'job_id': str(job_id), 'server': server_name, 'api': api_name, 'api_version': api_version, 'client': client_login, 'client_ip': client_ip, 'debug': DO_NOT_KILL_INSTANCES, 'trusted': IS_TOOLCHAIN_SECURED } models.users.charge_user_fix_price(user_id, job_id, "Calculation storage cost") result_name = project_codename + "-calc-" + str(job_id) result_file = cmd_util.ResultFile(project_codename, result_name + ".zip") internal_file = cmd_util.ResultFile(project_codename, result_name + "_workfiles.zip") if split_results: iterations_file = cmd_util.ResultFile(project_codename, result_name + "_iterations.zip") reduce_file = cmd_util.ResultFile(project_codename, result_name + "_reduce.zip") # Uploading file on cloud storage log.info("Uploading param file to storage") models.calc.save_calc_param_file(user_id, project_codename, calculation['name'], calc_param_file) try: # Creating worker with cmd_util.using_workers( api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers: # Launch main script with cmd_util.TaskProcess(job_id, job["project_uid"], "calc", workers, [split_results]) as task_proc: conn = workers.ssh_connection # Charge user end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: raise api_util.NoMoreCredits() log.info("Sending project files on worker") worker_in_storage = storages.SshStorage( conn, api_util.WORKER_INPUT_PATH, IS_TOOLCHAIN_SECURED) cmd_util.copy_project_file( user_id, project_codename, storage, worker_in_storage, "project_file.zip", tmp_folder, key=models.projects.PROJECT_FILE_RAW) cmd_util.copy_project_file( user_id, project_codename, storage, worker_in_storage, "anal.zip", tmp_folder, key=models.projects.PROJECT_FILE_ANALYSED) cmd_util.copy_project_file(user_id, project_codename, storage, worker_in_storage, "mesh.zip", tmp_folder, file_id=mesh['result_file_id']) worker_in_storage.upload_file(calc_param_file, "calc_params.zip") os.remove(calc_param_file) log.info("Project files sent to the worker") # Tell the script to start log.info("Starting the computation") task_proc.start() last_fetched_progress_time = datetime.datetime.utcfromtimestamp( 0) is_stopped = False while True: task_status = task_proc.check_status() # Charge if we need if datetime.datetime.utcnow() > end_time: end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: models.jobs.save_job_text(job_id, "No more credit") raise api_util.NoMoreCredits() if task_status != models.jobs.JOB_STATUS_RUNNING: log.info("Computation finished with status: " + models.jobs.job_status_to_str(task_status)) break if (datetime.datetime.utcnow() - last_fetched_progress_time ).seconds > STATUS_FETCHING_DELAY: fetch_progress(conn, user_id, project_codename, calculation['name'], calculation['id'], storage, tmp_folder) last_fetched_progress_time = datetime.datetime.utcnow() if not is_stopped: calculation = models.calc.get_calc( user_id, project['uid'], calculation['id']) if not calculation: raise api_util.ToolchainError("Calculation " + str(calc_id) + " disappeared") if calculation['status'] == models.calc.STATUS_STOPPED: log.info("Stopping computation") stop_calc(conn, project_codename) is_stopped = True time.sleep(1) # Checking if the machine is still here if not conn.ping(): models.jobs.save_job_text(job_id, "Worker instance disappeared") raise api_util.ToolchainError( "Worker instance disappeared") # Fetching computed data log.info("Saving results") worker_out_storage = storages.SshStorage( conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) log_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "worker.log") if conn.file_exists(log_file): with file_util.temp_filename(dir=tmp_folder) as tmp: conn.get_file(log_file, tmp) models.jobs.save_job_log(job_id, tmp) else: log.warning("No worker log file") if not result_file.exists(worker_out_storage): log.error("Unable to find file " + str(result_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") result_file.save_on_storage(worker_out_storage, storage, tmp_folder) if split_results: if not iterations_file.exists(worker_out_storage): log.error("Unable to find file " + str(iterations_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") iterations_file.save_on_storage(worker_out_storage, storage, tmp_folder) if not reduce_file.exists(worker_out_storage): log.error("Unable to find file " + str(reduce_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") reduce_file.save_on_storage(worker_out_storage, storage, tmp_folder) if internal_file.exists(worker_out_storage): internal_file.save_on_storage(worker_out_storage, storage, tmp_folder) else: log.warning("No internal files found on server") fetch_progress(conn, user_id, project_codename, calculation['name'], calculation['id'], storage, tmp_folder) log.info("Computation result fetched") # Signaling all output was fetched task_proc.stop_and_wait() # Charge again if required if datetime.datetime.utcnow() > end_time: models.users.charge_user_computing(project["user_id"], job_id, "Cloud computation cost") # Uploading file on cloud storage result_file.save_in_database(user_id) internal_file_id = internal_file.save_in_database( user_id) if internal_file.saved else None if split_results: iterations_file.save_in_database(user_id) reduce_file.save_in_database(user_id) models.calc.save_result_files(user_id, project_codename, calculation['name'], result_file.file_id, iterations_file.file_id, reduce_file.file_id, internal_file_id) else: models.calc.save_result_files(user_id, project_codename, calculation['name'], result_file.file_id, None, None, internal_file_id) except error_util.all_errors: with error_util.before_raising(): if REMOVE_RESULTS_ON_ERROR: internal_file.delete_from_distant(storage) result_file.delete_from_distant(storage) if split_results: iterations_file.delete_from_distant(storage) reduce_file.delete_from_distant(storage) log.info("Results saved")
def run(api_name, server_name, job_id, project_codename, mesh_name, calc_id, calc_param_file, provider_name, machine, nbr_machines, split_results, client_login, client_ip, api_version): """ Do the upload_and_analyze job :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job_id: the id of the job to run :type job_id: int :param project_codename: The uid of the project :type project_codename: str :param mesh_name: The name of the mesh :type mesh_name: str :param calc_id: The id of the calculation to launch :type calc_id: int :param calc_param_file: The name of the param file :type calc_param_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param split_results: Do you want the result file to be splitted ? :type split_results: bool :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ try: # Configure better logging name cmd_util.config_cmd_log(COMMAND_NAME, job_id) # Loading required information from database job = models.jobs.get_job(job_id) if not job: raise api_util.ToolchainError("Unknown job " + str(job_id)) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING) project = models.projects.get_project(job['user_id'], job['project_uid']) if not project: raise api_util.ToolchainError("Unknown project " + str(job['project_uid'])) user_id = project["user_id"] mesh = models.meshes.get_mesh(user_id, project_codename, mesh_name) if not mesh: raise api_util.ToolchainError("Unknown mesh " + str(mesh_name) + " in project " + str(job['project_uid'])) calculation = models.calc.get_calc(user_id, project['uid'], calc_id) if not calculation: raise api_util.ToolchainError("Unknown calculation " + str(calc_id)) calc_name = calculation['name'] try: models.calc.set_job(user_id, project_codename, calc_name, job_id) models.calc.set_calc_status(user_id, project_codename, calc_name, models.calc.STATUS_RUNNING) calculate(api_name, server_name, job, project, mesh, calculation, calc_param_file, provider_name, machine, nbr_machines, split_results, client_login, client_ip, api_version) calculation = models.calc.get_calc(user_id, project['uid'], calc_id) if calculation['status'] != models.calc.STATUS_STOPPED: models.calc.set_calc_status(user_id, project_codename, calc_name, models.calc.STATUS_COMPUTED) except api_util.abort_errors: with error_util.before_raising(): models.calc.set_calc_status(user_id, project_codename, calc_name, models.calc.STATUS_CANCELED) except error_util.all_errors: with error_util.before_raising(): models.calc.set_calc_status(user_id, project_codename, calc_name, models.calc.STATUS_KILLED) finally: if os.path.exists(calc_param_file): os.remove(calc_param_file)
def analyse(api_name, server_name, job, project, storage_name, project_file, provider_name, machine, nbr_machines, client_login, client_ip, api_version): """ Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job: The job information :type job: dict[str, any] :param project: The main project :type project: dict[str, any] :param storage_name: The name of the storage where the project will be located :type storage_name: str :param project_file: The raw project file to analyse :type project_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ job_id = int(job['id']) project_codename = job["project_uid"] analyzed_filename = job["project_uid"] + "-anal-" + str(job_id) + ".zip" user_id = project["user_id"] provider = api_util.get_provider(provider_name) storage = api_util.get_storage(storage_name) tmp_folder = api_util.get_conf().get("general", "tmp_folder") tags = { 'operation': "anal", 'job_id': str(job_id), 'server': server_name, 'api': api_name, 'api_version': api_version, 'client': client_login, 'client_ip': client_ip, 'trusted': IS_TOOLCHAIN_SECURED } # Uploading file on cloud storage log.info("Uploading project file to storage") models.projects.append_file_to_project( user_id, job["project_uid"], project_file, "project_" + job["project_uid"] + ".zip", key=models.projects.PROJECT_FILE_RAW, overwrite=True) log.info("Project file uploaded") models.users.charge_user_fix_price(user_id, job_id, "Project storage cost") analyzed_file = cmd_util.ResultFile(project_codename, analyzed_filename) try: # Creating worker with cmd_util.using_workers( api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers: with cmd_util.TaskProcess(job_id, job["project_uid"], "anal", workers) as task_proc: conn = workers.ssh_connection # Charge user end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: raise api_util.NoMoreCredits() log.info("Sending project files on worker") conn.send_file( project_file, util.path_join(api_util.WORKER_INPUT_PATH, "project_file.zip")) os.remove(project_file) log.info("Project files sent to the worker") # Tell the script to start log.info("Starting the computation") task_proc.start() while True: task_status = task_proc.check_status() # Charge if we need if datetime.datetime.utcnow() > end_time: end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: models.jobs.save_job_text(job_id, "No more credit") raise api_util.NoMoreCredits() if task_status != models.jobs.JOB_STATUS_RUNNING: log.info("Computation finished with status: " + models.jobs.job_status_to_str(task_status)) break time.sleep(5) # Checking if the machine is still here if not conn.ping(): models.jobs.save_job_text(job_id, "Worker instance disappeared") raise api_util.ToolchainError( "Worker instance disappeared") # Fetching computed data log.info("Fetching results") worker_out_storage = storages.SshStorage( conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) log_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "worker.log") if conn.file_exists(log_file): with file_util.temp_filename(dir=tmp_folder) as tmp: conn.get_file(log_file, tmp) models.jobs.save_job_log(job_id, tmp) else: log.warning("No worker log file") if not analyzed_file.exists(worker_out_storage): log.error("Unable to find file " + str(analyzed_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") analyzed_file.save_on_storage(worker_out_storage, storage, tmp_folder) log.info("Computation result fetched") # Signaling all output was fetched task_proc.stop_and_wait() # Charge again if required if datetime.datetime.utcnow() > end_time: models.users.charge_user_computing(project["user_id"], job_id, "Cloud computation cost") # Uploading file on cloud storage analyzed_file.save_in_database( user_id, key=models.projects.PROJECT_FILE_ANALYSED) except error_util.all_errors: with error_util.before_raising(): if REMOVE_RESULTS_ON_ERROR: analyzed_file.delete_from_distant(storage) log.info("Result saved")