def _file_exists(storage_name, filename): storage = api_util.get_storage(storage_name) return storage.file_exists(filename)
def run_garbage_collector(api_name, server_name, redis_host="localhost", redis_port=6379, data_db=0, pubsub_db=1): signal.signal(signal.SIGTERM, raise_keyboard_interrupt) signal.signal(signal.SIGINT, raise_keyboard_interrupt) core.api_util.DatabaseContext.load_conf() core.api_util.RedisContext.set_params(api_name, server_name, redis_host, redis_port, data_db, pubsub_db) # Loading providers and storages conf = api_util.get_conf() conf.read(os.path.join(API_PATH, 'config.conf')) allowed_providers = json.loads(conf.get("general", "allowed_providers")) providers = [] for provider_name in allowed_providers: providers.append(api_util.get_provider(provider_name)) allowed_storages = json.loads(conf.get("general", "allowed_storages")) storages = [] for storage_name in allowed_storages: storages.append(api_util.get_storage(storage_name)) running_jobs = RunningJobs() thread_list = [] for provider in providers: running_workers = RunningWorkers() worker_collector = WorkerCollector(provider, api_name, server_name, running_jobs, running_workers) worker_collector.start() thread_list.append(worker_collector) provider_artefact_collector = ProviderArtefactCollector( provider, running_jobs, running_workers) provider_artefact_collector.start() thread_list.append(provider_artefact_collector) job_collector = JobCollector(running_jobs) job_collector.start() thread_list.append(job_collector) model_collector = ModelCollector() model_collector.start() thread_list.append(model_collector) # FIXME: Disable for now # file_collector = FileCollector(api_name) # file_collector.start() # thread_list.append(file_collector) # # for storage in storages: # storage_collector = StorageCollector(storage) # storage_collector.start() # thread_list.append(storage_collector) try: while True: time.sleep(0.1) for proc in thread_list: if not proc.is_alive(): proc.reraise() except error_util.all_errors as e: with error_util.before_raising(): if error_util.is_abort(e): log.info("Signal received, exiting...") else: error_util.log_error(log, e) log.info("Garbage collection cleaning...") stop_and_join(thread_list) log.info("Garbage collection is cleaned") log.info("Garbage collection cleaning...") stop_and_join(thread_list) log.info("Garbage collection is cleaned")
def _delete_file(storage_name, filename): storage = api_util.get_storage(storage_name) storage.delete_file(filename)
def _get_file_creation_date(storage_name, filename): storage = api_util.get_storage(storage_name) return storage.get_file_creation_date(filename)
def _list_files(storage_name): storage = api_util.get_storage(storage_name) return storage.list_files()
def calculate(api_name, server_name, job, project, mesh, calculation, calc_param_file, provider_name, machine, nbr_machines, split_results, client_login, client_ip, api_version): """ Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job: The job information :type job: dict[str, any] :param project: The main project :type project: dict[str, any] :param mesh: The mesh used for this calculation :type mesh: dict[str, any] :param calculation: The calculation to launch :type calculation: dict[str, any] :param calc_param_file: The main job parameter file :type calc_param_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param split_results: Do you want the result file to be splitted ? :type split_results: bool :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ job_id = int(job['id']) nbr_machines = int(nbr_machines) models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING) project_codename = project['uid'] user_id = project["user_id"] calc_id = calculation['id'] tmp_folder = api_util.get_conf().get("general", "tmp_folder") provider = api_util.get_provider(provider_name) storage = api_util.get_storage(project['storage']) tags = { 'operation': "calc", 'job_id': str(job_id), 'server': server_name, 'api': api_name, 'api_version': api_version, 'client': client_login, 'client_ip': client_ip, 'debug': DO_NOT_KILL_INSTANCES, 'trusted': IS_TOOLCHAIN_SECURED } models.users.charge_user_fix_price(user_id, job_id, "Calculation storage cost") result_name = project_codename + "-calc-" + str(job_id) result_file = cmd_util.ResultFile(project_codename, result_name + ".zip") internal_file = cmd_util.ResultFile(project_codename, result_name + "_workfiles.zip") if split_results: iterations_file = cmd_util.ResultFile(project_codename, result_name + "_iterations.zip") reduce_file = cmd_util.ResultFile(project_codename, result_name + "_reduce.zip") # Uploading file on cloud storage log.info("Uploading param file to storage") models.calc.save_calc_param_file(user_id, project_codename, calculation['name'], calc_param_file) try: # Creating worker with cmd_util.using_workers( api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers: # Launch main script with cmd_util.TaskProcess(job_id, job["project_uid"], "calc", workers, [split_results]) as task_proc: conn = workers.ssh_connection # Charge user end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: raise api_util.NoMoreCredits() log.info("Sending project files on worker") worker_in_storage = storages.SshStorage( conn, api_util.WORKER_INPUT_PATH, IS_TOOLCHAIN_SECURED) cmd_util.copy_project_file( user_id, project_codename, storage, worker_in_storage, "project_file.zip", tmp_folder, key=models.projects.PROJECT_FILE_RAW) cmd_util.copy_project_file( user_id, project_codename, storage, worker_in_storage, "anal.zip", tmp_folder, key=models.projects.PROJECT_FILE_ANALYSED) cmd_util.copy_project_file(user_id, project_codename, storage, worker_in_storage, "mesh.zip", tmp_folder, file_id=mesh['result_file_id']) worker_in_storage.upload_file(calc_param_file, "calc_params.zip") os.remove(calc_param_file) log.info("Project files sent to the worker") # Tell the script to start log.info("Starting the computation") task_proc.start() last_fetched_progress_time = datetime.datetime.utcfromtimestamp( 0) is_stopped = False while True: task_status = task_proc.check_status() # Charge if we need if datetime.datetime.utcnow() > end_time: end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: models.jobs.save_job_text(job_id, "No more credit") raise api_util.NoMoreCredits() if task_status != models.jobs.JOB_STATUS_RUNNING: log.info("Computation finished with status: " + models.jobs.job_status_to_str(task_status)) break if (datetime.datetime.utcnow() - last_fetched_progress_time ).seconds > STATUS_FETCHING_DELAY: fetch_progress(conn, user_id, project_codename, calculation['name'], calculation['id'], storage, tmp_folder) last_fetched_progress_time = datetime.datetime.utcnow() if not is_stopped: calculation = models.calc.get_calc( user_id, project['uid'], calculation['id']) if not calculation: raise api_util.ToolchainError("Calculation " + str(calc_id) + " disappeared") if calculation['status'] == models.calc.STATUS_STOPPED: log.info("Stopping computation") stop_calc(conn, project_codename) is_stopped = True time.sleep(1) # Checking if the machine is still here if not conn.ping(): models.jobs.save_job_text(job_id, "Worker instance disappeared") raise api_util.ToolchainError( "Worker instance disappeared") # Fetching computed data log.info("Saving results") worker_out_storage = storages.SshStorage( conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) log_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "worker.log") if conn.file_exists(log_file): with file_util.temp_filename(dir=tmp_folder) as tmp: conn.get_file(log_file, tmp) models.jobs.save_job_log(job_id, tmp) else: log.warning("No worker log file") if not result_file.exists(worker_out_storage): log.error("Unable to find file " + str(result_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") result_file.save_on_storage(worker_out_storage, storage, tmp_folder) if split_results: if not iterations_file.exists(worker_out_storage): log.error("Unable to find file " + str(iterations_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") iterations_file.save_on_storage(worker_out_storage, storage, tmp_folder) if not reduce_file.exists(worker_out_storage): log.error("Unable to find file " + str(reduce_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") reduce_file.save_on_storage(worker_out_storage, storage, tmp_folder) if internal_file.exists(worker_out_storage): internal_file.save_on_storage(worker_out_storage, storage, tmp_folder) else: log.warning("No internal files found on server") fetch_progress(conn, user_id, project_codename, calculation['name'], calculation['id'], storage, tmp_folder) log.info("Computation result fetched") # Signaling all output was fetched task_proc.stop_and_wait() # Charge again if required if datetime.datetime.utcnow() > end_time: models.users.charge_user_computing(project["user_id"], job_id, "Cloud computation cost") # Uploading file on cloud storage result_file.save_in_database(user_id) internal_file_id = internal_file.save_in_database( user_id) if internal_file.saved else None if split_results: iterations_file.save_in_database(user_id) reduce_file.save_in_database(user_id) models.calc.save_result_files(user_id, project_codename, calculation['name'], result_file.file_id, iterations_file.file_id, reduce_file.file_id, internal_file_id) else: models.calc.save_result_files(user_id, project_codename, calculation['name'], result_file.file_id, None, None, internal_file_id) except error_util.all_errors: with error_util.before_raising(): if REMOVE_RESULTS_ON_ERROR: internal_file.delete_from_distant(storage) result_file.delete_from_distant(storage) if split_results: iterations_file.delete_from_distant(storage) reduce_file.delete_from_distant(storage) log.info("Results saved")
def analyse(api_name, server_name, job, project, storage_name, project_file, provider_name, machine, nbr_machines, client_login, client_ip, api_version): """ Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results :param api_name: The name of the api :type api_name: str :param server_name: The name of the server (ex: apidev.zephycloud.com) :type server_name: str :param job: The job information :type job: dict[str, any] :param project: The main project :type project: dict[str, any] :param storage_name: The name of the storage where the project will be located :type storage_name: str :param project_file: The raw project file to analyse :type project_file: str :param provider_name: The name of the provider :type provider_name: str :param machine: The type of machine to launch :type machine: str :param nbr_machines: The number of machines to run :type nbr_machines: int :param client_login: The login of the job owner :type client_login: str :param client_ip: The client ip address of the http request string this job :type client_ip: str :param api_version: The version of the http api where the user ask to launch this job :type api_version: str """ job_id = int(job['id']) project_codename = job["project_uid"] analyzed_filename = job["project_uid"] + "-anal-" + str(job_id) + ".zip" user_id = project["user_id"] provider = api_util.get_provider(provider_name) storage = api_util.get_storage(storage_name) tmp_folder = api_util.get_conf().get("general", "tmp_folder") tags = { 'operation': "anal", 'job_id': str(job_id), 'server': server_name, 'api': api_name, 'api_version': api_version, 'client': client_login, 'client_ip': client_ip, 'trusted': IS_TOOLCHAIN_SECURED } # Uploading file on cloud storage log.info("Uploading project file to storage") models.projects.append_file_to_project( user_id, job["project_uid"], project_file, "project_" + job["project_uid"] + ".zip", key=models.projects.PROJECT_FILE_RAW, overwrite=True) log.info("Project file uploaded") models.users.charge_user_fix_price(user_id, job_id, "Project storage cost") analyzed_file = cmd_util.ResultFile(project_codename, analyzed_filename) try: # Creating worker with cmd_util.using_workers( api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers: with cmd_util.TaskProcess(job_id, job["project_uid"], "anal", workers) as task_proc: conn = workers.ssh_connection # Charge user end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: raise api_util.NoMoreCredits() log.info("Sending project files on worker") conn.send_file( project_file, util.path_join(api_util.WORKER_INPUT_PATH, "project_file.zip")) os.remove(project_file) log.info("Project files sent to the worker") # Tell the script to start log.info("Starting the computation") task_proc.start() while True: task_status = task_proc.check_status() # Charge if we need if datetime.datetime.utcnow() > end_time: end_time = models.users.charge_user_computing( user_id, job_id, "Cloud computation cost") if models.users.get_credit(user_id) <= 0: models.jobs.save_job_text(job_id, "No more credit") raise api_util.NoMoreCredits() if task_status != models.jobs.JOB_STATUS_RUNNING: log.info("Computation finished with status: " + models.jobs.job_status_to_str(task_status)) break time.sleep(5) # Checking if the machine is still here if not conn.ping(): models.jobs.save_job_text(job_id, "Worker instance disappeared") raise api_util.ToolchainError( "Worker instance disappeared") # Fetching computed data log.info("Fetching results") worker_out_storage = storages.SshStorage( conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED) log_file = util.path_join(api_util.WORKER_OUTPUT_PATH, "worker.log") if conn.file_exists(log_file): with file_util.temp_filename(dir=tmp_folder) as tmp: conn.get_file(log_file, tmp) models.jobs.save_job_log(job_id, tmp) else: log.warning("No worker log file") if not analyzed_file.exists(worker_out_storage): log.error("Unable to find file " + str(analyzed_file) + " on worker") raise api_util.ToolchainError( "Task failed, no result file") analyzed_file.save_on_storage(worker_out_storage, storage, tmp_folder) log.info("Computation result fetched") # Signaling all output was fetched task_proc.stop_and_wait() # Charge again if required if datetime.datetime.utcnow() > end_time: models.users.charge_user_computing(project["user_id"], job_id, "Cloud computation cost") # Uploading file on cloud storage analyzed_file.save_in_database( user_id, key=models.projects.PROJECT_FILE_ANALYSED) except error_util.all_errors: with error_util.before_raising(): if REMOVE_RESULTS_ON_ERROR: analyzed_file.delete_from_distant(storage) log.info("Result saved")