def _file_exists(storage_name, filename):
     storage = api_util.get_storage(storage_name)
     return storage.file_exists(filename)
def run_garbage_collector(api_name,
                          server_name,
                          redis_host="localhost",
                          redis_port=6379,
                          data_db=0,
                          pubsub_db=1):
    signal.signal(signal.SIGTERM, raise_keyboard_interrupt)
    signal.signal(signal.SIGINT, raise_keyboard_interrupt)

    core.api_util.DatabaseContext.load_conf()
    core.api_util.RedisContext.set_params(api_name, server_name, redis_host,
                                          redis_port, data_db, pubsub_db)

    # Loading providers and storages
    conf = api_util.get_conf()
    conf.read(os.path.join(API_PATH, 'config.conf'))
    allowed_providers = json.loads(conf.get("general", "allowed_providers"))
    providers = []
    for provider_name in allowed_providers:
        providers.append(api_util.get_provider(provider_name))
    allowed_storages = json.loads(conf.get("general", "allowed_storages"))
    storages = []
    for storage_name in allowed_storages:
        storages.append(api_util.get_storage(storage_name))

    running_jobs = RunningJobs()

    thread_list = []
    for provider in providers:
        running_workers = RunningWorkers()
        worker_collector = WorkerCollector(provider, api_name, server_name,
                                           running_jobs, running_workers)
        worker_collector.start()
        thread_list.append(worker_collector)
        provider_artefact_collector = ProviderArtefactCollector(
            provider, running_jobs, running_workers)
        provider_artefact_collector.start()
        thread_list.append(provider_artefact_collector)

    job_collector = JobCollector(running_jobs)
    job_collector.start()
    thread_list.append(job_collector)

    model_collector = ModelCollector()
    model_collector.start()
    thread_list.append(model_collector)

    # FIXME: Disable for now
    # file_collector = FileCollector(api_name)
    # file_collector.start()
    # thread_list.append(file_collector)
    #
    # for storage in storages:
    #     storage_collector = StorageCollector(storage)
    #     storage_collector.start()
    #     thread_list.append(storage_collector)

    try:
        while True:
            time.sleep(0.1)
            for proc in thread_list:
                if not proc.is_alive():
                    proc.reraise()
    except error_util.all_errors as e:
        with error_util.before_raising():
            if error_util.is_abort(e):
                log.info("Signal received, exiting...")
            else:
                error_util.log_error(log, e)
            log.info("Garbage collection cleaning...")
            stop_and_join(thread_list)
            log.info("Garbage collection is cleaned")

    log.info("Garbage collection cleaning...")
    stop_and_join(thread_list)
    log.info("Garbage collection is cleaned")
 def _delete_file(storage_name, filename):
     storage = api_util.get_storage(storage_name)
     storage.delete_file(filename)
 def _get_file_creation_date(storage_name, filename):
     storage = api_util.get_storage(storage_name)
     return storage.get_file_creation_date(filename)
 def _list_files(storage_name):
     storage = api_util.get_storage(storage_name)
     return storage.list_files()
Example #6
0
def calculate(api_name, server_name, job, project, mesh, calculation,
              calc_param_file, provider_name, machine, nbr_machines,
              split_results, client_login, client_ip, api_version):
    """
    Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job:                     The job information
    :type job:                      dict[str, any]
    :param project:                 The main project
    :type project:                  dict[str, any]
    :param mesh:                    The mesh used for this calculation
    :type mesh:                     dict[str, any]
    :param calculation:             The calculation to launch
    :type calculation:              dict[str, any]
    :param calc_param_file:         The main job parameter file
    :type calc_param_file:          str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param split_results:           Do you want the result file to be splitted ?
    :type split_results:            bool
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    job_id = int(job['id'])
    nbr_machines = int(nbr_machines)
    models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_RUNNING)
    project_codename = project['uid']
    user_id = project["user_id"]
    calc_id = calculation['id']
    tmp_folder = api_util.get_conf().get("general", "tmp_folder")
    provider = api_util.get_provider(provider_name)
    storage = api_util.get_storage(project['storage'])
    tags = {
        'operation': "calc",
        'job_id': str(job_id),
        'server': server_name,
        'api': api_name,
        'api_version': api_version,
        'client': client_login,
        'client_ip': client_ip,
        'debug': DO_NOT_KILL_INSTANCES,
        'trusted': IS_TOOLCHAIN_SECURED
    }

    models.users.charge_user_fix_price(user_id, job_id,
                                       "Calculation storage cost")
    result_name = project_codename + "-calc-" + str(job_id)
    result_file = cmd_util.ResultFile(project_codename, result_name + ".zip")
    internal_file = cmd_util.ResultFile(project_codename,
                                        result_name + "_workfiles.zip")
    if split_results:
        iterations_file = cmd_util.ResultFile(project_codename,
                                              result_name + "_iterations.zip")
        reduce_file = cmd_util.ResultFile(project_codename,
                                          result_name + "_reduce.zip")

    # Uploading file on cloud storage
    log.info("Uploading param file to storage")
    models.calc.save_calc_param_file(user_id, project_codename,
                                     calculation['name'], calc_param_file)
    try:
        # Creating worker
        with cmd_util.using_workers(
                api_name,
                provider,
                job_id,
                machine,
                nbr_machines,
                tags,
                debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers:
            # Launch main script
            with cmd_util.TaskProcess(job_id, job["project_uid"], "calc",
                                      workers, [split_results]) as task_proc:
                conn = workers.ssh_connection
                # Charge user
                end_time = models.users.charge_user_computing(
                    user_id, job_id, "Cloud computation cost")
                if models.users.get_credit(user_id) <= 0:
                    raise api_util.NoMoreCredits()

                log.info("Sending project files on worker")
                worker_in_storage = storages.SshStorage(
                    conn, api_util.WORKER_INPUT_PATH, IS_TOOLCHAIN_SECURED)

                cmd_util.copy_project_file(
                    user_id,
                    project_codename,
                    storage,
                    worker_in_storage,
                    "project_file.zip",
                    tmp_folder,
                    key=models.projects.PROJECT_FILE_RAW)
                cmd_util.copy_project_file(
                    user_id,
                    project_codename,
                    storage,
                    worker_in_storage,
                    "anal.zip",
                    tmp_folder,
                    key=models.projects.PROJECT_FILE_ANALYSED)
                cmd_util.copy_project_file(user_id,
                                           project_codename,
                                           storage,
                                           worker_in_storage,
                                           "mesh.zip",
                                           tmp_folder,
                                           file_id=mesh['result_file_id'])
                worker_in_storage.upload_file(calc_param_file,
                                              "calc_params.zip")
                os.remove(calc_param_file)
                log.info("Project files sent to the worker")

                # Tell the script to start
                log.info("Starting the computation")
                task_proc.start()
                last_fetched_progress_time = datetime.datetime.utcfromtimestamp(
                    0)
                is_stopped = False
                while True:
                    task_status = task_proc.check_status()

                    # Charge if we need
                    if datetime.datetime.utcnow() > end_time:
                        end_time = models.users.charge_user_computing(
                            user_id, job_id, "Cloud computation cost")
                        if models.users.get_credit(user_id) <= 0:
                            models.jobs.save_job_text(job_id, "No more credit")
                            raise api_util.NoMoreCredits()

                    if task_status != models.jobs.JOB_STATUS_RUNNING:
                        log.info("Computation finished with status: " +
                                 models.jobs.job_status_to_str(task_status))
                        break
                    if (datetime.datetime.utcnow() - last_fetched_progress_time
                        ).seconds > STATUS_FETCHING_DELAY:
                        fetch_progress(conn, user_id, project_codename,
                                       calculation['name'], calculation['id'],
                                       storage, tmp_folder)
                        last_fetched_progress_time = datetime.datetime.utcnow()

                    if not is_stopped:
                        calculation = models.calc.get_calc(
                            user_id, project['uid'], calculation['id'])
                        if not calculation:
                            raise api_util.ToolchainError("Calculation " +
                                                          str(calc_id) +
                                                          " disappeared")
                        if calculation['status'] == models.calc.STATUS_STOPPED:
                            log.info("Stopping computation")
                            stop_calc(conn, project_codename)
                            is_stopped = True
                    time.sleep(1)

                # Checking if the machine is still here
                if not conn.ping():
                    models.jobs.save_job_text(job_id,
                                              "Worker instance disappeared")
                    raise api_util.ToolchainError(
                        "Worker instance disappeared")

                # Fetching computed data
                log.info("Saving results")
                worker_out_storage = storages.SshStorage(
                    conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED)
                log_file = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          "worker.log")
                if conn.file_exists(log_file):
                    with file_util.temp_filename(dir=tmp_folder) as tmp:
                        conn.get_file(log_file, tmp)
                        models.jobs.save_job_log(job_id, tmp)
                else:
                    log.warning("No worker log file")

                if not result_file.exists(worker_out_storage):
                    log.error("Unable to find file " + str(result_file) +
                              " on worker")
                    raise api_util.ToolchainError(
                        "Task failed, no result file")
                result_file.save_on_storage(worker_out_storage, storage,
                                            tmp_folder)

                if split_results:
                    if not iterations_file.exists(worker_out_storage):
                        log.error("Unable to find file " +
                                  str(iterations_file) + " on worker")
                        raise api_util.ToolchainError(
                            "Task failed, no result file")
                    iterations_file.save_on_storage(worker_out_storage,
                                                    storage, tmp_folder)

                    if not reduce_file.exists(worker_out_storage):
                        log.error("Unable to find file " + str(reduce_file) +
                                  " on worker")
                        raise api_util.ToolchainError(
                            "Task failed, no result file")
                    reduce_file.save_on_storage(worker_out_storage, storage,
                                                tmp_folder)

                if internal_file.exists(worker_out_storage):
                    internal_file.save_on_storage(worker_out_storage, storage,
                                                  tmp_folder)
                else:
                    log.warning("No internal files found on server")

                fetch_progress(conn, user_id, project_codename,
                               calculation['name'], calculation['id'], storage,
                               tmp_folder)
                log.info("Computation result fetched")

                # Signaling all output was fetched
                task_proc.stop_and_wait()

        # Charge again if required
        if datetime.datetime.utcnow() > end_time:
            models.users.charge_user_computing(project["user_id"], job_id,
                                               "Cloud computation cost")

        # Uploading file on cloud storage
        result_file.save_in_database(user_id)
        internal_file_id = internal_file.save_in_database(
            user_id) if internal_file.saved else None
        if split_results:
            iterations_file.save_in_database(user_id)
            reduce_file.save_in_database(user_id)
            models.calc.save_result_files(user_id, project_codename,
                                          calculation['name'],
                                          result_file.file_id,
                                          iterations_file.file_id,
                                          reduce_file.file_id,
                                          internal_file_id)
        else:
            models.calc.save_result_files(user_id, project_codename,
                                          calculation['name'],
                                          result_file.file_id, None, None,
                                          internal_file_id)
    except error_util.all_errors:
        with error_util.before_raising():
            if REMOVE_RESULTS_ON_ERROR:
                internal_file.delete_from_distant(storage)
                result_file.delete_from_distant(storage)
                if split_results:
                    iterations_file.delete_from_distant(storage)
                    reduce_file.delete_from_distant(storage)
    log.info("Results saved")
def analyse(api_name, server_name, job, project, storage_name, project_file,
            provider_name, machine, nbr_machines, client_login, client_ip,
            api_version):
    """
    Launch the machine(s), send the files, start the worker script, wait for progress and results and saving results

    :param api_name:                The name of the api
    :type api_name:                 str
    :param server_name:             The name of the server (ex: apidev.zephycloud.com)
    :type server_name:              str
    :param job:                     The job information
    :type job:                      dict[str, any]
    :param project:                 The main project
    :type project:                  dict[str, any]
    :param storage_name:            The name of the storage where the project will be located
    :type storage_name:             str
    :param project_file:            The raw project file to analyse
    :type project_file:             str
    :param provider_name:           The name of the provider
    :type provider_name:            str
    :param machine:                 The type of machine to launch
    :type machine:                  str
    :param nbr_machines:            The number of machines to run
    :type nbr_machines:             int
    :param client_login:            The login of the job owner
    :type client_login:             str
    :param client_ip:               The client ip address of the http request string this job
    :type client_ip:                str
    :param api_version:             The version of the http api where the user ask to launch this job
    :type api_version:              str
    """
    job_id = int(job['id'])
    project_codename = job["project_uid"]
    analyzed_filename = job["project_uid"] + "-anal-" + str(job_id) + ".zip"
    user_id = project["user_id"]
    provider = api_util.get_provider(provider_name)
    storage = api_util.get_storage(storage_name)
    tmp_folder = api_util.get_conf().get("general", "tmp_folder")

    tags = {
        'operation': "anal",
        'job_id': str(job_id),
        'server': server_name,
        'api': api_name,
        'api_version': api_version,
        'client': client_login,
        'client_ip': client_ip,
        'trusted': IS_TOOLCHAIN_SECURED
    }

    # Uploading file on cloud storage
    log.info("Uploading project file to storage")
    models.projects.append_file_to_project(
        user_id,
        job["project_uid"],
        project_file,
        "project_" + job["project_uid"] + ".zip",
        key=models.projects.PROJECT_FILE_RAW,
        overwrite=True)
    log.info("Project file uploaded")

    models.users.charge_user_fix_price(user_id, job_id, "Project storage cost")
    analyzed_file = cmd_util.ResultFile(project_codename, analyzed_filename)
    try:
        # Creating worker
        with cmd_util.using_workers(
                api_name,
                provider,
                job_id,
                machine,
                nbr_machines,
                tags,
                debug_keep_instances_alive=DO_NOT_KILL_INSTANCES) as workers:
            with cmd_util.TaskProcess(job_id, job["project_uid"], "anal",
                                      workers) as task_proc:
                conn = workers.ssh_connection
                # Charge user
                end_time = models.users.charge_user_computing(
                    user_id, job_id, "Cloud computation cost")
                if models.users.get_credit(user_id) <= 0:
                    raise api_util.NoMoreCredits()

                log.info("Sending project files on worker")
                conn.send_file(
                    project_file,
                    util.path_join(api_util.WORKER_INPUT_PATH,
                                   "project_file.zip"))
                os.remove(project_file)
                log.info("Project files sent to the worker")

                # Tell the script to start
                log.info("Starting the computation")
                task_proc.start()
                while True:
                    task_status = task_proc.check_status()

                    # Charge if we need
                    if datetime.datetime.utcnow() > end_time:
                        end_time = models.users.charge_user_computing(
                            user_id, job_id, "Cloud computation cost")
                        if models.users.get_credit(user_id) <= 0:
                            models.jobs.save_job_text(job_id, "No more credit")
                            raise api_util.NoMoreCredits()

                    if task_status != models.jobs.JOB_STATUS_RUNNING:
                        log.info("Computation finished with status: " +
                                 models.jobs.job_status_to_str(task_status))
                        break
                    time.sleep(5)

                # Checking if the machine is still here
                if not conn.ping():
                    models.jobs.save_job_text(job_id,
                                              "Worker instance disappeared")
                    raise api_util.ToolchainError(
                        "Worker instance disappeared")

                # Fetching computed data
                log.info("Fetching results")
                worker_out_storage = storages.SshStorage(
                    conn, api_util.WORKER_OUTPUT_PATH, IS_TOOLCHAIN_SECURED)
                log_file = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          "worker.log")
                if conn.file_exists(log_file):
                    with file_util.temp_filename(dir=tmp_folder) as tmp:
                        conn.get_file(log_file, tmp)
                        models.jobs.save_job_log(job_id, tmp)
                else:
                    log.warning("No worker log file")

                if not analyzed_file.exists(worker_out_storage):
                    log.error("Unable to find file " + str(analyzed_file) +
                              " on worker")
                    raise api_util.ToolchainError(
                        "Task failed, no result file")
                analyzed_file.save_on_storage(worker_out_storage, storage,
                                              tmp_folder)
                log.info("Computation result fetched")

                # Signaling all output was fetched
                task_proc.stop_and_wait()

        # Charge again if required
        if datetime.datetime.utcnow() > end_time:
            models.users.charge_user_computing(project["user_id"], job_id,
                                               "Cloud computation cost")

        # Uploading file on cloud storage
        analyzed_file.save_in_database(
            user_id, key=models.projects.PROJECT_FILE_ANALYSED)
    except error_util.all_errors:
        with error_util.before_raising():
            if REMOVE_RESULTS_ON_ERROR:
                analyzed_file.delete_from_distant(storage)
    log.info("Result saved")