Esempio n. 1
0
 def work(self, *args, **kwargs):
     try:
         # FIXME: Disable because it lock the database: with core.api_util.DatabaseContext.using_conn():
         for machine in models.provider_config.list_machines(
                 self._provider.name):
             if self.should_stop():
                 return
             cost = models.provider_config.get_machine_provider_cost(
                 self._provider.name, machine['machine_code'])
             cost = core.api_util.price_to_float(
                 int(cost['cost_per_sec']) * 3600)
             prices = async_util.run_proc(SpotIndexUpdater.load_prices,
                                          self._provider.name,
                                          machine['machine_code'])
             if not prices:
                 log.error("No price history found for machine " +
                           str(machine['machine_code']))
                 continue
             price_variance = compute_variance(prices)
             price_max = max(prices)
             index = max(0,
                         ((price_max - abs(price_variance)) / price_max) -
                         (max(0, cost - price_max) / cost)**10)
             models.provider_config.set_spot_index(self._provider.name,
                                                   machine['machine_code'],
                                                   index)
     except error_util.all_errors as e:
         error_util.log_error(log, e)
Esempio n. 2
0
    def work(self, *args, **kwargs):
        try:
            if self.should_stop():
                return

            conf = core.api_util.get_conf()
            currency_api_url = conf.get("currency", "currency_api_url")
            if type_util.is_array(currency_api_url):
                log.warning("This should not be a list: " +
                            repr(currency_api_url))
                currency_api_url = "".join(currency_api_url)
            currency_api_token = conf.get("currency", "currency_api_token")
            aws_pricing_api = conf.get("general", "provider_pricing_api")

            # FIXME: Disable because it lock the database: with core.api_util.DatabaseContext.using_conn():
            models.currencies.update_currency_exchange_rates(
                currency_api_url, currency_api_token)
            if self.should_stop():
                return
            for provider in core.api_util.get_all_providers():
                if self.should_stop():
                    return
                models.provider_config.update_provider_costs(
                    aws_pricing_api, provider.name)
            models.provider_config.update_machine_prices()
        except StandardError as e:
            error_util.log_error(log, e)
Esempio n. 3
0
    def work(self, *args, **kwargs):
        try:
            core.api_util.wait_for_redis()

            with core.api_util.RedisContext.using_pubsub_conn() as redis_conn:
                channel = core.api_util.RedisContext.get_channel("launcher")
                for message in redis_util.listen_pubsub(
                        redis_conn, channel, datetime.timedelta(seconds=1)):
                    if self.should_stop():
                        return
                    if message is None:
                        continue
                    if not re.match(r"^.*_\d+$", message['data']):
                        log.warning(message['data'] + ' format is invalid')
                        continue
                    task, jobid_str = message['data'].rsplit('_', 1)
                    self._msg_queue.put({
                        'type': REDIS_MESSAGE,
                        "data": {
                            'task': task,
                            'jobid': int(jobid_str)
                        }
                    })
        except StandardError as e:
            error_util.log_error(log, e)
Esempio n. 4
0
 def clean(self):
     if not self._shared_volume:
         return
     cmd = ['docker', 'volume', 'rm', self._shared_volume]
     try:
         subprocess.check_output(cmd, cwd=API_PATH, stderr=subprocess.PIPE)
     except error_util.all_errors as e:
         error_util.log_error(log, e)
    def work(self, *args, **kwargs):
        try:
            if self._running_jobs.get_update_date(
            ) <= datetime.datetime.utcfromtimestamp(0):
                return

            running_jobs = []
            if self._last_date < self._running_jobs.get_update_date():
                self._last_date = self._running_jobs.get_update_date()
                running_jobs = self._running_jobs.get_list()
                for job_id in running_jobs:
                    if self.should_stop():
                        return
                    self._date_by_job_id[int(job_id)] = self._last_date

            if GC_DEBUG_MODE:
                if datetime.datetime.utcnow(
                ) - self._last_date > datetime.timedelta(hours=1):
                    log.warning("GC: Should not happen:\n" + "\tlast_date: " +
                                str(self._last_date) + "\n" + "\tnow: " +
                                str(datetime.datetime.utcnow()) + "\n" +
                                "\trunning jobs update date: " +
                                str(self._running_jobs.get_update_date()))
                    return

            unfinished_jobs = models.jobs.list_unfinished_jobs()
            unfinished_job_ids = set(
                [int(job['id']) for job in unfinished_jobs])
            for job_id in unfinished_job_ids:
                if self.should_stop():
                    return
                if job_id not in self._date_by_job_id.keys():
                    self._date_by_job_id[job_id] = datetime.datetime.utcnow()

            for job_id in unfinished_job_ids:
                if self.should_stop():
                    return
                date = self._date_by_job_id[job_id]
                if datetime.datetime.utcnow() - date > datetime.timedelta(
                        hours=2):
                    if GC_DEBUG_MODE:
                        if job_id in self._deleted_jobs:
                            continue
                        self._deleted_jobs.add(job_id)
                        gc_debug("GC: Cleaning job " + str(job_id) +
                                 " because no worker is working on it.\n" +
                                 "Details: \n  date:" + str(date) +
                                 "\nlast_date:" + str(self._last_date) +
                                 "\nrunning_jobs: " + repr(running_jobs) +
                                 "\nnow: " + str(datetime.datetime.utcnow()))
                    else:
                        log.warning("GC: Cleaning job " + str(job_id) +
                                    " because no worker is working on it")
                        models.jobs.cancel_job(job_id)
        except error_util.abort_errors:
            raise
        except error_util.all_errors as e:
            error_util.log_error(log, e)
    def work(self, *args, **kwargs):
        # Check if all files in a storage should exists
        try:
            a_week_ago = datetime.datetime.utcnow() - datetime.timedelta(
                days=7)
            files = async_util.run_proc(StorageCollector._list_files,
                                        self._storage.name)
            for filename in files:
                if self.should_stop():
                    return
                if models.projects.file_exists(filename):
                    continue
                try:
                    creation_date = async_util.run_proc(
                        StorageCollector._get_file_creation_date,
                        self._storage.name, filename)
                except core.storages.FileMissingError:
                    continue  # The file has been already deleted
                if creation_date > a_week_ago:
                    continue
                if GC_DEBUG_MODE:
                    if filename in self._dirty_files:
                        continue
                    self._dirty_files.add(filename)
                    gc_debug("Removing old file " + filename + " on storage " +
                             self._storage.name)
                else:
                    log.warning("GC: Removing old file " + filename +
                                " on storage " + self._storage.name)
                    async_util.run_proc(StorageCollector._delete_file,
                                        self._storage.name, filename)
        except error_util.abort_errors:
            raise
        except error_util.all_errors as e:
            error_util.log_error(log, e)

        # Check if there is a missing file
        try:
            files = models.projects.list_files_on_storage(self._storage.name)
            for file_info in files:
                filename = file_info['filename']
                if async_util.run_proc(StorageCollector._file_exists,
                                       self._storage.name, filename):
                    continue
                if GC_DEBUG_MODE:
                    if filename in self._missing_files:
                        continue
                    self._missing_files.add(filename)
                    gc_debug("Missing file " + filename + " on storage " +
                             self._storage.name)
                else:
                    log.error("GC: Missing file " + filename + " on storage " +
                              self._storage.name)
        except error_util.abort_errors:
            raise
        except error_util.all_errors as e:
            error_util.log_error(log, e)
Esempio n. 7
0
 def work(self, *args, **kwargs):
     try:
         if self.should_stop():
             return
         now = datetime.datetime.utcnow()
         last_charge_limit = now - datetime.timedelta(minutes=1)
         # FIXME: Disable because it lock the database: with core.api_util.DatabaseContext.using_conn():
         models.projects.charge_all(last_charge_limit)
         models.meshes.charge_all(last_charge_limit)
         models.calc.charge_all(last_charge_limit)
     except StandardError as e:
         error_util.log_error(log, e)
Esempio n. 8
0
 def work(self, *args, **kargs):
     try:
         time.sleep(5)  # Don't start immediately
         evt_handler = SrcCodeEventHandler(self._msg_queue)
         observer = watchdog.observers.Observer()
         observer.schedule(evt_handler,
                           os.path.join(API_PATH, "app"),
                           recursive=True)
         observer.start()
         self.wait_for_stop()
         observer.stop()
         observer.join()
     except StandardError as e:
         error_util.log_error(log, e)
    def work(self, *args, **kwargs):
        try:
            yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1)
            to_delete = []
            for filename in os.listdir(self._uploaded_file_dir):
                if self.should_stop():
                    return
                if filename in ("/", "", ".", ".."):
                    continue
                file_path = os.path.abspath(
                    os.path.join(self._uploaded_file_dir, filename))
                creation_date = datetime.datetime.utcfromtimestamp(
                    os.path.getmtime(file_path))
                if creation_date < yesterday:
                    to_delete.append(file_path)

            for file_path in to_delete:
                if self.should_stop():
                    return
                if GC_DEBUG_MODE:
                    try:
                        if file_path in self._dirty_files:
                            continue
                        self._dirty_files.add(file_path)
                        gc_debug("GC: Removing old temp file: " +
                                 str(file_path))
                    except error_util.abort_errors:
                        raise
                    except error_util.all_errors as e:
                        log.error("Unable to remove old uploaded dir " +
                                  repr(file_path))
                        error_util.log_error(log, e)
                else:
                    if os.path.isdir(file_path):
                        try:
                            log.warning("GC: Removing old temp folder: " +
                                        str(file_path))
                            shutil.rmtree(file_path)
                        except error_util.abort_errors:
                            raise
                        except error_util.all_errors as e:
                            log.error("Unable to remove old uploaded dir " +
                                      repr(file_path))
                            error_util.log_error(log, e)
                    else:
                        try:
                            log.warning("GC: Removing old temp file: " +
                                        str(file_path))
                            os.remove(file_path)
                        except error_util.abort_errors:
                            raise
                        except error_util.all_errors as e:
                            log.error("Unable to remove old uploaded file " +
                                      repr(file_path))
                            error_util.log_error(log, e)
        except error_util.abort_errors:
            raise
        except error_util.all_errors as e:
            error_util.log_error(log, e)
    def work(self, *args, **kwargs):
        if self._running_jobs.get_update_date(
        ) <= datetime.datetime.utcfromtimestamp(0):
            return
        if self._running_workers.get_update_date(
        ) <= datetime.datetime.utcfromtimestamp(0):
            return
        try:
            artefacts = self._fetch_artefacts()
            if self.should_stop():
                return
            job_ids = [int(job_id) for job_id in self._running_jobs.get_list()]
            worker_ids = [
                w.worker_id for w in self._running_workers.get_list()
            ]
            for artefact in artefacts:
                if self.should_stop():
                    return
                try:
                    if artefact.job_id is not None:
                        if artefact.job_id in job_ids:
                            continue
                    elif artefact.worker_id is not None:
                        if artefact.worker_id in worker_ids:
                            continue
                    else:
                        log.warning(
                            "Bad artefact: no job or worker specified: " +
                            str(artefact))
                        continue

                    if GC_DEBUG_MODE:
                        if artefact in self._dirty_artefacts:
                            continue
                        self._dirty_artefacts.add(artefact)
                        gc_debug("GC: Cleaning " + str(artefact) +
                                 " because not worker use it anymore")
                    else:
                        log.warning("GC: Cleaning " + str(artefact) +
                                    " because not worker use it anymore")
                        self._provider.delete_artefact(artefact)
                except StandardError as e:
                    error_util.log_error(log, e)
        except error_util.abort_errors:
            raise
        except error_util.all_errors as e:
            error_util.log_error(log, e)
Esempio n. 11
0
def try_append_file_to_project(user_id,
                               project_codename,
                               file_path,
                               filename=None,
                               key=None):
    generated = append_file_to_project(user_id, project_codename, file_path,
                                       filename, key)
    try:
        yield generated
    except error_util.all_errors:
        with error_util.before_raising():
            try:
                remove_file_from_project(user_id, project_codename,
                                         generated['id'])
            except error_util.all_errors as e:
                log.error("Unable to remove file " + repr(generated))
                error_util.log_error(log, e)
Esempio n. 12
0
def run_pending_jobs(api_name, server_name, log_level, log_output):
    """
    Run all pending jobs. It is useful at script startup or when redis server is down.

    :param api_name:            The name of current API
    :type api_name:             str
    :param server_name:         The name of current server
    :type server_name:          str
    :param log_level:           The level of log we want
    :type log_level:            int
    :param log_output:          Where do we should output the logs. Should be "stdout", "stderr", "syslog" or a file
    :type log_output:           str
    """
    task_lib = models.jobs.list_tasks()
    for task in task_lib:
        try:
            run_task(api_name, server_name, task['task'], task['job_id'],
                     log_level, log_output)
        except StandardError as e:
            error_util.log_error(log, e)
Esempio n. 13
0
def init_process(fork, api_name, job_id, toolchain):
    if fork:
        try:
            in_child, parent_pid, child_pid = proc_util.double_fork()
        except exceptions.OSError as e:
            log.error("Unable to run command, fork failed, cause:")
            error_util.log_error(log, e)
            return False

        if not in_child:
            return False  # Nothing to do in parent, we continue the main loop

    signal.signal(signal.SIGTERM, raise_keyboard_interrupt)
    signal.signal(signal.SIGINT, raise_keyboard_interrupt)

    try:
        setproctitle.setproctitle(api_name + " " + toolchain + " job " +
                                  str(job_id))
    except StandardError as e:
        log.warning(str(e))
    return True
Esempio n. 14
0
def handle_error(e):
    code = 500
    msg = str(e) if app.debug else "Internal Error"
    if isinstance(e, HTTPException):
        code = int(e.code)
        msg = e.name
        warning_msg = e.name
        if e.description and app.debug:
            msg = e.name + ": " + e.description
            if len(e.description) < 80:
                warning_msg += ": " + e.description
        if code != 404:
            log.warning(warning_msg)
            error_util.log_error(log, e)
    else:
        error_util.log_error(log, e)
    if code == 404 and request.path.strip("/").split("/")[0] in ("v1",
                                                                 "admin"):
        response = jsonify({"success": 0, "error_msgs": [msg], "data": None})
    else:
        response = "<html><body><h1>" + escape(msg) + "</h1></body></html>"
    return response, code
Esempio n. 15
0
def run_server(api_name,
               server_name,
               redis_host="localhost",
               redis_port=6379,
               data_db=0,
               pubsub_db=1,
               auto_reload=False,
               log_level=logging.INFO,
               log_output="syslog",
               pid_file=None):
    """
    Main loop function.
    It listen to pending jobs and run them.
    It will burn the recurring prices for files saved of storages.
    It also run a garbage collector to ensure killed jobs or faulty workers are cleaned.

    :param api_name:            The name of current API
    :type api_name:             str
    :param server_name:         The fqdn of current server
    :type server_name:          str
    :param data_db:             The redis database for data. Optional, default 0
    :type data_db:              int
    :param pubsub_db:           The redis database for pubsub events. Optional, default 1
    :type pubsub_db:            int
    :param redis_host:          The redis server to connect to. Optional, default "localhost"
    :type redis_host:           str
    :param redis_port:          The redis server port. Optional, default 6379
    :type redis_port:           int
    :param auto_reload:         Do we want the server to restart if the source code changed? Optional, default False
    :type auto_reload:          bool
    :param log_level:           The level of log we want
    :type log_level:            int
    :param log_output:          Where do we should output the logs. Should be "stdout", "stderr", "syslog" or a file
    :type log_output:           str
    :param pid_file:            The pid file to create if any, None otherwise
    :type pid_file:             str|None
    """
    # Write the pid file
    if pid_file:
        pid_file = os.path.join("/var", "run", api_name, api_name + ".pid")
        with open(pid_file, "w") as fh:
            fh.write(str(os.getpid()) + "\n")

    log.info("Starting " + api_name + " server")

    core.api_util.DatabaseContext.load_conf()
    core.api_util.RedisContext.set_params(api_name, server_name, redis_host,
                                          redis_port, data_db, pubsub_db)

    core.api_util.wait_for_postgres()
    queue = async_util.create_thread_queue()
    running_threads = []

    redis_thread = RedisReceiver(queue, api_name, server_name)
    running_threads.append(redis_thread)
    redis_thread.start()

    def reload_signal_handler(*args):
        queue.put({'type': SRC_CHANGED_MESSAGE, "data": None})

    signal.signal(signal.SIGUSR1, reload_signal_handler)

    burner_thread = PriceBurner()
    running_threads.append(burner_thread)
    burner_thread.start()

    price_updater_thread = PriceUpdater()
    running_threads.append(price_updater_thread)
    price_updater_thread.start()

    for provider in core.api_util.get_all_providers():
        if provider.type == "aws_spot":
            spot_thread = SpotIndexUpdater(provider)
            running_threads.append(spot_thread)
            spot_thread.start()

    if auto_reload:
        change_thread = SourceChangeNotifier(queue)
        running_threads.append(change_thread)
        change_thread.start()
        debug_util.register_for_debug()
    else:

        def ignore_signal(*args):
            log.info("Server not in debug mode, ignoring signal...")

        signal.signal(signal.SIGUSR2, ignore_signal)

    if GC_ENABLED:
        cmd = [
            "python",
            os.path.join(API_PATH, "app", "garbage_collector.py"),
            "--log-level",
            logging.getLevelName(log_level), "--log-output", log_output
        ]
        if auto_reload:
            cmd.append("--debug")
        gc = subprocess.Popen(cmd)
    else:
        gc = None

    # Run all pending jobs we may have missed with a redis shutdown or a server.py shutdown
    run_pending_jobs(api_name, server_name, log_level, log_output)

    should_restart = False
    abort_exception = None

    while True:
        try:
            # Wait for events
            try:
                event = queue.get(block=True, timeout=60)
            except async_util.QueueEmpty:
                # No events during 1min, perhaps redis is dead so we check pending tasks
                run_pending_jobs(api_name, server_name, log_level, log_output)
                continue
            # Running
            msg_type = event['type']
            msg_data = event['data']
            if msg_type == REDIS_MESSAGE:
                run_task(api_name, server_name, msg_data['task'],
                         msg_data['jobid'], log_level, log_output)
            elif msg_type == SRC_CHANGED_MESSAGE:
                should_restart = True
                break
            else:
                log.error("Unknown message received: " + str(msg_type))
        except error_util.abort_errors as e:
            abort_exception = e
            break
        except StandardError as e:
            error_util.log_error(log, e)

    try:
        log.info("Exit confirmed, cleaning...")
        for thread in running_threads:
            thread.stop()
        if gc:
            proc_util.ensure_stop_proc(gc)
        for thread in running_threads:
            thread.join()
    finally:
        if pid_file:
            try:
                os.remove(pid_file)
            except StandardError:
                pass
        log.info("Everything is cleaned")

    if abort_exception:
        raise abort_exception
    elif should_restart:
        restart_server()
    def work(self, *args, **kwargs):
        try:
            # Clean projects
            dirty_projects = models.projects.list_failed_and_dirty()
            if self.should_stop():
                return
            for project in dirty_projects:
                if self.should_stop():
                    return
                if GC_DEBUG_MODE:
                    if project['uid'] in self._dirty_projects:
                        continue
                    self._dirty_projects.add(project['uid'])
                    gc_debug("GC: Setting project as not analyzed " +
                             str(project['uid']) +
                             " because no analysis succeeded")
                else:
                    log.warning("GC: Setting project as not analyzed " +
                                str(project['uid']) +
                                " because no analysis succeeded")
                    models.projects.set_project_status(
                        project['user_id'], project['uid'],
                        models.projects.PROJECT_STATUS_RAW)

            # Clean meshes
            if self.should_stop():
                return
            dirty_meshes = models.meshes.list_failed_and_dirty()
            if self.should_stop():
                return
            for mesh in dirty_meshes:
                if self.should_stop():
                    return
                if GC_DEBUG_MODE:
                    if mesh['id'] in self._dirty_meshes:
                        continue
                    self._dirty_meshes.add(mesh['id'])
                    gc_debug("GC: Setting mesh as failed " + str(mesh['id']) +
                             " because mesh job failed")
                else:
                    log.warning("GC: Setting mesh as failed " +
                                str(mesh['id']) + " because mesh job failed")
                    models.meshes.set_mesh_status(mesh['user_id'],
                                                  mesh['project_uid_id'],
                                                  mesh['name'],
                                                  models.meshes.STATUS_KILLED)

            # Clean calculations
            if self.should_stop():
                return
            dirty_calculations = models.calc.list_failed_and_dirty()
            if self.should_stop():
                return
            for calc in dirty_calculations:
                if self.should_stop():
                    return
                if GC_DEBUG_MODE:
                    if calc['id'] in self._dirty_calcs:
                        continue
                    self._dirty_calcs.add(calc['id'])
                    gc_debug("GC: Setting calc as failed " + str(calc['id']) +
                             " because calc job failed")
                else:
                    log.warning("GC: Setting calc as failed " +
                                str(calc['id']) + " because calc job failed")
                    models.calc.set_calc_status(calc['user_id'],
                                                calc['project_uid_id'],
                                                calc['name'],
                                                models.calc.STATUS_KILLED)
        except error_util.abort_errors:
            raise
        except error_util.all_errors as e:
            error_util.log_error(log, e)
    def work(self, *args, **kwargs):
        try:
            now = datetime.datetime.utcnow()

            # fetching data about running workers
            thread_list = []
            workers = async_util.run_proc(WorkerCollector._list_workers,
                                          self._provider.name)
            self._running_workers.set_list(workers, now)
            for worker in workers:
                if self.should_stop():
                    return
                if worker.worker_id not in self._observers.keys():
                    self._observers[worker.worker_id] = WorkerObserver(
                        worker, self._provider, self._api_name,
                        self._server_name, WORKER_PROCESS_LAUNCHER)
                thread = threading.Thread(
                    target=WorkerCollector._update_worker,
                    args=(self._observers[worker.worker_id], worker))
                thread.start()
                thread_list.append(thread)

            for thread in thread_list:
                if self.should_stop():
                    return
                thread.join()

            # Listing active jobs, and grouping cluster
            job_list = []
            cluster_list = ClusterList()
            for worker in workers:
                if self.should_stop():
                    return
                observer = self._observers[worker.worker_id]
                if observer.jobid and observer.status not in (
                        Worker.Status.SHUTTING_DOWN, Worker.Status.TERMINATED):
                    job_list.append(observer.jobid)
                    if observer.is_cluster_master():
                        cluster_list.append_master(observer)
                    elif observer.is_cluster_slave():
                        cluster_list.append_slave(observer)
            self._running_jobs.set_list(job_list, now)

            for worker in workers:
                if self.should_stop():
                    return
                observer = self._observers[worker.worker_id]
                sentence = judge_worker(observer, self._rules, cluster_list,
                                        now)
                if sentence is None:
                    continue
                if sentence.penalty == Penalty.DEATH:
                    msg = WorkerCollector.EMAIL_MSG % (
                        observer, sentence.description, "KILLING INSTANCE !!!",
                        observer.description)
                    if GC_DEBUG_MODE:
                        if worker.worker_id in self._dirty_kill_workers:
                            continue
                        self._dirty_kill_workers.add(worker.worker_id)
                        gc_debug(msg)
                    else:
                        log.warning("killing instance %s: %s" %
                                    (observer, sentence.description))
                        core.api_util.send_admin_email("Watchdog KILL", msg)
                        async_util.run_proc(WorkerCollector._kill_worker,
                                            self._provider.name, worker)
                    observer.mark_as_killed()
                elif sentence.penalty == Penalty.PROBATION:
                    msg = WorkerCollector.EMAIL_MSG % (
                        observer, sentence.description, "",
                        observer.description)
                    if not GC_DEBUG_MODE:
                        if worker.worker_id in self._dirty_warning_workers:
                            continue
                        self._dirty_warning_workers.add(worker.worker_id)
                        gc_debug(msg)
                    else:
                        log.warning("Strange behaviour for instance %s: %s" %
                                    (observer, sentence.description))
                        core.api_util.send_admin_email("Watchdog warning", msg)
        except error_util.abort_errors:
            raise
        except error_util.all_errors as e:
            error_util.log_error(log, e)
Esempio n. 18
0
def run_toolchain(api_name, server_name, job_id, toolchain):
    """
    Execute a task, which could be to cancel a running toolchain or to launch a specific toolchain.
    It will launch a separated subprocess and return before task is completed

    :param api_name:            The name of current API
    :type api_name:             str
    :param server_name:         The fqdn of current server
    :type server_name:          str
    :param job_id:              The is of the job the task is related to
    :type job_id:               int
    :param toolchain:           The task to launch
    :type toolchain:            str
    """
    # FIXME: Disable because it lock the database: with core.api_util.DatabaseContext.using_conn():
    try:
        task = models.jobs.get_task_info(job_id)
        if not task:
            log.info("Job " + str(job_id) + " is already running, skipping")
            return
        models.jobs.dequeue_task(job_id)
        job = models.jobs.get_job(job_id)
        if not job:
            log.error("Unknown job " + str(job_id))
        time.sleep(0.2)
        if int(job['status']) != models.jobs.JOB_STATUS_PENDING:
            log.info("Job " + str(job_id) + " already launched, skipping")
            return

        models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_LAUNCHING)
        with core.api_util.RedisContext.using_data_conn() as r:
            r.set(
                api_name + ":" + server_name + ":job-" + str(job_id) + "-pid",
                int(os.getpid()))
        if toolchain != task['task']:
            log.error("Bad task toolchain")
            return
        if toolchain == models.jobs.TASK_UPLOAD_AND_ANALYSE:
            commands.upload_and_analyze.run(api_name, server_name, job_id,
                                            **task["params"])
        elif toolchain == models.jobs.TASK_UPLOAD_AND_LINK:
            commands.upload_and_link.run(api_name, server_name, job_id,
                                         **task["params"])
        elif toolchain == models.jobs.TASK_MESH:
            commands.mesh.run(api_name, server_name, job_id, **task["params"])
        elif toolchain == models.jobs.TASK_CALC:
            commands.calc.run(api_name, server_name, job_id, **task["params"])
        elif toolchain == models.jobs.TASK_RESTART_CALC:
            commands.restart_calc.run(api_name, server_name, job_id,
                                      **task["params"])
        else:
            log.error("Task not implemented: " + str(toolchain))
            return
        log.info("Command successfully finished")
    except core.api_util.abort_errors:
        log.warning("Operation canceled")
        models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_CANCELED)
    except core.api_util.ToolchainError as e:
        log.error(str(e))
        models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_KILLED)
    except error_util.all_errors as e:
        error_util.log_error(log, e)
        models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_KILLED)
    else:
        models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_FINISHED)
    finally:
        with core.api_util.RedisContext.using_data_conn() as redis_conn:
            redis_conn.delete(api_name + ":" + server_name + ":job-" +
                              str(job_id) + "-pid")
    sys.exit(0)
Esempio n. 19
0
def run_task(api_name, server_name, task_order, job_id, log_level, log_output):
    """
    Execute a task, which could be to cancel a running toolchain or to launch a specific toolchain.
    It will launch a separated subprocess and return before task is completed

    :param api_name:            The name of current API
    :type api_name:             str
    :param server_name:         The fqdn of current server
    :type server_name:          str
    :param task_order:          The task to do (launch or cancel)
    :type task_order:           int
    :param job_id:              The is of the job the task is related to
    :type job_id:               int
    :param log_level:           The level of log we want
    :type log_level:            int
    :param log_output:          Where do we should output the logs. Should be "stdout", "stderr", "syslog" or a file
    :type log_output:           str
    """

    if task_order == models.jobs.TASK_CANCEL:
        models.jobs.dequeue_task(job_id)
        proc = multiprocessing.Process(target=cancel_job,
                                       args=(
                                           api_name,
                                           server_name,
                                           job_id,
                                       ))
        proc.daemon = True
        proc.start()
    elif task_order in [
            models.jobs.TASK_UPLOAD_AND_ANALYSE,
            models.jobs.TASK_UPLOAD_AND_LINK, models.jobs.TASK_MESH,
            models.jobs.TASK_CALC, models.jobs.TASK_RESTART_CALC
    ]:
        try:
            # The task will be dequeued by the process to get the task parameters
            subprocess.check_call([
                "python",
                os.path.join(API_PATH, "app",
                             "run_job.py"), "--fork", "--log-level",
                logging.getLevelName(log_level), "--log-output", log_output,
                '--redis-host',
                core.api_util.RedisContext.get_host(), '--redis-port',
                str(core.api_util.RedisContext.get_port()), '--redis-data-db',
                str(core.api_util.RedisContext.get_data_db()),
                '--redis-pubsub-db',
                str(core.api_util.RedisContext.get_pubsub_db()),
                str(job_id),
                str(task_order)
            ])
        except core.api_util.abort_errors:
            models.jobs.dequeue_task(job_id)
            log.warning("Operation canceled")
            models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_CANCELED)
        except core.api_util.ToolchainError as e:
            models.jobs.dequeue_task(job_id)
            log.error(str(e))
            models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_KILLED)
        except error_util.all_errors as e:
            models.jobs.dequeue_task(job_id)
            error_util.log_error(log, e)
            models.jobs.set_job_status(job_id, models.jobs.JOB_STATUS_KILLED)
    else:
        models.jobs.dequeue_task(job_id)
        log.error("Task not implemented: " + str(task_order))
        return
Esempio n. 20
0
def fetch_progress(conn, user_id, project_codename, calc_name, calc_id,
                   storage, tmp_folder):
    """
    Check if a progress file has been created on the main worker and save it if it exists

    :param conn:                    The ssh connection to the main worker
    :type conn:                     ssh.SshConnection
    :param user_id:                 The id of the job owner
    :type user_id:                  int
    :param project_codename:        The project uuid
    :type project_codename:         str
    :param calc_name:               The name of the calculation
    :type calc_name:                str
    :param calc_id:                 The id of the calculation
    :type calc_id:                  int
    :param storage:                 The storage of the project
    :type storage:                  core.ssh.Storage
    :param tmp_folder:              A temporary folder to use
    :type tmp_folder:               str
    :return:                        True if success, False if no file is found or a failure happens
    :rtype:                         bool
    """
    status_file_name = project_codename + "_calc_" + calc_name + "_status.zip"
    status_file = cmd_util.ResultFile(project_codename, status_file_name)
    old_status_file = None
    try:
        calc_dir = util.path_join(api_util.WORKER_WORK_PATH, "ZephyTOOLS",
                                  "PROJECTS_CFD", project_codename, "CALC")
        if not conn.folder_exists(calc_dir):
            log.debug("calc folder " + calc_dir +
                      " doesn't exists yet, skipping...")
            return True
        _, out, _ = conn.run([
            "find", calc_dir, "-mindepth", "1", "-maxdepth", "1", "-type", "d"
        ])
        out = out.strip()
        if not out or "\n" in out:  # No results or more than one result
            log.warning("Unable to get the calculation output folder")
            return
        calc_dir = out.rstrip("/")
        zipper_command = util.path_join(api_util.WORKER_WORK_PATH,
                                        "ZephyTOOLS", "APPLI", "TMP",
                                        "CFD_CALC_ZIP_STATUS.py")
        old_status_file = models.calc.get_calc_status_file(
            user_id, project_codename, calc_id)
        status_file_path = util.path_join(api_util.WORKER_OUTPUT_PATH,
                                          status_file_name)
        conn.run(
            ["python", zipper_command, "-i", calc_dir, "-o", status_file_path])

        worker_out_storage = storages.SshStorage(conn,
                                                 api_util.WORKER_OUTPUT_PATH,
                                                 IS_TOOLCHAIN_SECURED)
        if not status_file.exists(worker_out_storage):
            log.warning(
                "Unable to get calculation status file: file not found")
            return False
        status_file.save_on_storage(worker_out_storage, storage, tmp_folder)
        file_id = status_file.save_in_database(user_id)
        models.calc.save_status_file(user_id, project_codename, calc_id,
                                     file_id)
    except error_util.all_errors as e:
        with error_util.saved_stack() as error_stack:
            status_file.delete_from_distant(storage)
            if error_util.is_abort(e):
                error_stack.reraise()
            else:
                error_util.log_error(log, e)
                return False
    if old_status_file:
        models.projects.remove_file_from_project(user_id, project_codename,
                                                 old_status_file['id'])
    return True
def run_garbage_collector(api_name,
                          server_name,
                          redis_host="localhost",
                          redis_port=6379,
                          data_db=0,
                          pubsub_db=1):
    signal.signal(signal.SIGTERM, raise_keyboard_interrupt)
    signal.signal(signal.SIGINT, raise_keyboard_interrupt)

    core.api_util.DatabaseContext.load_conf()
    core.api_util.RedisContext.set_params(api_name, server_name, redis_host,
                                          redis_port, data_db, pubsub_db)

    # Loading providers and storages
    conf = api_util.get_conf()
    conf.read(os.path.join(API_PATH, 'config.conf'))
    allowed_providers = json.loads(conf.get("general", "allowed_providers"))
    providers = []
    for provider_name in allowed_providers:
        providers.append(api_util.get_provider(provider_name))
    allowed_storages = json.loads(conf.get("general", "allowed_storages"))
    storages = []
    for storage_name in allowed_storages:
        storages.append(api_util.get_storage(storage_name))

    running_jobs = RunningJobs()

    thread_list = []
    for provider in providers:
        running_workers = RunningWorkers()
        worker_collector = WorkerCollector(provider, api_name, server_name,
                                           running_jobs, running_workers)
        worker_collector.start()
        thread_list.append(worker_collector)
        provider_artefact_collector = ProviderArtefactCollector(
            provider, running_jobs, running_workers)
        provider_artefact_collector.start()
        thread_list.append(provider_artefact_collector)

    job_collector = JobCollector(running_jobs)
    job_collector.start()
    thread_list.append(job_collector)

    model_collector = ModelCollector()
    model_collector.start()
    thread_list.append(model_collector)

    # FIXME: Disable for now
    # file_collector = FileCollector(api_name)
    # file_collector.start()
    # thread_list.append(file_collector)
    #
    # for storage in storages:
    #     storage_collector = StorageCollector(storage)
    #     storage_collector.start()
    #     thread_list.append(storage_collector)

    try:
        while True:
            time.sleep(0.1)
            for proc in thread_list:
                if not proc.is_alive():
                    proc.reraise()
    except error_util.all_errors as e:
        with error_util.before_raising():
            if error_util.is_abort(e):
                log.info("Signal received, exiting...")
            else:
                error_util.log_error(log, e)
            log.info("Garbage collection cleaning...")
            stop_and_join(thread_list)
            log.info("Garbage collection is cleaned")

    log.info("Garbage collection cleaning...")
    stop_and_join(thread_list)
    log.info("Garbage collection is cleaned")
Esempio n. 22
0
def using_workers(api_name, provider, job_id, machine, nbr_machines, tags, debug_keep_instances_alive=False):
    machine_cost = models.provider_config.get_machine_provider_cost(provider.name, machine)
    if not machine_cost:
        raise RuntimeError("Unable to get the cost for provider " + str(provider.name))
    instance_price = api_util.price_to_float(machine_cost["cost_per_sec"]) * 3600  # In $/h, for aws spots
    nbr_machines = int(nbr_machines)
    alive_thread = None
    if nbr_machines == 1:
        workers = []
        try:
            log.info("Launching worker on provider " + str(provider.name))
            workers = provider.create_workers(int(nbr_machines), machine=machine, spot_price=instance_price)
            log.info("worker created")
            main_worker = workers[0]
            if main_worker.specific_cost:
                models.jobs.set_job_specific_cost(job_id, provider.name, machine, main_worker.specific_cost,
                                                  machine_cost["currency"], machine_cost["sec_granularity"],
                                                  machine_cost["min_sec_granularity"])

            # Tag instance
            provider.tag_workers(workers, {'Name': api_name + "_worker/job_" + str(job_id), "type": "worker"})
            if not debug_keep_instances_alive:
                debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id)
            tags = copy.copy(tags)
            tags['debug'] = "true" if debug_keep_instances_alive else "false"
            provider.tag_workers(workers, tags)

            # Connect to the worker
            ip = main_worker.public_ip if main_worker.public_ip else main_worker.private_ip
            log.info("Waiting for worker ssh connection to " + str(ip) + " ...")
            conn = ssh.SshConnection(ip, "aziugo", provider.get_key_path())
            conn.wait_for_connection()
            log.info("Connection with worker established")
            alive_thread = KeepAliveWorkerThread(conn)
            alive_thread.start()

            yield RunningWorkers(provider, workers, conn, debug_keep_instances_alive)
        finally:
            if alive_thread:
                alive_thread.stop()
                alive_thread.join()

            if workers and provider:
                if not debug_keep_instances_alive:
                    try:
                        debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id)
                    except error_util.all_errors as e:
                        log.warning(str(e))
                if debug_keep_instances_alive:
                    log.debug("Worker cleaning is disabled for debug purpose")
                else:
                    log.info("Stopping workers...")
                    cleanup_failed = False
                    try:
                        provider.terminate_workers(workers)
                    except error_util.abort_errors:
                        with error_util.before_raising():
                            try:
                                provider.terminate_workers(workers)
                                log.info("Workers stopped")
                            except error_util.abort_errors:
                                log.warning("Worker cleaned aborted.")
                                msg = "Workers of job "+str(job_id)+" are not killed. Please kill them manually"
                                log.error(msg)
                                api_util.send_admin_email("Worker cleaned aborted.", msg)
                    except error_util.all_errors as e:
                        cleanup_failed = True
                        msg = "Workers of job " + str(job_id) + " are not killed. Please kill them manually"
                        log.error(msg)
                        error_util.log_error(log, e)
                        api_util.send_admin_email("Worker cleaned aborted.", msg)
                    if not cleanup_failed:
                        log.info("Workers stopped")

    else:
        machine_info = models.provider_config.get_machine(provider.name, machine)
        if not machine_info:
            raise RuntimeError("Unable to get the description of machine " + str(machine))
        nbr_cores = int(machine_info['nbr_cores'])
        cluster_tags = copy.copy(tags)
        if not debug_keep_instances_alive:
            debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id)
        cluster_tags.update({
            "debug": "true" if debug_keep_instances_alive else "false",
            '%master%_Name': api_name + "_worker/job_" + str(job_id),
            '%master%_type': "cluster master",
            '%slave%_Name': api_name + "_worker/job_" + str(job_id) + " slave %slave_index%",
            '%slave%_type': "cluster slave",
        })
        log.info("Launching worker on provider " + str(provider.name))
        with core.cluster.Cluster(provider, "aziugo", nbr_cores, str(job_id), machine=machine,
                                  spot_price=instance_price, tags=cluster_tags,
                                  debug_no_terminate=debug_keep_instances_alive) as cluster:
            try:
                log.info("Main worker launched, with id " + str(cluster.master_id))

                log.info("Launching " + str(nbr_machines - 1) + " slave workers...")
                cluster.add_slaves(nbr_machines - 1)
                log.info("Slave workers launched")

                # Connect to the worker
                log.info("Waiting for worker ssh connection to "+str(cluster.ip)+" ...")
                conn = ssh.SshConnection(cluster.ip, "aziugo", provider.get_key_path())
                conn.wait_for_connection()
                log.info("Connection with worker established")
                alive_thread = KeepAliveClusterThread(cluster)
                alive_thread.start()
                yield RunningWorkers(provider, cluster.workers, conn, debug_keep_instances_alive)
            finally:
                if alive_thread:
                    alive_thread.stop()
                    alive_thread.join()

                if not debug_keep_instances_alive:
                    try:
                        debug_keep_instances_alive = models.jobs.is_shutdown_disabled(job_id)
                    except error_util.all_errors as e:
                        log.warning(str(e))
                    if debug_keep_instances_alive:
                        cluster.disable_clean()
                if not debug_keep_instances_alive:
                    log.info("Stopping workers...")
        if not debug_keep_instances_alive:
            log.info("Workers stopped")