Ejemplo n.º 1
0
    def build_runtime(self, runtime_name, requirements_file=None):
        try:
            shutil.rmtree(az_config.BUILD_DIR)
        except Exception:
            pass

        action_name = self._format_action_name(runtime_name)

        build_dir = os.path.join(az_config.BUILD_DIR, action_name)
        os.makedirs(build_dir, exist_ok=True)

        logger.info('Building default runtime in {}'.format(build_dir))

        action_dir = os.path.join(build_dir, az_config.ACTION_DIR)
        os.makedirs(action_dir, exist_ok=True)

        req_file = os.path.join(build_dir, 'requirements.txt')
        with open(req_file, 'w') as reqf:
            reqf.write(az_config.REQUIREMENTS_FILE)
            if not is_unix_system():
                if 'dev' in lithops.__version__:
                    reqf.write('git+https://github.com/lithops-cloud/lithops')
                else:
                    reqf.write('lithops=={}'.format(lithops.__version__))

        host_file = os.path.join(build_dir, 'host.json')
        with open(host_file, 'w') as hstf:
            hstf.write(az_config.HOST_FILE)

        fn_file = os.path.join(action_dir, 'function.json')
        if self.invocation_type == 'event':
            with open(fn_file, 'w') as fnf:
                in_q_name = self._format_queue_name(action_name, az_config.IN_QUEUE)
                az_config.BINDINGS_QUEUE['bindings'][0]['queueName'] = in_q_name
                out_q_name = self._format_queue_name(action_name, az_config.OUT_QUEUE)
                az_config.BINDINGS_QUEUE['bindings'][1]['queueName'] = out_q_name
                fnf.write(json.dumps(az_config.BINDINGS_QUEUE))

        elif self.invocation_type == 'http':
            with open(fn_file, 'w') as fnf:
                fnf.write(json.dumps(az_config.BINDINGS_HTTP))

        entry_point = os.path.join(os.path.dirname(__file__), 'entry_point.py')
        main_file = os.path.join(action_dir, '__init__.py')
        shutil.copy(entry_point, main_file)

        if is_unix_system():
            mod_dir = os.path.join(build_dir, az_config.ACTION_MODULES_DIR)
            os.chdir(build_dir)
            cmd = '{} -m pip install -U -t {} -r requirements.txt'.format(sys.executable, mod_dir)
            if logger.getEffectiveLevel() != logging.DEBUG:
                cmd = cmd + " >{} 2>&1".format(os.devnull)
            os.system(cmd)
            create_handler_zip(az_config.FH_ZIP_LOCATION, entry_point, '__init__.py')
            archive = zipfile.ZipFile(az_config.FH_ZIP_LOCATION)
            archive.extractall(path=mod_dir)
            os.remove(mod_dir+'/__init__.py')
            os.remove(az_config.FH_ZIP_LOCATION)
Ejemplo n.º 2
0
    def __init__(self, config, executor_id, job_id, log_level):

        logging.basicConfig(filename=log_file, level=log_level)

        self.log_active = logger.getEffectiveLevel() != logging.WARNING
        self.config = config
        self.queue = Queue()
        self.use_threads = not is_unix_system()
        self.num_workers = self.config['lithops'].get('workers', CPU_COUNT)
        self.workers = []

        sys.stdout = open(log_file, 'a')
        sys.stderr = open(log_file, 'a')

        if self.use_threads:
            for worker_id in range(self.num_workers):
                p = Thread(target=self._process_runner, args=(worker_id,))
                self.workers.append(p)
                p.start()
        else:
            for worker_id in range(self.num_workers):
                p = Process(target=self._process_runner, args=(worker_id,))
                self.workers.append(p)
                p.start()

        logger.info('ExecutorID {} | JobID {} - Localhost Executor started - {} workers'
                    .format(job.executor_id, job.job_id, self.num_workers))
Ejemplo n.º 3
0
    def __init__(self, config, executor_id, internal_storage, compute_handler):
        super().__init__(config, executor_id, internal_storage,
                         compute_handler)

        self.remote_invoker = self.config['serverless'].get(
            'remote_invoker', False)
        self.use_threads = (self.is_lithops_worker or not is_unix_system()
                            or mp.get_start_method() != 'fork')
        self.invokers = []
        self.ongoing_activations = 0

        if self.use_threads:
            self.token_bucket_q = queue.Queue()
            self.pending_calls_q = queue.Queue()
            self.running_flag = SimpleNamespace(value=0)
            self.INVOKER = Thread
        else:
            self.token_bucket_q = mp.Queue()
            self.pending_calls_q = mp.Queue()
            self.running_flag = mp.Value('i', 0)
            self.INVOKER = mp.Process

        self.job_monitor = JobMonitor(self.config, self.internal_storage,
                                      self.token_bucket_q)

        logger.debug('ExecutorID {} - Serverless invoker created'.format(
            self.executor_id))
Ejemplo n.º 4
0
    def __init__(self, config, executor_id, internal_storage, compute_handler):
        self.log_active = logger.getEffectiveLevel() != logging.WARNING
        self.config = config
        self.executor_id = executor_id
        self.storage_config = extract_storage_config(self.config)
        self.internal_storage = internal_storage
        self.compute_handler = compute_handler
        self.is_lithops_worker = is_lithops_worker()
        self.invokers = []

        self.remote_invoker = self.config['serverless'].get(
            'remote_invoker', False)
        self.workers = self.config['lithops'].get('workers')
        logger.debug('ExecutorID {} - Total available workers: {}'.format(
            self.executor_id, self.workers))

        if not is_lithops_worker() and is_unix_system():
            self.token_bucket_q = multiprocessing.Queue()
            self.pending_calls_q = multiprocessing.Queue()
            self.running_flag = multiprocessing.Value('i', 0)
        else:
            self.token_bucket_q = queue.Queue()
            self.pending_calls_q = queue.Queue()
            self.running_flag = SimpleNamespace(value=0)

        self.ongoing_activations = 0
        self.job_monitor = JobMonitor(self.config, self.internal_storage,
                                      self.token_bucket_q)

        logger.debug('ExecutorID {} - Serverless invoker created'.format(
            self.executor_id))
Ejemplo n.º 5
0
    def __init__(self, local_config):
        self.log_active = logger.getEffectiveLevel() != logging.WARNING
        self.config = local_config
        self.name = 'local'
        self.alive = True
        self.queue = Queue()
        self.logs_dir = os.path.join(STORAGE_FOLDER, LOGS_PREFIX)
        self.num_workers = self.config['workers']
        self.use_threads = not is_unix_system()

        self.workers = []

        atexit.register(self.close)

        if self.use_threads:
            for worker_id in range(self.num_workers):
                p = Thread(target=self._process_runner, args=(worker_id, ))
                self.workers.append(p)
                p.daemon = True
                p.start()
        else:
            for worker_id in range(self.num_workers):
                p = Process(target=self._process_runner, args=(worker_id, ))
                self.workers.append(p)
                p.start()

        log_msg = 'Lithops v{} init for Localhost - Total workers: {}'.format(
            __version__, self.num_workers)
        logger.info(log_msg)
        if not self.log_active:
            print(log_msg)
Ejemplo n.º 6
0
    def __init__(self, config, executor_id, job_id):
        self.config = config
        self.executor_id = executor_id
        self.job_id = job_id
        self.use_threads = not is_unix_system()
        self.num_workers = self.config['lithops'].get('workers', CPU_COUNT)
        self.workers = []

        if self.use_threads:
            self.queue = queue.Queue()
            WORKER = Thread
            WOREKR_PROCESS = self._thread_runner
        else:
            if 'fork' in mp.get_all_start_methods():
                mp.set_start_method('fork')
            self.queue = mp.Queue()
            WORKER = mp.Process
            WOREKR_PROCESS = self._process_runner

        for worker_id in range(self.num_workers):
            p = WORKER(target=WOREKR_PROCESS, args=(worker_id,))
            self.workers.append(p)
            p.start()

        logger.info('ExecutorID {} | JobID {} - Localhost runner started '
                    '- {} workers'.format(self.executor_id,
                                          self.job_id,
                                          self.num_workers))
Ejemplo n.º 7
0
    def _create_function(self, docker_image_name, memory, timeout):
        """
        Create and publish an Azure Functions
        """
        action_name = self._format_action_name(docker_image_name, memory)
        logger.info(
            'Creating new Lithops runtime for Azure Function: {}'.format(
                action_name))

        if self.invocation_type == 'event':
            try:
                in_q_name = self._format_queue_name(action_name,
                                                    az_config.IN_QUEUE)
                logger.debug('Creating queue {}'.format(in_q_name))
                self.queue_service.create_queue(in_q_name)
            except Exception:
                in_queue = self.queue_service.get_queue_client(in_q_name)
                in_queue.clear_messages()
            try:
                out_q_name = self._format_queue_name(action_name,
                                                     az_config.OUT_QUEUE)
                logger.debug('Creating queue {}'.format(out_q_name))
                self.queue_service.create_queue(out_q_name)
            except Exception:
                out_queue = self.queue_service.get_queue_client(out_q_name)
                out_queue.clear_messages()

        python_version = version_str(sys.version_info)
        cmd = (
            'az functionapp create --name {} --storage-account {} '
            '--resource-group {} --os-type Linux  --runtime python '
            '--runtime-version {} --functions-version {} --consumption-plan-location {}'
            .format(action_name, self.storage_account_name,
                    self.resource_group, python_version,
                    self.functions_version, self.location))
        if logger.getEffectiveLevel() != logging.DEBUG:
            cmd = cmd + " >{} 2>&1".format(os.devnull)
        res = os.system(cmd)
        if res != 0:
            raise Exception(
                'There was an error creating the function in Azure. cmd: {}'.
                format(cmd))

        logger.debug('Publishing function: {}'.format(action_name))
        build_dir = os.path.join(az_config.BUILD_DIR, action_name)
        os.chdir(build_dir)
        res = 1
        while res != 0:
            time.sleep(5)
            if is_unix_system():
                cmd = 'func azure functionapp publish {} --python --no-build'.format(
                    action_name)
            else:
                cmd = 'func azure functionapp publish {} --python'.format(
                    action_name)
            if logger.getEffectiveLevel() != logging.DEBUG:
                cmd = cmd + " >{} 2>&1".format(os.devnull)
            res = os.system(cmd)

        time.sleep(10)
Ejemplo n.º 8
0
    def run(self, job_payload, job_filename):
        """
        Runs a job
        """
        executor_id = job_payload['executor_id']
        job_id = job_payload['job_id']
        total_calls = len(job_payload['call_ids'])
        job_key = job_payload['job_key']

        logger.debug(f'ExecutorID {executor_id} | JobID {job_id} - Running '
                     f'{total_calls} activations in the localhost worker')

        if not os.path.isfile(RUNNER):
            self.setup()

        tmp_path = Path(TEMP).as_posix()
        cmd = f'docker run --name lithops_{job_key} '
        cmd += f'--user {self.uid}:{self.gid} ' if is_unix_system() else ''
        cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" {self.runtime} /tmp/lithops/runner.py run {job_filename}'

        log = open(RN_LOG_FILE, 'a')
        process = sp.Popen(shlex.split(cmd),
                           stdout=log,
                           stderr=log,
                           start_new_session=True)
        self.jobs[job_key] = process

        return process
Ejemplo n.º 9
0
    def __init__(self, config):
        self.config = config
        self.use_threads = not is_unix_system()
        self.num_workers = self.config['lithops'].get('workers', CPU_COUNT)
        self.workers = []

        log_file_stream = open(LH_LOG_FILE, 'a')
        sys.stdout = log_file_stream
        sys.stderr = log_file_stream

        if self.use_threads:
            self.queue = queue.Queue()
            for worker_id in range(self.num_workers):
                p = Thread(target=self._process_runner, args=(worker_id, ))
                self.workers.append(p)
                p.start()
        else:
            self.queue = mp.Queue()
            for worker_id in range(self.num_workers):
                p = mp.Process(target=self._process_runner, args=(worker_id, ))
                self.workers.append(p)
                p.start()

        logger.info(
            'ExecutorID {} | JobID {} - Localhost Executor started - {} workers'
            .format(job.executor_id, job.job_id, self.num_workers))
Ejemplo n.º 10
0
 def get_execution_cmd(self, runtime):
     if is_unix_system():
         cmd = ('docker run --user $(id -u):$(id -g) --rm -v {}:/tmp --entrypoint '
                '"python3" {} /tmp/lithops/runner.py'.format(TEMP, self.runtime))
     else:
         cmd = ('docker run --rm -v {}:/tmp --entrypoint "python3" {} '
                '/tmp/lithops/runner.py'.format(TEMP, self.runtime))
     return cmd
Ejemplo n.º 11
0
 def get_execution_cmd(self, runtime):
     if is_unix_system():
         cmd = (
             f'docker run --user $(id -u):$(id -g) --rm -v {TEMP}:/tmp --entrypoint '
             f'"python3" {self.runtime} /tmp/lithops/runner.py')
     else:
         cmd = (f'docker run --rm -v {TEMP}:/tmp --entrypoint "python3" '
                f'{self.runtime} /tmp/lithops/runner.py')
     return cmd
Ejemplo n.º 12
0
 def kill_job(job_key):
     if self.jobs[job_key].poll() is None:
         logger.debug(
             f'Killing job {job_key} with PID {self.jobs[job_key].pid}')
         PID = self.jobs[job_key].pid
         if is_unix_system():
             PGID = os.getpgid(PID)
             os.killpg(PGID, signal.SIGKILL)
         else:
             os.kill(PID, signal.SIGTERM)
     del self.jobs[job_key]
Ejemplo n.º 13
0
 def _start_invoker_process(self):
     """
     Starts the invoker process responsible to spawn pending calls in background
     """
     if self.is_lithops_function or not is_unix_system():
         for inv_id in range(INVOKER_PROCESSES):
             p = Thread(target=self._run_invoker_process, args=(inv_id, ))
             self.invokers.append(p)
             p.daemon = True
             p.start()
     else:
         for inv_id in range(INVOKER_PROCESSES):
             p = Process(target=self._run_invoker_process, args=(inv_id, ))
             self.invokers.append(p)
             p.daemon = True
             p.start()
Ejemplo n.º 14
0
    def preinstalls(self):
        if not os.path.isfile(RUNNER):
            self.setup()

        tmp_path = Path(TEMP).as_posix()
        cmd = 'docker run '
        cmd += f'--user {self.uid}:{self.gid} ' if is_unix_system() else ''
        cmd += f'--rm -v {tmp_path}:/tmp --entrypoint "python3" {self.runtime} /tmp/lithops/runner.py preinstalls'

        process = sp.run(shlex.split(cmd),
                         check=True,
                         stdout=sp.PIPE,
                         universal_newlines=True,
                         start_new_session=True)
        runtime_meta = json.loads(process.stdout.strip())

        return runtime_meta
Ejemplo n.º 15
0
def get_memory_usage(formatted=True):
    """
    Gets the current memory usage of the runtime.
    To be used only in the action code.
    """
    if not is_unix_system():
        return
    split_args = False
    pids_to_show = None
    discriminate_by_pid = False

    ps_mem.verify_environment(pids_to_show)
    sorted_cmds, shareds, count, total, swaps, total_swap = \
        ps_mem.get_memory_usage(pids_to_show, split_args, discriminate_by_pid,
                                include_self=True, only_self=False)
    if formatted:
        return sizeof_fmt(int(ps_mem.human(total, units=1)))
    else:
        return int(ps_mem.human(total, units=1))
Ejemplo n.º 16
0
def get_memory_usage(formatted=True):
    """
    Gets the current memory usage of the runtime.
    To be used only in the action code.
    """
    from lithops.libs import ps_mem
    if not is_unix_system() or os.geteuid() != 0:
        # Non Unix systems and non root users can't run
        # the ps_mem module
        return

    split_args = False
    pids_to_show = None
    discriminate_by_pid = False

    ps_mem.verify_environment(pids_to_show)
    sorted_cmds, shareds, count, total, swaps, total_swap = \
        ps_mem.get_memory_usage(pids_to_show, split_args, discriminate_by_pid,
                                include_self=True, only_self=False)
    if formatted:
        return sizeof_fmt(int(ps_mem.human(total, units=1)))
    else:
        return int(ps_mem.human(total, units=1))
Ejemplo n.º 17
0
    def wait(self,
             fs=None,
             throw_except=True,
             return_when=ALL_COMPLETED,
             download_results=False,
             timeout=None,
             THREADPOOL_SIZE=128,
             WAIT_DUR_SEC=1):
        """
        Wait for the Future instances (possibly created by different Executor instances)
        given by fs to complete. Returns a named 2-tuple of sets. The first set, named done,
        contains the futures that completed (finished or cancelled futures) before the wait
        completed. The second set, named not_done, contains the futures that did not complete
        (pending or running futures). timeout can be used to control the maximum number of
        seconds to wait before returning.

        :param fs: Futures list. Default None
        :param throw_except: Re-raise exception if call raised. Default True.
        :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS`
        :param download_results: Download results. Default false (Only get statuses)
        :param timeout: Timeout of waiting for results.
        :param THREADPOOL_SIZE: Number of threads to use. Default 64
        :param WAIT_DUR_SEC: Time interval between each check.

        :return: `(fs_done, fs_notdone)`
            where `fs_done` is a list of futures that have completed
            and `fs_notdone` is a list of futures that have not completed.
        :rtype: 2-tuple of list
        """
        futures = fs or self.futures
        if type(futures) != list:
            futures = [futures]

        if not futures:
            raise Exception(
                'You must run the call_async(), map() or map_reduce(), or provide'
                ' a list of futures before calling the wait()/get_result() method'
            )

        if download_results:
            msg = 'ExecutorID {} - Getting results'.format(self.executor_id)
            fs_done = [f for f in futures if f.done]
            fs_not_done = [f for f in futures if not f.done]
            fs_not_ready = [f for f in futures if not f.ready]

        else:
            msg = 'ExecutorID {} - Waiting for functions to complete'.format(
                self.executor_id)
            fs_done = [f for f in futures if f.ready or f.done]
            fs_not_done = [f for f in futures if not f.ready and not f.done]
            fs_not_ready = [f for f in futures if not f.ready]

        if not fs_not_done:
            return fs_done, fs_not_done

        logger.info(msg)

        if is_unix_system() and timeout is not None:
            logger.debug(
                'Setting waiting timeout to {} seconds'.format(timeout))
            error_msg = 'Timeout of {} seconds exceeded waiting for function activations to finish'.format(
                timeout)
            signal.signal(signal.SIGALRM, partial(timeout_handler, error_msg))
            signal.alarm(timeout)

        pbar = None
        error = False

        if not self.is_lithops_worker and self.setup_progressbar and fs_not_ready:
            from tqdm.auto import tqdm

            if is_notebook():
                pbar = tqdm(bar_format='{n}/|/ {n_fmt}/{total_fmt}',
                            total=len(fs_not_done))  # ncols=800
            else:
                print()
                pbar = tqdm(bar_format='  {l_bar}{bar}| {n_fmt}/{total_fmt}  ',
                            total=len(fs_not_done),
                            disable=None)

        try:
            if self.rabbitmq_monitor:
                logger.debug('Using RabbitMQ to monitor function activations')
                wait_rabbitmq(futures,
                              self.internal_storage,
                              rabbit_amqp_url=self.rabbit_amqp_url,
                              download_results=download_results,
                              throw_except=throw_except,
                              pbar=pbar,
                              return_when=return_when,
                              THREADPOOL_SIZE=THREADPOOL_SIZE)
            else:
                wait_storage(futures,
                             self.internal_storage,
                             download_results=download_results,
                             throw_except=throw_except,
                             return_when=return_when,
                             pbar=pbar,
                             THREADPOOL_SIZE=THREADPOOL_SIZE,
                             WAIT_DUR_SEC=WAIT_DUR_SEC)

        except KeyboardInterrupt as e:
            if download_results:
                not_dones_call_ids = [(f.job_id, f.call_id) for f in futures
                                      if not f.done]
            else:
                not_dones_call_ids = [(f.job_id, f.call_id) for f in futures
                                      if not f.ready and not f.done]
            msg = ('ExecutorID {} - Cancelled - Total Activations not done: {}'
                   .format(self.executor_id, len(not_dones_call_ids)))
            if pbar:
                pbar.close()
                print()
            logger.info(msg)
            error = True
            if self.data_cleaner and not self.is_lithops_worker:
                self.clean(clean_cloudobjects=False, force=True)
            raise e

        except Exception as e:
            error = True
            if self.data_cleaner and not self.is_lithops_worker:
                self.clean(clean_cloudobjects=False, force=True)
            raise e

        finally:
            self.invoker.stop()
            if is_unix_system():
                signal.alarm(0)
            if pbar and not pbar.disable:
                pbar.close()
                if not is_notebook():
                    print()
            if self.data_cleaner and not self.is_lithops_worker:
                self.clean(clean_cloudobjects=False)
            if not fs and error and is_notebook():
                del self.futures[len(self.futures) - len(futures):]

        if download_results:
            fs_done = [f for f in futures if f.done]
            fs_notdone = [f for f in futures if not f.done]
        else:
            fs_done = [f for f in futures if f.ready or f.done]
            fs_notdone = [f for f in futures if not f.ready and not f.done]

        return fs_done, fs_notdone
Ejemplo n.º 18
0
 def __init__(self, docker_image, pull_runtime):
     logger.debug(f'Starting Docker Environment for {docker_image}')
     super().__init__(runtime=docker_image)
     self.pull_runtime = pull_runtime
     self.uid = os.getuid() if is_unix_system() else None
     self.gid = os.getuid() if is_unix_system() else None
Ejemplo n.º 19
0
def run_job(job):
    """
    Runs a single job within a separate process
    """
    start_tstamp = time.time()
    setup_lithops_logger(job.log_level)

    logger.info("Lithops v{} - Starting execution".format(__version__))
    logger.info("Execution ID: {}/{}".format(job.job_key, job.call_id))
    logger.debug("Runtime name: {}".format(job.runtime_name))
    if job.runtime_memory:
        logger.debug("Runtime memory: {}MB".format(job.runtime_memory))
    logger.debug("Function timeout: {}s".format(job.execution_timeout))

    env = job.extra_env
    env['LITHOPS_WORKER'] = 'True'
    env['PYTHONUNBUFFERED'] = 'True'
    env['LITHOPS_CONFIG'] = json.dumps(job.config)
    env['PYTHONPATH'] = "{}:{}".format(os.getcwd(), LITHOPS_LIBS_PATH)
    env['__LITHOPS_SESSION_ID'] = '-'.join([job.job_key, job.call_id])
    os.environ.update(env)

    storage_config = extract_storage_config(job.config)
    internal_storage = InternalStorage(storage_config)

    call_status = CallStatus(job.config, internal_storage)
    call_status.response['worker_start_tstamp'] = start_tstamp
    call_status.response['host_submit_tstamp'] = job.host_submit_tstamp
    call_status.response['call_id'] = job.call_id
    call_status.response['job_id'] = job.job_id
    call_status.response['executor_id'] = job.executor_id

    show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False'))

    try:
        if __version__ != job.lithops_version:
            msg = (
                "Lithops version mismatch. Host version: {} - Runtime version: {}"
                .format(job.lithops_version, __version__))
            raise RuntimeError('HANDLER', msg)

        # send init status event
        call_status.send('__init__')

        if show_memory_peak:
            mm_handler_conn, mm_conn = Pipe()
            memory_monitor = Thread(target=memory_monitor_worker,
                                    args=(mm_conn, ))
            memory_monitor.start()

        job.jr_stats_file = os.path.join(job.job_dir, 'jobrunner.stats.txt')
        handler_conn, jobrunner_conn = Pipe()
        jobrunner = JobRunner(job, jobrunner_conn, internal_storage)
        logger.debug('Starting JobRunner process')
        jrp = Process(target=jobrunner.run) if is_unix_system() else Thread(
            target=jobrunner.run)
        jrp.start()

        jrp.join(job.execution_timeout)
        logger.debug('JobRunner process finished')

        if jrp.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            try:
                jrp.terminate()
            except Exception:
                # thread does not have terminate method
                pass
            msg = ('Function exceeded maximum time of {} seconds and was '
                   'killed'.format(job.execution_timeout))
            raise TimeoutError('HANDLER', msg)

        if show_memory_peak:
            mm_handler_conn.send('STOP')
            memory_monitor.join()
            peak_memory_usage = int(mm_handler_conn.recv())
            logger.info("Peak memory usage: {}".format(
                sizeof_fmt(peak_memory_usage)))
            call_status.response['peak_memory_usage'] = peak_memory_usage

        if not handler_conn.poll():
            logger.error(
                'No completion message received from JobRunner process')
            logger.debug('Assuming memory overflow...')
            # Only 1 message is returned by jobrunner when it finishes.
            # If no message, this means that the jobrunner process was killed.
            # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM.
            msg = 'Function exceeded maximum memory and was killed'
            raise MemoryError('HANDLER', msg)

        if os.path.exists(job.jr_stats_file):
            with open(job.jr_stats_file, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        call_status.response[key] = float(value)
                    except Exception:
                        call_status.response[key] = value
                    if key in [
                            'exception', 'exc_pickle_fail', 'result',
                            'new_futures'
                    ]:
                        call_status.response[key] = eval(value)

    except Exception:
        # internal runtime exceptions
        print('----------------------- EXCEPTION !-----------------------')
        traceback.print_exc(file=sys.stdout)
        print('----------------------------------------------------------')
        call_status.response['exception'] = True

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        call_status.response['exc_info'] = str(pickled_exc)

    finally:
        call_status.response['worker_end_tstamp'] = time.time()

        with open(job.log_file, 'rb') as lf:
            log_str = base64.b64encode(zlib.compress(lf.read())).decode()
            call_status.response['logs'] = log_str

        call_status.send('__end__')

        # Unset specific env vars
        for key in job.extra_env:
            os.environ.pop(key, None)
        os.environ.pop('__LITHOPS_TOTAL_EXECUTORS', None)

        logger.info("Finished")
Ejemplo n.º 20
0
def run_job(job):
    """
    Runs a single job within a separate process
    """
    setup_lithops_logger(job.log_level)

    backend = os.environ.get('__LITHOPS_BACKEND', '')
    logger.info("Lithops v{} - Starting {} execution".format(
        __version__, backend))
    logger.info("Execution ID: {}/{}".format(job.job_key, job.call_id))

    env = job.extra_env
    env['LITHOPS_CONFIG'] = json.dumps(job.config)
    env['__LITHOPS_SESSION_ID'] = '-'.join([job.job_key, job.call_id])
    os.environ.update(env)

    storage_config = extract_storage_config(job.config)
    internal_storage = InternalStorage(storage_config)
    call_status = create_call_status(job, internal_storage)

    if job.runtime_memory:
        logger.debug('Runtime: {} - Memory: {}MB - Timeout: {} seconds'.format(
            job.runtime_name, job.runtime_memory, job.execution_timeout))
    else:
        logger.debug('Runtime: {} - Timeout: {} seconds'.format(
            job.runtime_name, job.execution_timeout))

    job_interruped = False

    try:
        # send init status event
        call_status.send_init_event()

        handler_conn, jobrunner_conn = Pipe()
        jobrunner = JobRunner(job, jobrunner_conn, internal_storage)
        logger.debug('Starting JobRunner process')
        jrp = Process(target=jobrunner.run) if is_unix_system() else Thread(
            target=jobrunner.run)
        jrp.start()
        jrp.join(job.execution_timeout)
        logger.debug('JobRunner process finished')

        if jrp.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            try:
                jrp.terminate()
            except Exception:
                # thread does not have terminate method
                pass
            msg = ('Function exceeded maximum time of {} seconds and was '
                   'killed'.format(job.execution_timeout))
            raise TimeoutError('HANDLER', msg)

        if not handler_conn.poll():
            logger.error(
                'No completion message received from JobRunner process')
            logger.debug('Assuming memory overflow...')
            # Only 1 message is returned by jobrunner when it finishes.
            # If no message, this means that the jobrunner process was killed.
            # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM.
            msg = 'Function exceeded maximum memory and was killed'
            raise MemoryError('HANDLER', msg)

        if os.path.exists(job.stats_file):
            with open(job.stats_file, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        call_status.add(key, float(value))
                    except Exception:
                        call_status.add(key, value)
                    if key in ['exception', 'exc_pickle_fail', 'result']:
                        call_status.add(key, eval(value))

    except KeyboardInterrupt:
        job_interruped = True
        logger.debug("Job interrupted")

    except Exception:
        # internal runtime exceptions
        print('----------------------- EXCEPTION !-----------------------')
        traceback.print_exc(file=sys.stdout)
        print('----------------------------------------------------------')
        call_status.add('exception', True)

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        call_status.add('exc_info', str(pickled_exc))

    finally:
        if not job_interruped:
            call_status.add('worker_end_tstamp', time.time())

            # Flush log stream and save it to the call status
            job.log_stream.flush()
            if os.path.isfile(job.log_file):
                with open(job.log_file, 'rb') as lf:
                    log_str = base64.b64encode(zlib.compress(
                        lf.read())).decode()
                    call_status.add('logs', log_str)

            call_status.send_finish_event()

        logger.info("Finished")
Ejemplo n.º 21
0
def wait(fs,
         throw_except=True,
         return_when=ALL_COMPLETED,
         download_results=False,
         timeout=None,
         THREADPOOL_SIZE=128,
         WAIT_DUR_SEC=1,
         internal_storage=None):
    """
    Wait for the Future instances (possibly created by different Executor instances)
    given by fs to complete. Returns a named 2-tuple of sets. The first set, named done,
    contains the futures that completed (finished or cancelled futures) before the wait
    completed. The second set, named not_done, contains the futures that did not complete
    (pending or running futures). timeout can be used to control the maximum number of
    seconds to wait before returning.

    :param fs: Futures list. Default None
    :param throw_except: Re-raise exception if call raised. Default True.
    :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS`
    :param download_results: Download results. Default false (Only get statuses)
    :param timeout: Timeout of waiting for results.
    :param THREADPOOL_SIZE: Number of threads to use. Default 64
    :param WAIT_DUR_SEC: Time interval between each check.

    :return: `(fs_done, fs_notdone)`
        where `fs_done` is a list of futures that have completed
        and `fs_notdone` is a list of futures that have not completed.
    :rtype: 2-tuple of list
    """
    if not internal_storage:
        internal_storage = InternalStorage(fs[0].storage_config)

    if type(fs) != list:
        fs = [fs]

    setup_progressbar = (not is_lithops_worker()
                         and logger.getEffectiveLevel() == logging.INFO)

    if download_results:
        msg = 'Getting results from functions'
        fs_done = [f for f in fs if f.done]
        fs_not_done = [f for f in fs if not f.done]
        # fs_not_ready = [f for f in futures if not f.ready and not f.done]

    else:
        msg = 'Waiting for functions to complete'
        fs_done = [f for f in fs if f.ready or f.done]
        fs_not_done = [f for f in fs if not f.done]
        # fs_not_ready = [f for f in futures if not f.ready and not f.done]

    if not fs_not_done:
        return fs_done, fs_not_done

    logger.info(msg)

    if is_unix_system() and timeout is not None:
        logger.debug('Setting waiting timeout to {} seconds'.format(timeout))
        error_msg = 'Timeout of {} seconds exceeded waiting for function activations to finish'.format(
            timeout)
        signal.signal(signal.SIGALRM, partial(timeout_handler, error_msg))
        signal.alarm(timeout)

    pbar = None

    if not is_lithops_worker() and setup_progressbar:
        from tqdm.auto import tqdm

        if not is_notebook():
            print()
        pbar = tqdm(bar_format='  {l_bar}{bar}| {n_fmt}/{total_fmt}  ',
                    total=len(fs_not_done),
                    disable=None)

    try:
        wait_storage(fs,
                     internal_storage,
                     download_results=download_results,
                     throw_except=throw_except,
                     return_when=return_when,
                     pbar=pbar,
                     THREADPOOL_SIZE=THREADPOOL_SIZE,
                     WAIT_DUR_SEC=WAIT_DUR_SEC)

    except KeyboardInterrupt as e:
        if download_results:
            not_dones_call_ids = [(f.job_id, f.call_id) for f in fs
                                  if not f.done]
        else:
            not_dones_call_ids = [(f.job_id, f.call_id) for f in fs
                                  if not f.ready and not f.done]
        msg = ('Cancelled - Total Activations not done: {}'.format(
            len(not_dones_call_ids)))
        if pbar:
            pbar.close()
            print()
        logger.info(msg)
        raise e

    except Exception as e:
        raise e

    finally:
        if is_unix_system():
            signal.alarm(0)
        if pbar and not pbar.disable:
            pbar.close()
            if not is_notebook():
                print()

    if download_results:
        fs_done = [f for f in fs if f.done]
        fs_notdone = [f for f in fs if not f.done]
    else:
        fs_done = [f for f in fs if f.ready or f.done]
        fs_notdone = [f for f in fs if not f.ready and not f.done]

    return fs_done, fs_notdone
Ejemplo n.º 22
0
def wait(fs,
         internal_storage=None,
         throw_except=True,
         timeout=None,
         return_when=ALL_COMPLETED,
         download_results=False,
         job_monitor=None,
         threadpool_size=THREADPOOL_SIZE,
         wait_dur_sec=WAIT_DUR_SEC):
    """
    Wait for the Future instances (possibly created by different Executor instances)
    given by fs to complete. Returns a named 2-tuple of sets. The first set, named done,
    contains the futures that completed (finished or cancelled futures) before the wait
    completed. The second set, named not_done, contains the futures that did not complete
    (pending or running futures). timeout can be used to control the maximum number of
    seconds to wait before returning.

    :param fs: Futures list. Default None
    :param throw_except: Re-raise exception if call raised. Default True.
    :param return_when: Percentage of done futures
    :param download_results: Download results. Default false (Only get statuses)
    :param timeout: Timeout of waiting for results.
    :param threadpool_zise: Number of threads to use. Default 64
    :param wait_dur_sec: Time interval between each check.

    :return: `(fs_done, fs_notdone)`
        where `fs_done` is a list of futures that have completed
        and `fs_notdone` is a list of futures that have not completed.
    :rtype: 2-tuple of list
    """
    if not fs:
        return

    if type(fs) != list and type(fs) != FuturesList:
        fs = [fs]

    if download_results:
        msg = 'ExecutorID {} - Getting results from functions'.format(
            fs[0].executor_id)
        fs_done = [f for f in fs if f.done]
        fs_not_done = [f for f in fs if not f.done]

    else:
        msg = 'ExecutorID {} - Waiting for {}% of functions to complete'.format(
            fs[0].executor_id, return_when)
        fs_done = [f for f in fs if f.success or f.done]
        fs_not_done = [f for f in fs if not (f.success or f.done)]

    logger.info(msg)

    if not fs_not_done:
        return fs_done, fs_not_done

    if is_unix_system() and timeout is not None:
        logger.debug('Setting waiting timeout to {} seconds'.format(timeout))
        error_msg = 'Timeout of {} seconds exceeded waiting for function activations to finish'.format(
            timeout)
        signal.signal(signal.SIGALRM, partial(timeout_handler, error_msg))
        signal.alarm(timeout)

    # Setup progress bar
    pbar = None
    if not is_lithops_worker() and logger.getEffectiveLevel() == logging.INFO:
        from tqdm.auto import tqdm
        if not is_notebook():
            print()
        pbar = tqdm(bar_format='  {l_bar}{bar}| {n_fmt}/{total_fmt}  ',
                    total=len(fs),
                    disable=None)
        pbar.update(len(fs_done))

    try:
        executors_data = _create_executors_data_from_futures(
            fs, internal_storage)

        if not job_monitor:
            for executor_data in executors_data:
                job_monitor = JobMonitor(
                    executor_id=executor_data.executor_id,
                    internal_storage=executor_data.internal_storage)
                job_monitor.start(fs=executor_data.futures)

        sleep_sec = wait_dur_sec if job_monitor.backend == 'storage' else 0.3

        if return_when == ALWAYS:
            for executor_data in executors_data:
                _get_executor_data(fs,
                                   executor_data,
                                   pbar=pbar,
                                   throw_except=throw_except,
                                   download_results=download_results,
                                   threadpool_size=threadpool_size)
        else:
            while not _check_done(fs, return_when, download_results):
                for executor_data in executors_data:
                    new_data = _get_executor_data(
                        fs,
                        executor_data,
                        pbar=pbar,
                        throw_except=throw_except,
                        download_results=download_results,
                        threadpool_size=threadpool_size)
                time.sleep(0 if new_data else sleep_sec)

    except KeyboardInterrupt as e:
        if download_results:
            not_dones_call_ids = [(f.job_id, f.call_id) for f in fs
                                  if not f.done]
        else:
            not_dones_call_ids = [(f.job_id, f.call_id) for f in fs
                                  if not f.success and not f.done]
        msg = ('Cancelled - Total Activations not done: {}'.format(
            len(not_dones_call_ids)))
        if pbar:
            pbar.close()
            print()
        logger.info(msg)
        raise e

    except Exception as e:
        raise e

    finally:
        if is_unix_system():
            signal.alarm(0)
        if pbar and not pbar.disable:
            pbar.close()
            if not is_notebook():
                print()

    if download_results:
        fs_done = [f for f in fs if f.done]
        fs_notdone = [f for f in fs if not f.done]
    else:
        fs_done = [f for f in fs if f.success or f.done]
        fs_notdone = [f for f in fs if not f.success and not f.done]

    return fs_done, fs_notdone
Ejemplo n.º 23
0
import os
import sys
import pkgutil
import logging
import pickle
import subprocess
from contextlib import contextmanager

from lithops.version import __version__ as lithops_ver
from lithops.utils import sizeof_fmt, is_unix_system, b64str_to_bytes
from lithops.constants import LITHOPS_TEMP_DIR, MODULES_DIR

logger = logging.getLogger(__name__)

if is_unix_system():
    # Windows hosts can't use ps_mem module
    import ps_mem


def get_function_and_modules(job, internal_storage):
    """
    Gets the function and the modules from storage
    """
    logger.debug("Getting function and modules")

    mode = job.config['lithops']['mode']
    customized_runtime = job.config[mode].get('customized_runtime', False)

    func_obj = None
    if customized_runtime:
Ejemplo n.º 24
0
    def _init_runtime(self, docker_image_name):
        name = self._format_runtime_name(docker_image_name)

        if self._is_localhost:
            if is_unix_system():
                uid_cmd = "id -u $USER"
                uid = subprocess.check_output(uid_cmd,
                                              shell=True).decode().strip()

            if self.docker_client:
                running_containers = self.docker_client.containers.list(
                    filters={'name': 'lithops'})
                running_runtimes = [c.name for c in running_containers]

                if name not in running_runtimes:
                    self.docker_client.containers.run(
                        docker_image_name,
                        entrypoint='python',
                        command='/tmp/{}/__main__.py'.format(
                            DOCKER_BASE_FOLDER),
                        volumes=['{}:/tmp'.format(TEMP)],
                        detach=True,
                        auto_remove=True,
                        user=uid,
                        name=name,
                        ports={'8080/tcp': docker_config.LITHOPS_SERVER_PORT})
                    time.sleep(5)
            else:
                running_runtimes_cmd = "docker ps --format '{{.Names}}' -f name=lithops"
                running_runtimes = subprocess.run(
                    running_runtimes_cmd, shell=True,
                    stdout=subprocess.PIPE).stdout.decode()
                if name not in running_runtimes:
                    if is_unix_system():
                        cmd = (
                            'docker run -d --name {} --user {} -v {}:/tmp -p 8080:{}'
                            ' --entrypoint "python" {} /tmp/{}/__main__.py'.
                            format(name, uid, TEMP,
                                   docker_config.LITHOPS_SERVER_PORT,
                                   docker_image_name, DOCKER_BASE_FOLDER))
                    else:
                        cmd = ('docker run -d --name {}  -v {}:/tmp -p 8080:{}'
                               ' --entrypoint "python" {} /tmp/{}/__main__.py'.
                               format(name, TEMP,
                                      docker_config.LITHOPS_SERVER_PORT,
                                      docker_image_name, DOCKER_BASE_FOLDER))

                    if not self.log_active:
                        cmd = cmd + " >{} 2>&1".format(os.devnull)
                    res = os.system(cmd)
                    if res != 0:
                        raise Exception(
                            'There was an error starting the runtime')
                    time.sleep(5)

        else:
            running_runtimes_cmd = "docker ps --format '{{.Names}}' -f name=lithops"
            running_runtimes = self._ssh_run_remote_command(
                running_runtimes_cmd)
            used_runtimes_cmd = "docker ps -a --format '{{.Names}}' -f name=lithops"
            used_runtimes = self._ssh_run_remote_command(used_runtimes_cmd)

            if name not in running_runtimes and name in used_runtimes:
                cmd = 'docker rm -f {}'.format(name)
                self._ssh_run_remote_command(cmd)

            cmd = (
                'docker run -d --name {} --user $(id -u):$(id -g) -v {}:/tmp -p 8080:{}'
                ' --entrypoint "python" {} /tmp/{}/__main__.py'.format(
                    name, LITHOPS_TEMP, docker_config.LITHOPS_SERVER_PORT,
                    docker_image_name, DOCKER_BASE_FOLDER))
            if name not in running_runtimes:
                self._ssh_run_remote_command(cmd)
                time.sleep(5)
                # install missing dependency
                cmd = ('docker exec {} pip install pyyaml'.format(name))
                try:
                    self._ssh_run_remote_command(cmd)
                except Exception as e:
                    if 'upgrade pip' in str(e):
                        pass
                    else:
                        raise e