Esempio n. 1
0
    def run_job(self, job_payload):
        """
        Run the job description against the selected environment
        """
        executor_id = job_payload['executor_id']
        job_id = job_payload['job_id']
        runtime = job_payload['job_description']['runtime_name']

        job_key = create_job_key(executor_id, job_id)
        log_file = os.path.join(LOGS_DIR, job_key + '.log')
        logger.info("Running job in {}. View execution logs at {}".format(
            runtime, log_file))

        if not os.path.isfile(RUNNER):
            self.env.setup()

        exec_command = self.env.get_execution_cmd(runtime)

        executor_id = job_payload['executor_id']
        job_id = job_payload['job_id']
        storage_bucket = job_payload['config']['lithops']['storage_bucket']

        job_dir = os.path.join(LITHOPS_TEMP_DIR, storage_bucket, JOBS_PREFIX)
        os.makedirs(job_dir, exist_ok=True)
        jobr_filename = os.path.join(job_dir, '{}-job.json'.format(job_key))

        with open(jobr_filename, 'w') as jl:
            json.dump(job_payload, jl)

        log_file = open(RN_LOG_FILE, 'a')
        sp.Popen(exec_command + ' run ' + jobr_filename,
                 shell=True,
                 stdout=log_file,
                 stderr=log_file,
                 universal_newlines=True)
Esempio n. 2
0
def run():
    log_file_stream = open(RN_LOG_FILE, 'a')
    sys.stdout = log_file_stream
    sys.stderr = log_file_stream

    job_filename = sys.argv[2]
    logger.info('Got {} job file'.format(job_filename))

    with open(job_filename, 'rb') as jf:
        job = SimpleNamespace(**json.load(jf))

    logger.info('ExecutorID {} | JobID {} - Starting execution'
                .format(job.executor_id, job.job_id))

    runner = Runner(job.config, job.executor_id, job.job_id)
    runner.run(job.job_description, job.log_level)
    runner.wait()

    job_key = create_job_key(job.executor_id, job.job_id)
    done = os.path.join(JOBS_DONE_DIR, job_key+'.done')
    Path(done).touch()

    if os.path.exists(job_filename):
        os.remove(job_filename)

    logger.info('ExecutorID {} | JobID {} - Execution Finished'
                .format(job.executor_id, job.job_id))
Esempio n. 3
0
    def _process_runner(self, worker_id):
        logger.debug('Localhost worker process {} started'.format(worker_id))
        os.environ['__LITHOPS_LOCAL_EXECUTION'] = 'True'

        p_logger = logging.getLogger('lithops')

        while True:
            with io.StringIO() as buf,  redirect_stdout(buf), redirect_stderr(buf):
                event = self.queue.get(block=True)
                if isinstance(event, ShutdownSentinel):
                    break
                act_id = str(uuid.uuid4()).replace('-', '')[:12]
                os.environ['__LITHOPS_ACTIVATION_ID'] = act_id
                executor_id = event['executor_id']
                job_id = event['job_id']
                setup_logger(event['log_level'])
                p_logger.info("Lithops v{} - Starting execution".format(__version__))
                function_handler(event)
                log_output = buf.getvalue()

            job_key = create_job_key(executor_id, job_id)
            log_file = os.path.join(LOGS_DIR, job_key+'.log')
            header = "Activation: '{}' ({})\n[\n".format(event['runtime_name'], act_id)
            tail = ']\n\n'
            output = log_output.replace('\n', '\n    ', log_output.count('\n')-1)
            with open(log_file, 'a') as lf:
                lf.write(header+'    '+output+tail)
            with open(FN_LOG_FILE, 'a') as lf:
                lf.write(header+'    '+output+tail)
Esempio n. 4
0
    def _send_status_rabbitmq(self):
        """
        Send the status event to RabbitMQ
        """
        dmpd_response_status = json.dumps(self.response)
        drs = sizeof_fmt(len(dmpd_response_status))

        executor_id = self.response['executor_id']
        job_id = self.response['job_id']

        rabbit_amqp_url = self.config['rabbitmq'].get('amqp_url')
        status_sent = False
        output_query_count = 0
        params = pika.URLParameters(rabbit_amqp_url)
        job_key = create_job_key(executor_id, job_id)
        exchange = 'lithops-{}'.format(job_key)

        while not status_sent and output_query_count < 5:
            output_query_count = output_query_count + 1
            try:
                connection = pika.BlockingConnection(params)
                channel = connection.channel()
                channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True)
                channel.basic_publish(exchange=exchange, routing_key='',
                                      body=dmpd_response_status)
                connection.close()
                logger.info("Execution status sent to rabbitmq - Size: {}".format(drs))
                status_sent = True
            except Exception as e:
                logger.error("Unable to send status to rabbitmq")
                logger.error(str(e))
                logger.info('Retrying to send status to rabbitmq')
                time.sleep(0.2)
Esempio n. 5
0
    def clean(self,
              fs=None,
              cs=None,
              clean_cloudobjects=True,
              spawn_cleaner=True,
              force=False):
        """
        Deletes all the temp files from storage. These files include the function,
        the data serialization and the function invocation results. It can also clean
        cloudobjects.

        :param fs: list of futures to clean
        :param cs: list of cloudobjects to clean
        :param clean_cloudobjects: true/false
        :param spawn_cleaner true/false
        """

        os.makedirs(CLEANER_DIR, exist_ok=True)

        def save_data_to_clean(data):
            with tempfile.NamedTemporaryFile(dir=CLEANER_DIR,
                                             delete=False) as temp:
                pickle.dump(data, temp)

        if cs:
            data = {
                'cos_to_clean': list(cs),
                'storage_config': self.internal_storage.get_storage_config()
            }
            save_data_to_clean(data)
            if not fs:
                return

        futures = fs or self.futures
        futures = [futures] if type(futures) != list else futures
        present_jobs = {
            create_job_key(f.executor_id, f.job_id)
            for f in futures
            if (f.executor_id.count('-') == 1 and f.done) or force
        }
        jobs_to_clean = present_jobs - self.cleaned_jobs

        if jobs_to_clean:
            logger.info("ExecutorID {} - Cleaning temporary data".format(
                self.executor_id))
            data = {
                'jobs_to_clean': jobs_to_clean,
                'clean_cloudobjects': clean_cloudobjects,
                'storage_config': self.internal_storage.get_storage_config()
            }
            save_data_to_clean(data)
            self.cleaned_jobs.update(jobs_to_clean)

            self.compute_handler.clear()

        if (jobs_to_clean or cs) and spawn_cleaner:
            log_file = open(CLEANER_LOG_FILE, 'a')
            cmdstr = '{} -m lithops.scripts.cleaner'.format(sys.executable)
            sp.Popen(cmdstr, shell=True, stdout=log_file, stderr=log_file)
Esempio n. 6
0
    def start_job_monitoring(self, job):
        logger.debug('ExecutorID {} | JobID {} - Starting job monitoring'
                     .format(job.executor_id, job.job_id))
        if self.rabbitmq_monitor:
            th = Thread(target=self._job_monitoring_rabbitmq, args=(job,))
        else:
            th = Thread(target=self._job_monitoring_os, args=(job,))

        if not self.is_lithops_worker:
            th.daemon = True

        job_key = create_job_key(job.executor_id, job.job_id)
        self.monitors[job_key] = {'thread': th, 'should_run': True}
        th.start()
Esempio n. 7
0
    def create_resources(rabbit_amqp_url, executor_id, job_id):
        job_key = create_job_key(executor_id, job_id)
        exchange = 'lithops-{}'.format(job_key)
        queue_0 = '{}-0'.format(exchange)  # For waiting
        queue_1 = '{}-1'.format(exchange)  # For invoker

        params = pika.URLParameters(rabbit_amqp_url)
        connection = pika.BlockingConnection(params)
        channel = connection.channel()
        channel.exchange_declare(exchange=exchange, exchange_type='fanout', auto_delete=True)
        channel.queue_declare(queue=queue_0, auto_delete=True)
        channel.queue_bind(exchange=exchange, queue=queue_0)
        channel.queue_declare(queue=queue_1, auto_delete=True)
        channel.queue_bind(exchange=exchange, queue=queue_1)
        connection.close()
Esempio n. 8
0
    def run(self, job_description, log_level):
        logger.info("Localhost run method")
        job = SimpleNamespace(**job_description)
        job_key = create_job_key(job.executor_id, job.job_id)

        if not hasattr(job, 'call_id'):
            logger.info("Running entire job {}".format(job_key))
            for i in range(job.total_calls):
                call_id = "{:05d}".format(i)
                self._invoke(job, call_id, log_level)
        else:
            logger.info("Running single call id {}-{}".format(
                job_key, job.call_id))
            self._invoke(job, job.call_id, log_level)

        for i in self.workers:
            self.job_queue.put(ShutdownSentinel())
Esempio n. 9
0
    def _job_monitoring_os(self, job):
        total_callids_done = 0
        job_key = create_job_key(job.executor_id, job.job_id)

        while self.monitors[job_key]['should_run'] and total_callids_done < job.total_calls:
            time.sleep(1)
            callids_running, callids_done = self.internal_storage.get_job_status(job.executor_id, job.job_id)
            total_new_tokens = len(callids_done) - total_callids_done
            total_callids_done = total_callids_done + total_new_tokens
            for i in range(total_new_tokens):
                if self.monitors[job_key]['should_run']:
                    self.token_bucket_q.put('#')
                else:
                    break

        logger.debug('ExecutorID {} | JobID {} - Job monitoring finished'
                     .format(job.executor_id,  job.job_id))
Esempio n. 10
0
    def _start_log_monitor(self, executor_id, job_id):
        """
        Starts a process that polls the remote log into a local file
        """

        job_key = create_job_key(executor_id, job_id)

        def log_monitor():
            os.makedirs(LOGS_DIR, exist_ok=True)
            log_file = os.path.join(LOGS_DIR, job_key + '.log')
            fdout_0 = open(log_file, 'wb')
            fdout_1 = open(FN_LOG_FILE, 'ab')

            ssh_client = self.ssh_client.create_client(self.ip_address)
            cmd = 'tail -n +1 -F /tmp/lithops/logs/{}.log'.format(job_key)
            stdin, stdout, stderr = ssh_client.exec_command(cmd)
            channel = stdout.channel
            stdin.close()
            channel.shutdown_write()

            data = None
            while not channel.closed:
                try:
                    readq, _, _ = select.select([channel], [], [], 10)
                    if readq and readq[0].recv_ready():
                        data = channel.recv(len(readq[0].in_buffer))
                        fdout_0.write(data)
                        fdout_0.flush()
                        fdout_1.write(data)
                        fdout_1.flush()
                    else:
                        if data:
                            cmd = 'ls /tmp/lithops/jobs/{}.done'.format(
                                job_key)
                            _, out, _ = ssh_client.exec_command(cmd)
                            if out.read().decode().strip():
                                break
                        time.sleep(0.5)
                except Exception:
                    pass

        if not self.is_lithops_worker:
            Thread(target=log_monitor, daemon=True).start()
            logger.debug('ExecutorID {} | JobID {} - Remote log monitor '
                         'started'.format(executor_id, job_id))
Esempio n. 11
0
def delete_rabbitmq_resources(rabbit_amqp_url, executor_id, job_id):
    """
    Deletes RabbitMQ queues and exchanges of a given job.
    Only called when an exception is produced, otherwise resources are
    automatically deleted.
    """
    job_key = create_job_key(executor_id, job_id)
    exchange = 'lithops-{}'.format(job_key)
    queue_0 = '{}-0'.format(exchange)  # For waiting
    queue_1 = '{}-1'.format(exchange)  # For invoker

    params = pika.URLParameters(rabbit_amqp_url)
    connection = pika.BlockingConnection(params)
    channel = connection.channel()
    channel.queue_delete(queue=queue_0)
    channel.queue_delete(queue=queue_1)
    channel.exchange_delete(exchange=exchange)
    connection.close()
Esempio n. 12
0
    def get_job_status(self, executor_id, job_id):
        """
        Get the status of a callset.
        :param executor_id: executor's ID
        :return: A list of call IDs that have updated status.
        """
        job_key = create_job_key(executor_id, job_id)
        callset_prefix = '/'.join([JOBS_PREFIX, job_key])
        keys = self.storage.list_keys(self.bucket, callset_prefix)

        running_keys = [k.split('/') for k in keys if init_key_suffix in k]
        running_callids = [(tuple(k[1].rsplit("-", 1) + [k[2]]),
                            k[3].replace(init_key_suffix, ''))
                           for k in running_keys]

        done_keys = [k.split('/')[1:] for k in keys if status_key_suffix in k]
        done_callids = [tuple(k[0].rsplit("-", 1) + [k[1]]) for k in done_keys]

        return set(running_callids), set(done_callids)
Esempio n. 13
0
def run():
    """
    Run a job
    """
    global last_usage_time
    global backend_handler
    global jobs

    message = flask.request.get_json(force=True, silent=True)
    if message and not isinstance(message, dict):
        return error('The action did not receive a dictionary as an argument.')

    try:
        runtime = message['job_description']['runtime_name']
        verify_runtime_name(runtime)
    except Exception as e:
        return error(str(e))

    last_usage_time = time.time()

    standalone_config = message['config']['standalone']
    backend_handler.auto_dismantle = standalone_config['auto_dismantle']
    backend_handler.soft_dismantle_timeout = standalone_config[
        'soft_dismantle_timeout']
    backend_handler.hard_dismantle_timeout = standalone_config[
        'hard_dismantle_timeout']

    act_id = str(uuid.uuid4()).replace('-', '')[:12]
    executor_id = message['executor_id']
    job_id = message['job_id']
    job_key = create_job_key(executor_id, job_id)
    jobs[job_key] = 'running'

    localhost_handler = LocalhostHandler({'runtime': runtime})
    localhost_handler.run_job(message)

    response = flask.jsonify({'activationId': act_id})
    response.status_code = 202

    return response
Esempio n. 14
0
    def run_job(self, job_payload):
        """
        Run the job description against the selected environment
        """
        executor_id = job_payload['executor_id']
        job_id = job_payload['job_id']
        job_key = create_job_key(executor_id, job_id)
        log_file = os.path.join(LOGS_DIR, job_key + '.log')

        if not self._is_proxy_ready():
            # The VM instance is stopped
            if not self.log_active:
                print(
                    'ExecutorID {} - Starting VM instance'.format(executor_id))
            init_time = time.time()
            self.backend.start()
            self._wait_proxy_ready()
            total_start_time = round(time.time() - init_time, 2)
            logger.info(
                'VM instance ready in {} seconds'.format(total_start_time))

        self._start_log_monitor(executor_id, job_id)

        logger.info('ExecutorID {} | JobID {} - Running job'.format(
            executor_id, job_id))
        logger.info("View execution logs at {}".format(log_file))

        if self.is_lithops_worker:
            url = "http://{}:{}/run".format('127.0.0.1', PROXY_SERVICE_PORT)
            r = requests.post(url, data=json.dumps(job_payload), verify=True)
            response = r.json()
        else:
            cmd = ('curl -X POST http://127.0.0.1:8080/run -d {} '
                   '-H \'Content-Type: application/json\''.format(
                       shlex.quote(json.dumps(job_payload))))
            out = self.ssh_client.run_remote_command(self.ip_address, cmd)
            response = json.loads(out)

        return response['activationId']
Esempio n. 15
0
    def _job_monitoring_rabbitmq(self, job):
        total_callids_done = 0
        job_key = create_job_key(job.executor_id, job.job_id)

        exchange = 'lithops-{}'.format(job_key)
        queue_1 = '{}-1'.format(exchange)

        params = pika.URLParameters(self.rabbit_amqp_url)
        connection = pika.BlockingConnection(params)
        channel = connection.channel()

        def callback(ch, method, properties, body):
            nonlocal total_callids_done
            call_status = json.loads(body.decode("utf-8"))
            if call_status['type'] == '__end__':
                if self.monitors[job_key]['should_run']:
                    self.token_bucket_q.put('#')
                total_callids_done += 1
            if total_callids_done == job.total_calls or \
               not self.monitors[job_key]['should_run']:
                ch.stop_consuming()

        channel.basic_consume(callback, queue=queue_1, no_ack=True)
        channel.start_consuming()
Esempio n. 16
0
    def clean(self,
              fs: Optional[Union[ResponseFuture, List[ResponseFuture]]] = None,
              cs: Optional[List[CloudObject]] = None,
              clean_cloudobjects: Optional[bool] = True,
              clean_fn: Optional[bool] = False,
              force: Optional[bool] = False):
        """
        Deletes all the temp files from storage. These files include the function,
        the data serialization and the function invocation results. It can also clean
        cloudobjects.

        :param fs: List of futures to clean
        :param cs: List of cloudobjects to clean
        :param clean_cloudobjects: Delete all cloudobjects created with this executor
        :param clan_fn: Delete cached functions in this executor
        :param force: Clean all future objects even if they have not benn completed
        """
        global CLEANER_PROCESS

        def save_data_to_clean(data):
            with tempfile.NamedTemporaryFile(dir=CLEANER_DIR,
                                             delete=False) as temp:
                pickle.dump(data, temp)

        if cs:
            data = {
                'cos_to_clean': list(cs),
                'storage_config': self.internal_storage.get_storage_config()
            }
            save_data_to_clean(data)
            if not fs:
                return

        if clean_fn:
            data = {
                'fn_to_clean': self.executor_id,
                'storage_config': self.internal_storage.get_storage_config()
            }
            save_data_to_clean(data)

        futures = fs or self.futures
        futures = [futures] if type(futures) != list else futures
        present_jobs = {
            create_job_key(f.executor_id, f.job_id)
            for f in futures
            if (f.executor_id.count('-') == 1 and f.done) or force
        }
        jobs_to_clean = present_jobs - self.cleaned_jobs

        if jobs_to_clean:
            logger.info(
                f'ExecutorID {self.executor_id} - Cleaning temporary data')
            data = {
                'jobs_to_clean': jobs_to_clean,
                'clean_cloudobjects': clean_cloudobjects,
                'storage_config': self.internal_storage.get_storage_config()
            }
            save_data_to_clean(data)
            self.cleaned_jobs.update(jobs_to_clean)

        spawn_cleaner = not (CLEANER_PROCESS
                             and CLEANER_PROCESS.poll() is None)
        if (jobs_to_clean or cs) and spawn_cleaner:
            cmd = [sys.executable, '-m', 'lithops.scripts.cleaner']
            CLEANER_PROCESS = sp.Popen(cmd, start_new_session=True)
Esempio n. 17
0
def _create_job(config,
                internal_storage,
                executor_id,
                job_id,
                func,
                iterdata,
                runtime_meta,
                runtime_memory,
                extra_env,
                include_modules,
                exclude_modules,
                execution_timeout,
                host_job_meta,
                chunksize=None,
                worker_processes=None,
                invoke_pool_threads=16):
    """
    Creates a new Job
    """
    ext_env = {} if extra_env is None else extra_env.copy()
    if ext_env:
        ext_env = utils.convert_bools_to_string(ext_env)
        logger.debug("Extra environment vars {}".format(ext_env))

    job = SimpleNamespace()
    job.chunksize = chunksize or config['lithops']['chunksize']
    job.worker_processes = worker_processes or config['lithops'][
        'worker_processes']
    job.execution_timeout = execution_timeout or config['lithops'][
        'execution_timeout']
    job.executor_id = executor_id
    job.job_id = job_id
    job.job_key = create_job_key(job.executor_id, job.job_id)
    job.extra_env = ext_env
    job.function_name = func.__name__
    job.total_calls = len(iterdata)

    mode = config['lithops']['mode']

    if mode == SERVERLESS:
        job.invoke_pool_threads = invoke_pool_threads or config['serverless'][
            'invoke_pool_threads']
        job.runtime_memory = runtime_memory or config['serverless'][
            'runtime_memory']
        job.runtime_timeout = config['serverless']['runtime_timeout']
        if job.execution_timeout >= job.runtime_timeout:
            job.execution_timeout = job.runtime_timeout - 5

    elif mode == STANDALONE:
        job.runtime_memory = None
        runtime_timeout = config['standalone']['hard_dismantle_timeout']
        if job.execution_timeout >= runtime_timeout:
            job.execution_timeout = runtime_timeout - 10

    elif mode == LOCALHOST:
        job.runtime_memory = None
        job.runtime_timeout = execution_timeout

    exclude_modules_cfg = config['lithops'].get('exclude_modules', [])
    include_modules_cfg = config['lithops'].get('include_modules', [])

    exc_modules = set()
    inc_modules = set()
    if exclude_modules_cfg:
        exc_modules.update(exclude_modules_cfg)
    if exclude_modules:
        exc_modules.update(exclude_modules)
    if include_modules_cfg is not None:
        inc_modules.update(include_modules_cfg)
    if include_modules_cfg is None and not include_modules:
        inc_modules = None
    if include_modules is not None and include_modules:
        inc_modules.update(include_modules)
    if include_modules is None:
        inc_modules = None

    logger.debug(
        'ExecutorID {} | JobID {} - Serializing function and data'.format(
            executor_id, job_id))
    job_serialize_start = time.time()
    serializer = SerializeIndependent(runtime_meta['preinstalls'])
    func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules,
                                              exc_modules)
    data_strs = func_and_data_ser[1:]
    data_size_bytes = sum(len(x) for x in data_strs)
    module_data = create_module_data(mod_paths)
    func_str = func_and_data_ser[0]
    func_module_str = pickle.dumps(
        {
            'func': func_str,
            'module_data': module_data
        }, -1)
    func_module_size_bytes = len(func_module_str)
    total_size = utils.sizeof_fmt(data_size_bytes + func_module_size_bytes)
    host_job_meta['host_job_serialize_time'] = round(
        time.time() - job_serialize_start, 6)

    host_job_meta['data_size_bytes'] = data_size_bytes
    host_job_meta['func_module_size_bytes'] = func_module_size_bytes

    if 'data_limit' in config['lithops']:
        data_limit = config['lithops']['data_limit']
    else:
        data_limit = MAX_AGG_DATA_SIZE

    if data_limit and data_size_bytes > data_limit * 1024**2:
        log_msg = (
            'ExecutorID {} | JobID {} - Total data exceeded maximum size '
            'of {}'.format(executor_id, job_id,
                           sizeof_fmt(data_limit * 1024**2)))
        raise Exception(log_msg)

    logger.info('ExecutorID {} | JobID {} - Uploading function and data '
                '- Total: {}'.format(executor_id, job_id, total_size))

    # Upload data
    data_key = create_agg_data_key(JOBS_PREFIX, executor_id, job_id)
    job.data_key = data_key
    data_bytes, data_byte_ranges = utils.agg_data(data_strs)
    job.data_byte_ranges = data_byte_ranges
    data_upload_start = time.time()
    internal_storage.put_data(data_key, data_bytes)
    data_upload_end = time.time()

    host_job_meta['host_data_upload_time'] = round(
        data_upload_end - data_upload_start, 6)
    func_upload_start = time.time()

    # Upload function and modules
    if config[mode].get('customized_runtime'):
        # Prepare function and modules locally to store in the runtime image later
        function_file = func.__code__.co_filename
        function_hash = hashlib.md5(open(function_file,
                                         'rb').read()).hexdigest()[:16]
        mod_hash = hashlib.md5(repr(
            sorted(mod_paths)).encode('utf-8')).hexdigest()[:16]

        uuid = '{}{}'.format(function_hash, mod_hash)
        func_key = create_func_key(JOBS_PREFIX, uuid, "")

        _store_func_and_modules(func_key, func_str, module_data)

        job.ext_runtime_uuid = uuid
    else:
        func_key = create_func_key(JOBS_PREFIX, executor_id, job_id)
        internal_storage.put_func(func_key, func_module_str)

    job.func_key = func_key
    func_upload_end = time.time()

    host_job_meta['host_func_upload_time'] = round(
        func_upload_end - func_upload_start, 6)

    host_job_meta['host_job_created_time'] = round(
        time.time() - host_job_meta['host_job_create_tstamp'], 6)

    job.metadata = host_job_meta

    return job
Esempio n. 18
0
    def status(self,
               throw_except=True,
               internal_storage=None,
               check_only=False):
        """
        Return the status returned by the call.
        If the call raised an exception, this method will raise the same exception
        If the future is cancelled before completing then CancelledError will be raised.

        :param check_only: Return None immediately if job is not complete. Default False.
        :param throw_except: Reraise exception if call raised. Default true.
        :param storage_handler: Storage handler to poll cloud storage. Default None.
        :return: Result of the call.
        :raises CancelledError: If the job is cancelled before completed.
        :raises TimeoutError: If job is not complete after `timeout` seconds.
        """
        if self._state == ResponseFuture.State.New:
            raise ValueError("task not yet invoked")

        if self.success or self.done:
            return self._call_status

        if self.ready and self._new_futures:
            self._set_state(ResponseFuture.State.Done)
            return self._call_status

        if self._call_status is None or self._call_status['type'] == '__init__':
            if internal_storage is None:
                internal_storage = InternalStorage(self._storage_config)
            check_storage_path(internal_storage.get_storage_config(),
                               self._storage_path)
            self._call_status = internal_storage.get_call_status(
                self.executor_id, self.job_id, self.call_id)
            self._status_query_count += 1

            if check_only:
                return self._call_status

            while self._call_status is None:
                time.sleep(self.GET_RESULT_SLEEP_SECS)
                self._call_status = internal_storage.get_call_status(
                    self.executor_id, self.job_id, self.call_id)
                self._status_query_count += 1
            self._host_status_done_tstamp = time.time()

        self.stats[
            'host_status_done_tstamp'] = self._host_status_done_tstamp or time.time(
            )
        self.stats['host_status_query_count'] = self._status_query_count
        self.activation_id = self._call_status['activation_id']

        if 'logs' in self._call_status:
            self.logs = zlib.decompress(
                base64.b64decode(self._call_status['logs'].encode())).decode()
            job_key = create_job_key(self.executor_id, self.job_id)
            log_file = os.path.join(LOGS_DIR, job_key + '.log')
            header = "Activation: '{}' ({})\n[\n".format(
                self.runtime_name, self.activation_id)
            tail = ']\n\n'
            output = self.logs.replace('\r',
                                       '').replace('\n', '\n    ',
                                                   self.logs.count('\n') - 1)
            with open(log_file, 'a') as lf:
                lf.write(header + '    ' + output + tail)
            with open(FN_LOG_FILE, 'a') as lf:
                lf.write(header + '    ' + output + tail)

        if self._call_status['exception']:
            self._set_state(ResponseFuture.State.Error)
            self._exception = pickle.loads(eval(self._call_status['exc_info']))

            msg1 = (
                'ExecutorID {} | JobID {} - There was an exception - Activation '
                'ID: {}'.format(self.executor_id, self.job_id,
                                self.activation_id))

            if not self._call_status.get('exc_pickle_fail', False):
                fn_exctype = self._exception[0]
                fn_exc = self._exception[1]
                if fn_exc.args and fn_exc.args[0] == "HANDLER":
                    self._handler_exception = True
                    try:
                        del fn_exc.errno
                    except Exception:
                        pass
                    fn_exc.args = (fn_exc.args[1], )
            else:
                fn_exctype = Exception
                fn_exc = Exception(self._exception['exc_value'])
                self._exception = (fn_exctype, fn_exc,
                                   self._exception['exc_traceback'])

            def exception_hook(exctype, exc, trcbck):
                if exctype == fn_exctype and str(exc) == str(fn_exc):
                    logger.warning(msg1)
                    if self._handler_exception:
                        msg2 = 'Exception: {} - {}'.format(
                            fn_exctype.__name__, fn_exc)
                        logger.warning(msg2)
                    else:
                        traceback.print_exception(*self._exception)
                else:
                    sys.excepthook = sys.__excepthook__
                    traceback.print_exception(exctype, exc, trcbck)

            if throw_except:
                sys.excepthook = exception_hook
                reraise(*self._exception)
            else:
                logger.warning(msg1)
                msg2 = 'Exception: {} - {}'.format(self._exception[0].__name__,
                                                   self._exception[1])
                logger.warning(msg2)
                return None

        for key in self._call_status:
            if any(ss in key
                   for ss in ['time', 'tstamp', 'count', 'size', 'container']):
                self.stats[key] = self._call_status[key]

        self.stats['worker_exec_time'] = round(
            self.stats['worker_end_tstamp'] -
            self.stats['worker_start_tstamp'], 8)
        total_time = format(round(self.stats['worker_exec_time'], 2), '.2f')

        logger.debug(
            'ExecutorID {} | JobID {} - Got status from call {} - Activation '
            'ID: {} - Time: {} seconds'.format(self.executor_id, self.job_id,
                                               self.call_id,
                                               self.activation_id,
                                               str(total_time)))

        self._set_state(ResponseFuture.State.Success)

        if not self._call_status['result']:
            self._produce_output = False

        if not self._produce_output:
            self._set_state(ResponseFuture.State.Done)

        if 'new_futures' in self._call_status and not self._new_futures:
            new_futures = pickle.loads(eval(self._call_status['new_futures']))
            self._new_futures = [
                new_futures
            ] if type(new_futures) == ResponseFuture else new_futures
            self._set_state(ResponseFuture.State.Futures)

        return self._call_status
Esempio n. 19
0
def _create_job(config, internal_storage, executor_id, job_id, func,
                iterdata,  runtime_meta, runtime_memory, extra_env,
                include_modules, exclude_modules, execution_timeout,
                host_job_meta, chunksize=None):
    """
    Creates a new Job
    """
    global FUNCTION_CACHE

    ext_env = {} if extra_env is None else extra_env.copy()
    if ext_env:
        ext_env = utils.convert_bools_to_string(ext_env)
        logger.debug("Extra environment vars {}".format(ext_env))

    mode = config['lithops']['mode']
    backend = config['lithops']['backend']

    job = SimpleNamespace()
    job.chunksize = chunksize or config['lithops']['chunksize']
    job.worker_processes = config[backend]['worker_processes']
    job.execution_timeout = execution_timeout or config['lithops']['execution_timeout']
    job.executor_id = executor_id
    job.job_id = job_id
    job.job_key = create_job_key(job.executor_id, job.job_id)
    job.extra_env = ext_env
    job.function_name = func.__name__ if inspect.isfunction(func) or inspect.ismethod(func) else type(func).__name__
    job.total_calls = len(iterdata)

    if mode == SERVERLESS:
        job.runtime_memory = runtime_memory or config[backend]['runtime_memory']
        job.runtime_timeout = config[backend]['runtime_timeout']
        if job.execution_timeout >= job.runtime_timeout:
            job.execution_timeout = job.runtime_timeout - 5

    elif mode in STANDALONE:
        job.runtime_memory = None
        runtime_timeout = config[STANDALONE]['hard_dismantle_timeout']
        if job.execution_timeout >= runtime_timeout:
            job.execution_timeout = runtime_timeout - 10

    elif mode == LOCALHOST:
        job.runtime_memory = None
        job.runtime_timeout = None

    exclude_modules_cfg = config['lithops'].get('exclude_modules', [])
    include_modules_cfg = config['lithops'].get('include_modules', [])

    exc_modules = set()
    inc_modules = set()
    if exclude_modules_cfg:
        exc_modules.update(exclude_modules_cfg)
    if exclude_modules:
        exc_modules.update(exclude_modules)
    if include_modules_cfg is not None:
        inc_modules.update(include_modules_cfg)
    if include_modules_cfg is None and not include_modules:
        inc_modules = None
    if include_modules is not None and include_modules:
        inc_modules.update(include_modules)
    if include_modules is None:
        inc_modules = None

    logger.debug('ExecutorID {} | JobID {} - Serializing function and data'.format(executor_id, job_id))
    job_serialize_start = time.time()
    serializer = SerializeIndependent(runtime_meta['preinstalls'])
    func_and_data_ser, mod_paths = serializer([func] + iterdata, inc_modules, exc_modules)
    data_strs = func_and_data_ser[1:]
    data_size_bytes = sum(len(x) for x in data_strs)
    module_data = create_module_data(mod_paths)
    func_str = func_and_data_ser[0]
    func_module_str = pickle.dumps({'func': func_str, 'module_data': module_data}, -1)
    func_module_size_bytes = len(func_module_str)

    host_job_meta['host_job_serialize_time'] = round(time.time()-job_serialize_start, 6)
    host_job_meta['data_size_bytes'] = data_size_bytes
    host_job_meta['func_module_size_bytes'] = func_module_size_bytes

    # Check data limit
    if 'data_limit' in config['lithops']:
        data_limit = config['lithops']['data_limit']
    else:
        data_limit = MAX_AGG_DATA_SIZE
    if data_limit and data_size_bytes > data_limit*1024**2:
        log_msg = ('ExecutorID {} | JobID {} - Total data exceeded maximum size '
                   'of {}'.format(executor_id, job_id, utils.sizeof_fmt(data_limit*1024**2)))
        raise Exception(log_msg)

    # Upload function and data
    upload_function = not config['lithops'].get('customized_runtime', False)
    upload_data = not (len(str(data_strs[0])) * job.chunksize < 8*1204 and backend in FAAS_BACKENDS)

    # Upload function and modules
    if upload_function:
        function_hash = hashlib.md5(func_module_str).hexdigest()
        job.func_key = create_func_key(executor_id, function_hash)
        if job.func_key not in FUNCTION_CACHE:
            logger.debug('ExecutorID {} | JobID {} - Uploading function and modules '
                         'to the storage backend'.format(executor_id, job_id))
            func_upload_start = time.time()
            internal_storage.put_func(job.func_key, func_module_str)
            func_upload_end = time.time()
            host_job_meta['host_func_upload_time'] = round(func_upload_end - func_upload_start, 6)
            FUNCTION_CACHE.add(job.func_key)
        else:
            logger.debug('ExecutorID {} | JobID {} - Function and modules '
                         'found in local cache'.format(executor_id, job_id))
            host_job_meta['host_func_upload_time'] = 0

    else:
        # Prepare function and modules locally to store in the runtime image later
        function_file = func.__code__.co_filename
        function_hash = hashlib.md5(open(function_file, 'rb').read()).hexdigest()[:16]
        mod_hash = hashlib.md5(repr(sorted(mod_paths)).encode('utf-8')).hexdigest()[:16]
        job.func_key = func_key_suffix
        job.ext_runtime_uuid = '{}{}'.format(function_hash, mod_hash)
        job.local_tmp_dir = os.path.join(CUSTOM_RUNTIME_DIR, job.ext_runtime_uuid)
        _store_func_and_modules(job.local_tmp_dir, job.func_key, func_str, module_data)
        host_job_meta['host_func_upload_time'] = 0

    # upload data
    if upload_data:
        # Upload iterdata to COS only if a single element is greater than 8KB
        logger.debug('ExecutorID {} | JobID {} - Uploading data to the storage backend'
                     .format(executor_id, job_id))
        # pass_iteradata through an object storage file
        data_key = create_data_key(executor_id, job_id)
        job.data_key = data_key
        data_bytes, data_byte_ranges = utils.agg_data(data_strs)
        job.data_byte_ranges = data_byte_ranges
        data_upload_start = time.time()
        internal_storage.put_data(data_key, data_bytes)
        data_upload_end = time.time()
        host_job_meta['host_data_upload_time'] = round(data_upload_end-data_upload_start, 6)

    else:
        # pass iteradata as part of the invocation payload
        logger.debug('ExecutorID {} | JobID {} - Data per activation is < '
                     '{}. Passing data through invocation payload'
                     .format(executor_id, job_id, utils.sizeof_fmt(8*1024)))
        job.data_key = None
        job.data_byte_ranges = None
        job.data_byte_strs = data_strs
        host_job_meta['host_data_upload_time'] = 0

    host_job_meta['host_job_created_time'] = round(time.time() - host_job_meta['host_job_create_tstamp'], 6)

    job.metadata = host_job_meta

    return job
Esempio n. 20
0
def wait_rabbitmq(fs,
                  internal_storage,
                  rabbit_amqp_url,
                  download_results=False,
                  throw_except=True,
                  pbar=None,
                  return_when=ALL_COMPLETED,
                  THREADPOOL_SIZE=128):
    """
    Wait for the Future instances `fs` to complete. Returns a 2-tuple of
    lists. The first list contains the futures that completed
    (finished or cancelled) before the wait completed. The second
    contains uncompleted futures.

    :param futures: A list of futures.
    :param executor_id: executor's ID.
    :param internal_storage: Storage handler to poll cloud storage.
    :param rabbit_amqp_url: amqp url for accessing rabbitmq.
    :param pbar: Progress bar.
    :param return_when: One of `ALL_COMPLETED`, `ANY_COMPLETED`, `ALWAYS`
    :return: `(fs_dones, fs_notdones)`
        where `fs_dones` is a list of futures that have completed
        and `fs_notdones` is a list of futures that have not completed.
    :rtype: 2-tuple of lists
    """
    if return_when != ALL_COMPLETED:
        raise NotImplementedError(return_when)

    thread_pool = ThreadPoolExecutor(max_workers=THREADPOOL_SIZE)
    present_jobs = {}
    done_call_ids = {}

    for f in fs:
        if (download_results and not f.done) or (not download_results
                                                 and not (f.ready or f.done)):
            job_key = create_job_key(f.executor_id, f.job_id)
            if job_key not in present_jobs:
                present_jobs[job_key] = {}
            present_jobs[job_key][f.call_id] = f

    job_monitor_q = queue.Queue()
    for job_key in present_jobs.keys():
        total_calls = len(present_jobs[job_key])
        done_call_ids[job_key] = {'total': total_calls, 'call_ids': []}
        job_monitor = Thread(target=_job_monitor_thread,
                             args=(job_key, total_calls, rabbit_amqp_url,
                                   job_monitor_q))
        job_monitor.daemon = True
        job_monitor.start()

    # thread to check possible function activations unexpected errors.
    # It will raise a Timeout error if the status is not received after X seconds.
    running_futures = []
    ftc = Thread(target=_future_timeout_checker_thread,
                 args=(running_futures, job_monitor_q, throw_except))
    ftc.daemon = True
    ftc.start()

    def reception_finished():
        """
        Method to check if the call_status from all the function activations
        have been received.
        """
        for job_id in done_call_ids:
            total = done_call_ids[job_id]['total']
            recived_call_ids = len(done_call_ids[job_id]['call_ids'])

            if total is None or total > recived_call_ids:
                return False

        return True

    get_result_futures = []

    def get_result(f):
        f.result(throw_except=throw_except, internal_storage=internal_storage)

    while not reception_finished():
        try:
            call_status = job_monitor_q.get()
        except KeyboardInterrupt:
            raise KeyboardInterrupt

        rcvd_executor_id = call_status['executor_id']
        rcvd_job_id = call_status['job_id']
        rcvd_call_id = call_status['call_id']
        job_key = create_job_key(rcvd_executor_id, rcvd_job_id)
        fut = present_jobs[job_key][rcvd_call_id]
        fut._call_status = call_status
        fut.status(throw_except=throw_except,
                   internal_storage=internal_storage)

        if call_status['type'] == '__init__':
            running_futures.append(fut)

        if call_status['type'] == '__end__':
            done_call_ids[job_key]['call_ids'].append(rcvd_call_id)

            if pbar:
                pbar.update(1)
                pbar.refresh()

            if 'new_futures' in call_status:
                new_futures = fut.result()
                fs.extend(new_futures)

                if pbar:
                    pbar.total = pbar.total + len(new_futures)
                    pbar.refresh()

                present_jobs_new_futures = {
                    create_job_key(f.executor_id, f.job_id)
                    for f in new_futures
                }

                for f in new_futures:
                    job_key_new_futures = create_job_key(
                        f.executor_id, f.job_id)
                    if job_key_new_futures not in present_jobs:
                        present_jobs[job_key_new_futures] = {}
                    present_jobs[job_key_new_futures][f.call_id] = f

                for job_key_new_futures in present_jobs_new_futures:
                    total_calls = len(present_jobs[job_key_new_futures])
                    done_call_ids[job_key_new_futures] = {
                        'total': total_calls,
                        'call_ids': []
                    }
                    job_monitor = Thread(target=_job_monitor_thread,
                                         args=(job_key, total_calls,
                                               rabbit_amqp_url, job_monitor_q))
                    job_monitor.daemon = True
                    job_monitor.start()

            if 'new_futures' not in call_status and download_results:
                gr_ft = thread_pool.submit(get_result, fut)
                get_result_futures.append(gr_ft)

    wait(get_result_futures)

    return fs, []
Esempio n. 21
0
def function_handler(event):
    start_tstamp = time.time()

    logger.debug("Action handler started")

    extra_env = event.get('extra_env', {})
    os.environ.update(extra_env)
    os.environ.update({'LITHOPS_WORKER': 'True', 'PYTHONUNBUFFERED': 'True'})

    config = event['config']
    call_id = event['call_id']
    job_id = event['job_id']
    executor_id = event['executor_id']
    job_key = create_job_key(executor_id, job_id)
    logger.info("Execution ID: {}/{}".format(job_key, call_id))

    runtime_name = event['runtime_name']
    runtime_memory = event['runtime_memory']
    execution_timeout = event['execution_timeout']

    logger.debug("Runtime name: {}".format(runtime_name))
    if runtime_memory:
        logger.debug("Runtime memory: {}MB".format(runtime_memory))
    logger.debug("Function timeout: {}s".format(execution_timeout))

    func_key = event['func_key']
    data_key = event['data_key']
    data_byte_range = event['data_byte_range']

    storage_config = extract_storage_config(config)
    internal_storage = InternalStorage(storage_config)

    call_status = CallStatus(config, internal_storage)
    call_status.response['host_submit_tstamp'] = event['host_submit_tstamp']
    call_status.response['worker_start_tstamp'] = start_tstamp
    context_dict = {
        'python_version': os.environ.get("PYTHON_VERSION"),
        'call_id': call_id,
        'job_id': job_id,
        'executor_id': executor_id,
        'activation_id': os.environ.get('__LITHOPS_ACTIVATION_ID')
    }
    call_status.response.update(context_dict)

    show_memory_peak = strtobool(os.environ.get('SHOW_MEMORY_PEAK', 'False'))

    try:
        if version.__version__ != event['lithops_version']:
            msg = (
                "Lithops version mismatch. Host version: {} - Runtime version: {}"
                .format(event['lithops_version'], version.__version__))
            raise RuntimeError('HANDLER', msg)

        # send init status event
        call_status.send('__init__')

        # call_status.response['free_disk_bytes'] = free_disk_space("/tmp")
        custom_env = {
            'LITHOPS_CONFIG': json.dumps(config),
            '__LITHOPS_SESSION_ID': '-'.join([job_key, call_id]),
            'PYTHONPATH': "{}:{}".format(os.getcwd(), LITHOPS_LIBS_PATH)
        }
        os.environ.update(custom_env)

        jobrunner_stats_dir = os.path.join(LITHOPS_TEMP_DIR,
                                           storage_config['bucket'],
                                           JOBS_PREFIX, job_key, call_id)
        os.makedirs(jobrunner_stats_dir, exist_ok=True)
        jobrunner_stats_filename = os.path.join(jobrunner_stats_dir,
                                                'jobrunner.stats.txt')

        jobrunner_config = {
            'lithops_config':
            config,
            'call_id':
            call_id,
            'job_id':
            job_id,
            'executor_id':
            executor_id,
            'func_key':
            func_key,
            'data_key':
            data_key,
            'data_byte_range':
            data_byte_range,
            'output_key':
            create_output_key(JOBS_PREFIX, executor_id, job_id, call_id),
            'stats_filename':
            jobrunner_stats_filename
        }

        if show_memory_peak:
            mm_handler_conn, mm_conn = Pipe()
            memory_monitor = Thread(target=memory_monitor_worker,
                                    args=(mm_conn, ))
            memory_monitor.start()

        handler_conn, jobrunner_conn = Pipe()
        jobrunner = JobRunner(jobrunner_config, jobrunner_conn,
                              internal_storage)
        logger.debug('Starting JobRunner process')
        local_execution = strtobool(
            os.environ.get('__LITHOPS_LOCAL_EXECUTION', 'False'))
        jrp = Thread(target=jobrunner.run) if local_execution else Process(
            target=jobrunner.run)
        jrp.start()

        jrp.join(execution_timeout)
        logger.debug('JobRunner process finished')

        if jrp.is_alive():
            # If process is still alive after jr.join(job_max_runtime), kill it
            try:
                jrp.terminate()
            except Exception:
                # thread does not have terminate method
                pass
            msg = ('Function exceeded maximum time of {} seconds and was '
                   'killed'.format(execution_timeout))
            raise TimeoutError('HANDLER', msg)

        if show_memory_peak:
            mm_handler_conn.send('STOP')
            memory_monitor.join()
            peak_memory_usage = int(mm_handler_conn.recv())
            logger.info("Peak memory usage: {}".format(
                sizeof_fmt(peak_memory_usage)))
            call_status.response['peak_memory_usage'] = peak_memory_usage

        if not handler_conn.poll():
            logger.error(
                'No completion message received from JobRunner process')
            logger.debug('Assuming memory overflow...')
            # Only 1 message is returned by jobrunner when it finishes.
            # If no message, this means that the jobrunner process was killed.
            # 99% of times the jobrunner is killed due an OOM, so we assume here an OOM.
            msg = 'Function exceeded maximum memory and was killed'
            raise MemoryError('HANDLER', msg)

        if os.path.exists(jobrunner_stats_filename):
            with open(jobrunner_stats_filename, 'r') as fid:
                for l in fid.readlines():
                    key, value = l.strip().split(" ", 1)
                    try:
                        call_status.response[key] = float(value)
                    except Exception:
                        call_status.response[key] = value
                    if key in [
                            'exception', 'exc_pickle_fail', 'result',
                            'new_futures'
                    ]:
                        call_status.response[key] = eval(value)

    except Exception:
        # internal runtime exceptions
        print('----------------------- EXCEPTION !-----------------------',
              flush=True)
        traceback.print_exc(file=sys.stdout)
        print('----------------------------------------------------------',
              flush=True)
        call_status.response['exception'] = True

        pickled_exc = pickle.dumps(sys.exc_info())
        pickle.loads(
            pickled_exc)  # this is just to make sure they can be unpickled
        call_status.response['exc_info'] = str(pickled_exc)

    finally:
        call_status.response['worker_end_tstamp'] = time.time()
        call_status.send('__end__')

        # Unset specific env vars
        for key in extra_env:
            os.environ.pop(key, None)
        os.environ.pop('__LITHOPS_TOTAL_EXECUTORS', None)

        logger.info("Finished")